diff options
author | Alex Gorrod <alexander.gorrod@mongodb.com> | 2016-10-26 11:42:12 +1100 |
---|---|---|
committer | Alex Gorrod <alexander.gorrod@mongodb.com> | 2016-10-26 11:42:12 +1100 |
commit | b11ed312cedb905dec49dd2c9c262fabf64d13cd (patch) | |
tree | 5073a2976683cf61035598f8937d06ad1819f9b4 | |
parent | 9cf2f89d6d95e1de797f05ab1fef28695f8bae7b (diff) | |
parent | ef9a7983ea47cea78400a4472a3d4e46735385c5 (diff) | |
download | mongo-b11ed312cedb905dec49dd2c9c262fabf64d13cd.tar.gz |
Merge branch 'mongodb-3.4' into mongodb-3.2
165 files changed, 5215 insertions, 2804 deletions
diff --git a/bench/wtperf/config.c b/bench/wtperf/config.c index 4c7b17f102a..5b14a4cdf68 100644 --- a/bench/wtperf/config.c +++ b/bench/wtperf/config.c @@ -28,15 +28,19 @@ #include "wtperf.h" -/* All options changeable on command line using -o or -O are listed here. */ -static CONFIG_OPT config_opts[] = { +static CONFIG_OPT config_opts_desc[] = { /* Option descriptions */ #define OPT_DEFINE_DESC #include "wtperf_opt.i" #undef OPT_DEFINE_DESC }; -static int config_opt(CONFIG *, WT_CONFIG_ITEM *, WT_CONFIG_ITEM *); -static void config_opt_usage(void); +static CONFIG_OPTS config_opts_default = { /* Option defaults */ +#define OPT_DEFINE_DEFAULT +#include "wtperf_opt.i" +#undef OPT_DEFINE_DEFAULT + + { NULL, NULL } /* config_head */ +}; /* * STRING_MATCH -- @@ -47,6 +51,72 @@ static void config_opt_usage(void); (strncmp(str, bytes, len) == 0 && (str)[(len)] == '\0') /* + * config_opt_init -- + * Initialize the global configuration options. + */ +void +config_opt_init(CONFIG_OPTS **retp) +{ + CONFIG_OPT *desc; + CONFIG_OPTS *opts; + size_t i; + char **strp; + void *valueloc; + + opts = dmalloc(sizeof(CONFIG_OPTS)); + *opts = config_opts_default; + + TAILQ_INIT(&opts->config_head); + + /* + * Option strings come-and-go as we configure them, so allocate copies + * of the default strings now so that we can always free the string as + * we allocate new versions. + */ + for (i = 0, desc = config_opts_desc; + i < WT_ELEMENTS(config_opts_desc); i++, ++desc) + if (desc->type == CONFIG_STRING_TYPE || + desc->type == STRING_TYPE) { + valueloc = ((uint8_t *)opts + desc->offset); + strp = (char **)valueloc; + *strp = dstrdup(*strp); + } + + *retp = opts; +} + +/* + * config_opt_cleanup -- + * Clean up the global configuration options. + */ +void +config_opt_cleanup(CONFIG_OPTS *opts) +{ + CONFIG_OPT *desc; + CONFIG_QUEUE_ENTRY *config_line; + size_t i; + char **strp; + void *valueloc; + + for (i = 0, desc = config_opts_desc; + i < WT_ELEMENTS(config_opts_desc); i++, ++desc) + if (desc->type == CONFIG_STRING_TYPE || + desc->type == STRING_TYPE) { + valueloc = ((uint8_t *)opts + desc->offset); + strp = (char **)valueloc; + free(*strp); + } + + while ((config_line = TAILQ_FIRST(&opts->config_head)) != NULL) { + TAILQ_REMOVE(&opts->config_head, config_line, q); + free(config_line->string); + free(config_line); + } + + free(opts); +} + +/* * config_unescape -- * Modify a string in place, replacing any backslash escape sequences. * The modified string is always shorter. @@ -94,168 +164,11 @@ config_unescape(char *orig) } /* - * config_copy -- - * CONFIG structure initialization, based on a source configuration. - */ -void -config_copy(CONFIG *dest, const CONFIG *src) -{ - CONFIG_QUEUE_ENTRY *conf_line, *tmp_line; - size_t i; - char *newstr, **pstr; - - memcpy(dest, src, sizeof(CONFIG)); - - if (src->home != NULL) - dest->home = dstrdup(src->home); - if (src->monitor_dir != NULL) - dest->monitor_dir = dstrdup(src->monitor_dir); - if (src->partial_config != NULL) - dest->partial_config = dstrdup(src->partial_config); - if (src->reopen_config != NULL) - dest->reopen_config = dstrdup(src->reopen_config); - if (src->base_uri != NULL) - dest->base_uri = dstrdup(src->base_uri); - - if (src->uris != NULL) { - dest->uris = dcalloc(src->table_count, sizeof(char *)); - for (i = 0; i < src->table_count; i++) - dest->uris[i] = dstrdup(src->uris[i]); - } - - if (src->async_config != NULL) - dest->async_config = dstrdup(src->async_config); - - dest->ckptthreads = NULL; - dest->popthreads = NULL; - dest->workers = NULL; - - if (src->workload != NULL) { - dest->workload = dcalloc(WORKLOAD_MAX, sizeof(WORKLOAD)); - memcpy(dest->workload, - src->workload, WORKLOAD_MAX * sizeof(WORKLOAD)); - } - - for (i = 0; i < sizeof(config_opts) / sizeof(config_opts[0]); i++) - if (config_opts[i].type == STRING_TYPE || - config_opts[i].type == CONFIG_STRING_TYPE) { - pstr = (char **) - ((u_char *)dest + config_opts[i].offset); - if (*pstr != NULL) { - newstr = dstrdup(*pstr); - *pstr = newstr; - } - } - - TAILQ_INIT(&dest->stone_head); - TAILQ_INIT(&dest->config_head); - - /* Clone the config string information into the new cfg object */ - TAILQ_FOREACH(conf_line, &src->config_head, c) { - tmp_line = dcalloc(sizeof(CONFIG_QUEUE_ENTRY), 1); - tmp_line->string = dstrdup(conf_line->string); - TAILQ_INSERT_TAIL(&dest->config_head, tmp_line, c); - } -} - -/* - * config_free -- - * Free any storage allocated in the config struct. - */ -void -config_free(CONFIG *cfg) -{ - CONFIG_QUEUE_ENTRY *config_line; - size_t i; - char **pstr; - - free(cfg->home); - free(cfg->monitor_dir); - free(cfg->partial_config); - free(cfg->reopen_config); - - /* Free the various URIs */ - free(cfg->base_uri); - free(cfg->log_table_uri); - - if (cfg->uris != NULL) { - for (i = 0; i < cfg->table_count; i++) - free(cfg->uris[i]); - free(cfg->uris); - } - - free(cfg->async_config); - - free(cfg->ckptthreads); - free(cfg->popthreads); - - free(cfg->workers); - free(cfg->workload); - - cleanup_truncate_config(cfg); - - while (!TAILQ_EMPTY(&cfg->config_head)) { - config_line = TAILQ_FIRST(&cfg->config_head); - TAILQ_REMOVE(&cfg->config_head, config_line, c); - free(config_line->string); - free(config_line); - } - - for (i = 0; i < sizeof(config_opts) / sizeof(config_opts[0]); i++) - if (config_opts[i].type == STRING_TYPE || - config_opts[i].type == CONFIG_STRING_TYPE) { - pstr = (char **) - ((u_char *)cfg + config_opts[i].offset); - free(*pstr); - *pstr = NULL; - } -} - -/* - * config_compress -- - * Parse the compression configuration. - */ -int -config_compress(CONFIG *cfg) -{ - int ret; - const char *s; - - ret = 0; - s = cfg->compression; - if (strcmp(s, "none") == 0) { - cfg->compress_ext = NULL; - cfg->compress_table = NULL; - } else if (strcmp(s, "lz4") == 0) { -#ifndef HAVE_BUILTIN_EXTENSION_LZ4 - cfg->compress_ext = LZ4_EXT; -#endif - cfg->compress_table = LZ4_BLK; - } else if (strcmp(s, "snappy") == 0) { -#ifndef HAVE_BUILTIN_EXTENSION_SNAPPY - cfg->compress_ext = SNAPPY_EXT; -#endif - cfg->compress_table = SNAPPY_BLK; - } else if (strcmp(s, "zlib") == 0) { -#ifndef HAVE_BUILTIN_EXTENSION_ZLIB - cfg->compress_ext = ZLIB_EXT; -#endif - cfg->compress_table = ZLIB_BLK; - } else { - fprintf(stderr, - "invalid compression configuration: %s\n", s); - ret = EINVAL; - } - return (ret); - -} - -/* * config_threads -- * Parse the thread configuration. */ static int -config_threads(CONFIG *cfg, const char *config, size_t len) +config_threads(WTPERF *wtperf, const char *config, size_t len) { WORKLOAD *workp; WT_CONFIG_ITEM groupk, groupv, k, v; @@ -263,19 +176,19 @@ config_threads(CONFIG *cfg, const char *config, size_t len) int ret; group = scan = NULL; - if (cfg->workload != NULL) { + if (wtperf->workload != NULL) { /* * This call overrides an earlier call. Free and * reset everything. */ - free(cfg->workload); - cfg->workload = NULL; - cfg->workload_cnt = 0; - cfg->workers_cnt = 0; + free(wtperf->workload); + wtperf->workload = NULL; + wtperf->workload_cnt = 0; + wtperf->workers_cnt = 0; } /* Allocate the workload array. */ - cfg->workload = dcalloc(WORKLOAD_MAX, sizeof(WORKLOAD)); - cfg->workload_cnt = 0; + wtperf->workload = dcalloc(WORKLOAD_MAX, sizeof(WORKLOAD)); + wtperf->workload_cnt = 0; /* * The thread configuration may be in multiple groups, that is, we have @@ -294,14 +207,14 @@ config_threads(CONFIG *cfg, const char *config, size_t len) goto err; /* Move to the next workload slot. */ - if (cfg->workload_cnt == WORKLOAD_MAX) { + if (wtperf->workload_cnt == WORKLOAD_MAX) { fprintf(stderr, "too many workloads configured, only %d workloads " "supported\n", WORKLOAD_MAX); return (EINVAL); } - workp = &cfg->workload[cfg->workload_cnt++]; + workp = &wtperf->workload[wtperf->workload_cnt++]; while ((ret = scan->next(scan, &k, &v)) == 0) { if (STRING_MATCH("count", k.str, k.len)) { @@ -334,9 +247,9 @@ config_threads(CONFIG *cfg, const char *config, size_t len) if ((workp->truncate = v.val) != 1) goto err; /* There can only be one Truncate thread. */ - if (F_ISSET(cfg, CFG_TRUNCATE)) + if (F_ISSET(wtperf, CFG_TRUNCATE)) goto err; - F_SET(cfg, CFG_TRUNCATE); + F_SET(wtperf, CFG_TRUNCATE); continue; } if (STRING_MATCH("truncate_pct", k.str, k.len)) { @@ -364,13 +277,13 @@ config_threads(CONFIG *cfg, const char *config, size_t len) goto err; /* Special random value */ workp->update_delta = INT64_MAX; - F_SET(cfg, CFG_GROW); + F_SET(wtperf, CFG_GROW); } else { workp->update_delta = v.val; if (v.val > 0) - F_SET(cfg, CFG_GROW); + F_SET(wtperf, CFG_GROW); if (v.val < 0) - F_SET(cfg, CFG_SHRINK); + F_SET(wtperf, CFG_SHRINK); } continue; } @@ -400,7 +313,7 @@ config_threads(CONFIG *cfg, const char *config, size_t len) if (workp->truncate != 0 && (workp->insert > 0 || workp->read > 0 || workp->update > 0)) goto err; - cfg->workers_cnt += (u_int)workp->threads; + wtperf->workers_cnt += (u_int)workp->threads; } ret = group->close(group); @@ -428,32 +341,34 @@ err: if (group != NULL) * value. */ static int -config_opt(CONFIG *cfg, WT_CONFIG_ITEM *k, WT_CONFIG_ITEM *v) +config_opt(WTPERF *wtperf, WT_CONFIG_ITEM *k, WT_CONFIG_ITEM *v) { - CONFIG_OPT *popt; + CONFIG_OPTS *opts; + CONFIG_OPT *desc; char *begin, *newstr, **strp; int ret; - size_t i, newlen, nopt; + size_t i, newlen; void *valueloc; - popt = NULL; - nopt = sizeof(config_opts)/sizeof(config_opts[0]); - for (i = 0; i < nopt; i++) - if (strlen(config_opts[i].name) == k->len && - strncmp(config_opts[i].name, k->str, k->len) == 0) { - popt = &config_opts[i]; + opts = wtperf->opts; + + desc = NULL; + for (i = 0; i < WT_ELEMENTS(config_opts_desc); i++) + if (strlen(config_opts_desc[i].name) == k->len && + strncmp(config_opts_desc[i].name, k->str, k->len) == 0) { + desc = &config_opts_desc[i]; break; } - if (popt == NULL) { + if (desc == NULL) { fprintf(stderr, "wtperf: Error: " "unknown option \'%.*s\'\n", (int)k->len, k->str); fprintf(stderr, "Options:\n"); - for (i = 0; i < nopt; i++) - fprintf(stderr, "\t%s\n", config_opts[i].name); + for (i = 0; i < WT_ELEMENTS(config_opts_desc); i++) + fprintf(stderr, "\t%s\n", config_opts_desc[i].name); return (EINVAL); } - valueloc = ((u_char *)cfg + popt->offset); - switch (popt->type) { + valueloc = ((uint8_t *)opts + desc->offset); + switch (desc->type) { case BOOL_TYPE: if (v->type != WT_CONFIG_ITEM_BOOL) { fprintf(stderr, "wtperf: Error: " @@ -531,7 +446,7 @@ config_opt(CONFIG *cfg, WT_CONFIG_ITEM *k, WT_CONFIG_ITEM *v) */ if (v->type == WT_CONFIG_ITEM_STRUCT && STRING_MATCH("threads", k->str, k->len)) - return (config_threads(cfg, v->str, v->len)); + return (config_threads(wtperf, v->str, v->len)); if (v->type != WT_CONFIG_ITEM_STRING && v->type != WT_CONFIG_ITEM_ID) { @@ -559,7 +474,7 @@ config_opt(CONFIG *cfg, WT_CONFIG_ITEM *k, WT_CONFIG_ITEM *v) * via lines ending in '\'. */ int -config_opt_file(CONFIG *cfg, const char *filename) +config_opt_file(WTPERF *wtperf, const char *filename) { FILE *fp; size_t linelen, optionpos; @@ -659,7 +574,7 @@ config_opt_file(CONFIG *cfg, const char *filename) if (contline) optionpos += linelen; else { - if ((ret = config_opt_line(cfg, option)) != 0) { + if ((ret = config_opt_str(wtperf, option)) != 0) { fprintf(stderr, "wtperf: %s: %d: parse error\n", filename, linenum); break; @@ -684,23 +599,26 @@ config_opt_file(CONFIG *cfg, const char *filename) } /* - * config_opt_line -- + * config_opt_str -- * Parse a single line of config options. Continued lines have already * been joined. */ int -config_opt_line(CONFIG *cfg, const char *optstr) +config_opt_str(WTPERF *wtperf, const char *optstr) { + CONFIG_OPTS *opts; CONFIG_QUEUE_ENTRY *config_line; WT_CONFIG_ITEM k, v; WT_CONFIG_PARSER *scan; size_t len; int ret, t_ret; + opts = wtperf->opts; + len = strlen(optstr); if ((ret = wiredtiger_config_parser_open( NULL, optstr, len, &scan)) != 0) { - lprintf(cfg, ret, 0, "Error in config_scan_begin"); + lprintf(wtperf, ret, 0, "Error in config_scan_begin"); return (ret); } @@ -712,7 +630,7 @@ config_opt_line(CONFIG *cfg, const char *optstr) */ config_line = dcalloc(sizeof(CONFIG_QUEUE_ENTRY), 1); config_line->string = dstrdup(optstr); - TAILQ_INSERT_TAIL(&cfg->config_head, config_line, c); + TAILQ_INSERT_TAIL(&opts->config_head, config_line, q); while (ret == 0) { if ((ret = scan->next(scan, &k, &v)) != 0) { @@ -721,10 +639,10 @@ config_opt_line(CONFIG *cfg, const char *optstr) ret = 0; break; } - ret = config_opt(cfg, &k, &v); + ret = config_opt(wtperf, &k, &v); } if ((t_ret = scan->close(scan)) != 0) { - lprintf(cfg, ret, 0, "Error in config_scan_end"); + lprintf(wtperf, ret, 0, "Error in config_scan_end"); if (ret == 0) ret = t_ret; } @@ -733,19 +651,20 @@ config_opt_line(CONFIG *cfg, const char *optstr) } /* - * config_opt_str -- - * Set a single string config option. + * config_opt_name_value -- + * Set a name/value configuration pair. */ int -config_opt_str(CONFIG *cfg, const char *name, const char *value) +config_opt_name_value(WTPERF *wtperf, const char *name, const char *value) { + size_t len; int ret; char *optstr; - /* name="value" */ - optstr = dmalloc(strlen(name) + strlen(value) + 4); - sprintf(optstr, "%s=\"%s\"", name, value); - ret = config_opt_line(cfg, optstr); + len = strlen(name) + strlen(value) + 4; + optstr = dmalloc(len); + snprintf(optstr, len, "%s=\"%s\"", name, value); + ret = config_opt_str(wtperf, optstr); free(optstr); return (ret); } @@ -755,60 +674,63 @@ config_opt_str(CONFIG *cfg, const char *name, const char *value) * Configuration sanity checks. */ int -config_sanity(CONFIG *cfg) +config_sanity(WTPERF *wtperf) { + CONFIG_OPTS *opts; WORKLOAD *workp; u_int i; + opts = wtperf->opts; + /* Various intervals should be less than the run-time. */ - if (cfg->run_time > 0 && - ((cfg->checkpoint_threads != 0 && - cfg->checkpoint_interval > cfg->run_time) || - cfg->report_interval > cfg->run_time || - cfg->sample_interval > cfg->run_time)) { + if (opts->run_time > 0 && + ((opts->checkpoint_threads != 0 && + opts->checkpoint_interval > opts->run_time) || + opts->report_interval > opts->run_time || + opts->sample_interval > opts->run_time)) { fprintf(stderr, "interval value longer than the run-time\n"); return (EINVAL); } /* The maximum is here to keep file name construction simple. */ - if (cfg->table_count < 1 || cfg->table_count > 99999) { + if (opts->table_count < 1 || opts->table_count > 99999) { fprintf(stderr, "invalid table count, less than 1 or greater than 99999\n"); return (EINVAL); } - if (cfg->database_count < 1 || cfg->database_count > 99) { + if (opts->database_count < 1 || opts->database_count > 99) { fprintf(stderr, "invalid database count, less than 1 or greater than 99\n"); return (EINVAL); } - if (cfg->pareto > 100) { + if (opts->pareto > 100) { fprintf(stderr, "Invalid pareto distribution - should be a percentage\n"); return (EINVAL); } - if (cfg->value_sz_max < cfg->value_sz) { - if (F_ISSET(cfg, CFG_GROW)) { + if (opts->value_sz_max < opts->value_sz) { + if (F_ISSET(wtperf, CFG_GROW)) { fprintf(stderr, "value_sz_max %" PRIu32 " must be greater than or equal to value_sz %" - PRIu32 "\n", cfg->value_sz_max, cfg->value_sz); + PRIu32 "\n", opts->value_sz_max, opts->value_sz); return (EINVAL); } else - cfg->value_sz_max = cfg->value_sz; + opts->value_sz_max = opts->value_sz; } - if (cfg->value_sz_min > cfg->value_sz) { - if (F_ISSET(cfg, CFG_SHRINK)) { + if (opts->value_sz_min > opts->value_sz) { + if (F_ISSET(wtperf, CFG_SHRINK)) { fprintf(stderr, "value_sz_min %" PRIu32 " must be less than or equal to value_sz %" - PRIu32 "\n", cfg->value_sz_min, cfg->value_sz); + PRIu32 "\n", opts->value_sz_min, opts->value_sz); return (EINVAL); } else - cfg->value_sz_min = cfg->value_sz; + opts->value_sz_min = opts->value_sz; } - if (cfg->readonly && cfg->workload != NULL) - for (i = 0, workp = cfg->workload; - i < cfg->workload_cnt; ++i, ++workp) + if (opts->readonly && wtperf->workload != NULL) + for (i = 0, workp = wtperf->workload; + i < wtperf->workload_cnt; ++i, ++workp) if (workp->insert != 0 || workp->update != 0 || workp->truncate != 0) { fprintf(stderr, @@ -824,21 +746,21 @@ config_sanity(CONFIG *cfg) * Consolidate repeated configuration settings so that it only appears * once in the configuration output file. */ -void -config_consolidate(CONFIG *cfg) +static void +config_consolidate(CONFIG_OPTS *opts) { CONFIG_QUEUE_ENTRY *conf_line, *test_line, *tmp; char *string_key; /* - * This loop iterates over the config queue and for entry checks if an - * entry later in the queue has the same key. If a match is found then - * the current queue entry is removed and we continue. + * This loop iterates over the config queue and for each entry checks if + * a later queue entry has the same key. If there's a match, the current + * queue entry is removed and we continue. */ - conf_line = TAILQ_FIRST(&cfg->config_head); + conf_line = TAILQ_FIRST(&opts->config_head); while (conf_line != NULL) { string_key = strchr(conf_line->string, '='); - tmp = test_line = TAILQ_NEXT(conf_line, c); + tmp = test_line = TAILQ_NEXT(conf_line, q); while (test_line != NULL) { /* * The + 1 here forces the '=' sign to be matched @@ -849,89 +771,75 @@ config_consolidate(CONFIG *cfg) if (strncmp(conf_line->string, test_line->string, (size_t)((string_key - conf_line->string) + 1)) == 0) { - TAILQ_REMOVE(&cfg->config_head, conf_line, c); + TAILQ_REMOVE(&opts->config_head, conf_line, q); free(conf_line->string); free(conf_line); break; } - test_line = TAILQ_NEXT(test_line, c); + test_line = TAILQ_NEXT(test_line, q); } conf_line = tmp; } } /* - * config_to_file -- + * config_opt_log -- * Write the final config used in this execution to a file. */ void -config_to_file(CONFIG *cfg) +config_opt_log(CONFIG_OPTS *opts, const char *path) { CONFIG_QUEUE_ENTRY *config_line; FILE *fp; - size_t req_len; - char *path; - fp = NULL; + testutil_checkfmt(((fp = fopen(path, "w")) == NULL), "%s", path); - /* Backup the config */ - req_len = strlen(cfg->home) + strlen("/CONFIG.wtperf") + 1; - path = dcalloc(req_len, 1); - snprintf(path, req_len, "%s/CONFIG.wtperf", cfg->home); - if ((fp = fopen(path, "w")) == NULL) { - lprintf(cfg, errno, 0, "%s", path); - goto err; - } + config_consolidate(opts); - /* Print the config dump */ - fprintf(fp,"# Warning. This config includes " + fprintf(fp,"# Warning: This config includes " "unwritten, implicit configuration defaults.\n" "# Changes to those values may cause differences in behavior.\n"); - config_consolidate(cfg); - config_line = TAILQ_FIRST(&cfg->config_head); - while (config_line != NULL) { + TAILQ_FOREACH(config_line, &opts->config_head, q) fprintf(fp, "%s\n", config_line->string); - config_line = TAILQ_NEXT(config_line, c); - } - -err: free(path); - if (fp != NULL) - (void)fclose(fp); + testutil_check(fclose(fp)); } /* - * config_print -- + * config_opt_print -- * Print out the configuration in verbose mode. */ void -config_print(CONFIG *cfg) +config_opt_print(WTPERF *wtperf) { + CONFIG_OPTS *opts; WORKLOAD *workp; u_int i; + opts = wtperf->opts; + printf("Workload configuration:\n"); - printf("\t" "Home: %s\n", cfg->home); - printf("\t" "Table name: %s\n", cfg->table_name); - printf("\t" "Connection configuration: %s\n", cfg->conn_config); - if (cfg->sess_config != NULL) - printf("\t" "Session configuration: %s\n", cfg->sess_config); + printf("\t" "Home: %s\n", wtperf->home); + printf("\t" "Table name: %s\n", opts->table_name); + printf("\t" "Connection configuration: %s\n", opts->conn_config); + if (opts->sess_config != NULL) + printf("\t" "Session configuration: %s\n", opts->sess_config); printf("\t%s table: %s\n", - cfg->create ? "Creating new" : "Using existing", - cfg->table_config); + opts->create ? "Creating new" : "Using existing", + opts->table_config); printf("\t" "Key size: %" PRIu32 ", value size: %" PRIu32 "\n", - cfg->key_sz, cfg->value_sz); - if (cfg->create) + opts->key_sz, opts->value_sz); + if (opts->create) printf("\t" "Populate threads: %" PRIu32 ", inserting %" PRIu32 " rows\n", - cfg->populate_threads, cfg->icount); + opts->populate_threads, opts->icount); printf("\t" "Workload seconds, operations: %" PRIu32 ", %" PRIu32 "\n", - cfg->run_time, cfg->run_ops); - if (cfg->workload != NULL) { + opts->run_time, opts->run_ops); + if (wtperf->workload != NULL) { printf("\t" "Workload configuration(s):\n"); - for (i = 0, workp = cfg->workload; - i < cfg->workload_cnt; ++i, ++workp) + for (i = 0, workp = wtperf->workload; + i < wtperf->workload_cnt; ++i, ++workp) printf("\t\t%" PRId64 " threads (inserts=%" PRId64 ", reads=%" PRId64 ", updates=%" PRId64 ", truncates=% " PRId64 ")\n", @@ -941,11 +849,11 @@ config_print(CONFIG *cfg) } printf("\t" "Checkpoint threads, interval: %" PRIu32 ", %" PRIu32 "\n", - cfg->checkpoint_threads, cfg->checkpoint_interval); - printf("\t" "Reporting interval: %" PRIu32 "\n", cfg->report_interval); - printf("\t" "Sampling interval: %" PRIu32 "\n", cfg->sample_interval); + opts->checkpoint_threads, opts->checkpoint_interval); + printf("\t" "Reporting interval: %" PRIu32 "\n", opts->report_interval); + printf("\t" "Sampling interval: %" PRIu32 "\n", opts->sample_interval); - printf("\t" "Verbosity: %" PRIu32 "\n", cfg->verbose); + printf("\t" "Verbosity: %" PRIu32 "\n", opts->verbose); } /* @@ -975,10 +883,10 @@ pretty_print(const char *p, const char *indent) * config_opt_usage -- * Configuration usage error message. */ -static void +void config_opt_usage(void) { - size_t i, nopt; + size_t i; const char *defaultval, *typestr; pretty_print( @@ -988,11 +896,10 @@ config_opt_usage(void) "String values must be enclosed in \" quotes, boolean values must " "be either true or false.\n", NULL); - nopt = sizeof(config_opts)/sizeof(config_opts[0]); - for (i = 0; i < nopt; i++) { - defaultval = config_opts[i].defaultval; + for (i = 0; i < WT_ELEMENTS(config_opts_desc); i++) { + defaultval = config_opts_desc[i].defaultval; typestr = "string"; - switch (config_opts[i].type) { + switch (config_opts_desc[i].type) { case BOOL_TYPE: typestr = "boolean"; if (strcmp(defaultval, "0") == 0) @@ -1011,28 +918,7 @@ config_opt_usage(void) break; } printf("%s (%s, default=%s)\n", - config_opts[i].name, typestr, defaultval); - pretty_print(config_opts[i].description, "\t"); + config_opts_desc[i].name, typestr, defaultval); + pretty_print(config_opts_desc[i].description, "\t"); } } - -/* - * usage -- - * wtperf usage print, no error. - */ -void -usage(void) -{ - printf("wtperf [-C config] " - "[-H mount] [-h home] [-O file] [-o option] [-T config]\n"); - printf("\t-C <string> additional connection configuration\n"); - printf("\t (added to option conn_config)\n"); - printf("\t-H <mount> configure Helium volume mount point\n"); - printf("\t-h <string> Wired Tiger home must exist, default WT_TEST\n"); - printf("\t-O <file> file contains options as listed below\n"); - printf("\t-o option=val[,option=val,...] set options listed below\n"); - printf("\t-T <string> additional table configuration\n"); - printf("\t (added to option table_config)\n"); - printf("\n"); - config_opt_usage(); -} diff --git a/bench/wtperf/config_opt.h b/bench/wtperf/config_opt.h index b7eff8e143f..3f1ab642227 100644 --- a/bench/wtperf/config_opt.h +++ b/bench/wtperf/config_opt.h @@ -37,3 +37,17 @@ typedef struct { CONFIG_OPT_TYPE type; size_t offset; } CONFIG_OPT; + +typedef struct __config_queue_entry { + char *string; + TAILQ_ENTRY(__config_queue_entry) q; +} CONFIG_QUEUE_ENTRY; + +typedef struct { /* Option structure */ +#define OPT_DECLARE_STRUCT +#include "wtperf_opt.i" +#undef OPT_DECLARE_STRUCT + + /* Queue head to save a copy of the config to be output */ + TAILQ_HEAD(__config_qh, __config_queue_entry) config_head; +} CONFIG_OPTS; diff --git a/bench/wtperf/doxy.c b/bench/wtperf/doxy.c deleted file mode 100644 index 26d73168ef2..00000000000 --- a/bench/wtperf/doxy.c +++ /dev/null @@ -1,111 +0,0 @@ -/*- - * Public Domain 2014-2016 MongoDB, Inc. - * Public Domain 2008-2014 WiredTiger, Inc. - * - * This is free and unencumbered software released into the public domain. - * - * Anyone is free to copy, modify, publish, use, compile, sell, or - * distribute this software, either in source code form or as a compiled - * binary, for any purpose, commercial or non-commercial, and by any - * means. - * - * In jurisdictions that recognize copyright laws, the author or authors - * of this software dedicate any and all copyright interest in the - * software to the public domain. We make this dedication for the benefit - * of the public at large and to the detriment of our heirs and - * successors. We intend this dedication to be an overt act of - * relinquishment in perpetuity of all present and future rights to this - * software under copyright law. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include <string.h> -#include <stdio.h> - -#include "config_opt.h" - -static const CONFIG_OPT config_opts[] = { -#define OPT_DEFINE_DOXYGEN -#include "wtperf_opt.i" -#undef OPT_DEFINE_DOXYGEN -}; - -/* - * pretty_print -- - * Print out lines of text for a 80 character window. - */ -static void -pretty_print(const char *p, const char *indent) -{ - const char *t; - - for (;; p = t + 1) { - if (strlen(p) <= 70) - break; - for (t = p + 70; t > p && *t != ' '; --t) - ; - if (t == p) /* No spaces? */ - break; - printf("%s%.*s\n", - indent == NULL ? "" : indent, (int)(t - p), p); - } - if (*p != '\0') - printf("%s%s\n", indent == NULL ? "" : indent, p); -} - -/* - * config_doxygen -- - * Output the configuration information for doxgen. - */ -static void -config_doxygen(void) -{ - size_t i, nopt; - const char *defaultval, *typestr; - - nopt = sizeof(config_opts)/sizeof(config_opts[0]); - for (i = 0; i < nopt; i++) { - defaultval = config_opts[i].defaultval; - typestr = "string"; - switch (config_opts[i].type) { - case BOOL_TYPE: - typestr = "boolean"; - if (strcmp(defaultval, "0") == 0) - defaultval = "false"; - else - defaultval = "true"; - break; - case CONFIG_STRING_TYPE: - case STRING_TYPE: - break; - case INT_TYPE: - typestr = "int"; - break; - case UINT32_TYPE: - typestr = "unsigned int"; - break; - } - printf("@par %s (%s, default=%s)\n", - config_opts[i].name, typestr, defaultval); - pretty_print(config_opts[i].description, NULL); - } -} - -/* - * config_doxygen -- - * A standalone program to output the configuration options in a doxygen - * format. - */ -int -main() -{ - config_doxygen(); - return (0); -} diff --git a/bench/wtperf/idle_table_cycle.c b/bench/wtperf/idle_table_cycle.c index 3c079bb560f..13fa55e86f5 100644 --- a/bench/wtperf/idle_table_cycle.c +++ b/bench/wtperf/idle_table_cycle.c @@ -29,28 +29,25 @@ #include "wtperf.h" static int -check_timing(CONFIG *cfg, +check_timing(WTPERF *wtperf, const char *name, struct timespec start, struct timespec *stop) { + CONFIG_OPTS *opts; uint64_t last_interval; - int ret; - if ((ret = __wt_epoch(NULL, stop)) != 0) { - lprintf(cfg, ret, 0, - "Get time failed in cycle_idle_tables."); - cfg->error = ret; - return (ret); - } + opts = wtperf->opts; + + __wt_epoch(NULL, stop); last_interval = (uint64_t)(WT_TIMEDIFF_SEC(*stop, start)); - if (last_interval > cfg->idle_table_cycle) { - lprintf(cfg, ret, 0, + if (last_interval > opts->idle_table_cycle) { + lprintf(wtperf, ETIMEDOUT, 0, "Cycling idle table failed because %s took %" PRIu64 " seconds which is longer than configured acceptable" " maximum of %" PRIu32 ".", - name, last_interval, cfg->idle_table_cycle); - cfg->error = ETIMEDOUT; + name, last_interval, opts->idle_table_cycle); + wtperf->error = true; return (ETIMEDOUT); } return (0); @@ -64,64 +61,62 @@ static void * cycle_idle_tables(void *arg) { struct timespec start, stop; - CONFIG *cfg; - WT_SESSION *session; + CONFIG_OPTS *opts; + WTPERF *wtperf; WT_CURSOR *cursor; + WT_SESSION *session; int cycle_count, ret; char uri[512]; - cfg = (CONFIG *)arg; + wtperf = (WTPERF *)arg; + opts = wtperf->opts; cycle_count = 0; - if ((ret = cfg->conn->open_session( - cfg->conn, NULL, cfg->sess_config, &session)) != 0) { - lprintf(cfg, ret, 0, - "Error opening a session on %s", cfg->home); + if ((ret = wtperf->conn->open_session( + wtperf->conn, NULL, opts->sess_config, &session)) != 0) { + lprintf(wtperf, ret, 0, + "Error opening a session on %s", wtperf->home); return (NULL); } - for (cycle_count = 0; cfg->idle_cycle_run; ++cycle_count) { - snprintf(uri, 512, "%s_cycle%07d", cfg->uris[0], cycle_count); + for (cycle_count = 0; wtperf->idle_cycle_run; ++cycle_count) { + snprintf(uri, sizeof(uri), + "%s_cycle%07d", wtperf->uris[0], cycle_count); /* Don't busy cycle in this loop. */ __wt_sleep(1, 0); /* Setup a start timer. */ - if ((ret = __wt_epoch(NULL, &start)) != 0) { - lprintf(cfg, ret, 0, - "Get time failed in cycle_idle_tables."); - cfg->error = ret; - return (NULL); - } + __wt_epoch(NULL, &start); /* Create a table. */ if ((ret = session->create( - session, uri, cfg->table_config)) != 0) { + session, uri, opts->table_config)) != 0) { if (ret == EBUSY) continue; - lprintf(cfg, ret, 0, + lprintf(wtperf, ret, 0, "Table create failed in cycle_idle_tables."); - cfg->error = ret; + wtperf->error = true; return (NULL); } - if (check_timing(cfg, "create", start, &stop) != 0) + if (check_timing(wtperf, "create", start, &stop) != 0) return (NULL); start = stop; /* Open and close cursor. */ if ((ret = session->open_cursor( session, uri, NULL, NULL, &cursor)) != 0) { - lprintf(cfg, ret, 0, + lprintf(wtperf, ret, 0, "Cursor open failed in cycle_idle_tables."); - cfg->error = ret; + wtperf->error = true; return (NULL); } if ((ret = cursor->close(cursor)) != 0) { - lprintf(cfg, ret, 0, + lprintf(wtperf, ret, 0, "Cursor close failed in cycle_idle_tables."); - cfg->error = ret; + wtperf->error = true; return (NULL); } - if (check_timing(cfg, "cursor", start, &stop) != 0) + if (check_timing(wtperf, "cursor", start, &stop) != 0) return (NULL); start = stop; @@ -134,12 +129,12 @@ cycle_idle_tables(void *arg) __wt_sleep(1, 0); if (ret != 0 && ret != EBUSY) { - lprintf(cfg, ret, 0, + lprintf(wtperf, ret, 0, "Table drop failed in cycle_idle_tables."); - cfg->error = ret; + wtperf->error = true; return (NULL); } - if (check_timing(cfg, "drop", start, &stop) != 0) + if (check_timing(wtperf, "drop", start, &stop) != 0) return (NULL); } @@ -154,20 +149,23 @@ cycle_idle_tables(void *arg) * initialization isn't necessary. */ int -start_idle_table_cycle(CONFIG *cfg, pthread_t *idle_table_cycle_thread) +start_idle_table_cycle(WTPERF *wtperf, pthread_t *idle_table_cycle_thread) { + CONFIG_OPTS *opts; pthread_t thread_id; int ret; - if (cfg->idle_table_cycle == 0) + opts = wtperf->opts; + + if (opts->idle_table_cycle == 0) return (0); - cfg->idle_cycle_run = true; + wtperf->idle_cycle_run = true; if ((ret = pthread_create( - &thread_id, NULL, cycle_idle_tables, cfg)) != 0) { - lprintf( - cfg, ret, 0, "Error creating idle table cycle thread."); - cfg->idle_cycle_run = false; + &thread_id, NULL, cycle_idle_tables, wtperf)) != 0) { + lprintf(wtperf, + ret, 0, "Error creating idle table cycle thread."); + wtperf->idle_cycle_run = false; return (ret); } *idle_table_cycle_thread = thread_id; @@ -176,17 +174,20 @@ start_idle_table_cycle(CONFIG *cfg, pthread_t *idle_table_cycle_thread) } int -stop_idle_table_cycle(CONFIG *cfg, pthread_t idle_table_cycle_thread) +stop_idle_table_cycle(WTPERF *wtperf, pthread_t idle_table_cycle_thread) { + CONFIG_OPTS *opts; int ret; - if (cfg->idle_table_cycle == 0 || !cfg->idle_cycle_run) + opts = wtperf->opts; + + if (opts->idle_table_cycle == 0 || !wtperf->idle_cycle_run) return (0); - cfg->idle_cycle_run = false; + wtperf->idle_cycle_run = false; if ((ret = pthread_join(idle_table_cycle_thread, NULL)) != 0) { lprintf( - cfg, ret, 0, "Error joining idle table cycle thread."); + wtperf, ret, 0, "Error joining idle table cycle thread."); return (ret); } return (0); diff --git a/bench/wtperf/misc.c b/bench/wtperf/misc.c index 2821216f240..24b3323a49a 100644 --- a/bench/wtperf/misc.c +++ b/bench/wtperf/misc.c @@ -30,31 +30,34 @@ /* Setup the logging output mechanism. */ int -setup_log_file(CONFIG *cfg) +setup_log_file(WTPERF *wtperf) { + CONFIG_OPTS *opts; + size_t len; int ret; char *fname; + opts = wtperf->opts; ret = 0; - if (cfg->verbose < 1) + if (opts->verbose < 1) return (0); - fname = dcalloc(strlen(cfg->monitor_dir) + - strlen(cfg->table_name) + strlen(".stat") + 2, 1); - - sprintf(fname, "%s/%s.stat", cfg->monitor_dir, cfg->table_name); - cfg->logf = fopen(fname, "w"); - if (cfg->logf == NULL) { + len = strlen(wtperf->monitor_dir) + + strlen(opts->table_name) + strlen(".stat") + 2; + fname = dmalloc(len); + snprintf(fname, len, + "%s/%s.stat", wtperf->monitor_dir, opts->table_name); + if ((wtperf->logf = fopen(fname, "w")) == NULL) { ret = errno; fprintf(stderr, "%s: %s\n", fname, strerror(ret)); } free(fname); - if (cfg->logf == NULL) + if (wtperf->logf == NULL) return (ret); /* Use line buffering for the log file. */ - __wt_stream_set_line_buffer(cfg->logf); + __wt_stream_set_line_buffer(wtperf->logf); return (0); } @@ -62,17 +65,20 @@ setup_log_file(CONFIG *cfg) * Log printf - output a log message. */ void -lprintf(const CONFIG *cfg, int err, uint32_t level, const char *fmt, ...) +lprintf(const WTPERF *wtperf, int err, uint32_t level, const char *fmt, ...) { + CONFIG_OPTS *opts; va_list ap; - if (err == 0 && level <= cfg->verbose) { + opts = wtperf->opts; + + if (err == 0 && level <= opts->verbose) { va_start(ap, fmt); - vfprintf(cfg->logf, fmt, ap); + vfprintf(wtperf->logf, fmt, ap); va_end(ap); - fprintf(cfg->logf, "\n"); + fprintf(wtperf->logf, "\n"); - if (level < cfg->verbose) { + if (level < opts->verbose) { va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); @@ -87,11 +93,11 @@ lprintf(const CONFIG *cfg, int err, uint32_t level, const char *fmt, ...) vfprintf(stderr, fmt, ap); va_end(ap); fprintf(stderr, " Error: %s\n", wiredtiger_strerror(err)); - if (cfg->logf != NULL) { + if (wtperf->logf != NULL) { va_start(ap, fmt); - vfprintf(cfg->logf, fmt, ap); + vfprintf(wtperf->logf, fmt, ap); va_end(ap); - fprintf(cfg->logf, " Error: %s\n", wiredtiger_strerror(err)); + fprintf(wtperf->logf, " Error: %s\n", wiredtiger_strerror(err)); } /* Never attempt to continue if we got a panic from WiredTiger. */ diff --git a/bench/wtperf/runners/500m-btree-50r50u.wtperf b/bench/wtperf/runners/500m-btree-50r50u.wtperf index 06745bf7cca..536127f0dd8 100644 --- a/bench/wtperf/runners/500m-btree-50r50u.wtperf +++ b/bench/wtperf/runners/500m-btree-50r50u.wtperf @@ -10,6 +10,9 @@ create=false compression="snappy" sess_config="isolation=snapshot" table_count=2 +# close_conn as false allows this test to close/finish faster, but if running +# as the set, the next test will need to run recovery. +close_conn=false key_sz=40 value_sz=120 max_latency=2000 diff --git a/bench/wtperf/runners/500m-btree-80r20u.wtperf b/bench/wtperf/runners/500m-btree-80r20u.wtperf index 77edbfb4941..d6218c44af0 100644 --- a/bench/wtperf/runners/500m-btree-80r20u.wtperf +++ b/bench/wtperf/runners/500m-btree-80r20u.wtperf @@ -8,6 +8,9 @@ conn_config="cache_size=16G,checkpoint=(wait=60,log_size=2GB),session_max=20000,log=(enabled),statistics=(fast),statistics_log=(wait=30,json),eviction=(threads_max=4)" create=false compression="snappy" +# close_conn as false allows this test to close/finish faster, but if running +# as the set, the next test will need to run recovery. +close_conn=false sess_config="isolation=snapshot table_count=2 key_sz=40 diff --git a/bench/wtperf/runners/checkpoint-stress.wtperf b/bench/wtperf/runners/checkpoint-stress.wtperf index 0c98a0c2db0..bbd3a3ba5ed 100644 --- a/bench/wtperf/runners/checkpoint-stress.wtperf +++ b/bench/wtperf/runners/checkpoint-stress.wtperf @@ -4,6 +4,7 @@ conn_config="cache_size=16GB,eviction=(threads_max=4),log=(enabled=false)" table_config="leaf_page_max=32k,internal_page_max=16k,allocation_size=4k,split_pct=90,type=file" # Enough data to fill the cache. 150 million 1k records results in two ~11GB # tables +close_conn=false icount=150000000 create=true compression="snappy" diff --git a/bench/wtperf/runners/evict-btree-stress-multi.wtperf b/bench/wtperf/runners/evict-btree-stress-multi.wtperf index 9699b9ae3bb..a5a29f66fa0 100644 --- a/bench/wtperf/runners/evict-btree-stress-multi.wtperf +++ b/bench/wtperf/runners/evict-btree-stress-multi.wtperf @@ -1,6 +1,7 @@ conn_config="cache_size=1G,eviction=(threads_max=4),session_max=2000" table_config="type=file" table_count=100 +close_conn=false icount=100000000 report_interval=5 run_time=600 diff --git a/bench/wtperf/track.c b/bench/wtperf/track.c index b3f4847d9d0..822bdaa4b4a 100644 --- a/bench/wtperf/track.c +++ b/bench/wtperf/track.c @@ -32,16 +32,18 @@ * Return total insert operations for the populate phase. */ uint64_t -sum_pop_ops(CONFIG *cfg) +sum_pop_ops(WTPERF *wtperf) { - CONFIG_THREAD *thread; + CONFIG_OPTS *opts; + WTPERF_THREAD *thread; uint64_t total; u_int i; + opts = wtperf->opts; total = 0; - for (i = 0, thread = cfg->popthreads; - thread != NULL && i < cfg->populate_threads; ++i, ++thread) + for (i = 0, thread = wtperf->popthreads; + thread != NULL && i < opts->populate_threads; ++i, ++thread) total += thread->insert.ops; return (total); } @@ -50,16 +52,18 @@ sum_pop_ops(CONFIG *cfg) * Return total checkpoint operations. */ uint64_t -sum_ckpt_ops(CONFIG *cfg) +sum_ckpt_ops(WTPERF *wtperf) { - CONFIG_THREAD *thread; + CONFIG_OPTS *opts; + WTPERF_THREAD *thread; uint64_t total; u_int i; + opts = wtperf->opts; total = 0; - for (i = 0, thread = cfg->ckptthreads; - thread != NULL && i < cfg->checkpoint_threads; ++i, ++thread) + for (i = 0, thread = wtperf->ckptthreads; + thread != NULL && i < opts->checkpoint_threads; ++i, ++thread) total += thread->ckpt.ops; return (total); } @@ -68,19 +72,22 @@ sum_ckpt_ops(CONFIG *cfg) * Return total operations count for the worker threads. */ static uint64_t -sum_ops(CONFIG *cfg, size_t field_offset) +sum_ops(WTPERF *wtperf, size_t field_offset) { - CONFIG_THREAD *thread; + CONFIG_OPTS *opts; + WTPERF_THREAD *thread; uint64_t total; int64_t i, th_cnt; + opts = wtperf->opts; total = 0; - if (cfg->popthreads == NULL) { - thread = cfg->workers; - th_cnt = cfg->workers_cnt; + + if (wtperf->popthreads == NULL) { + thread = wtperf->workers; + th_cnt = wtperf->workers_cnt; } else { - thread = cfg->popthreads; - th_cnt = cfg->populate_threads; + thread = wtperf->popthreads; + th_cnt = opts->populate_threads; } for (i = 0; thread != NULL && i < th_cnt; ++i, ++thread) total += ((TRACK *)((uint8_t *)thread + field_offset))->ops; @@ -88,24 +95,24 @@ sum_ops(CONFIG *cfg, size_t field_offset) return (total); } uint64_t -sum_insert_ops(CONFIG *cfg) +sum_insert_ops(WTPERF *wtperf) { - return (sum_ops(cfg, offsetof(CONFIG_THREAD, insert))); + return (sum_ops(wtperf, offsetof(WTPERF_THREAD, insert))); } uint64_t -sum_read_ops(CONFIG *cfg) +sum_read_ops(WTPERF *wtperf) { - return (sum_ops(cfg, offsetof(CONFIG_THREAD, read))); + return (sum_ops(wtperf, offsetof(WTPERF_THREAD, read))); } uint64_t -sum_truncate_ops(CONFIG *cfg) +sum_truncate_ops(WTPERF *wtperf) { - return (sum_ops(cfg, offsetof(CONFIG_THREAD, truncate))); + return (sum_ops(wtperf, offsetof(WTPERF_THREAD, truncate))); } uint64_t -sum_update_ops(CONFIG *cfg) +sum_update_ops(WTPERF *wtperf) { - return (sum_ops(cfg, offsetof(CONFIG_THREAD, update))); + return (sum_ops(wtperf, offsetof(WTPERF_THREAD, update))); } /* @@ -114,25 +121,27 @@ sum_update_ops(CONFIG *cfg) * particular operation. */ static void -latency_op(CONFIG *cfg, +latency_op(WTPERF *wtperf, size_t field_offset, uint32_t *avgp, uint32_t *minp, uint32_t *maxp) { - CONFIG_THREAD *thread; + CONFIG_OPTS *opts; TRACK *track; + WTPERF_THREAD *thread; uint64_t ops, latency, tmp; int64_t i, th_cnt; uint32_t max, min; + opts = wtperf->opts; ops = latency = 0; max = 0; min = UINT32_MAX; - if (cfg->popthreads == NULL) { - thread = cfg->workers; - th_cnt = cfg->workers_cnt; + if (wtperf->popthreads == NULL) { + thread = wtperf->workers; + th_cnt = wtperf->workers_cnt; } else { - thread = cfg->popthreads; - th_cnt = cfg->populate_threads; + thread = wtperf->popthreads; + th_cnt = opts->populate_threads; } for (i = 0; thread != NULL && i < th_cnt; ++i, ++thread) { track = (TRACK *)((uint8_t *)thread + field_offset); @@ -160,11 +169,11 @@ latency_op(CONFIG *cfg, } } void -latency_read(CONFIG *cfg, uint32_t *avgp, uint32_t *minp, uint32_t *maxp) +latency_read(WTPERF *wtperf, uint32_t *avgp, uint32_t *minp, uint32_t *maxp) { static uint32_t last_avg = 0, last_max = 0, last_min = 0; - latency_op(cfg, offsetof(CONFIG_THREAD, read), avgp, minp, maxp); + latency_op(wtperf, offsetof(WTPERF_THREAD, read), avgp, minp, maxp); /* * If nothing happened, graph the average, minimum and maximum as they @@ -181,11 +190,11 @@ latency_read(CONFIG *cfg, uint32_t *avgp, uint32_t *minp, uint32_t *maxp) } } void -latency_insert(CONFIG *cfg, uint32_t *avgp, uint32_t *minp, uint32_t *maxp) +latency_insert(WTPERF *wtperf, uint32_t *avgp, uint32_t *minp, uint32_t *maxp) { static uint32_t last_avg = 0, last_max = 0, last_min = 0; - latency_op(cfg, offsetof(CONFIG_THREAD, insert), avgp, minp, maxp); + latency_op(wtperf, offsetof(WTPERF_THREAD, insert), avgp, minp, maxp); /* * If nothing happened, graph the average, minimum and maximum as they @@ -202,11 +211,11 @@ latency_insert(CONFIG *cfg, uint32_t *avgp, uint32_t *minp, uint32_t *maxp) } } void -latency_update(CONFIG *cfg, uint32_t *avgp, uint32_t *minp, uint32_t *maxp) +latency_update(WTPERF *wtperf, uint32_t *avgp, uint32_t *minp, uint32_t *maxp) { static uint32_t last_avg = 0, last_max = 0, last_min = 0; - latency_op(cfg, offsetof(CONFIG_THREAD, update), avgp, minp, maxp); + latency_op(wtperf, offsetof(WTPERF_THREAD, update), avgp, minp, maxp); /* * If nothing happened, graph the average, minimum and maximum as they @@ -228,17 +237,17 @@ latency_update(CONFIG *cfg, uint32_t *avgp, uint32_t *minp, uint32_t *maxp) * Sum latency for a set of threads. */ static void -sum_latency(CONFIG *cfg, size_t field_offset, TRACK *total) +sum_latency(WTPERF *wtperf, size_t field_offset, TRACK *total) { - CONFIG_THREAD *thread; + WTPERF_THREAD *thread; TRACK *trk; int64_t i; u_int j; memset(total, 0, sizeof(*total)); - for (i = 0, thread = cfg->workers; - thread != NULL && i < cfg->workers_cnt; ++i, ++thread) { + for (i = 0, thread = wtperf->workers; + thread != NULL && i < wtperf->workers_cnt; ++i, ++thread) { trk = (TRACK *)((uint8_t *)thread + field_offset); for (j = 0; j < ELEMENTS(trk->us); ++j) { @@ -256,32 +265,33 @@ sum_latency(CONFIG *cfg, size_t field_offset, TRACK *total) } } static void -sum_insert_latency(CONFIG *cfg, TRACK *total) +sum_insert_latency(WTPERF *wtperf, TRACK *total) { - sum_latency(cfg, offsetof(CONFIG_THREAD, insert), total); + sum_latency(wtperf, offsetof(WTPERF_THREAD, insert), total); } static void -sum_read_latency(CONFIG *cfg, TRACK *total) +sum_read_latency(WTPERF *wtperf, TRACK *total) { - sum_latency(cfg, offsetof(CONFIG_THREAD, read), total); + sum_latency(wtperf, offsetof(WTPERF_THREAD, read), total); } static void -sum_update_latency(CONFIG *cfg, TRACK *total) +sum_update_latency(WTPERF *wtperf, TRACK *total) { - sum_latency(cfg, offsetof(CONFIG_THREAD, update), total); + sum_latency(wtperf, offsetof(WTPERF_THREAD, update), total); } static void -latency_print_single(CONFIG *cfg, TRACK *total, const char *name) +latency_print_single(WTPERF *wtperf, TRACK *total, const char *name) { FILE *fp; u_int i; uint64_t cumops; char path[1024]; - snprintf(path, sizeof(path), "%s/latency.%s", cfg->monitor_dir, name); + snprintf(path, sizeof(path), + "%s/latency.%s", wtperf->monitor_dir, name); if ((fp = fopen(path, "w")) == NULL) { - lprintf(cfg, errno, 0, "%s", path); + lprintf(wtperf, errno, 0, "%s", path); return; } @@ -317,14 +327,14 @@ latency_print_single(CONFIG *cfg, TRACK *total, const char *name) } void -latency_print(CONFIG *cfg) +latency_print(WTPERF *wtperf) { TRACK total; - sum_insert_latency(cfg, &total); - latency_print_single(cfg, &total, "insert"); - sum_read_latency(cfg, &total); - latency_print_single(cfg, &total, "read"); - sum_update_latency(cfg, &total); - latency_print_single(cfg, &total, "update"); + sum_insert_latency(wtperf, &total); + latency_print_single(wtperf, &total, "insert"); + sum_read_latency(wtperf, &total); + latency_print_single(wtperf, &total, "read"); + sum_update_latency(wtperf, &total); + latency_print_single(wtperf, &total, "update"); } diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index bf6b156bb69..8c7f0053388 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -31,78 +31,38 @@ /* Default values. */ #define DEFAULT_HOME "WT_TEST" #define DEFAULT_MONITOR_DIR "WT_TEST" -static const CONFIG default_cfg = { - NULL, /* home */ - NULL, /* monitor dir */ - NULL, /* partial logging */ - NULL, /* reopen config */ - NULL, /* base_uri */ - NULL, /* log_table_uri */ - NULL, /* uris */ - NULL, /* conn */ - NULL, /* logf */ - NULL, /* async */ - NULL, NULL, /* compressor ext, blk */ - NULL, NULL, /* populate, checkpoint threads */ - - NULL, /* worker threads */ - 0, /* worker thread count */ - NULL, /* workloads */ - 0, /* workload count */ - 0, /* use_asyncops */ - 0, /* checkpoint operations */ - 0, /* insert operations */ - 0, /* read operations */ - 0, /* truncate operations */ - 0, /* update operations */ - 0, /* insert key */ - 0, /* log like table key */ - 0, /* checkpoint in progress */ - 0, /* thread error */ - 0, /* notify threads to stop */ - 0, /* in warmup phase */ - false, /* Signal for idle cycle thread */ - 0, /* total seconds running */ - 0, /* flags */ - {NULL, NULL}, /* the truncate queue */ - {NULL, NULL}, /* the config queue */ - -#define OPT_DEFINE_DEFAULT -#include "wtperf_opt.i" -#undef OPT_DEFINE_DEFAULT -}; static const char * const debug_cconfig = ""; static const char * const debug_tconfig = ""; static void *checkpoint_worker(void *); -static int drop_all_tables(CONFIG *); -static int execute_populate(CONFIG *); -static int execute_workload(CONFIG *); -static int find_table_count(CONFIG *); +static int drop_all_tables(WTPERF *); +static int execute_populate(WTPERF *); +static int execute_workload(WTPERF *); +static int find_table_count(WTPERF *); static void *monitor(void *); static void *populate_thread(void *); -static void randomize_value(CONFIG_THREAD *, char *); +static void randomize_value(WTPERF_THREAD *, char *); static void recreate_dir(const char *); -static int start_all_runs(CONFIG *); -static int start_run(CONFIG *); -static int start_threads(CONFIG *, - WORKLOAD *, CONFIG_THREAD *, u_int, void *(*)(void *)); -static int stop_threads(CONFIG *, u_int, CONFIG_THREAD *); +static int start_all_runs(WTPERF *); +static int start_run(WTPERF *); +static int start_threads(WTPERF *, + WORKLOAD *, WTPERF_THREAD *, u_int, void *(*)(void *)); +static int stop_threads(WTPERF *, u_int, WTPERF_THREAD *); static void *thread_run_wtperf(void *); -static void update_value_delta(CONFIG_THREAD *); +static void update_value_delta(WTPERF_THREAD *); static void *worker(void *); -static uint64_t wtperf_rand(CONFIG_THREAD *); -static uint64_t wtperf_value_range(CONFIG *); +static uint64_t wtperf_rand(WTPERF_THREAD *); +static uint64_t wtperf_value_range(WTPERF *); -#define INDEX_COL_NAMES ",columns=(key,val)" +#define INDEX_COL_NAMES "columns=(key,val)" /* Retrieve an ID for the next insert operation. */ static inline uint64_t -get_next_incr(CONFIG *cfg) +get_next_incr(WTPERF *wtperf) { - return (__wt_atomic_add64(&cfg->insert_key, 1)); + return (__wt_atomic_add64(&wtperf->insert_key, 1)); } /* @@ -110,11 +70,14 @@ get_next_incr(CONFIG *cfg) * other element in the value buffer. */ static void -randomize_value(CONFIG_THREAD *thread, char *value_buf) +randomize_value(WTPERF_THREAD *thread, char *value_buf) { + CONFIG_OPTS *opts; uint8_t *vb; uint32_t i, max_range, rand_val; + opts = thread->wtperf->opts; + /* * Limit how much of the buffer we validate for length, this means * that only threads that do growing updates will ever make changes to @@ -123,11 +86,11 @@ randomize_value(CONFIG_THREAD *thread, char *value_buf) * in this performance sensitive function. */ if (thread->workload == NULL || thread->workload->update_delta == 0) - max_range = thread->cfg->value_sz; + max_range = opts->value_sz; else if (thread->workload->update_delta > 0) - max_range = thread->cfg->value_sz_max; + max_range = opts->value_sz_max; else - max_range = thread->cfg->value_sz_min; + max_range = opts->value_sz_min; /* * Generate a single random value and re-use it. We generally only @@ -157,17 +120,17 @@ randomize_value(CONFIG_THREAD *thread, char *value_buf) * Partition data by key ranges. */ static uint32_t -map_key_to_table(CONFIG *cfg, uint64_t k) +map_key_to_table(CONFIG_OPTS *opts, uint64_t k) { - if (cfg->range_partition) { + if (opts->range_partition) { /* Take care to return a result in [0..table_count-1]. */ - if (k > cfg->icount + cfg->random_range) + if (k > opts->icount + opts->random_range) return (0); return ((uint32_t)((k - 1) / - ((cfg->icount + cfg->random_range + cfg->table_count - 1) / - cfg->table_count))); + ((opts->icount + opts->random_range + + opts->table_count - 1) / opts->table_count))); } else - return ((uint32_t)(k % cfg->table_count)); + return ((uint32_t)(k % opts->table_count)); } /* @@ -176,26 +139,28 @@ map_key_to_table(CONFIG *cfg, uint64_t k) * scratch buffer. */ static inline void -update_value_delta(CONFIG_THREAD *thread) +update_value_delta(WTPERF_THREAD *thread) { - CONFIG *cfg; + CONFIG_OPTS *opts; + WTPERF *wtperf; char * value; int64_t delta, len, new_len; - cfg = thread->cfg; + wtperf = thread->wtperf; + opts = wtperf->opts; value = thread->value_buf; delta = thread->workload->update_delta; len = (int64_t)strlen(value); if (delta == INT64_MAX) delta = __wt_random(&thread->rnd) % - (cfg->value_sz_max - cfg->value_sz); + (opts->value_sz_max - opts->value_sz); /* Ensure we aren't changing across boundaries */ - if (delta > 0 && len + delta > cfg->value_sz_max) - delta = cfg->value_sz_max - len; - else if (delta < 0 && len + delta < cfg->value_sz_min) - delta = cfg->value_sz_min - len; + if (delta > 0 && len + delta > opts->value_sz_max) + delta = opts->value_sz_max - len; + else if (delta < 0 && len + delta < opts->value_sz_min) + delta = opts->value_sz_min - len; /* Bail if there isn't anything to do */ if (delta == 0) @@ -206,7 +171,7 @@ update_value_delta(CONFIG_THREAD *thread) else { /* Extend the value by the configured amount. */ for (new_len = len; - new_len < cfg->value_sz_max && new_len - len < delta; + new_len < opts->value_sz_max && new_len - len < delta; new_len++) value[new_len] = 'a'; } @@ -215,24 +180,24 @@ update_value_delta(CONFIG_THREAD *thread) static int cb_asyncop(WT_ASYNC_CALLBACK *cb, WT_ASYNC_OP *op, int ret, uint32_t flags) { - CONFIG *cfg; - CONFIG_THREAD *thread; TRACK *trk; + WTPERF *wtperf; + WTPERF_THREAD *thread; WT_ASYNC_OPTYPE type; - char *value; uint32_t *tables; int t_ret; + char *value; (void)cb; (void)flags; - cfg = NULL; /* -Wconditional-uninitialized */ + wtperf = NULL; /* -Wconditional-uninitialized */ thread = NULL; /* -Wconditional-uninitialized */ type = op->get_type(op); if (type != WT_AOP_COMPACT) { - thread = (CONFIG_THREAD *)op->app_private; - cfg = thread->cfg; + thread = (WTPERF_THREAD *)op->app_private; + wtperf = thread->wtperf; } trk = NULL; @@ -249,7 +214,7 @@ cb_asyncop(WT_ASYNC_CALLBACK *cb, WT_ASYNC_OP *op, int ret, uint32_t flags) if (ret == 0 && (t_ret = op->get_value(op, &value)) != 0) { ret = t_ret; - lprintf(cfg, ret, 0, "get_value in read."); + lprintf(wtperf, ret, 0, "get_value in read."); goto err; } break; @@ -259,7 +224,8 @@ cb_asyncop(WT_ASYNC_CALLBACK *cb, WT_ASYNC_OP *op, int ret, uint32_t flags) case WT_AOP_NONE: case WT_AOP_REMOVE: /* We never expect this type. */ - lprintf(cfg, ret, 0, "No type in op %" PRIu64, op->get_id(op)); + lprintf(wtperf, + ret, 0, "No type in op %" PRIu64, op->get_id(op)); goto err; } @@ -273,15 +239,14 @@ cb_asyncop(WT_ASYNC_CALLBACK *cb, WT_ASYNC_OP *op, int ret, uint32_t flags) if (type == WT_AOP_COMPACT) return (0); if (ret == 0 || (ret == WT_NOTFOUND && type != WT_AOP_INSERT)) { - if (!cfg->in_warmup) + if (!wtperf->in_warmup) (void)__wt_atomic_add64(&trk->ops, 1); return (0); } err: /* Panic if error */ - lprintf(cfg, ret, 0, "Error in op %" PRIu64, - op->get_id(op)); - cfg->error = cfg->stop = 1; + lprintf(wtperf, ret, 0, "Error in op %" PRIu64, op->get_id(op)); + wtperf->error = wtperf->stop = true; return (1); } @@ -353,8 +318,9 @@ op_name(uint8_t *op) static void * worker_async(void *arg) { - CONFIG *cfg; - CONFIG_THREAD *thread; + CONFIG_OPTS *opts; + WTPERF *wtperf; + WTPERF_THREAD *thread; WT_ASYNC_OP *asyncop; WT_CONNECTION *conn; uint64_t next_val; @@ -362,9 +328,10 @@ worker_async(void *arg) int ret; char *key_buf, *value_buf; - thread = (CONFIG_THREAD *)arg; - cfg = thread->cfg; - conn = cfg->conn; + thread = (WTPERF_THREAD *)arg; + wtperf = thread->wtperf; + opts = wtperf->opts; + conn = wtperf->conn; key_buf = thread->key_buf; value_buf = thread->value_buf; @@ -372,7 +339,7 @@ worker_async(void *arg) op = thread->workload->ops; op_end = op + sizeof(thread->workload->ops); - while (!cfg->stop) { + while (!wtperf->stop) { /* * Generate the next key and setup operation specific * statistics tracking objects. @@ -380,10 +347,10 @@ worker_async(void *arg) switch (*op) { case WORKER_INSERT: case WORKER_INSERT_RMW: - if (cfg->random_range) + if (opts->random_range) next_val = wtperf_rand(thread); else - next_val = cfg->icount + get_next_incr(cfg); + next_val = opts->icount + get_next_incr(wtperf); break; case WORKER_READ: case WORKER_UPDATE: @@ -394,22 +361,22 @@ worker_async(void *arg) * we rely on at least one insert to get a valid item * id. */ - if (wtperf_value_range(cfg) < next_val) + if (wtperf_value_range(wtperf) < next_val) continue; break; default: goto err; /* can't happen */ } - generate_key(cfg, key_buf, next_val); + generate_key(opts, key_buf, next_val); /* * Spread the data out around the multiple databases. * Sleep to allow workers a chance to run and process async ops. * Then retry to get an async op. */ - while ((ret = conn->async_new_op( - conn, cfg->uris[map_key_to_table(cfg, next_val)], + while ((ret = conn->async_new_op(conn, + wtperf->uris[map_key_to_table(wtperf->opts, next_val)], NULL, &cb, &asyncop)) == EBUSY) (void)usleep(10000); if (ret != 0) @@ -424,23 +391,23 @@ worker_async(void *arg) break; goto op_err; case WORKER_INSERT: - if (cfg->random_value) + if (opts->random_value) randomize_value(thread, value_buf); asyncop->set_value(asyncop, value_buf); if ((ret = asyncop->insert(asyncop)) == 0) break; goto op_err; case WORKER_UPDATE: - if (cfg->random_value) + if (opts->random_value) randomize_value(thread, value_buf); asyncop->set_value(asyncop, value_buf); if ((ret = asyncop->update(asyncop)) == 0) break; goto op_err; default: -op_err: lprintf(cfg, ret, 0, +op_err: lprintf(wtperf, ret, 0, "%s failed for: %s, range: %"PRIu64, - op_name(op), key_buf, wtperf_value_range(cfg)); + op_name(op), key_buf, wtperf_value_range(wtperf)); goto err; /* can't happen */ } @@ -454,7 +421,7 @@ op_err: lprintf(cfg, ret, 0, /* Notify our caller we failed and shut the system down. */ if (0) { -err: cfg->error = cfg->stop = 1; +err: wtperf->error = wtperf->stop = true; } return (NULL); } @@ -465,17 +432,19 @@ err: cfg->error = cfg->stop = 1; * search do them. Ensuring the keys we see are always in order. */ static int -do_range_reads(CONFIG *cfg, WT_CURSOR *cursor) +do_range_reads(WTPERF *wtperf, WT_CURSOR *cursor) { + CONFIG_OPTS *opts; size_t range; uint64_t next_val, prev_val; char *range_key_buf; char buf[512]; int ret; + opts = wtperf->opts; ret = 0; - if (cfg->read_range == 0) + if (opts->read_range == 0) return (0); memset(&buf[0], 0, 512 * sizeof(char)); @@ -485,7 +454,7 @@ do_range_reads(CONFIG *cfg, WT_CURSOR *cursor) testutil_check(cursor->get_key(cursor, &range_key_buf)); extract_key(range_key_buf, &next_val); - for (range = 0; range < cfg->read_range; ++range) { + for (range = 0; range < opts->read_range; ++range) { prev_val = next_val; ret = cursor->next(cursor); /* We are done if we reach the end. */ @@ -496,7 +465,7 @@ do_range_reads(CONFIG *cfg, WT_CURSOR *cursor) testutil_check(cursor->get_key(cursor, &range_key_buf)); extract_key(range_key_buf, &next_val); if (next_val < prev_val) { - lprintf(cfg, EINVAL, 0, + lprintf(wtperf, EINVAL, 0, "Out of order keys %" PRIu64 " came before %" PRIu64, prev_val, next_val); @@ -510,9 +479,10 @@ static void * worker(void *arg) { struct timespec start, stop; - CONFIG *cfg; - CONFIG_THREAD *thread; + CONFIG_OPTS *opts; TRACK *trk; + WTPERF *wtperf; + WTPERF_THREAD *thread; WT_CONNECTION *conn; WT_CURSOR **cursors, *cursor, *log_table_cursor, *tmp_cursor; WT_SESSION *session; @@ -524,9 +494,10 @@ worker(void *arg) char *value_buf, *key_buf, *value; char buf[512]; - thread = (CONFIG_THREAD *)arg; - cfg = thread->cfg; - conn = cfg->conn; + thread = (WTPERF_THREAD *)arg; + wtperf = thread->wtperf; + opts = wtperf->opts; + conn = wtperf->conn; cursors = NULL; log_table_cursor = NULL; /* -Wconditional-initialized */ ops = 0; @@ -535,42 +506,40 @@ worker(void *arg) trk = NULL; if ((ret = conn->open_session( - conn, NULL, cfg->sess_config, &session)) != 0) { - lprintf(cfg, ret, 0, "worker: WT_CONNECTION.open_session"); + conn, NULL, opts->sess_config, &session)) != 0) { + lprintf(wtperf, ret, 0, "worker: WT_CONNECTION.open_session"); goto err; } - cursors = dcalloc(cfg->table_count, sizeof(WT_CURSOR *)); - for (i = 0; i < cfg->table_count_idle; i++) { - snprintf(buf, 512, "%s_idle%05d", cfg->uris[0], (int)i); + cursors = dcalloc(opts->table_count, sizeof(WT_CURSOR *)); + for (i = 0; i < opts->table_count_idle; i++) { + snprintf(buf, 512, "%s_idle%05d", wtperf->uris[0], (int)i); if ((ret = session->open_cursor( session, buf, NULL, NULL, &tmp_cursor)) != 0) { - lprintf(cfg, ret, 0, + lprintf(wtperf, ret, 0, "Error opening idle table %s", buf); goto err; } if ((ret = tmp_cursor->close(tmp_cursor)) != 0) { - lprintf(cfg, ret, 0, + lprintf(wtperf, ret, 0, "Error closing idle table %s", buf); goto err; } } - for (i = 0; i < cfg->table_count; i++) { + for (i = 0; i < opts->table_count; i++) { if ((ret = session->open_cursor(session, - cfg->uris[i], NULL, NULL, &cursors[i])) != 0) { - lprintf(cfg, ret, 0, + wtperf->uris[i], NULL, NULL, &cursors[i])) != 0) { + lprintf(wtperf, ret, 0, "worker: WT_SESSION.open_cursor: %s", - cfg->uris[i]); + wtperf->uris[i]); goto err; } } - if (cfg->log_like_table) { - if ((ret = session->open_cursor(session, - cfg->log_table_uri, NULL, NULL, &log_table_cursor)) != 0) { - lprintf(cfg, ret, 0, - "worker: WT_SESSION.open_cursor: %s", - cfg->log_table_uri); - goto err; - } + if (opts->log_like_table && (ret = session->open_cursor(session, + wtperf->log_table_uri, NULL, NULL, &log_table_cursor)) != 0) { + lprintf(wtperf, ret, 0, + "worker: WT_SESSION.open_cursor: %s", + wtperf->log_table_uri); + goto err; } /* Setup the timer for throttling. */ @@ -579,7 +548,7 @@ worker(void *arg) /* Setup for truncate */ if (thread->workload->truncate != 0) - if ((ret = setup_truncate(cfg, thread, session)) != 0) + if ((ret = setup_truncate(wtperf, thread, session)) != 0) goto err; key_buf = thread->key_buf; @@ -588,13 +557,13 @@ worker(void *arg) op = thread->workload->ops; op_end = op + sizeof(thread->workload->ops); - if ((ops_per_txn != 0 || cfg->log_like_table) && + if ((ops_per_txn != 0 || opts->log_like_table) && (ret = session->begin_transaction(session, NULL)) != 0) { - lprintf(cfg, ret, 0, "First transaction begin failed"); + lprintf(wtperf, ret, 0, "First transaction begin failed"); goto err; } - while (!cfg->stop) { + while (!wtperf->stop) { /* * Generate the next key and setup operation specific * statistics tracking objects. @@ -603,10 +572,10 @@ worker(void *arg) case WORKER_INSERT: case WORKER_INSERT_RMW: trk = &thread->insert; - if (cfg->random_range) + if (opts->random_range) next_val = wtperf_rand(thread); else - next_val = cfg->icount + get_next_incr(cfg); + next_val = opts->icount + get_next_incr(wtperf); break; case WORKER_READ: trk = &thread->read; @@ -621,7 +590,7 @@ worker(void *arg) * we rely on at least one insert to get a valid item * id. */ - if (wtperf_value_range(cfg) < next_val) + if (wtperf_value_range(wtperf) < next_val) continue; break; case WORKER_TRUNCATE: @@ -632,24 +601,22 @@ worker(void *arg) goto err; /* can't happen */ } - generate_key(cfg, key_buf, next_val); + generate_key(opts, key_buf, next_val); /* * Spread the data out around the multiple databases. */ - cursor = cursors[map_key_to_table(cfg, next_val)]; + cursor = cursors[map_key_to_table(wtperf->opts, next_val)]; /* * Skip the first time we do an operation, when trk->ops * is 0, to avoid first time latency spikes. */ measure_latency = - cfg->sample_interval != 0 && trk != NULL && - trk->ops != 0 && (trk->ops % cfg->sample_rate == 0); - if (measure_latency && (ret = __wt_epoch(NULL, &start)) != 0) { - lprintf(cfg, ret, 0, "Get time call failed"); - goto err; - } + opts->sample_interval != 0 && trk != NULL && + trk->ops != 0 && (trk->ops % opts->sample_rate == 0); + if (measure_latency) + __wt_epoch(NULL, &start); cursor->set_key(cursor, key_buf); @@ -666,7 +633,7 @@ worker(void *arg) if (ret == 0) { if ((ret = cursor->get_value( cursor, &value)) != 0) { - lprintf(cfg, ret, 0, + lprintf(wtperf, ret, 0, "get_value in read."); goto err; } @@ -675,7 +642,7 @@ worker(void *arg) * for several operations, confirming that the * next key is in the correct order. */ - ret = do_range_reads(cfg, cursor); + ret = do_range_reads(wtperf, cursor); } if (ret == 0 || ret == WT_NOTFOUND) @@ -690,15 +657,15 @@ worker(void *arg) /* FALLTHROUGH */ case WORKER_INSERT: - if (cfg->random_value) + if (opts->random_value) randomize_value(thread, value_buf); cursor->set_value(cursor, value_buf); if ((ret = cursor->insert(cursor)) == 0) break; goto op_err; case WORKER_TRUNCATE: - if ((ret = run_truncate( - cfg, thread, cursor, session, &truncated)) == 0) { + if ((ret = run_truncate(wtperf, + thread, cursor, session, &truncated)) == 0) { if (truncated) trk = &thread->truncate; else @@ -712,7 +679,7 @@ worker(void *arg) if ((ret = cursor->search(cursor)) == 0) { if ((ret = cursor->get_value( cursor, &value)) != 0) { - lprintf(cfg, ret, 0, + lprintf(wtperf, ret, 0, "get_value in update."); goto err; } @@ -721,14 +688,14 @@ worker(void *arg) * safe, and be sure to NUL-terminate. */ strncpy(value_buf, - value, cfg->value_sz_max - 1); + value, opts->value_sz_max - 1); if (thread->workload->update_delta != 0) update_value_delta(thread); if (value_buf[0] == 'a') value_buf[0] = 'b'; else value_buf[0] = 'a'; - if (cfg->random_value) + if (opts->random_value) randomize_value(thread, value_buf); cursor->set_value(cursor, value_buf); if ((ret = cursor->update(cursor)) == 0) @@ -756,62 +723,59 @@ op_err: if (ret == WT_ROLLBACK && ops_per_txn != 0) { * order in cases of ordered inserts, as we * aren't retrying here. */ - lprintf(cfg, ret, 1, + lprintf(wtperf, ret, 1, "%s for: %s, range: %"PRIu64, op_name(op), - key_buf, wtperf_value_range(cfg)); + key_buf, wtperf_value_range(wtperf)); if ((ret = session->rollback_transaction( session, NULL)) != 0) { - lprintf(cfg, ret, 0, + lprintf(wtperf, ret, 0, "Failed rollback_transaction"); goto err; } if ((ret = session->begin_transaction( session, NULL)) != 0) { - lprintf(cfg, ret, 0, + lprintf(wtperf, ret, 0, "Worker begin transaction failed"); goto err; } break; } - lprintf(cfg, ret, 0, + lprintf(wtperf, ret, 0, "%s failed for: %s, range: %"PRIu64, - op_name(op), key_buf, wtperf_value_range(cfg)); + op_name(op), key_buf, wtperf_value_range(wtperf)); goto err; default: goto err; /* can't happen */ } /* Update the log-like table. */ - if (cfg->log_like_table && + if (opts->log_like_table && (*op != WORKER_READ && *op != WORKER_TRUNCATE)) { - log_id = __wt_atomic_add64(&cfg->log_like_table_key, 1); + log_id = + __wt_atomic_add64(&wtperf->log_like_table_key, 1); log_table_cursor->set_key(log_table_cursor, log_id); log_table_cursor->set_value( log_table_cursor, value_buf); if ((ret = log_table_cursor->insert(log_table_cursor)) != 0) { - lprintf(cfg, ret, 0, "Cursor insert failed"); + lprintf(wtperf, ret, 0, "Cursor insert failed"); goto err; } } /* Release the cursor, if we have multiple tables. */ - if (cfg->table_count > 1 && ret == 0 && + if (opts->table_count > 1 && ret == 0 && *op != WORKER_INSERT && *op != WORKER_INSERT_RMW) { if ((ret = cursor->reset(cursor)) != 0) { - lprintf(cfg, ret, 0, "Cursor reset failed"); + lprintf(wtperf, ret, 0, "Cursor reset failed"); goto err; } } /* Gather statistics */ - if (!cfg->in_warmup) { + if (!wtperf->in_warmup) { if (measure_latency) { - if ((ret = __wt_epoch(NULL, &stop)) != 0) { - lprintf(cfg, ret, 0, - "Get time call failed"); - goto err; - } + __wt_epoch(NULL, &stop); ++trk->latency_ops; usecs = WT_TIMEDIFF_US(stop, start); track_operation(trk, usecs); @@ -824,17 +788,17 @@ op_err: if (ret == WT_ROLLBACK && ops_per_txn != 0) { * Commit the transaction if grouping operations together * or tracking changes in our log table. */ - if ((cfg->log_like_table && ops_per_txn == 0) || + if ((opts->log_like_table && ops_per_txn == 0) || (ops_per_txn != 0 && ops++ % ops_per_txn == 0)) { if ((ret = session->commit_transaction( session, NULL)) != 0) { - lprintf(cfg, ret, 0, + lprintf(wtperf, ret, 0, "Worker transaction commit failed"); goto err; } if ((ret = session->begin_transaction( session, NULL)) != 0) { - lprintf(cfg, ret, 0, + lprintf(wtperf, ret, 0, "Worker begin transaction failed"); goto err; } @@ -854,13 +818,13 @@ op_err: if (ret == WT_ROLLBACK && ops_per_txn != 0) { } if ((ret = session->close(session, NULL)) != 0) { - lprintf(cfg, ret, 0, "Session close in worker failed"); + lprintf(wtperf, ret, 0, "Session close in worker failed"); goto err; } /* Notify our caller we failed and shut the system down. */ if (0) { -err: cfg->error = cfg->stop = 1; +err: wtperf->error = wtperf->stop = true; } free(cursors); @@ -913,14 +877,17 @@ run_mix_schedule_op(WORKLOAD *workp, int op, int64_t op_cnt) * Schedule the mixed-run operations. */ static int -run_mix_schedule(CONFIG *cfg, WORKLOAD *workp) +run_mix_schedule(WTPERF *wtperf, WORKLOAD *workp) { + CONFIG_OPTS *opts; int64_t pct; + opts = wtperf->opts; + /* Confirm reads, inserts, truncates and updates cannot all be zero. */ if (workp->insert == 0 && workp->read == 0 && workp->truncate == 0 && workp->update == 0) { - lprintf(cfg, EINVAL, 0, "no operations scheduled"); + lprintf(wtperf, EINVAL, 0, "no operations scheduled"); return (EINVAL); } @@ -931,7 +898,7 @@ run_mix_schedule(CONFIG *cfg, WORKLOAD *workp) if (workp->truncate != 0) { if (workp->insert != 0 || workp->read != 0 || workp->update != 0) { - lprintf(cfg, EINVAL, 0, + lprintf(wtperf, EINVAL, 0, "Can't configure truncate in a mixed workload"); return (EINVAL); } @@ -947,7 +914,7 @@ run_mix_schedule(CONFIG *cfg, WORKLOAD *workp) */ if (workp->insert != 0 && workp->read == 0 && workp->update == 0) { memset(workp->ops, - cfg->insert_rmw ? WORKER_INSERT_RMW : WORKER_INSERT, + opts->insert_rmw ? WORKER_INSERT_RMW : WORKER_INSERT, sizeof(workp->ops)); return (0); } @@ -979,7 +946,7 @@ run_mix_schedule(CONFIG *cfg, WORKLOAD *workp) (workp->insert + workp->read + workp->update); if (pct != 0) run_mix_schedule_op(workp, - cfg->insert_rmw ? WORKER_INSERT_RMW : WORKER_INSERT, pct); + opts->insert_rmw ? WORKER_INSERT_RMW : WORKER_INSERT, pct); pct = (workp->update * 100) / (workp->insert + workp->read + workp->update); if (pct != 0) @@ -991,9 +958,10 @@ static void * populate_thread(void *arg) { struct timespec start, stop; - CONFIG *cfg; - CONFIG_THREAD *thread; + CONFIG_OPTS *opts; TRACK *trk; + WTPERF *wtperf; + WTPERF_THREAD *thread; WT_CONNECTION *conn; WT_CURSOR **cursors, *cursor; WT_SESSION *session; @@ -1004,9 +972,10 @@ populate_thread(void *arg) char *value_buf, *key_buf; const char *cursor_config; - thread = (CONFIG_THREAD *)arg; - cfg = thread->cfg; - conn = cfg->conn; + thread = (WTPERF_THREAD *)arg; + wtperf = thread->wtperf; + opts = wtperf->opts; + conn = wtperf->conn; session = NULL; cursors = NULL; ret = stress_checkpoint_due = 0; @@ -1016,37 +985,37 @@ populate_thread(void *arg) value_buf = thread->value_buf; if ((ret = conn->open_session( - conn, NULL, cfg->sess_config, &session)) != 0) { - lprintf(cfg, ret, 0, "populate: WT_CONNECTION.open_session"); + conn, NULL, opts->sess_config, &session)) != 0) { + lprintf(wtperf, ret, 0, "populate: WT_CONNECTION.open_session"); goto err; } /* Do bulk loads if populate is single-threaded. */ cursor_config = - (cfg->populate_threads == 1 && !cfg->index) ? "bulk" : NULL; + (opts->populate_threads == 1 && !opts->index) ? "bulk" : NULL; /* Create the cursors. */ - cursors = dcalloc(cfg->table_count, sizeof(WT_CURSOR *)); - for (i = 0; i < cfg->table_count; i++) { + cursors = dcalloc(opts->table_count, sizeof(WT_CURSOR *)); + for (i = 0; i < opts->table_count; i++) { if ((ret = session->open_cursor( - session, cfg->uris[i], NULL, + session, wtperf->uris[i], NULL, cursor_config, &cursors[i])) != 0) { - lprintf(cfg, ret, 0, + lprintf(wtperf, ret, 0, "populate: WT_SESSION.open_cursor: %s", - cfg->uris[i]); + wtperf->uris[i]); goto err; } } /* Populate the databases. */ for (intxn = 0, opcount = 0;;) { - op = get_next_incr(cfg); - if (op > cfg->icount) + op = get_next_incr(wtperf); + if (op > opts->icount) break; - if (cfg->populate_ops_per_txn != 0 && !intxn) { + if (opts->populate_ops_per_txn != 0 && !intxn) { if ((ret = session->begin_transaction( - session, cfg->transaction_config)) != 0) { - lprintf(cfg, ret, 0, + session, opts->transaction_config)) != 0) { + lprintf(wtperf, ret, 0, "Failed starting transaction."); goto err; } @@ -1055,31 +1024,29 @@ populate_thread(void *arg) /* * Figure out which table this op belongs to. */ - cursor = cursors[map_key_to_table(cfg, op)]; - generate_key(cfg, key_buf, op); + cursor = cursors[map_key_to_table(wtperf->opts, op)]; + generate_key(opts, key_buf, op); measure_latency = - cfg->sample_interval != 0 && - trk->ops != 0 && (trk->ops % cfg->sample_rate == 0); - if (measure_latency && (ret = __wt_epoch(NULL, &start)) != 0) { - lprintf(cfg, ret, 0, "Get time call failed"); - goto err; - } + opts->sample_interval != 0 && + trk->ops != 0 && (trk->ops % opts->sample_rate == 0); + if (measure_latency) + __wt_epoch(NULL, &start); cursor->set_key(cursor, key_buf); - if (cfg->random_value) + if (opts->random_value) randomize_value(thread, value_buf); cursor->set_value(cursor, value_buf); if ((ret = cursor->insert(cursor)) == WT_ROLLBACK) { - lprintf(cfg, ret, 0, "insert retrying"); + lprintf(wtperf, ret, 0, "insert retrying"); if ((ret = session->rollback_transaction( session, NULL)) != 0) { - lprintf(cfg, ret, 0, + lprintf(wtperf, ret, 0, "Failed rollback_transaction"); goto err; } intxn = 0; continue; } else if (ret != 0) { - lprintf(cfg, ret, 0, "Failed inserting"); + lprintf(wtperf, ret, 0, "Failed inserting"); goto err; } /* @@ -1089,28 +1056,25 @@ populate_thread(void *arg) * of them. */ if (measure_latency) { - if ((ret = __wt_epoch(NULL, &stop)) != 0) { - lprintf(cfg, ret, 0, "Get time call failed"); - goto err; - } + __wt_epoch(NULL, &stop); ++trk->latency_ops; usecs = WT_TIMEDIFF_US(stop, start); track_operation(trk, usecs); } ++thread->insert.ops; /* Same as trk->ops */ - if (cfg->checkpoint_stress_rate != 0 && - (op % cfg->checkpoint_stress_rate) == 0) + if (opts->checkpoint_stress_rate != 0 && + (op % opts->checkpoint_stress_rate) == 0) stress_checkpoint_due = 1; - if (cfg->populate_ops_per_txn != 0) { - if (++opcount < cfg->populate_ops_per_txn) + if (opts->populate_ops_per_txn != 0) { + if (++opcount < opts->populate_ops_per_txn) continue; opcount = 0; if ((ret = session->commit_transaction( session, NULL)) != 0) - lprintf(cfg, ret, 0, + lprintf(wtperf, ret, 0, "Fail committing, transaction was aborted"); intxn = 0; } @@ -1118,24 +1082,24 @@ populate_thread(void *arg) if (stress_checkpoint_due && intxn == 0) { stress_checkpoint_due = 0; if ((ret = session->checkpoint(session, NULL)) != 0) { - lprintf(cfg, ret, 0, "Checkpoint failed"); + lprintf(wtperf, ret, 0, "Checkpoint failed"); goto err; } } } if (intxn && (ret = session->commit_transaction(session, NULL)) != 0) - lprintf(cfg, ret, 0, + lprintf(wtperf, ret, 0, "Fail committing, transaction was aborted"); if ((ret = session->close(session, NULL)) != 0) { - lprintf(cfg, ret, 0, "Error closing session in populate"); + lprintf(wtperf, ret, 0, "Error closing session in populate"); goto err; } /* Notify our caller we failed and shut the system down. */ if (0) { -err: cfg->error = cfg->stop = 1; +err: wtperf->error = wtperf->stop = true; } free(cursors); @@ -1146,9 +1110,10 @@ static void * populate_async(void *arg) { struct timespec start, stop; - CONFIG *cfg; - CONFIG_THREAD *thread; + CONFIG_OPTS *opts; TRACK *trk; + WTPERF *wtperf; + WTPERF_THREAD *thread; WT_ASYNC_OP *asyncop; WT_CONNECTION *conn; WT_SESSION *session; @@ -1156,9 +1121,10 @@ populate_async(void *arg) int measure_latency, ret; char *value_buf, *key_buf; - thread = (CONFIG_THREAD *)arg; - cfg = thread->cfg; - conn = cfg->conn; + thread = (WTPERF_THREAD *)arg; + wtperf = thread->wtperf; + opts = wtperf->opts; + conn = wtperf->conn; session = NULL; ret = 0; trk = &thread->insert; @@ -1167,8 +1133,8 @@ populate_async(void *arg) value_buf = thread->value_buf; if ((ret = conn->open_session( - conn, NULL, cfg->sess_config, &session)) != 0) { - lprintf(cfg, ret, 0, "populate: WT_CONNECTION.open_session"); + conn, NULL, opts->sess_config, &session)) != 0) { + lprintf(wtperf, ret, 0, "populate: WT_CONNECTION.open_session"); goto err; } @@ -1178,38 +1144,38 @@ populate_async(void *arg) * the time to process by workers. */ measure_latency = - cfg->sample_interval != 0 && - trk->ops != 0 && (trk->ops % cfg->sample_rate == 0); - if (measure_latency && (ret = __wt_epoch(NULL, &start)) != 0) { - lprintf(cfg, ret, 0, "Get time call failed"); - goto err; - } + opts->sample_interval != 0 && + trk->ops != 0 && (trk->ops % opts->sample_rate == 0); + if (measure_latency) + __wt_epoch(NULL, &start); + /* Populate the databases. */ for (;;) { - op = get_next_incr(cfg); - if (op > cfg->icount) + op = get_next_incr(wtperf); + if (op > opts->icount) break; /* * Allocate an async op for whichever table. */ while ((ret = conn->async_new_op( - conn, cfg->uris[map_key_to_table(cfg, op)], + conn, wtperf->uris[map_key_to_table(wtperf->opts, op)], NULL, &cb, &asyncop)) == EBUSY) (void)usleep(10000); if (ret != 0) goto err; asyncop->app_private = thread; - generate_key(cfg, key_buf, op); + generate_key(opts, key_buf, op); asyncop->set_key(asyncop, key_buf); - if (cfg->random_value) + if (opts->random_value) randomize_value(thread, value_buf); asyncop->set_value(asyncop, value_buf); if ((ret = asyncop->insert(asyncop)) != 0) { - lprintf(cfg, ret, 0, "Failed inserting"); + lprintf(wtperf, ret, 0, "Failed inserting"); goto err; } } + /* * Gather statistics. * We measure the latency of inserting a single key. If there @@ -1221,22 +1187,19 @@ populate_async(void *arg) if (conn->async_flush(conn) != 0) goto err; if (measure_latency) { - if ((ret = __wt_epoch(NULL, &stop)) != 0) { - lprintf(cfg, ret, 0, "Get time call failed"); - goto err; - } + __wt_epoch(NULL, &stop); ++trk->latency_ops; usecs = WT_TIMEDIFF_US(stop, start); track_operation(trk, usecs); } if ((ret = session->close(session, NULL)) != 0) { - lprintf(cfg, ret, 0, "Error closing session in populate"); + lprintf(wtperf, ret, 0, "Error closing session in populate"); goto err; } /* Notify our caller we failed and shut the system down. */ if (0) { -err: cfg->error = cfg->stop = 1; +err: wtperf->error = wtperf->stop = true; } return (NULL); } @@ -1246,8 +1209,9 @@ monitor(void *arg) { struct timespec t; struct tm *tm, _tm; - CONFIG *cfg; + CONFIG_OPTS *opts; FILE *fp; + WTPERF *wtperf; size_t len; uint64_t min_thr, reads, inserts, updates; uint64_t cur_reads, cur_inserts, cur_updates; @@ -1257,24 +1221,26 @@ monitor(void *arg) uint32_t update_avg, update_min, update_max; uint32_t latency_max, level; u_int i; - int msg_err, ret; + int msg_err; const char *str; char buf[64], *path; - cfg = (CONFIG *)arg; - assert(cfg->sample_interval != 0); + wtperf = (WTPERF *)arg; + opts = wtperf->opts; + assert(opts->sample_interval != 0); + fp = NULL; path = NULL; - min_thr = (uint64_t)cfg->min_throughput; - latency_max = (uint32_t)ms_to_us(cfg->max_latency); + min_thr = (uint64_t)opts->min_throughput; + latency_max = (uint32_t)ms_to_us(opts->max_latency); /* Open the logging file. */ - len = strlen(cfg->monitor_dir) + 100; + len = strlen(wtperf->monitor_dir) + 100; path = dmalloc(len); - snprintf(path, len, "%s/monitor", cfg->monitor_dir); + snprintf(path, len, "%s/monitor", wtperf->monitor_dir); if ((fp = fopen(path, "w")) == NULL) { - lprintf(cfg, errno, 0, "%s", path); + lprintf(wtperf, errno, 0, "%s", path); goto err; } /* Set line buffering for monitor file. */ @@ -1297,34 +1263,31 @@ monitor(void *arg) "update maximum latency(uS)" "\n"); last_reads = last_inserts = last_updates = 0; - while (!cfg->stop) { - for (i = 0; i < cfg->sample_interval; i++) { + while (!wtperf->stop) { + for (i = 0; i < opts->sample_interval; i++) { sleep(1); - if (cfg->stop) + if (wtperf->stop) break; } /* If the workers are done, don't bother with a final call. */ - if (cfg->stop) + if (wtperf->stop) break; - if (cfg->in_warmup) + if (wtperf->in_warmup) continue; - if ((ret = __wt_epoch(NULL, &t)) != 0) { - lprintf(cfg, ret, 0, "Get time call failed"); - goto err; - } + __wt_epoch(NULL, &t); tm = localtime_r(&t.tv_sec, &_tm); (void)strftime(buf, sizeof(buf), "%b %d %H:%M:%S", tm); - reads = sum_read_ops(cfg); - inserts = sum_insert_ops(cfg); - updates = sum_update_ops(cfg); - latency_read(cfg, &read_avg, &read_min, &read_max); - latency_insert(cfg, &insert_avg, &insert_min, &insert_max); - latency_update(cfg, &update_avg, &update_min, &update_max); + reads = sum_read_ops(wtperf); + inserts = sum_insert_ops(wtperf); + updates = sum_update_ops(wtperf); + latency_read(wtperf, &read_avg, &read_min, &read_max); + latency_insert(wtperf, &insert_avg, &insert_min, &insert_max); + latency_update(wtperf, &update_avg, &update_min, &update_max); - cur_reads = (reads - last_reads) / cfg->sample_interval; - cur_updates = (updates - last_updates) / cfg->sample_interval; + cur_reads = (reads - last_reads) / opts->sample_interval; + cur_updates = (updates - last_updates) / opts->sample_interval; /* * For now the only item we need to worry about changing is * inserts when we transition from the populate phase to @@ -1334,7 +1297,7 @@ monitor(void *arg) cur_inserts = 0; else cur_inserts = - (inserts - last_inserts) / cfg->sample_interval; + (inserts - last_inserts) / opts->sample_interval; (void)fprintf(fp, "%s,%" PRIu32 @@ -1344,9 +1307,9 @@ monitor(void *arg) ",%" PRIu32 ",%" PRIu32 ",%" PRIu32 ",%" PRIu32 ",%" PRIu32 ",%" PRIu32 "\n", - buf, cfg->totalsec, + buf, wtperf->totalsec, cur_reads, cur_inserts, cur_updates, - cfg->ckpt ? 'Y' : 'N', + wtperf->ckpt ? 'Y' : 'N', read_avg, read_min, read_max, insert_avg, insert_min, insert_max, update_avg, update_min, update_max); @@ -1354,7 +1317,7 @@ monitor(void *arg) if (latency_max != 0 && (read_max > latency_max || insert_max > latency_max || update_max > latency_max)) { - if (cfg->max_latency_fatal) { + if (opts->max_latency_fatal) { level = 1; msg_err = WT_PANIC; str = "ERROR"; @@ -1363,7 +1326,7 @@ monitor(void *arg) msg_err = 0; str = "WARNING"; } - lprintf(cfg, msg_err, level, + lprintf(wtperf, msg_err, level, "%s: max latency exceeded: threshold %" PRIu32 " read max %" PRIu32 " insert max %" PRIu32 " update max %" PRIu32, str, latency_max, @@ -1373,7 +1336,7 @@ monitor(void *arg) ((cur_reads != 0 && cur_reads < min_thr) || (cur_inserts != 0 && cur_inserts < min_thr) || (cur_updates != 0 && cur_updates < min_thr))) { - if (cfg->min_throughput_fatal) { + if (opts->min_throughput_fatal) { level = 1; msg_err = WT_PANIC; str = "ERROR"; @@ -1382,7 +1345,7 @@ monitor(void *arg) msg_err = 0; str = "WARNING"; } - lprintf(cfg, msg_err, level, + lprintf(wtperf, msg_err, level, "%s: minimum throughput not met: threshold %" PRIu64 " reads %" PRIu64 " inserts %" PRIu64 " updates %" PRIu64, str, min_thr, cur_reads, @@ -1395,7 +1358,7 @@ monitor(void *arg) /* Notify our caller we failed and shut the system down. */ if (0) { -err: cfg->error = cfg->stop = 1; +err: wtperf->error = wtperf->stop = true; } if (fp != NULL) @@ -1408,75 +1371,73 @@ err: cfg->error = cfg->stop = 1; static void * checkpoint_worker(void *arg) { - CONFIG *cfg; - CONFIG_THREAD *thread; + CONFIG_OPTS *opts; + WTPERF *wtperf; + WTPERF_THREAD *thread; WT_CONNECTION *conn; WT_SESSION *session; struct timespec e, s; uint32_t i; int ret; - thread = (CONFIG_THREAD *)arg; - cfg = thread->cfg; - conn = cfg->conn; + thread = (WTPERF_THREAD *)arg; + wtperf = thread->wtperf; + opts = wtperf->opts; + conn = wtperf->conn; session = NULL; if ((ret = conn->open_session( - conn, NULL, cfg->sess_config, &session)) != 0) { - lprintf(cfg, ret, 0, + conn, NULL, opts->sess_config, &session)) != 0) { + lprintf(wtperf, ret, 0, "open_session failed in checkpoint thread."); goto err; } - while (!cfg->stop) { + while (!wtperf->stop) { /* Break the sleep up, so we notice interrupts faster. */ - for (i = 0; i < cfg->checkpoint_interval; i++) { + for (i = 0; i < opts->checkpoint_interval; i++) { sleep(1); - if (cfg->stop) + if (wtperf->stop) break; } /* If the workers are done, don't bother with a final call. */ - if (cfg->stop) + if (wtperf->stop) break; - if ((ret = __wt_epoch(NULL, &s)) != 0) { - lprintf(cfg, ret, 0, "Get time failed in checkpoint."); - goto err; - } - cfg->ckpt = 1; + __wt_epoch(NULL, &s); + + wtperf->ckpt = true; if ((ret = session->checkpoint(session, NULL)) != 0) { - lprintf(cfg, ret, 0, "Checkpoint failed."); + lprintf(wtperf, ret, 0, "Checkpoint failed."); goto err; } - cfg->ckpt = 0; + wtperf->ckpt = false; ++thread->ckpt.ops; - if ((ret = __wt_epoch(NULL, &e)) != 0) { - lprintf(cfg, ret, 0, "Get time failed in checkpoint."); - goto err; - } + __wt_epoch(NULL, &e); } if (session != NULL && ((ret = session->close(session, NULL)) != 0)) { - lprintf(cfg, ret, 0, + lprintf(wtperf, ret, 0, "Error closing session in checkpoint worker."); goto err; } /* Notify our caller we failed and shut the system down. */ if (0) { -err: cfg->error = cfg->stop = 1; +err: wtperf->error = wtperf->stop = true; } return (NULL); } static int -execute_populate(CONFIG *cfg) +execute_populate(WTPERF *wtperf) { struct timespec start, stop; - CONFIG_THREAD *popth; + CONFIG_OPTS *opts; + WTPERF_THREAD *popth; WT_ASYNC_OP *asyncop; pthread_t idle_table_cycle_thread; size_t i; @@ -1486,59 +1447,57 @@ execute_populate(CONFIG *cfg) int elapsed, ret; void *(*pfunc)(void *); - lprintf(cfg, 0, 1, + opts = wtperf->opts; + + lprintf(wtperf, 0, 1, "Starting %" PRIu32 " populate thread(s) for %" PRIu32 " items", - cfg->populate_threads, cfg->icount); + opts->populate_threads, opts->icount); /* Start cycling idle tables if configured. */ - if ((ret = start_idle_table_cycle(cfg, &idle_table_cycle_thread)) != 0) + if ((ret = + start_idle_table_cycle(wtperf, &idle_table_cycle_thread)) != 0) return (ret); - cfg->insert_key = 0; + wtperf->insert_key = 0; - cfg->popthreads = dcalloc(cfg->populate_threads, sizeof(CONFIG_THREAD)); - if (cfg->use_asyncops > 0) { - lprintf(cfg, 0, 1, "Starting %" PRIu32 " async thread(s)", - cfg->async_threads); + wtperf->popthreads = + dcalloc(opts->populate_threads, sizeof(WTPERF_THREAD)); + if (wtperf->use_asyncops) { + lprintf(wtperf, 0, 1, "Starting %" PRIu32 " async thread(s)", + opts->async_threads); pfunc = populate_async; } else pfunc = populate_thread; - if ((ret = start_threads(cfg, NULL, - cfg->popthreads, cfg->populate_threads, pfunc)) != 0) + if ((ret = start_threads(wtperf, NULL, + wtperf->popthreads, opts->populate_threads, pfunc)) != 0) return (ret); - if ((ret = __wt_epoch(NULL, &start)) != 0) { - lprintf(cfg, ret, 0, "Get time failed in populate."); - return (ret); - } + __wt_epoch(NULL, &start); for (elapsed = 0, interval = 0, last_ops = 0; - cfg->insert_key < cfg->icount && cfg->error == 0;) { + wtperf->insert_key < opts->icount && !wtperf->error;) { /* * Sleep for 100th of a second, report_interval is in second * granularity, each 100th increment of elapsed is a single * increment of interval. */ (void)usleep(10000); - if (cfg->report_interval == 0 || ++elapsed < 100) + if (opts->report_interval == 0 || ++elapsed < 100) continue; elapsed = 0; - if (++interval < cfg->report_interval) + if (++interval < opts->report_interval) continue; interval = 0; - cfg->totalsec += cfg->report_interval; - cfg->insert_ops = sum_pop_ops(cfg); - lprintf(cfg, 0, 1, + wtperf->totalsec += opts->report_interval; + wtperf->insert_ops = sum_pop_ops(wtperf); + lprintf(wtperf, 0, 1, "%" PRIu64 " populate inserts (%" PRIu64 " of %" PRIu32 ") in %" PRIu32 " secs (%" PRIu32 " total secs)", - cfg->insert_ops - last_ops, cfg->insert_ops, - cfg->icount, cfg->report_interval, cfg->totalsec); - last_ops = cfg->insert_ops; - } - if ((ret = __wt_epoch(NULL, &stop)) != 0) { - lprintf(cfg, ret, 0, "Get time failed in populate."); - return (ret); + wtperf->insert_ops - last_ops, wtperf->insert_ops, + opts->icount, opts->report_interval, wtperf->totalsec); + last_ops = wtperf->insert_ops; } + __wt_epoch(NULL, &stop); /* * Move popthreads aside to narrow possible race with the monitor @@ -1546,21 +1505,22 @@ execute_populate(CONFIG *cfg) * NULL when the populate phase is finished, to know that the workload * phase has started. */ - popth = cfg->popthreads; - cfg->popthreads = NULL; - ret = stop_threads(cfg, cfg->populate_threads, popth); + popth = wtperf->popthreads; + wtperf->popthreads = NULL; + ret = stop_threads(wtperf, opts->populate_threads, popth); free(popth); if (ret != 0) return (ret); /* Report if any worker threads didn't finish. */ - if (cfg->error != 0) { - lprintf(cfg, WT_ERROR, 0, + if (wtperf->error) { + lprintf(wtperf, WT_ERROR, 0, "Populate thread(s) exited without finishing."); return (WT_ERROR); } - lprintf(cfg, 0, 1, "Finished load of %" PRIu32 " items", cfg->icount); + lprintf(wtperf, + 0, 1, "Finished load of %" PRIu32 " items", opts->icount); msecs = WT_TIMEDIFF_MS(stop, start); /* @@ -1572,9 +1532,9 @@ execute_populate(CONFIG *cfg) print_ops_sec = 0; } else { print_secs = (double)msecs / (double)MSEC_PER_SEC; - print_ops_sec = (uint64_t)(cfg->icount / print_secs); + print_ops_sec = (uint64_t)(opts->icount / print_secs); } - lprintf(cfg, 0, 1, + lprintf(wtperf, 0, 1, "Load time: %.2f\n" "load ops/sec: %" PRIu64, print_secs, print_ops_sec); @@ -1583,58 +1543,57 @@ execute_populate(CONFIG *cfg) * set an unlimited timeout because if we close the connection * then any in-progress compact/merge is aborted. */ - if (cfg->compact) { - assert(cfg->async_threads > 0); - lprintf(cfg, 0, 1, "Compact after populate"); - if ((ret = __wt_epoch(NULL, &start)) != 0) { - lprintf(cfg, ret, 0, "Get time failed in populate."); - return (ret); - } - tables = cfg->table_count; - for (i = 0; i < cfg->table_count; i++) { + if (opts->compact) { + assert(opts->async_threads > 0); + lprintf(wtperf, 0, 1, "Compact after populate"); + __wt_epoch(NULL, &start); + tables = opts->table_count; + for (i = 0; i < opts->table_count; i++) { /* * If no ops are available, retry. Any other error, * return. */ - while ((ret = cfg->conn->async_new_op(cfg->conn, - cfg->uris[i], "timeout=0", &cb, &asyncop)) == EBUSY) + while ((ret = wtperf->conn->async_new_op( + wtperf->conn, wtperf->uris[i], + "timeout=0", &cb, &asyncop)) == EBUSY) (void)usleep(10000); if (ret != 0) return (ret); asyncop->app_private = &tables; if ((ret = asyncop->compact(asyncop)) != 0) { - lprintf(cfg, ret, 0, "Async compact failed."); + lprintf(wtperf, + ret, 0, "Async compact failed."); return (ret); } } - if ((ret = cfg->conn->async_flush(cfg->conn)) != 0) { - lprintf(cfg, ret, 0, "Populate async flush failed."); - return (ret); - } - if ((ret = __wt_epoch(NULL, &stop)) != 0) { - lprintf(cfg, ret, 0, "Get time failed in populate."); + if ((ret = wtperf->conn->async_flush(wtperf->conn)) != 0) { + lprintf(wtperf, ret, 0, "Populate async flush failed."); return (ret); } - lprintf(cfg, 0, 1, + __wt_epoch(NULL, &stop); + lprintf(wtperf, 0, 1, "Compact completed in %" PRIu64 " seconds", (uint64_t)(WT_TIMEDIFF_SEC(stop, start))); assert(tables == 0); } /* Stop cycling idle tables. */ - if ((ret = stop_idle_table_cycle(cfg, idle_table_cycle_thread)) != 0) + if ((ret = stop_idle_table_cycle(wtperf, idle_table_cycle_thread)) != 0) return (ret); return (0); } static int -close_reopen(CONFIG *cfg) +close_reopen(WTPERF *wtperf) { + CONFIG_OPTS *opts; int ret; - if (!cfg->readonly && !cfg->reopen_connection) + opts = wtperf->opts; + + if (!opts->readonly && !opts->reopen_connection) return (0); /* * Reopen the connection. We do this so that the workload phase always @@ -1642,16 +1601,16 @@ close_reopen(CONFIG *cfg) * be identified. This is particularly important for LSM, where the * merge algorithm is more aggressive for read-only trees. */ - /* cfg->conn is released no matter the return value from close(). */ - ret = cfg->conn->close(cfg->conn, NULL); - cfg->conn = NULL; + /* wtperf->conn is released no matter the return value from close(). */ + ret = wtperf->conn->close(wtperf->conn, NULL); + wtperf->conn = NULL; if (ret != 0) { - lprintf(cfg, ret, 0, "Closing the connection failed"); + lprintf(wtperf, ret, 0, "Closing the connection failed"); return (ret); } if ((ret = wiredtiger_open( - cfg->home, NULL, cfg->reopen_config, &cfg->conn)) != 0) { - lprintf(cfg, ret, 0, "Re-opening the connection failed"); + wtperf->home, NULL, wtperf->reopen_config, &wtperf->conn)) != 0) { + lprintf(wtperf, ret, 0, "Re-opening the connection failed"); return (ret); } /* @@ -1660,10 +1619,10 @@ close_reopen(CONFIG *cfg) * threads looking for work that will never arrive don't affect * performance. */ - if (cfg->compact && cfg->use_asyncops == 0) { - if ((ret = cfg->conn->reconfigure( - cfg->conn, "async=(enabled=false)")) != 0) { - lprintf(cfg, ret, 0, "Reconfigure async off failed"); + if (opts->compact && !wtperf->use_asyncops) { + if ((ret = wtperf->conn->reconfigure( + wtperf->conn, "async=(enabled=false)")) != 0) { + lprintf(wtperf, ret, 0, "Reconfigure async off failed"); return (ret); } } @@ -1671,10 +1630,11 @@ close_reopen(CONFIG *cfg) } static int -execute_workload(CONFIG *cfg) +execute_workload(WTPERF *wtperf) { - CONFIG_THREAD *threads; + CONFIG_OPTS *opts; WORKLOAD *workp; + WTPERF_THREAD *threads; WT_CONNECTION *conn; WT_SESSION **sessions; pthread_t idle_table_cycle_thread; @@ -1685,9 +1645,11 @@ execute_workload(CONFIG *cfg) int ret, t_ret; void *(*pfunc)(void *); - cfg->insert_key = 0; - cfg->insert_ops = cfg->read_ops = cfg->truncate_ops = 0; - cfg->update_ops = 0; + opts = wtperf->opts; + + wtperf->insert_key = 0; + wtperf->insert_ops = wtperf->read_ops = wtperf->truncate_ops = 0; + wtperf->update_ops = 0; last_ckpts = last_inserts = last_reads = last_truncates = 0; last_updates = 0; @@ -1696,38 +1658,40 @@ execute_workload(CONFIG *cfg) sessions = NULL; /* Start cycling idle tables. */ - if ((ret = start_idle_table_cycle(cfg, &idle_table_cycle_thread)) != 0) + if ((ret = + start_idle_table_cycle(wtperf, &idle_table_cycle_thread)) != 0) return (ret); - if (cfg->warmup != 0) - cfg->in_warmup = 1; + if (opts->warmup != 0) + wtperf->in_warmup = true; /* Allocate memory for the worker threads. */ - cfg->workers = dcalloc((size_t)cfg->workers_cnt, sizeof(CONFIG_THREAD)); + wtperf->workers = + dcalloc((size_t)wtperf->workers_cnt, sizeof(WTPERF_THREAD)); - if (cfg->use_asyncops > 0) { - lprintf(cfg, 0, 1, "Starting %" PRIu32 " async thread(s)", - cfg->async_threads); + if (wtperf->use_asyncops) { + lprintf(wtperf, 0, 1, "Starting %" PRIu32 " async thread(s)", + opts->async_threads); pfunc = worker_async; } else pfunc = worker; - if (cfg->session_count_idle != 0) { - sessions = dcalloc((size_t)cfg->session_count_idle, + if (opts->session_count_idle != 0) { + sessions = dcalloc((size_t)opts->session_count_idle, sizeof(WT_SESSION *)); - conn = cfg->conn; - for (i = 0; i < cfg->session_count_idle; ++i) - if ((ret = conn->open_session( - conn, NULL, cfg->sess_config, &sessions[i])) != 0) { - lprintf(cfg, ret, 0, + conn = wtperf->conn; + for (i = 0; i < opts->session_count_idle; ++i) + if ((ret = conn->open_session(conn, + NULL, opts->sess_config, &sessions[i])) != 0) { + lprintf(wtperf, ret, 0, "execute_workload: idle open_session"); goto err; } } /* Start each workload. */ - for (threads = cfg->workers, i = 0, - workp = cfg->workload; i < cfg->workload_cnt; ++i, ++workp) { - lprintf(cfg, 0, 1, + for (threads = wtperf->workers, i = 0, + workp = wtperf->workload; i < wtperf->workload_cnt; ++i, ++workp) { + lprintf(wtperf, 0, 1, "Starting workload #%u: %" PRId64 " threads, inserts=%" PRId64 ", reads=%" PRId64 ", updates=%" PRId64 ", truncate=%" PRId64 ", throttle=%" PRId64, @@ -1736,25 +1700,26 @@ execute_workload(CONFIG *cfg) workp->throttle); /* Figure out the workload's schedule. */ - if ((ret = run_mix_schedule(cfg, workp)) != 0) + if ((ret = run_mix_schedule(wtperf, workp)) != 0) goto err; /* Start the workload's threads. */ if ((ret = start_threads( - cfg, workp, threads, (u_int)workp->threads, pfunc)) != 0) + wtperf, workp, threads, (u_int)workp->threads, pfunc)) != 0) goto err; threads += workp->threads; } - if (cfg->warmup != 0) { - lprintf(cfg, 0, 1, - "Waiting for warmup duration of %" PRIu32, cfg->warmup); - sleep(cfg->warmup); - cfg->in_warmup = 0; + if (opts->warmup != 0) { + lprintf(wtperf, 0, 1, + "Waiting for warmup duration of %" PRIu32, opts->warmup); + sleep(opts->warmup); + wtperf->in_warmup = false; } - for (interval = cfg->report_interval, run_time = cfg->run_time, - run_ops = cfg->run_ops; cfg->error == 0;) { + for (interval = opts->report_interval, + run_time = opts->run_time, run_ops = opts->run_ops; + !wtperf->error;) { /* * Sleep for one second at a time. * If we are tracking run time, check to see if we're done, and @@ -1769,59 +1734,60 @@ execute_workload(CONFIG *cfg) } /* Sum the operations we've done. */ - cfg->ckpt_ops = sum_ckpt_ops(cfg); - cfg->insert_ops = sum_insert_ops(cfg); - cfg->read_ops = sum_read_ops(cfg); - cfg->update_ops = sum_update_ops(cfg); - cfg->truncate_ops = sum_truncate_ops(cfg); + wtperf->ckpt_ops = sum_ckpt_ops(wtperf); + wtperf->insert_ops = sum_insert_ops(wtperf); + wtperf->read_ops = sum_read_ops(wtperf); + wtperf->update_ops = sum_update_ops(wtperf); + wtperf->truncate_ops = sum_truncate_ops(wtperf); /* If we're checking total operations, see if we're done. */ if (run_ops != 0 && run_ops <= - cfg->insert_ops + cfg->read_ops + cfg->update_ops) + wtperf->insert_ops + wtperf->read_ops + wtperf->update_ops) break; /* If writing out throughput information, see if it's time. */ if (interval == 0 || --interval > 0) continue; - interval = cfg->report_interval; - cfg->totalsec += cfg->report_interval; + interval = opts->report_interval; + wtperf->totalsec += opts->report_interval; - lprintf(cfg, 0, 1, + lprintf(wtperf, 0, 1, "%" PRIu64 " reads, %" PRIu64 " inserts, %" PRIu64 " updates, %" PRIu64 " truncates, %" PRIu64 " checkpoints in %" PRIu32 " secs (%" PRIu32 " total secs)", - cfg->read_ops - last_reads, - cfg->insert_ops - last_inserts, - cfg->update_ops - last_updates, - cfg->truncate_ops - last_truncates, - cfg->ckpt_ops - last_ckpts, - cfg->report_interval, cfg->totalsec); - last_reads = cfg->read_ops; - last_inserts = cfg->insert_ops; - last_updates = cfg->update_ops; - last_truncates = cfg->truncate_ops; - last_ckpts = cfg->ckpt_ops; + wtperf->read_ops - last_reads, + wtperf->insert_ops - last_inserts, + wtperf->update_ops - last_updates, + wtperf->truncate_ops - last_truncates, + wtperf->ckpt_ops - last_ckpts, + opts->report_interval, wtperf->totalsec); + last_reads = wtperf->read_ops; + last_inserts = wtperf->insert_ops; + last_updates = wtperf->update_ops; + last_truncates = wtperf->truncate_ops; + last_ckpts = wtperf->ckpt_ops; } /* Notify the worker threads they are done. */ -err: cfg->stop = 1; +err: wtperf->stop = true; /* Stop cycling idle tables. */ - if ((ret = stop_idle_table_cycle(cfg, idle_table_cycle_thread)) != 0) + if ((ret = stop_idle_table_cycle(wtperf, idle_table_cycle_thread)) != 0) return (ret); - if ((t_ret = stop_threads( - cfg, (u_int)cfg->workers_cnt, cfg->workers)) != 0 && ret == 0) + if ((t_ret = stop_threads(wtperf, + (u_int)wtperf->workers_cnt, wtperf->workers)) != 0 && ret == 0) ret = t_ret; /* Drop tables if configured to and this isn't an error path */ - if (ret == 0 && cfg->drop_tables && (ret = drop_all_tables(cfg)) != 0) - lprintf(cfg, ret, 0, "Drop tables failed."); + if (ret == 0 && + opts->drop_tables && (ret = drop_all_tables(wtperf)) != 0) + lprintf(wtperf, ret, 0, "Drop tables failed."); free(sessions); /* Report if any worker threads didn't finish. */ - if (cfg->error != 0) { - lprintf(cfg, WT_ERROR, 0, + if (wtperf->error) { + lprintf(wtperf, WT_ERROR, 0, "Worker thread(s) exited without finishing."); if (ret == 0) ret = WT_ERROR; @@ -1834,8 +1800,9 @@ err: cfg->stop = 1; * existing table. */ static int -find_table_count(CONFIG *cfg) +find_table_count(WTPERF *wtperf) { + CONFIG_OPTS *opts; WT_CONNECTION *conn; WT_CURSOR *cursor; WT_SESSION *session; @@ -1843,29 +1810,30 @@ find_table_count(CONFIG *cfg) int ret, t_ret; char *key; - conn = cfg->conn; + opts = wtperf->opts; + conn = wtperf->conn; max_icount = 0; if ((ret = conn->open_session( - conn, NULL, cfg->sess_config, &session)) != 0) { - lprintf(cfg, ret, 0, + conn, NULL, opts->sess_config, &session)) != 0) { + lprintf(wtperf, ret, 0, "find_table_count: open_session failed"); goto out; } - for (i = 0; i < cfg->table_count; i++) { - if ((ret = session->open_cursor(session, cfg->uris[i], + for (i = 0; i < opts->table_count; i++) { + if ((ret = session->open_cursor(session, wtperf->uris[i], NULL, NULL, &cursor)) != 0) { - lprintf(cfg, ret, 0, + lprintf(wtperf, ret, 0, "find_table_count: open_cursor failed"); goto err; } if ((ret = cursor->prev(cursor)) != 0) { - lprintf(cfg, ret, 0, + lprintf(wtperf, ret, 0, "find_table_count: cursor prev failed"); goto err; } if ((ret = cursor->get_key(cursor, &key)) != 0) { - lprintf(cfg, ret, 0, + lprintf(wtperf, ret, 0, "find_table_count: cursor get_key failed"); goto err; } @@ -1874,7 +1842,7 @@ find_table_count(CONFIG *cfg) max_icount = table_icount; if ((ret = cursor->close(cursor)) != 0) { - lprintf(cfg, ret, 0, + lprintf(wtperf, ret, 0, "find_table_count: cursor close failed"); goto err; } @@ -1882,91 +1850,99 @@ find_table_count(CONFIG *cfg) err: if ((t_ret = session->close(session, NULL)) != 0) { if (ret == 0) ret = t_ret; - lprintf(cfg, ret, 0, + lprintf(wtperf, ret, 0, "find_table_count: session close failed"); } - cfg->icount = max_icount; + opts->icount = max_icount; out: return (ret); } /* - * Populate the uri array if more than one table is being used. + * Populate the uri array. */ static void -create_uris(CONFIG *cfg) +create_uris(WTPERF *wtperf) { - size_t base_uri_len; + CONFIG_OPTS *opts; + size_t len; uint32_t i; - char *uri; - base_uri_len = strlen(cfg->base_uri); - cfg->uris = dcalloc(cfg->table_count, sizeof(char *)); - for (i = 0; i < cfg->table_count; i++) { - uri = cfg->uris[i] = dcalloc(base_uri_len + 6, 1); - /* - * If there is only one table, just use base name. - */ - if (cfg->table_count == 1) - memcpy(uri, cfg->base_uri, base_uri_len); + opts = wtperf->opts; + + wtperf->uris = dcalloc(opts->table_count, sizeof(char *)); + len = strlen("table:") + strlen(opts->table_name) + 20; + for (i = 0; i < opts->table_count; i++) { + /* If there is only one table, just use the base name. */ + wtperf->uris[i] = dmalloc(len); + if (opts->table_count == 1) + snprintf(wtperf->uris[i], + len, "table:%s", opts->table_name); else - sprintf(uri, "%s%05d", cfg->base_uri, i); + snprintf(wtperf->uris[i], + len, "table:%s%05d", opts->table_name, i); } /* Create the log-like-table URI. */ - cfg->log_table_uri = dcalloc(base_uri_len + 11, 1); - sprintf(cfg->log_table_uri, "%s_log_table", cfg->base_uri); + len = strlen("table:") + + strlen(opts->table_name) + strlen("_log_table") + 1; + wtperf->log_table_uri = dmalloc(len); + snprintf( + wtperf->log_table_uri, len, "table:%s_log_table", opts->table_name); } static int -create_tables(CONFIG *cfg) +create_tables(WTPERF *wtperf) { + CONFIG_OPTS *opts; WT_SESSION *session; size_t i; int ret; char buf[512]; - if ((ret = cfg->conn->open_session( - cfg->conn, NULL, cfg->sess_config, &session)) != 0) { - lprintf(cfg, ret, 0, - "Error opening a session on %s", cfg->home); + opts = wtperf->opts; + + if ((ret = wtperf->conn->open_session( + wtperf->conn, NULL, opts->sess_config, &session)) != 0) { + lprintf(wtperf, ret, 0, + "Error opening a session on %s", wtperf->home); return (ret); } - for (i = 0; i < cfg->table_count_idle; i++) { - snprintf(buf, 512, "%s_idle%05d", cfg->uris[0], (int)i); + for (i = 0; i < opts->table_count_idle; i++) { + snprintf(buf, 512, "%s_idle%05d", wtperf->uris[0], (int)i); if ((ret = session->create( - session, buf, cfg->table_config)) != 0) { - lprintf(cfg, ret, 0, + session, buf, opts->table_config)) != 0) { + lprintf(wtperf, ret, 0, "Error creating idle table %s", buf); return (ret); } } - if (cfg->log_like_table && (ret = session->create(session, - cfg->log_table_uri, "key_format=Q,value_format=S")) != 0) { - lprintf(cfg, ret, 0, "Error creating log table %s", buf); + if (opts->log_like_table && (ret = session->create(session, + wtperf->log_table_uri, "key_format=Q,value_format=S")) != 0) { + lprintf(wtperf, ret, 0, "Error creating log table %s", buf); return (ret); } - for (i = 0; i < cfg->table_count; i++) { - if (cfg->log_partial && i > 0) { + for (i = 0; i < opts->table_count; i++) { + if (opts->log_partial && i > 0) { if (((ret = session->create(session, - cfg->uris[i], cfg->partial_config)) != 0)) { - lprintf(cfg, ret, 0, - "Error creating table %s", cfg->uris[i]); + wtperf->uris[i], wtperf->partial_config)) != 0)) { + lprintf(wtperf, ret, 0, + "Error creating table %s", wtperf->uris[i]); return (ret); } } else if ((ret = session->create( - session, cfg->uris[i], cfg->table_config)) != 0) { - lprintf(cfg, ret, 0, - "Error creating table %s", cfg->uris[i]); + session, wtperf->uris[i], opts->table_config)) != 0) { + lprintf(wtperf, ret, 0, + "Error creating table %s", wtperf->uris[i]); return (ret); } - if (cfg->index) { + if (opts->index) { snprintf(buf, 512, "index:%s:val_idx", - cfg->uris[i] + strlen("table:")); + wtperf->uris[i] + strlen("table:")); if ((ret = session->create( session, buf, "columns=(val)")) != 0) { - lprintf(cfg, ret, 0, + lprintf(wtperf, ret, 0, "Error creating index %s", buf); return (ret); } @@ -1974,76 +1950,213 @@ create_tables(CONFIG *cfg) } if ((ret = session->close(session, NULL)) != 0) { - lprintf(cfg, ret, 0, "Error closing session"); + lprintf(wtperf, ret, 0, "Error closing session"); return (ret); } return (0); } +/* + * wtperf_copy -- + * Create a new WTPERF structure as a duplicate of a previous one. + */ +static void +wtperf_copy(const WTPERF *src, WTPERF **retp) +{ + CONFIG_OPTS *opts; + WTPERF *dest; + size_t i; + + opts = src->opts; + + dest = dcalloc(1, sizeof(WTPERF)); + + /* + * Don't copy the home and monitor directories, they are filled in by + * our caller, explicitly. + */ + + if (src->partial_config != NULL) + dest->partial_config = dstrdup(src->partial_config); + if (src->reopen_config != NULL) + dest->reopen_config = dstrdup(src->reopen_config); + + if (src->uris != NULL) { + dest->uris = dcalloc(opts->table_count, sizeof(char *)); + for (i = 0; i < opts->table_count; i++) + dest->uris[i] = dstrdup(src->uris[i]); + } + + if (src->async_config != NULL) + dest->async_config = dstrdup(src->async_config); + + dest->ckptthreads = NULL; + dest->popthreads = NULL; + + dest->workers = NULL; + dest->workers_cnt = src->workers_cnt; + if (src->workload_cnt != 0) { + dest->workload_cnt = src->workload_cnt; + dest->workload = dcalloc(src->workload_cnt, sizeof(WORKLOAD)); + memcpy(dest->workload, + src->workload, src->workload_cnt * sizeof(WORKLOAD)); + } + + TAILQ_INIT(&dest->stone_head); + + dest->opts = src->opts; + + *retp = dest; +} + +/* + * wtperf_free -- + * Free any storage allocated in the WTPERF structure. + */ +static void +wtperf_free(WTPERF *wtperf) +{ + CONFIG_OPTS *opts; + size_t i; + + opts = wtperf->opts; + + free(wtperf->home); + free(wtperf->monitor_dir); + free(wtperf->partial_config); + free(wtperf->reopen_config); + free(wtperf->log_table_uri); + + if (wtperf->uris != NULL) { + for (i = 0; i < opts->table_count; i++) + free(wtperf->uris[i]); + free(wtperf->uris); + } + + free(wtperf->async_config); + + free(wtperf->ckptthreads); + free(wtperf->popthreads); + + free(wtperf->workers); + free(wtperf->workload); + + cleanup_truncate_config(wtperf); +} + +/* + * config_compress -- + * Parse the compression configuration. + */ +static int +config_compress(WTPERF *wtperf) +{ + CONFIG_OPTS *opts; + int ret; + const char *s; + + opts = wtperf->opts; + ret = 0; + + s = opts->compression; + if (strcmp(s, "none") == 0) { + wtperf->compress_ext = NULL; + wtperf->compress_table = NULL; + } else if (strcmp(s, "lz4") == 0) { +#ifndef HAVE_BUILTIN_EXTENSION_LZ4 + wtperf->compress_ext = LZ4_EXT; +#endif + wtperf->compress_table = LZ4_BLK; + } else if (strcmp(s, "snappy") == 0) { +#ifndef HAVE_BUILTIN_EXTENSION_SNAPPY + wtperf->compress_ext = SNAPPY_EXT; +#endif + wtperf->compress_table = SNAPPY_BLK; + } else if (strcmp(s, "zlib") == 0) { +#ifndef HAVE_BUILTIN_EXTENSION_ZLIB + wtperf->compress_ext = ZLIB_EXT; +#endif + wtperf->compress_table = ZLIB_BLK; + } else if (strcmp(s, "zstd") == 0) { +#ifndef HAVE_BUILTIN_EXTENSION_ZSTD + wtperf->compress_ext = ZSTD_EXT; +#endif + wtperf->compress_table = ZSTD_BLK; + } else { + fprintf(stderr, + "invalid compression configuration: %s\n", s); + ret = EINVAL; + } + return (ret); + +} + static int -start_all_runs(CONFIG *cfg) +start_all_runs(WTPERF *wtperf) { - CONFIG *next_cfg, **configs; + CONFIG_OPTS *opts; + WTPERF *next_wtperf, **wtperfs; pthread_t *threads; - size_t home_len, i; + size_t i, len; int ret, t_ret; - char *new_home; + opts = wtperf->opts; + wtperfs = NULL; ret = 0; - configs = NULL; - if (cfg->database_count == 1) - return (start_run(cfg)); + if (opts->database_count == 1) + return (start_run(wtperf)); - /* Allocate an array to hold our config struct copies. */ - configs = dcalloc(cfg->database_count, sizeof(CONFIG *)); + /* Allocate an array to hold our WTPERF copies. */ + wtperfs = dcalloc(opts->database_count, sizeof(WTPERF *)); /* Allocate an array to hold our thread IDs. */ - threads = dcalloc(cfg->database_count, sizeof(pthread_t)); - - home_len = strlen(cfg->home); - for (i = 0; i < cfg->database_count; i++) { - next_cfg = dcalloc(1, sizeof(CONFIG)); - configs[i] = next_cfg; - config_copy(next_cfg, cfg); - - /* Setup a unique home directory for each database. */ - new_home = dmalloc(home_len + 5); - snprintf(new_home, home_len + 5, "%s/D%02d", cfg->home, (int)i); - free(next_cfg->home); - next_cfg->home = new_home; - - /* If the monitor dir is default, update it too. */ - if (strcmp(cfg->monitor_dir, cfg->home) == 0) { - free(next_cfg->monitor_dir); - next_cfg->monitor_dir = dstrdup(new_home); - } + threads = dcalloc(opts->database_count, sizeof(pthread_t)); - /* If creating the sub-database, recreate its home */ - if (cfg->create != 0) - recreate_dir(next_cfg->home); + for (i = 0; i < opts->database_count; i++) { + wtperf_copy(wtperf, &next_wtperf); + wtperfs[i] = next_wtperf; + + /* + * Set up unique home/monitor directories for each database. + * Re-create the directories if creating the databases. + */ + len = strlen(wtperf->home) + 5; + next_wtperf->home = dmalloc(len); + snprintf( + next_wtperf->home, len, "%s/D%02d", wtperf->home, (int)i); + if (opts->create != 0) + recreate_dir(next_wtperf->home); + + len = strlen(wtperf->monitor_dir) + 5; + next_wtperf->monitor_dir = dmalloc(len); + snprintf(next_wtperf->monitor_dir, + len, "%s/D%02d", wtperf->monitor_dir, (int)i); + if (opts->create != 0 && + strcmp(next_wtperf->home, next_wtperf->monitor_dir) != 0) + recreate_dir(next_wtperf->monitor_dir); if ((ret = pthread_create( - &threads[i], NULL, thread_run_wtperf, next_cfg)) != 0) { - lprintf(cfg, ret, 0, "Error creating thread"); + &threads[i], NULL, thread_run_wtperf, next_wtperf)) != 0) { + lprintf(wtperf, ret, 0, "Error creating thread"); goto err; } } /* Wait for threads to finish. */ - for (i = 0; i < cfg->database_count; i++) + for (i = 0; i < opts->database_count; i++) if ((t_ret = pthread_join(threads[i], NULL)) != 0) { - lprintf(cfg, ret, 0, "Error joining thread"); + lprintf(wtperf, ret, 0, "Error joining thread"); if (ret == 0) ret = t_ret; } -err: for (i = 0; i < cfg->database_count && configs[i] != NULL; i++) { - config_free(configs[i]); - free(configs[i]); +err: for (i = 0; i < opts->database_count && wtperfs[i] != NULL; i++) { + wtperf_free(wtperfs[i]); + free(wtperfs[i]); } - free(configs); + free(wtperfs); free(threads); return (ret); @@ -2053,120 +2166,124 @@ err: for (i = 0; i < cfg->database_count && configs[i] != NULL; i++) { static void * thread_run_wtperf(void *arg) { - CONFIG *cfg; + WTPERF *wtperf; int ret; - cfg = (CONFIG *)arg; - if ((ret = start_run(cfg)) != 0) - lprintf(cfg, ret, 0, "Run failed for: %s.", cfg->home); + wtperf = (WTPERF *)arg; + if ((ret = start_run(wtperf)) != 0) + lprintf(wtperf, ret, 0, "Run failed for: %s.", wtperf->home); return (NULL); } static int -start_run(CONFIG *cfg) +start_run(WTPERF *wtperf) { + CONFIG_OPTS *opts; pthread_t monitor_thread; uint64_t total_ops; uint32_t run_time; int monitor_created, ret, t_ret; + opts = wtperf->opts; monitor_created = ret = 0; /* [-Wconditional-uninitialized] */ memset(&monitor_thread, 0, sizeof(monitor_thread)); - if ((ret = setup_log_file(cfg)) != 0) + if ((ret = setup_log_file(wtperf)) != 0) goto err; if ((ret = wiredtiger_open( /* Open the real connection. */ - cfg->home, NULL, cfg->conn_config, &cfg->conn)) != 0) { - lprintf(cfg, ret, 0, "Error connecting to %s", cfg->home); + wtperf->home, NULL, opts->conn_config, &wtperf->conn)) != 0) { + lprintf(wtperf, ret, 0, "Error connecting to %s", wtperf->home); goto err; } - create_uris(cfg); + create_uris(wtperf); /* If creating, create the tables. */ - if (cfg->create != 0 && (ret = create_tables(cfg)) != 0) + if (opts->create != 0 && (ret = create_tables(wtperf)) != 0) goto err; /* Start the monitor thread. */ - if (cfg->sample_interval != 0) { + if (opts->sample_interval != 0) { if ((ret = pthread_create( - &monitor_thread, NULL, monitor, cfg)) != 0) { - lprintf( - cfg, ret, 0, "Error creating monitor thread."); + &monitor_thread, NULL, monitor, wtperf)) != 0) { + lprintf(wtperf, + ret, 0, "Error creating monitor thread."); goto err; } monitor_created = 1; } /* If creating, populate the table. */ - if (cfg->create != 0 && execute_populate(cfg) != 0) + if (opts->create != 0 && execute_populate(wtperf) != 0) goto err; /* Optional workload. */ - if (cfg->workers_cnt != 0 && - (cfg->run_time != 0 || cfg->run_ops != 0)) { + if (wtperf->workers_cnt != 0 && + (opts->run_time != 0 || opts->run_ops != 0)) { /* * If we have a workload, close and reopen the connection so * that LSM can detect read-only workloads. */ - if (close_reopen(cfg) != 0) + if (close_reopen(wtperf) != 0) goto err; /* Didn't create, set insert count. */ - if (cfg->create == 0 && cfg->random_range == 0 && - find_table_count(cfg) != 0) + if (opts->create == 0 && + opts->random_range == 0 && find_table_count(wtperf) != 0) goto err; /* Start the checkpoint thread. */ - if (cfg->checkpoint_threads != 0) { - lprintf(cfg, 0, 1, + if (opts->checkpoint_threads != 0) { + lprintf(wtperf, 0, 1, "Starting %" PRIu32 " checkpoint thread(s)", - cfg->checkpoint_threads); - cfg->ckptthreads = dcalloc( - cfg->checkpoint_threads, sizeof(CONFIG_THREAD)); - if (start_threads(cfg, NULL, cfg->ckptthreads, - cfg->checkpoint_threads, checkpoint_worker) != 0) + opts->checkpoint_threads); + wtperf->ckptthreads = dcalloc( + opts->checkpoint_threads, sizeof(WTPERF_THREAD)); + if (start_threads(wtperf, NULL, wtperf->ckptthreads, + opts->checkpoint_threads, checkpoint_worker) != 0) goto err; } /* Execute the workload. */ - if ((ret = execute_workload(cfg)) != 0) + if ((ret = execute_workload(wtperf)) != 0) goto err; /* One final summation of the operations we've completed. */ - cfg->read_ops = sum_read_ops(cfg); - cfg->insert_ops = sum_insert_ops(cfg); - cfg->truncate_ops = sum_truncate_ops(cfg); - cfg->update_ops = sum_update_ops(cfg); - cfg->ckpt_ops = sum_ckpt_ops(cfg); - total_ops = cfg->read_ops + cfg->insert_ops + cfg->update_ops; - - run_time = cfg->run_time == 0 ? 1 : cfg->run_time; - lprintf(cfg, 0, 1, + wtperf->read_ops = sum_read_ops(wtperf); + wtperf->insert_ops = sum_insert_ops(wtperf); + wtperf->truncate_ops = sum_truncate_ops(wtperf); + wtperf->update_ops = sum_update_ops(wtperf); + wtperf->ckpt_ops = sum_ckpt_ops(wtperf); + total_ops = + wtperf->read_ops + wtperf->insert_ops + wtperf->update_ops; + + run_time = opts->run_time == 0 ? 1 : opts->run_time; + lprintf(wtperf, 0, 1, "Executed %" PRIu64 " read operations (%" PRIu64 "%%) %" PRIu64 " ops/sec", - cfg->read_ops, (cfg->read_ops * 100) / total_ops, - cfg->read_ops / run_time); - lprintf(cfg, 0, 1, + wtperf->read_ops, (wtperf->read_ops * 100) / total_ops, + wtperf->read_ops / run_time); + lprintf(wtperf, 0, 1, "Executed %" PRIu64 " insert operations (%" PRIu64 "%%) %" PRIu64 " ops/sec", - cfg->insert_ops, (cfg->insert_ops * 100) / total_ops, - cfg->insert_ops / run_time); - lprintf(cfg, 0, 1, + wtperf->insert_ops, (wtperf->insert_ops * 100) / total_ops, + wtperf->insert_ops / run_time); + lprintf(wtperf, 0, 1, "Executed %" PRIu64 " truncate operations (%" PRIu64 "%%) %" PRIu64 " ops/sec", - cfg->truncate_ops, (cfg->truncate_ops * 100) / total_ops, - cfg->truncate_ops / run_time); - lprintf(cfg, 0, 1, + wtperf->truncate_ops, + (wtperf->truncate_ops * 100) / total_ops, + wtperf->truncate_ops / run_time); + lprintf(wtperf, 0, 1, "Executed %" PRIu64 " update operations (%" PRIu64 "%%) %" PRIu64 " ops/sec", - cfg->update_ops, (cfg->update_ops * 100) / total_ops, - cfg->update_ops / run_time); - lprintf(cfg, 0, 1, + wtperf->update_ops, (wtperf->update_ops * 100) / total_ops, + wtperf->update_ops / run_time); + lprintf(wtperf, 0, 1, "Executed %" PRIu64 " checkpoint operations", - cfg->ckpt_ops); + wtperf->ckpt_ops); - latency_print(cfg); + latency_print(wtperf); } if (0) { @@ -2175,40 +2292,41 @@ err: if (ret == 0) } /* Notify the worker threads they are done. */ - cfg->stop = 1; + wtperf->stop = true; - if ((t_ret = stop_threads(cfg, 1, cfg->ckptthreads)) != 0) + if ((t_ret = stop_threads(wtperf, 1, wtperf->ckptthreads)) != 0) if (ret == 0) ret = t_ret; if (monitor_created != 0 && (t_ret = pthread_join(monitor_thread, NULL)) != 0) { - lprintf(cfg, ret, 0, "Error joining monitor thread."); + lprintf(wtperf, ret, 0, "Error joining monitor thread."); if (ret == 0) ret = t_ret; } - if (cfg->conn != NULL && - (t_ret = cfg->conn->close(cfg->conn, NULL)) != 0) { - lprintf(cfg, t_ret, 0, - "Error closing connection to %s", cfg->home); + if (wtperf->conn != NULL && opts->close_conn && + (t_ret = wtperf->conn->close(wtperf->conn, NULL)) != 0) { + lprintf(wtperf, t_ret, 0, + "Error closing connection to %s", wtperf->home); if (ret == 0) ret = t_ret; } if (ret == 0) { - if (cfg->run_time == 0 && cfg->run_ops == 0) - lprintf(cfg, 0, 1, "Run completed"); + if (opts->run_time == 0 && opts->run_ops == 0) + lprintf(wtperf, 0, 1, "Run completed"); else - lprintf(cfg, 0, 1, "Run completed: %" PRIu32 " %s", - cfg->run_time == 0 ? cfg->run_ops : cfg->run_time, - cfg->run_time == 0 ? "operations" : "seconds"); + lprintf(wtperf, 0, 1, "Run completed: %" PRIu32 " %s", + opts->run_time == 0 ? + opts->run_ops : opts->run_time, + opts->run_time == 0 ? "operations" : "seconds"); } - if (cfg->logf != NULL) { - if ((t_ret = fflush(cfg->logf)) != 0 && ret == 0) + if (wtperf->logf != NULL) { + if ((t_ret = fflush(wtperf->logf)) != 0 && ret == 0) ret = t_ret; - if ((t_ret = fclose(cfg->logf)) != 0 && ret == 0) + if ((t_ret = fclose(wtperf->logf)) != 0 && ret == 0) ret = t_ret; } return (ret); @@ -2216,33 +2334,56 @@ err: if (ret == 0) extern int __wt_optind, __wt_optreset; extern char *__wt_optarg; -void (*custom_die)(void) = NULL; + +/* + * usage -- + * wtperf usage print, no error. + */ +static void +usage(void) +{ + printf("wtperf [-C config] " + "[-H mount] [-h home] [-O file] [-o option] [-T config]\n"); + printf("\t-C <string> additional connection configuration\n"); + printf("\t (added to option conn_config)\n"); + printf("\t-H <mount> configure Helium volume mount point\n"); + printf("\t-h <string> Wired Tiger home must exist, default WT_TEST\n"); + printf("\t-O <file> file contains options as listed below\n"); + printf("\t-o option=val[,option=val,...] set options listed below\n"); + printf("\t-T <string> additional table configuration\n"); + printf("\t (added to option table_config)\n"); + printf("\n"); + config_opt_usage(); +} int main(int argc, char *argv[]) { - CONFIG *cfg, _cfg; + CONFIG_OPTS *opts; + WTPERF *wtperf, _wtperf; size_t req_len, sreq_len; bool monitor_set; int ch, ret; - const char *opts = "C:h:m:O:o:T:"; + const char *cmdflags = "C:h:m:O:o:T:"; const char *config_opts; - char *cc_buf, *sess_cfg, *tc_buf, *user_cconfig, *user_tconfig; + char *cc_buf, *path, *sess_cfg, *tc_buf, *user_cconfig, *user_tconfig; + + /* The first WTPERF structure (from which all others are derived). */ + wtperf = &_wtperf; + memset(wtperf, 0, sizeof(*wtperf)); + wtperf->home = dstrdup(DEFAULT_HOME); + wtperf->monitor_dir = dstrdup(DEFAULT_MONITOR_DIR); + TAILQ_INIT(&wtperf->stone_head); + config_opt_init(&wtperf->opts); + opts = wtperf->opts; monitor_set = false; ret = 0; config_opts = NULL; cc_buf = sess_cfg = tc_buf = user_cconfig = user_tconfig = NULL; - /* Setup the default configuration values. */ - cfg = &_cfg; - memset(cfg, 0, sizeof(*cfg)); - config_copy(cfg, &default_cfg); - cfg->home = dstrdup(DEFAULT_HOME); - cfg->monitor_dir = dstrdup(DEFAULT_MONITOR_DIR); - /* Do a basic validation of options, and home is needed before open. */ - while ((ch = __wt_getopt("wtperf", argc, argv, opts)) != EOF) + while ((ch = __wt_getopt("wtperf", argc, argv, cmdflags)) != EOF) switch (ch) { case 'C': if (user_cconfig == NULL) @@ -2256,12 +2397,12 @@ main(int argc, char *argv[]) } break; case 'h': - free(cfg->home); - cfg->home = dstrdup(__wt_optarg); + free(wtperf->home); + wtperf->home = dstrdup(__wt_optarg); break; case 'm': - free(cfg->monitor_dir); - cfg->monitor_dir = dstrdup(__wt_optarg); + free(wtperf->monitor_dir); + wtperf->monitor_dir = dstrdup(__wt_optarg); monitor_set = true; break; case 'O': @@ -2288,47 +2429,48 @@ main(int argc, char *argv[]) * monitor directory to the home dir. */ if (!monitor_set) { - free(cfg->monitor_dir); - cfg->monitor_dir = dstrdup(cfg->home); + free(wtperf->monitor_dir); + wtperf->monitor_dir = dstrdup(wtperf->home); } /* Parse configuration settings from configuration file. */ - if (config_opts != NULL && config_opt_file(cfg, config_opts) != 0) + if (config_opts != NULL && config_opt_file(wtperf, config_opts) != 0) goto einval; /* Parse options that override values set via a configuration file. */ __wt_optreset = __wt_optind = 1; - while ((ch = __wt_getopt("wtperf", argc, argv, opts)) != EOF) + while ((ch = __wt_getopt("wtperf", argc, argv, cmdflags)) != EOF) switch (ch) { case 'o': /* Allow -o key=value */ - if (config_opt_line(cfg, __wt_optarg) != 0) + if (config_opt_str(wtperf, __wt_optarg) != 0) goto einval; break; } - if (cfg->populate_threads == 0 && cfg->icount != 0) { - lprintf(cfg, 1, 0, + if (opts->populate_threads == 0 && opts->icount != 0) { + lprintf(wtperf, 1, 0, "Cannot have 0 populate threads when icount is set\n"); goto err; } - cfg->async_config = NULL; + wtperf->async_config = NULL; /* * If the user specified async_threads we use async for all ops. * If the user wants compaction, then we also enable async for * the compact operation, but not for the workloads. */ - if (cfg->async_threads > 0) { - if (F_ISSET(cfg, CFG_TRUNCATE)) { - lprintf(cfg, 1, 0, "Cannot run truncate and async\n"); + if (opts->async_threads > 0) { + if (F_ISSET(wtperf, CFG_TRUNCATE)) { + lprintf(wtperf, + 1, 0, "Cannot run truncate and async\n"); goto err; } - cfg->use_asyncops = 1; + wtperf->use_asyncops = true; } - if (cfg->compact && cfg->async_threads == 0) - cfg->async_threads = 2; - if (cfg->async_threads > 0) { + if (opts->compact && opts->async_threads == 0) + opts->async_threads = 2; + if (opts->async_threads > 0) { /* * The maximum number of async threads is two digits, so just * use that to compute the space we need. Assume the default @@ -2336,145 +2478,133 @@ main(int argc, char *argv[]) * to 4096 if needed. */ req_len = strlen(",async=(enabled=true,threads=)") + 4; - cfg->async_config = dmalloc(req_len); - snprintf(cfg->async_config, req_len, + wtperf->async_config = dmalloc(req_len); + snprintf(wtperf->async_config, req_len, ",async=(enabled=true,threads=%" PRIu32 ")", - cfg->async_threads); + opts->async_threads); } - if ((ret = config_compress(cfg)) != 0) + if ((ret = config_compress(wtperf)) != 0) goto err; /* You can't have truncate on a random collection. */ - if (F_ISSET(cfg, CFG_TRUNCATE) && cfg->random_range) { - lprintf(cfg, 1, 0, "Cannot run truncate and random_range\n"); + if (F_ISSET(wtperf, CFG_TRUNCATE) && opts->random_range) { + lprintf(wtperf, 1, 0, "Cannot run truncate and random_range\n"); goto err; } /* We can't run truncate with more than one table. */ - if (F_ISSET(cfg, CFG_TRUNCATE) && cfg->table_count > 1) { - lprintf(cfg, 1, 0, "Cannot truncate more than 1 table\n"); + if (F_ISSET(wtperf, CFG_TRUNCATE) && opts->table_count > 1) { + lprintf(wtperf, 1, 0, "Cannot truncate more than 1 table\n"); goto err; } - /* Build the URI from the table name. */ - req_len = strlen("table:") + strlen(cfg->table_name) + 2; - cfg->base_uri = dmalloc(req_len); - snprintf(cfg->base_uri, req_len, "table:%s", cfg->table_name); - /* Make stdout line buffered, so verbose output appears quickly. */ __wt_stream_set_line_buffer(stdout); /* Concatenate non-default configuration strings. */ - if (cfg->verbose > 1 || user_cconfig != NULL || - cfg->session_count_idle > 0 || cfg->compress_ext != NULL || - cfg->async_config != NULL) { - req_len = strlen(debug_cconfig) + 3; + if (opts->verbose > 1 || user_cconfig != NULL || + opts->session_count_idle > 0 || wtperf->compress_ext != NULL || + wtperf->async_config != NULL) { + req_len = strlen(debug_cconfig) + 20; if (user_cconfig != NULL) req_len += strlen(user_cconfig); - if (cfg->async_config != NULL) - req_len += strlen(cfg->async_config); - if (cfg->compress_ext != NULL) - req_len += strlen(cfg->compress_ext); - if (cfg->session_count_idle > 0) { + if (wtperf->async_config != NULL) + req_len += strlen(wtperf->async_config); + if (wtperf->compress_ext != NULL) + req_len += strlen(wtperf->compress_ext); + if (opts->session_count_idle > 0) { sreq_len = strlen(",session_max=") + 6; req_len += sreq_len; sess_cfg = dmalloc(sreq_len); snprintf(sess_cfg, sreq_len, ",session_max=%" PRIu32, - cfg->session_count_idle + cfg->workers_cnt + - cfg->populate_threads + 10); + opts->session_count_idle + + wtperf->workers_cnt + opts->populate_threads + 10); } cc_buf = dmalloc(req_len); - /* - * This is getting hard to parse. - */ - snprintf(cc_buf, req_len, "%s%s%s%s%s%s%s", - cfg->async_config ? cfg->async_config : "", - cfg->compress_ext ? cfg->compress_ext : "", - cfg->verbose > 1 && strlen(debug_cconfig) ? ",": "", - cfg->verbose > 1 && - strlen(debug_cconfig) ? debug_cconfig : "", - sess_cfg ? sess_cfg : "", - user_cconfig ? ",": "", - user_cconfig ? user_cconfig : ""); - if (strlen(cc_buf)) - if ((ret = config_opt_str( - cfg, "conn_config", cc_buf)) != 0) - goto err; + snprintf(cc_buf, req_len, "%s,%s,%s,%s,%s", + wtperf->async_config ? wtperf->async_config : "", + wtperf->compress_ext ? wtperf->compress_ext : "", + opts->verbose > 1 ? debug_cconfig : "", + sess_cfg != NULL ? sess_cfg : "", + user_cconfig != NULL ? user_cconfig : ""); + if (strlen(cc_buf) && (ret = + config_opt_name_value(wtperf, "conn_config", cc_buf)) != 0) + goto err; } - if (cfg->verbose > 1 || cfg->index || - user_tconfig != NULL || cfg->compress_table != NULL) { - req_len = strlen(debug_tconfig) + 3; + if (opts->verbose > 1 || opts->index || + user_tconfig != NULL || wtperf->compress_table != NULL) { + req_len = strlen(debug_tconfig) + 20; if (user_tconfig != NULL) req_len += strlen(user_tconfig); - if (cfg->compress_table != NULL) - req_len += strlen(cfg->compress_table); - if (cfg->index) + if (wtperf->compress_table != NULL) + req_len += strlen(wtperf->compress_table); + if (opts->index) req_len += strlen(INDEX_COL_NAMES); tc_buf = dmalloc(req_len); - /* - * This is getting hard to parse. - */ - snprintf(tc_buf, req_len, "%s%s%s%s%s%s", - cfg->index ? INDEX_COL_NAMES : "", - cfg->compress_table ? cfg->compress_table : "", - cfg->verbose > 1 && strlen(debug_tconfig) ? ",": "", - cfg->verbose > 1 && - strlen(debug_tconfig) ? debug_tconfig : "", - user_tconfig ? ",": "", + snprintf(tc_buf, req_len, "%s,%s,%s,%s", + opts->index ? INDEX_COL_NAMES : "", + wtperf->compress_table != NULL ? + wtperf->compress_table : "", + opts->verbose > 1 ? debug_tconfig : "", user_tconfig ? user_tconfig : ""); - if (strlen(tc_buf)) - if ((ret = config_opt_str( - cfg, "table_config", tc_buf)) != 0) - goto err; + if (strlen(tc_buf) && (ret = + config_opt_name_value(wtperf, "table_config", tc_buf)) != 0) + goto err; } - if (cfg->log_partial && cfg->table_count > 1) { - req_len = strlen(cfg->table_config) + + if (opts->log_partial && opts->table_count > 1) { + req_len = strlen(opts->table_config) + strlen(LOG_PARTIAL_CONFIG) + 1; - cfg->partial_config = dmalloc(req_len); - snprintf(cfg->partial_config, req_len, "%s%s", - cfg->table_config, LOG_PARTIAL_CONFIG); + wtperf->partial_config = dmalloc(req_len); + snprintf(wtperf->partial_config, req_len, "%s%s", + opts->table_config, LOG_PARTIAL_CONFIG); } /* * Set the config for reopen. If readonly add in that string. * If not readonly then just copy the original conn_config. */ - if (cfg->readonly) - req_len = strlen(cfg->conn_config) + + if (opts->readonly) + req_len = strlen(opts->conn_config) + strlen(READONLY_CONFIG) + 1; else - req_len = strlen(cfg->conn_config) + 1; - cfg->reopen_config = dmalloc(req_len); - if (cfg->readonly) - snprintf(cfg->reopen_config, req_len, "%s%s", - cfg->conn_config, READONLY_CONFIG); + req_len = strlen(opts->conn_config) + 1; + wtperf->reopen_config = dmalloc(req_len); + if (opts->readonly) + snprintf(wtperf->reopen_config, req_len, "%s%s", + opts->conn_config, READONLY_CONFIG); else - snprintf(cfg->reopen_config, req_len, "%s", - cfg->conn_config); + snprintf(wtperf->reopen_config, + req_len, "%s", opts->conn_config); /* Sanity-check the configuration. */ - if ((ret = config_sanity(cfg)) != 0) + if ((ret = config_sanity(wtperf)) != 0) goto err; /* If creating, remove and re-create the home directory. */ - if (cfg->create != 0) - recreate_dir(cfg->home); + if (opts->create != 0) + recreate_dir(wtperf->home); /* Write a copy of the config. */ - config_to_file(cfg); + req_len = strlen(wtperf->home) + strlen("/CONFIG.wtperf") + 1; + path = dmalloc(req_len); + snprintf(path, req_len, "%s/CONFIG.wtperf", wtperf->home); + config_opt_log(opts, path); + free(path); /* Display the configuration. */ - if (cfg->verbose > 1) - config_print(cfg); + if (opts->verbose > 1) + config_opt_print(wtperf); - if ((ret = start_all_runs(cfg)) != 0) + if ((ret = start_all_runs(wtperf)) != 0) goto err; if (0) { einval: ret = EINVAL; } -err: config_free(cfg); +err: wtperf_free(wtperf); + config_opt_cleanup(opts); + free(cc_buf); free(sess_cfg); free(tc_buf); @@ -2485,26 +2615,26 @@ err: config_free(cfg); } static int -start_threads(CONFIG *cfg, - WORKLOAD *workp, CONFIG_THREAD *base, u_int num, void *(*func)(void *)) +start_threads(WTPERF *wtperf, + WORKLOAD *workp, WTPERF_THREAD *base, u_int num, void *(*func)(void *)) { - CONFIG_THREAD *thread; + CONFIG_OPTS *opts; + WTPERF_THREAD *thread; u_int i; int ret; + opts = wtperf->opts; + /* Initialize the threads. */ for (i = 0, thread = base; i < num; ++i, ++thread) { - thread->cfg = cfg; + thread->wtperf = wtperf; thread->workload = workp; /* * We don't want the threads executing in lock-step, seed each * one differently. */ - if ((ret = __wt_random_init_seed(NULL, &thread->rnd)) != 0) { - lprintf(cfg, ret, 0, "Error initializing RNG"); - return (ret); - } + __wt_random_init_seed(NULL, &thread->rnd); /* * Every thread gets a key/data buffer because we don't bother @@ -2512,14 +2642,14 @@ start_threads(CONFIG *cfg, * don't, it's not enough memory to bother. These buffers hold * strings: trailing NUL is included in the size. */ - thread->key_buf = dcalloc(cfg->key_sz, 1); - thread->value_buf = dcalloc(cfg->value_sz_max, 1); + thread->key_buf = dcalloc(opts->key_sz, 1); + thread->value_buf = dcalloc(opts->value_sz_max, 1); /* * Initialize and then toss in a bit of random values if needed. */ - memset(thread->value_buf, 'a', cfg->value_sz - 1); - if (cfg->random_value) + memset(thread->value_buf, 'a', opts->value_sz - 1); + if (opts->random_value) randomize_value(thread, thread->value_buf); /* @@ -2537,7 +2667,7 @@ start_threads(CONFIG *cfg, for (i = 0, thread = base; i < num; ++i, ++thread) if ((ret = pthread_create( &thread->handle, NULL, func, thread)) != 0) { - lprintf(cfg, ret, 0, "Error creating thread"); + lprintf(wtperf, ret, 0, "Error creating thread"); return (ret); } @@ -2545,7 +2675,7 @@ start_threads(CONFIG *cfg, } static int -stop_threads(CONFIG *cfg, u_int num, CONFIG_THREAD *threads) +stop_threads(WTPERF *wtperf, u_int num, WTPERF_THREAD *threads) { u_int i; int ret; @@ -2555,7 +2685,7 @@ stop_threads(CONFIG *cfg, u_int num, CONFIG_THREAD *threads) for (i = 0; i < num; ++i, ++threads) { if ((ret = pthread_join(threads->handle, NULL)) != 0) { - lprintf(cfg, ret, 0, "Error joining thread"); + lprintf(wtperf, ret, 0, "Error joining thread"); return (ret); } @@ -2588,35 +2718,38 @@ recreate_dir(const char *name) } static int -drop_all_tables(CONFIG *cfg) +drop_all_tables(WTPERF *wtperf) { struct timespec start, stop; + CONFIG_OPTS *opts; WT_SESSION *session; size_t i; uint64_t msecs; int ret, t_ret; + opts = wtperf->opts; + /* Drop any tables. */ - if ((ret = cfg->conn->open_session( - cfg->conn, NULL, cfg->sess_config, &session)) != 0) { - lprintf(cfg, ret, 0, - "Error opening a session on %s", cfg->home); + if ((ret = wtperf->conn->open_session( + wtperf->conn, NULL, opts->sess_config, &session)) != 0) { + lprintf(wtperf, ret, 0, + "Error opening a session on %s", wtperf->home); return (ret); } - testutil_check(__wt_epoch(NULL, &start)); - for (i = 0; i < cfg->table_count; i++) { - if ((ret = session->drop( - session, cfg->uris[i], NULL)) != 0) { - lprintf(cfg, ret, 0, - "Error dropping table %s", cfg->uris[i]); + __wt_epoch(NULL, &start); + for (i = 0; i < opts->table_count; i++) { + if ((ret = + session->drop(session, wtperf->uris[i], NULL)) != 0) { + lprintf(wtperf, ret, 0, + "Error dropping table %s", wtperf->uris[i]); goto err; } } - testutil_check(__wt_epoch(NULL, &stop)); + __wt_epoch(NULL, &stop); msecs = WT_TIMEDIFF_MS(stop, start); - lprintf(cfg, 0, 1, + lprintf(wtperf, 0, 1, "Executed %" PRIu32 " drop operations average time %" PRIu64 "ms", - cfg->table_count, msecs / cfg->table_count); + opts->table_count, msecs / opts->table_count); err: if ((t_ret = session->close(session, NULL)) != 0 && ret == 0) ret = t_ret; @@ -2624,27 +2757,34 @@ err: if ((t_ret = session->close(session, NULL)) != 0 && ret == 0) } static uint64_t -wtperf_value_range(CONFIG *cfg) +wtperf_value_range(WTPERF *wtperf) { - if (cfg->random_range) - return (cfg->icount + cfg->random_range); + CONFIG_OPTS *opts; + + opts = wtperf->opts; + + if (opts->random_range) + return (opts->icount + opts->random_range); /* * It is legal to configure a zero size populate phase, hide that * from other code by pretending the range is 1 in that case. */ - if (cfg->icount + cfg->insert_key == 0) + if (opts->icount + wtperf->insert_key == 0) return (1); - return (cfg->icount + cfg->insert_key - (u_int)(cfg->workers_cnt + 1)); + return (opts->icount + + wtperf->insert_key - (u_int)(wtperf->workers_cnt + 1)); } static uint64_t -wtperf_rand(CONFIG_THREAD *thread) +wtperf_rand(WTPERF_THREAD *thread) { - CONFIG *cfg; + CONFIG_OPTS *opts; + WTPERF *wtperf; double S1, S2, U; uint64_t rval; - cfg = thread->cfg; + wtperf = thread->wtperf; + opts = wtperf->opts; /* * Use WiredTiger's random number routine: it's lock-free and fairly @@ -2653,11 +2793,11 @@ wtperf_rand(CONFIG_THREAD *thread) rval = __wt_random(&thread->rnd); /* Use Pareto distribution to give 80/20 hot/cold values. */ - if (cfg->pareto != 0) { + if (opts->pareto != 0) { #define PARETO_SHAPE 1.5 S1 = (-1 / PARETO_SHAPE); - S2 = wtperf_value_range(cfg) * - (cfg->pareto / 100.0) * (PARETO_SHAPE - 1); + S2 = wtperf_value_range(wtperf) * + (opts->pareto / 100.0) * (PARETO_SHAPE - 1); U = 1 - (double)rval / (double)UINT32_MAX; rval = (uint64_t)((pow(U, S1) - 1) * S2); /* @@ -2665,13 +2805,13 @@ wtperf_rand(CONFIG_THREAD *thread) * 2% of the time, from my testing. That will lead to the * first item in the table being "hot". */ - if (rval > wtperf_value_range(cfg)) + if (rval > wtperf_value_range(wtperf)) rval = 0; } /* * Wrap the key to within the expected range and avoid zero: we never * insert that key. */ - rval = (rval % wtperf_value_range(cfg)) + 1; + rval = (rval % wtperf_value_range(wtperf)) + 1; return (rval); } diff --git a/bench/wtperf/wtperf.h b/bench/wtperf/wtperf.h index 1bb94db2634..81d74e134f6 100644 --- a/bench/wtperf/wtperf.h +++ b/bench/wtperf/wtperf.h @@ -36,8 +36,8 @@ #include "config_opt.h" -typedef struct __config CONFIG; -typedef struct __config_thread CONFIG_THREAD; +typedef struct __wtperf WTPERF; +typedef struct __wtperf_thread WTPERF_THREAD; typedef struct __truncate_queue_entry TRUNCATE_QUEUE_ENTRY; #define EXT_PFX ",extensions=(" @@ -54,6 +54,9 @@ typedef struct __truncate_queue_entry TRUNCATE_QUEUE_ENTRY; #define ZLIB_BLK BLKCMP_PFX "zlib" #define ZLIB_EXT \ EXT_PFX EXTPATH "zlib/.libs/libwiredtiger_zlib.so" EXT_SFX +#define ZSTD_BLK BLKCMP_PFX "zstd" +#define ZSTD_EXT \ + EXT_PFX EXTPATH "zstd/.libs/libwiredtiger_zstd.so" EXT_SFX typedef struct { int64_t threads; /* Thread count */ @@ -95,12 +98,6 @@ struct __truncate_queue_entry { TAILQ_ENTRY(__truncate_queue_entry) q; }; -struct __config_queue_entry { - char *string; - TAILQ_ENTRY(__config_queue_entry) c; -}; -typedef struct __config_queue_entry CONFIG_QUEUE_ENTRY; - /* Steering for the throttle configuration */ typedef struct { struct timespec last_increment; /* Time that we last added more ops */ @@ -111,40 +108,35 @@ typedef struct { #define LOG_PARTIAL_CONFIG ",log=(enabled=false)" #define READONLY_CONFIG ",readonly=true" -/* - * NOTE: If you add any fields to this structure here, you must also add - * an initialization in wtperf.c in the default_cfg. - */ -struct __config { /* Configuration structure */ +struct __wtperf { /* Per-database structure */ char *home; /* WiredTiger home */ char *monitor_dir; /* Monitor output dir */ char *partial_config; /* Config string for partial logging */ char *reopen_config; /* Config string for conn reopen */ - char *base_uri; /* Object URI */ - char *log_table_uri; /* URI for log table */ - char **uris; /* URIs if multiple tables */ + char *log_table_uri; /* URI for log table */ + char **uris; /* URIs */ WT_CONNECTION *conn; /* Database connection */ FILE *logf; /* Logging handle */ - char *async_config; /* Config string for async */ + char *async_config; /* Config string for async */ + bool use_asyncops; /* Use async operations */ const char *compress_ext; /* Compression extension for conn */ const char *compress_table; /* Compression arg to table create */ - CONFIG_THREAD *ckptthreads, *popthreads; + WTPERF_THREAD *ckptthreads; /* Checkpoint threads */ + WTPERF_THREAD *popthreads; /* Populate threads */ #define WORKLOAD_MAX 50 - CONFIG_THREAD *workers; /* Worker threads */ + WTPERF_THREAD *workers; /* Worker threads */ u_int workers_cnt; WORKLOAD *workload; /* Workloads */ u_int workload_cnt; - uint32_t use_asyncops; /* Use async operations */ /* State tracking variables. */ - uint64_t ckpt_ops; /* checkpoint operations */ uint64_t insert_ops; /* insert operations */ uint64_t read_ops; /* read operations */ @@ -154,10 +146,10 @@ struct __config { /* Configuration structure */ uint64_t insert_key; /* insert key */ uint64_t log_like_table_key; /* used to allocate IDs for log table */ - volatile int ckpt; /* checkpoint in progress */ - volatile int error; /* thread error */ - volatile int stop; /* notify threads to stop */ - volatile int in_warmup; /* Running warmup phase */ + volatile bool ckpt; /* checkpoint in progress */ + volatile bool error; /* thread error */ + volatile bool stop; /* notify threads to stop */ + volatile bool in_warmup; /* running warmup phase */ volatile bool idle_cycle_run; /* Signal for idle cycle thread */ @@ -171,13 +163,7 @@ struct __config { /* Configuration structure */ /* Queue head for use with the Truncate Logic */ TAILQ_HEAD(__truncate_qh, __truncate_queue_entry) stone_head; - /* Queue head to save a copy of the config to be output */ - TAILQ_HEAD(__config_qh, __config_queue_entry) config_head; - - /* Fields changeable on command line are listed in wtperf_opt.i */ -#define OPT_DECLARE_STRUCT -#include "wtperf_opt.i" -#undef OPT_DECLARE_STRUCT + CONFIG_OPTS *opts; /* Global configuration */ }; #define ELEMENTS(a) (sizeof(a) / sizeof(a[0])) @@ -237,8 +223,8 @@ typedef struct { uint32_t sec[100]; /* < 1s 2s ... 100s */ } TRACK; -struct __config_thread { /* Per-thread structure */ - CONFIG *cfg; /* Enclosing configuration */ +struct __wtperf_thread { /* Per-thread structure */ + WTPERF *wtperf; /* Enclosing configuration */ WT_RAND_STATE rnd; /* Random number generation state */ @@ -260,50 +246,45 @@ struct __config_thread { /* Per-thread structure */ TRACK truncate_sleep; /* Truncate sleep operations */ }; -void cleanup_truncate_config(CONFIG *); -int config_compress(CONFIG *); -void config_free(CONFIG *); -void config_copy(CONFIG *, const CONFIG *); -int config_opt_file(CONFIG *, const char *); -int config_opt_line(CONFIG *, const char *); -int config_opt_str(CONFIG *, const char *, const char *); -void config_to_file(CONFIG *); -void config_consolidate(CONFIG *); -void config_print(CONFIG *); -int config_sanity(CONFIG *); -void latency_insert(CONFIG *, uint32_t *, uint32_t *, uint32_t *); -void latency_read(CONFIG *, uint32_t *, uint32_t *, uint32_t *); -void latency_update(CONFIG *, uint32_t *, uint32_t *, uint32_t *); -void latency_print(CONFIG *); +void cleanup_truncate_config(WTPERF *); +int config_opt_file(WTPERF *, const char *); +void config_opt_cleanup(CONFIG_OPTS *); +void config_opt_init(CONFIG_OPTS **); +void config_opt_log(CONFIG_OPTS *, const char *); +int config_opt_name_value(WTPERF *, const char *, const char *); +void config_opt_print(WTPERF *); +int config_opt_str(WTPERF *, const char *); +void config_opt_usage(void); +int config_sanity(WTPERF *); +void latency_insert(WTPERF *, uint32_t *, uint32_t *, uint32_t *); +void latency_print(WTPERF *); +void latency_read(WTPERF *, uint32_t *, uint32_t *, uint32_t *); +void latency_update(WTPERF *, uint32_t *, uint32_t *, uint32_t *); int run_truncate( - CONFIG *, CONFIG_THREAD *, WT_CURSOR *, WT_SESSION *, int *); -int setup_log_file(CONFIG *); -void setup_throttle(CONFIG_THREAD*); -int setup_truncate(CONFIG *, CONFIG_THREAD *, WT_SESSION *); -int start_idle_table_cycle(CONFIG *, pthread_t *); -int stop_idle_table_cycle(CONFIG *, pthread_t); -uint64_t sum_ckpt_ops(CONFIG *); -uint64_t sum_insert_ops(CONFIG *); -uint64_t sum_pop_ops(CONFIG *); -uint64_t sum_read_ops(CONFIG *); -uint64_t sum_truncate_ops(CONFIG *); -uint64_t sum_update_ops(CONFIG *); -void usage(void); -void worker_throttle(CONFIG_THREAD*); - -void lprintf(const CONFIG *, int err, uint32_t, const char *, ...) + WTPERF *, WTPERF_THREAD *, WT_CURSOR *, WT_SESSION *, int *); +int setup_log_file(WTPERF *); +void setup_throttle(WTPERF_THREAD *); +int setup_truncate(WTPERF *, WTPERF_THREAD *, WT_SESSION *); +int start_idle_table_cycle(WTPERF *, pthread_t *); +int stop_idle_table_cycle(WTPERF *, pthread_t); +void worker_throttle(WTPERF_THREAD *); +uint64_t sum_ckpt_ops(WTPERF *); +uint64_t sum_insert_ops(WTPERF *); +uint64_t sum_pop_ops(WTPERF *); +uint64_t sum_read_ops(WTPERF *); +uint64_t sum_truncate_ops(WTPERF *); +uint64_t sum_update_ops(WTPERF *); + +void lprintf(const WTPERF *, int err, uint32_t, const char *, ...) #if defined(__GNUC__) __attribute__((format (printf, 4, 5))) #endif ; static inline void -generate_key(CONFIG *cfg, char *key_buf, uint64_t keyno) +generate_key(CONFIG_OPTS *opts, char *key_buf, uint64_t keyno) { - /* - * Don't change to snprintf, sprintf is faster in some tests. - */ - sprintf(key_buf, "%0*" PRIu64, cfg->key_sz - 1, keyno); + u64_to_string_zf(keyno, key_buf, opts->key_sz); } static inline void diff --git a/bench/wtperf/wtperf_opt.i b/bench/wtperf/wtperf_opt.i index 17517ffe477..680eb53a90e 100644 --- a/bench/wtperf/wtperf_opt.i +++ b/bench/wtperf/wtperf_opt.i @@ -38,14 +38,14 @@ #ifdef OPT_DEFINE_DESC #define DEF_OPT_AS_BOOL(name, initval, desc) \ - { #name, desc, #initval, BOOL_TYPE, offsetof(CONFIG, name) }, + { #name, desc, #initval, BOOL_TYPE, offsetof(CONFIG_OPTS, name) }, #define DEF_OPT_AS_CONFIG_STRING(name, initval, desc) \ { #name, desc, initval, CONFIG_STRING_TYPE, \ - offsetof(CONFIG, name) }, + offsetof(CONFIG_OPTS, name) }, #define DEF_OPT_AS_STRING(name, initval, desc) \ - { #name, desc, initval, STRING_TYPE, offsetof(CONFIG, name) }, + { #name, desc, initval, STRING_TYPE, offsetof(CONFIG_OPTS, name) }, #define DEF_OPT_AS_UINT32(name, initval, desc) \ - { #name, desc, #initval, UINT32_TYPE, offsetof(CONFIG, name) }, + { #name, desc, #initval, UINT32_TYPE, offsetof(CONFIG_OPTS, name) }, #endif #ifdef OPT_DEFINE_DEFAULT @@ -57,13 +57,13 @@ #ifdef OPT_DEFINE_DOXYGEN #define DEF_OPT_AS_BOOL(name, initval, desc) \ - { #name, desc, #initval, BOOL_TYPE, 0 }, + OPTION #name, desc, #initval, boolean #define DEF_OPT_AS_CONFIG_STRING(name, initval, desc) \ - { #name, desc, initval, CONFIG_STRING_TYPE, 0 }, + OPTION #name, desc, initval, string #define DEF_OPT_AS_STRING(name, initval, desc) \ - { #name, desc, initval, STRING_TYPE, 0 }, + OPTION #name, desc, initval, string #define DEF_OPT_AS_UINT32(name, initval, desc) \ - { #name, desc, #initval, UINT32_TYPE, 0 }, + OPTION #name, desc, #initval, unsigned int #endif /* @@ -94,17 +94,20 @@ DEF_OPT_AS_UINT32(checkpoint_stress_rate, 0, DEF_OPT_AS_UINT32(checkpoint_threads, 0, "number of checkpoint threads") DEF_OPT_AS_CONFIG_STRING(conn_config, "create", "connection configuration string") +DEF_OPT_AS_BOOL(close_conn, 1, "properly close connection at end of test. " + "Setting to false does not sync data to disk and can result in lost " + "data after test exits.") DEF_OPT_AS_BOOL(compact, 0, "post-populate compact for LSM merging activity") DEF_OPT_AS_STRING(compression, "none", "compression extension. Allowed configuration values are: " - "'none', 'lz4', 'snappy', 'zlib'") + "'none', 'lz4', 'snappy', 'zlib', 'zstd'") DEF_OPT_AS_BOOL(create, 1, "do population phase; false to use existing database") DEF_OPT_AS_UINT32(database_count, 1, "number of WiredTiger databases to use. Each database will execute the" " workload using a separate home directory and complete set of worker" " threads") -DEF_OPT_AS_UINT32(drop_tables, 0, +DEF_OPT_AS_BOOL(drop_tables, 0, "Whether to drop all tables at the end of the run, and report time taken" " to do the drop.") DEF_OPT_AS_UINT32(icount, 5000, @@ -193,8 +196,8 @@ DEF_OPT_AS_STRING(threads, "", "workload configuration: each 'count' " "'updates', 'truncate', 'truncate_pct' and 'truncate_count'. There are " "also behavior modifiers, supported modifiers are 'ops_per_txn'") DEF_OPT_AS_CONFIG_STRING(transaction_config, "", - "transaction configuration string, relevant when populate_opts_per_txn " - "is nonzero") + "WT_SESSION.begin_transaction configuration string, applied during the " + "populate phase when populate_ops_per_txn is nonzero") DEF_OPT_AS_STRING(table_name, "test", "table name") DEF_OPT_AS_BOOL(truncate_single_ops, 0, "Implement truncate via cursor remove instead of session API") diff --git a/bench/wtperf/wtperf_throttle.c b/bench/wtperf/wtperf_throttle.c index e49bca00d07..d104a68175d 100644 --- a/bench/wtperf/wtperf_throttle.c +++ b/bench/wtperf/wtperf_throttle.c @@ -32,7 +32,7 @@ * Put the initial config together for running a throttled workload. */ void -setup_throttle(CONFIG_THREAD *thread) +setup_throttle(WTPERF_THREAD *thread) { THROTTLE_CONFIG *throttle_cfg; @@ -70,7 +70,7 @@ setup_throttle(CONFIG_THREAD *thread) throttle_cfg->ops_count = throttle_cfg->ops_per_increment; /* Set the first timestamp of when we incremented */ - testutil_check(__wt_epoch(NULL, &throttle_cfg->last_increment)); + __wt_epoch(NULL, &throttle_cfg->last_increment); } /* @@ -78,7 +78,7 @@ setup_throttle(CONFIG_THREAD *thread) * counter to perform more operations. */ void -worker_throttle(CONFIG_THREAD *thread) +worker_throttle(WTPERF_THREAD *thread) { THROTTLE_CONFIG *throttle_cfg; struct timespec now; @@ -86,7 +86,7 @@ worker_throttle(CONFIG_THREAD *thread) throttle_cfg = &thread->throttle_cfg; - testutil_check(__wt_epoch(NULL, &now)); + __wt_epoch(NULL, &now); /* * If we did enough operations in the current interval, sleep for @@ -101,7 +101,7 @@ worker_throttle(CONFIG_THREAD *thread) /* * After sleeping, set the interval to the current time. */ - testutil_check(__wt_epoch(NULL, &throttle_cfg->last_increment)); + __wt_epoch(NULL, &throttle_cfg->last_increment); } else { throttle_cfg->ops_count = (usecs_delta * throttle_cfg->ops_per_increment) / diff --git a/bench/wtperf/wtperf_truncate.c b/bench/wtperf/wtperf_truncate.c index e6ebc83c681..3fbb740d2c8 100644 --- a/bench/wtperf/wtperf_truncate.c +++ b/bench/wtperf/wtperf_truncate.c @@ -35,8 +35,9 @@ decode_key(char *key_buf) } int -setup_truncate(CONFIG *cfg, CONFIG_THREAD *thread, WT_SESSION *session) { - +setup_truncate(WTPERF *wtperf, WTPERF_THREAD *thread, WT_SESSION *session) +{ + CONFIG_OPTS *opts; TRUNCATE_CONFIG *trunc_cfg; TRUNCATE_QUEUE_ENTRY *truncate_item; WORKLOAD *workload; @@ -45,13 +46,14 @@ setup_truncate(CONFIG *cfg, CONFIG_THREAD *thread, WT_SESSION *session) { int ret; uint64_t end_point, final_stone_gap, i, start_point; + opts = wtperf->opts; end_point = final_stone_gap = start_point = 0; trunc_cfg = &thread->trunc_cfg; workload = thread->workload; /* We are limited to only one table when running truncate. */ if ((ret = session->open_cursor( - session, cfg->uris[0], NULL, NULL, &cursor)) != 0) + session, wtperf->uris[0], NULL, NULL, &cursor)) != 0) goto err; /* @@ -79,14 +81,14 @@ setup_truncate(CONFIG *cfg, CONFIG_THREAD *thread, WT_SESSION *session) { */ if ((ret = cursor->next(cursor)) != 0 || (ret = cursor->get_key(cursor, &key)) != 0) { - lprintf(cfg, ret, 0, "truncate setup start: failed"); + lprintf(wtperf, ret, 0, "truncate setup start: failed"); goto err; } start_point = decode_key(key); if ((cursor->reset(cursor)) != 0 || (ret = cursor->prev(cursor)) != 0 || (ret = cursor->get_key(cursor, &key)) != 0) { - lprintf(cfg, ret, 0, "truncate setup end: failed"); + lprintf(wtperf, ret, 0, "truncate setup end: failed"); goto err; } end_point = decode_key(key); @@ -104,12 +106,13 @@ setup_truncate(CONFIG *cfg, CONFIG_THREAD *thread, WT_SESSION *session) { for (i = 1; i <= trunc_cfg->needed_stones; i++) { truncate_item = dcalloc(sizeof(TRUNCATE_QUEUE_ENTRY), 1); - truncate_item->key = dcalloc(cfg->key_sz, 1); + truncate_item->key = dcalloc(opts->key_sz, 1); generate_key( - cfg, truncate_item->key, trunc_cfg->stone_gap * i); + opts, truncate_item->key, trunc_cfg->stone_gap * i); truncate_item->diff = (trunc_cfg->stone_gap * i) - trunc_cfg->last_key; - TAILQ_INSERT_TAIL(&cfg->stone_head, truncate_item, q); + TAILQ_INSERT_TAIL( + &wtperf->stone_head, truncate_item, q); trunc_cfg->last_key = trunc_cfg->stone_gap * i; trunc_cfg->num_stones++; } @@ -117,27 +120,29 @@ setup_truncate(CONFIG *cfg, CONFIG_THREAD *thread, WT_SESSION *session) { trunc_cfg->stone_gap = final_stone_gap; err: if ((ret = cursor->close(cursor)) != 0) { - lprintf(cfg, ret, 0, "truncate setup: cursor close failed"); + lprintf(wtperf, ret, 0, "truncate setup: cursor close failed"); } return (ret); } int -run_truncate(CONFIG *cfg, CONFIG_THREAD *thread, - WT_CURSOR *cursor, WT_SESSION *session, int *truncatedp) { - +run_truncate(WTPERF *wtperf, WTPERF_THREAD *thread, + WT_CURSOR *cursor, WT_SESSION *session, int *truncatedp) +{ + CONFIG_OPTS *opts; TRUNCATE_CONFIG *trunc_cfg; TRUNCATE_QUEUE_ENTRY *truncate_item; char *next_key; int ret, t_ret; uint64_t used_stone_gap; - ret = 0; + opts = wtperf->opts; trunc_cfg = &thread->trunc_cfg; + ret = 0; *truncatedp = 0; /* Update the total inserts */ - trunc_cfg->total_inserts = sum_insert_ops(cfg); + trunc_cfg->total_inserts = sum_insert_ops(wtperf); trunc_cfg->expected_total += (trunc_cfg->total_inserts - trunc_cfg->last_total_inserts); trunc_cfg->last_total_inserts = trunc_cfg->total_inserts; @@ -170,10 +175,10 @@ run_truncate(CONFIG *cfg, CONFIG_THREAD *thread, while (trunc_cfg->num_stones < trunc_cfg->needed_stones) { trunc_cfg->last_key += used_stone_gap; truncate_item = dcalloc(sizeof(TRUNCATE_QUEUE_ENTRY), 1); - truncate_item->key = dcalloc(cfg->key_sz, 1); - generate_key(cfg, truncate_item->key, trunc_cfg->last_key); + truncate_item->key = dcalloc(opts->key_sz, 1); + generate_key(opts, truncate_item->key, trunc_cfg->last_key); truncate_item->diff = used_stone_gap; - TAILQ_INSERT_TAIL(&cfg->stone_head, truncate_item, q); + TAILQ_INSERT_TAIL(&wtperf->stone_head, truncate_item, q); trunc_cfg->num_stones++; } @@ -182,34 +187,35 @@ run_truncate(CONFIG *cfg, CONFIG_THREAD *thread, trunc_cfg->expected_total <= thread->workload->truncate_count) return (0); - truncate_item = TAILQ_FIRST(&cfg->stone_head); + truncate_item = TAILQ_FIRST(&wtperf->stone_head); trunc_cfg->num_stones--; - TAILQ_REMOVE(&cfg->stone_head, truncate_item, q); + TAILQ_REMOVE(&wtperf->stone_head, truncate_item, q); /* * Truncate the content via a single truncate call or a cursor walk * depending on the configuration. */ - if (cfg->truncate_single_ops) { + if (opts->truncate_single_ops) { while ((ret = cursor->next(cursor)) == 0) { testutil_check(cursor->get_key(cursor, &next_key)); if (strcmp(next_key, truncate_item->key) == 0) break; if ((ret = cursor->remove(cursor)) != 0) { - lprintf(cfg, ret, 0, "Truncate remove: failed"); + lprintf(wtperf, + ret, 0, "Truncate remove: failed"); goto err; } } } else { cursor->set_key(cursor,truncate_item->key); if ((ret = cursor->search(cursor)) != 0) { - lprintf(cfg, ret, 0, "Truncate search: failed"); + lprintf(wtperf, ret, 0, "Truncate search: failed"); goto err; } if ((ret = session->truncate( session, NULL, NULL, cursor, NULL)) != 0) { - lprintf(cfg, ret, 0, "Truncate: failed"); + lprintf(wtperf, ret, 0, "Truncate: failed"); goto err; } } @@ -221,19 +227,20 @@ err: free(truncate_item->key); free(truncate_item); t_ret = cursor->reset(cursor); if (t_ret != 0) - lprintf(cfg, t_ret, 0, "Cursor reset failed"); + lprintf(wtperf, t_ret, 0, "Cursor reset failed"); if (ret == 0 && t_ret != 0) ret = t_ret; return (ret); } void -cleanup_truncate_config(CONFIG *cfg) { +cleanup_truncate_config(WTPERF *wtperf) +{ TRUNCATE_QUEUE_ENTRY *truncate_item; - while (!TAILQ_EMPTY(&cfg->stone_head)) { - truncate_item = TAILQ_FIRST(&cfg->stone_head); - TAILQ_REMOVE(&cfg->stone_head, truncate_item, q); + while (!TAILQ_EMPTY(&wtperf->stone_head)) { + truncate_item = TAILQ_FIRST(&wtperf->stone_head); + TAILQ_REMOVE(&wtperf->stone_head, truncate_item, q); free(truncate_item->key); free(truncate_item); } diff --git a/build_posix/Make.base b/build_posix/Make.base index 4efbe3f76c3..5b945aca5e0 100644 --- a/build_posix/Make.base +++ b/build_posix/Make.base @@ -77,6 +77,9 @@ endif if HAVE_BUILTIN_EXTENSION_ZLIB libwiredtiger_la_LIBADD += ext/compressors/zlib/libwiredtiger_zlib.la endif +if HAVE_BUILTIN_EXTENSION_ZSTD +libwiredtiger_la_LIBADD += ext/compressors/zstd/libwiredtiger_zstd.la +endif libwiredtiger_static_la_LIBADD=$(libwiredtiger_la_LIBADD) libwiredtiger_static_la_SOURCES=$(libwiredtiger_la_SOURCES) diff --git a/build_posix/Make.subdirs b/build_posix/Make.subdirs index 0b5175e4196..55941837249 100644 --- a/build_posix/Make.subdirs +++ b/build_posix/Make.subdirs @@ -11,6 +11,7 @@ ext/compressors/lz4 LZ4 ext/compressors/nop ext/compressors/snappy SNAPPY ext/compressors/zlib ZLIB +ext/compressors/zstd ZSTD ext/datasources/helium HAVE_HELIUM ext/encryptors/nop ext/encryptors/rotn diff --git a/build_posix/aclocal/options.m4 b/build_posix/aclocal/options.m4 index 1f6a1690279..7043430a6d6 100644 --- a/build_posix/aclocal/options.m4 +++ b/build_posix/aclocal/options.m4 @@ -19,10 +19,12 @@ AH_TEMPLATE(HAVE_BUILTIN_EXTENSION_SNAPPY, [Snappy support automatically loaded.]) AH_TEMPLATE(HAVE_BUILTIN_EXTENSION_ZLIB, [Zlib support automatically loaded.]) +AH_TEMPLATE(HAVE_BUILTIN_EXTENSION_ZSTD, + [ZSTD support automatically loaded.]) AC_MSG_CHECKING(if --with-builtins option specified) AC_ARG_WITH(builtins, [AS_HELP_STRING([--with-builtins], - [builtin extension names (lz4, snappy, zlib).])], + [builtin extension names (lz4, snappy, zlib, zstd).])], [with_builtins=$withval], [with_builtins=]) @@ -36,6 +38,8 @@ for builtin_i in $builtin_list; do wt_cv_with_builtin_extension_snappy=yes;; zlib) AC_DEFINE(HAVE_BUILTIN_EXTENSION_ZLIB) wt_cv_with_builtin_extension_zlib=yes;; + zstd) AC_DEFINE(HAVE_BUILTIN_EXTENSION_ZSTD) + wt_cv_with_builtin_extension_zstd=yes;; *) AC_MSG_ERROR([Unknown builtin extension "$builtin_i"]);; esac done @@ -45,6 +49,8 @@ AM_CONDITIONAL([HAVE_BUILTIN_EXTENSION_SNAPPY], [test "$wt_cv_with_builtin_extension_snappy" = "yes"]) AM_CONDITIONAL([HAVE_BUILTIN_EXTENSION_ZLIB], [test "$wt_cv_with_builtin_extension_zlib" = "yes"]) +AM_CONDITIONAL([HAVE_BUILTIN_EXTENSION_ZSTD], + [test "$wt_cv_with_builtin_extension_zstd" = "yes"]) AC_MSG_RESULT($with_builtins) AH_TEMPLATE( @@ -276,4 +282,30 @@ if test "$wt_cv_enable_zlib" = "yes"; then fi AM_CONDITIONAL([ZLIB], [test "$wt_cv_enable_zlib" = "yes"]) +AC_MSG_CHECKING(if --enable-zstd option specified) +AC_ARG_ENABLE(zstd, + [AS_HELP_STRING([--enable-zstd], + [Build the zstd compressor extension.])], r=$enableval, r=no) +case "$r" in +no) if test "$wt_cv_with_builtin_extension_zstd" = "yes"; then + wt_cv_enable_zstd=yes + else + wt_cv_enable_zstd=no + fi + ;; +*) if test "$wt_cv_with_builtin_extension_zstd" = "yes"; then + AC_MSG_ERROR( + [Only one of --enable-zstd --with-builtins=zstd allowed]) + fi + wt_cv_enable_zstd=yes;; +esac +AC_MSG_RESULT($wt_cv_enable_zstd) +if test "$wt_cv_enable_zstd" = "yes"; then + AC_CHECK_HEADER(zstd.h,, + [AC_MSG_ERROR([--enable-zstd requires zstd.h])]) + AC_CHECK_LIB(zstd, ZSTD_compress,, + [AC_MSG_ERROR([--enable-zstd requires Zstd library])]) +fi +AM_CONDITIONAL([ZSTD], [test "$wt_cv_enable_zstd" = "yes"]) + ]) diff --git a/build_posix/configure.ac.in b/build_posix/configure.ac.in index 608d602937d..ad00b19a3bb 100644 --- a/build_posix/configure.ac.in +++ b/build_posix/configure.ac.in @@ -74,7 +74,7 @@ AM_CONDITIONAL([ARM64_HOST], [test "$wt_cv_arm64" = "yes"]) # support compiling the ASM code we have to perform the CRC checks on PowerPC. # To compile with clang we need to override the ASM compiler with CCAS to use # gcc. Unfortunately, doing the compilation in this manner means libtool can't -# determine what tag to use for that one .S file. If we catch that we are using +# determine what tag to use for that one .sx file. If we catch that we are using # two different compilers for CC and CCAS and we are on a PowerPC system we # overload the libtool flags to provide CC by default. if test "$wt_cv_powerpc" = "yes" -a "$CC" != "$CCAS"; then diff --git a/build_win/wiredtiger_config.h b/build_win/wiredtiger_config.h index 83ddc6eb194..78d2784cb70 100644 --- a/build_win/wiredtiger_config.h +++ b/build_win/wiredtiger_config.h @@ -19,6 +19,9 @@ /* Zlib support automatically loaded. */ /* #undef HAVE_BUILTIN_EXTENSION_ZLIB */ +/* ZSTD support automatically loaded. */ +/* #undef HAVE_BUILTIN_EXTENSION_ZSTD */ + /* Define to 1 if you have the `clock_gettime' function. */ /* #undef HAVE_CLOCK_GETTIME */ @@ -70,6 +73,9 @@ /* Define to 1 if you have the `z' library (-lz). */ /* #undef HAVE_LIBZ */ +/* Define to 1 if you have the `zstd' library (-lzstd). */ +/* #undef HAVE_LIBZSTD */ + /* Define to 1 if you have the <memory.h> header file. */ /* #undef HAVE_MEMORY_H */ diff --git a/dist/api_data.py b/dist/api_data.py index 9781e58a807..7affc58a217 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -136,8 +136,8 @@ file_config = format_meta + [ configure a compressor for file blocks. Permitted values are \c "none" or custom compression engine name created with WT_CONNECTION::add_compressor. If WiredTiger has builtin support for - \c "snappy", \c "lz4" or \c "zlib" compression, these names are also - available. See @ref compression for more information'''), + \c "lz4", \c "snappy", \c "zlib" or \c "zstd" compression, these names + are also available. See @ref compression for more information'''), Config('cache_resident', 'false', r''' do not ever evict the object's pages from cache. Not compatible with LSM tables; see @ref tuning_cache_resident for more information''', @@ -183,6 +183,12 @@ file_config = format_meta + [ configure Huffman encoding for values. Permitted values are \c "none", \c "english", \c "utf8<file>" or \c "utf16<file>". See @ref huffman for more information'''), + Config('ignore_in_memory_cache_size', 'false', r''' + allow update and insert operations to proceed even if the cache is + already at capacity. Only valid in conjunction with in-memory + databases. Should be used with caution - this configuration allows + WiredTiger to consume memory over the configured cache limit''', + type='boolean'), Config('internal_key_truncate', 'true', r''' configure internal key truncation, discarding unnecessary trailing bytes on internal keys (ignored for custom @@ -410,13 +416,13 @@ connection_runtime_config = [ Config('eviction_dirty_target', '5', r''' perform eviction in worker threads when the cache contains at least this much dirty content, expressed as a percentage of the total cache - size. Ignored if \c in_memory is \c true''', + size.''', min=1, max=99), Config('eviction_dirty_trigger', '20', r''' trigger application threads to perform eviction when the cache contains at least this much dirty content, expressed as a percentage of the total cache size. This setting only alters behavior if it is lower than - eviction_trigger. Ignored if \c in_memory is \c true''', + eviction_trigger''', min=1, max=99), Config('eviction_target', '80', r''' perform eviction in worker threads when the cache contains at least @@ -496,7 +502,8 @@ connection_runtime_config = [ is used to gather statistics, as well as each time statistics are logged using the \c statistics_log configuration. See @ref statistics for more information''', - type='list', choices=['all', 'fast', 'none', 'clear']), + type='list', + choices=['all', 'cache_walk', 'fast', 'none', 'clear', 'tree_walk']), Config('verbose', '', r''' enable messages for various events. Only available if WiredTiger is configured with --enable-verbose. Options are given as a @@ -563,8 +570,9 @@ wiredtiger_open_log_configuration = [ configure a compressor for log records. Permitted values are \c "none" or custom compression engine name created with WT_CONNECTION::add_compressor. If WiredTiger has builtin support - for \c "snappy", \c "lz4" or \c "zlib" compression, these names - are also available. See @ref compression for more information'''), + for \c "lz4", \c "snappy", \c "zlib" or \c "zstd" compression, + these names are also available. See @ref compression for more + information'''), Config('file_max', '100MB', r''' the maximum size of log files''', min='100KB', max='2GB'), @@ -970,7 +978,8 @@ methods = { gathering them, where appropriate (for example, a cache size statistic is not cleared, while the count of cursor insert operations will be cleared). See @ref statistics for more information''', - type='list', choices=['all', 'fast', 'clear', 'size']), + type='list', + choices=['all', 'cache_walk', 'fast', 'clear', 'size', 'tree_walk']), Config('target', '', r''' if non-empty, backup the list of objects; valid only for a backup data source''', diff --git a/dist/filelist b/dist/filelist index 19fa1122a27..fe9a17b7799 100644 --- a/dist/filelist +++ b/dist/filelist @@ -48,12 +48,12 @@ src/btree/row_modify.c src/btree/row_srch.c src/cache/cache_las.c src/checksum/arm64/crc32-arm64.c ARM64_HOST -src/checksum/power8/crc32.S POWERPC_HOST +src/checksum/power8/crc32.sx POWERPC_HOST src/checksum/power8/crc32_wrapper.c POWERPC_HOST src/checksum/software/checksum.c src/checksum/x86/crc32-x86.c X86_HOST src/checksum/zseries/crc32-s390x.c ZSERIES_HOST -src/checksum/zseries/crc32le-vx.S ZSERIES_HOST +src/checksum/zseries/crc32le-vx.sx ZSERIES_HOST src/config/config.c src/config/config_api.c src/config/config_check.c @@ -90,6 +90,7 @@ src/cursor/cur_table.c src/evict/evict_file.c src/evict/evict_lru.c src/evict/evict_page.c +src/evict/evict_stat.c src/log/log.c src/log/log_auto.c src/log/log_slot.c diff --git a/dist/flags.py b/dist/flags.py index 93b6e0cbbf4..e200f95fba6 100644 --- a/dist/flags.py +++ b/dist/flags.py @@ -133,6 +133,16 @@ flags = { 'SESSION_QUIET_CORRUPT_FILE', 'SESSION_SERVER_ASYNC', ], + 'stat' : [ + 'STAT_CLEAR', + 'STAT_JSON', + 'STAT_ON_CLOSE', + 'STAT_TYPE_ALL', + 'STAT_TYPE_CACHE_WALK', + 'STAT_TYPE_FAST', + 'STAT_TYPE_SIZE', + 'STAT_TYPE_TREE_WALK', + ], } flag_cnt = {} # Dictionary [flag] : [reference count] diff --git a/dist/s_define.list b/dist/s_define.list index 6a1a32004ea..7b11d665de5 100644 --- a/dist/s_define.list +++ b/dist/s_define.list @@ -39,14 +39,18 @@ WT_READ_BARRIER WT_REF_SIZE WT_SESSION_LOCKED_CHECKPOINT WT_SESSION_LOCKED_TURTLE -WT_STATS_FIELD_TO_SLOT +WT_STATS_FIELD_TO_OFFSET WT_STATS_SLOT_ID WT_STAT_CONN_DECRV WT_STAT_DATA_DECRV WT_STAT_DECR WT_STAT_DECRV WT_STAT_DECRV_ATOMIC +WT_STAT_DECRV_ATOMIC_BASE +WT_STAT_DECRV_BASE WT_STAT_INCRV_ATOMIC +WT_STAT_INCRV_ATOMIC_BASE +WT_STAT_INCRV_BASE WT_STAT_WRITE WT_TIMEDIFF_US WT_TRET_ERROR_OK diff --git a/dist/s_docs b/dist/s_docs index e2b1d2aed11..b4f449fa093 100755 --- a/dist/s_docs +++ b/dist/s_docs @@ -1,7 +1,7 @@ #! /bin/sh t=__wt.$$ -trap 'rm -f $t /tmp/__doxy' 0 1 2 3 13 15 +trap 'rm -f $t' 0 1 2 3 13 15 # Skip this when building release packages: docs are built separately test -n "$WT_RELEASE_BUILD" && exit 0 @@ -30,18 +30,22 @@ wtperf_config() { # The Linux ed command writes line numbers to stderr, redirect both # stdout and stderr to keep things quiet. - cc -o /tmp/__doxy ../bench/wtperf/doxy.c && + # + # The OS X cpp program injects line number output in the middle of lines + # and doesn't stringify #XXX entries; use the -E option to the compiler + # instead. + cat ../bench/wtperf/wtperf_opt.i | + ${CC:-cc} -E -DOPT_DEFINE_DOXYGEN - | python wtperf_config.py > $t (echo '/START_AUTO_GENERATED_WTPERF_CONFIGURATION/+3,/STOP_AUTO_GENERATED_WTPERF_CONFIGURATION/-1d' echo 'i' echo '' echo '.' - echo ".r !/tmp/__doxy" + echo ".r $t" echo 'a' echo '' echo '.' echo 'w' - echo 'q') | ed ../src/docs/wtperf.dox 1>/dev/null 2>/dev/null && - rm -f /tmp/__doxy + echo 'q') | ed ../src/docs/wtperf.dox 1>/dev/null 2>/dev/null } structurechk() diff --git a/dist/s_export b/dist/s_export index dc69238b270..b8e42c970f9 100755 --- a/dist/s_export +++ b/dist/s_export @@ -26,7 +26,7 @@ check() sort | uniq -u | egrep -v \ - 'zlib_extension_init|lz4_extension_init|snappy_extension_init' > $t + 'lz4_extension_init|snappy_extension_init|zlib_extension_init|zstd_extension_init' > $t test -s $t && { echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=" diff --git a/dist/s_stat b/dist/s_stat index 935c7e1fb43..5d5937e1833 100755 --- a/dist/s_stat +++ b/dist/s_stat @@ -20,6 +20,25 @@ search=`sed \ -e d ../src/include/stat.h | sort` +# There are some fields that are used, but we can't detect it. +cat << UNUSED_STAT_FIELDS +lock_checkpoint_count +lock_checkpoint_wait_application +lock_checkpoint_wait_internal +lock_handle_list_count +lock_handle_list_wait_application +lock_handle_list_wait_internal +lock_metadata_count +lock_metadata_wait_application +lock_metadata_wait_internal +lock_schema_count +lock_schema_wait_application +lock_schema_wait_internal +lock_table_count +lock_table_wait_application +lock_table_wait_internal +UNUSED_STAT_FIELDS + echo "$search" fgrep -who "$search" $l) | sort | uniq -u > $t diff --git a/dist/s_string.ok b/dist/s_string.ok index 1887cbd936f..7cf96aec399 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -18,6 +18,7 @@ ASYNC Addr Ailamaki Alakuijala +Alexandrescu's Alloc Async Athanassoulis @@ -59,6 +60,7 @@ COVERITY CPUs CRC CSV +CStream CURSORs CURSTD CallsCustDate @@ -68,6 +70,7 @@ Checksum Checksums CityHash CloseHandle +Collet Comparator Config Coverity @@ -124,6 +127,7 @@ FORALL FOREACH FS FULLFSYNC +Facebook FindClose FindFirstFile Fixup @@ -165,6 +169,7 @@ INSN INTL ISA ITEMs +ITER InitializeCriticalSectionAndSpinCount Inline Intra @@ -372,6 +377,7 @@ WILLNEED WIREDTIGER WRLSN WRNOLOCK +WTPERF WaitForSingleObject WakeAllConditionVariable Wconditional @@ -395,6 +401,12 @@ WriteFile Wuninitialized Wunused XP +Yann +ZSTD +Zlib +Zlib's +Zstd +Zstd's abcdef abcdefghijklmnopqrstuvwxyz addl @@ -416,6 +428,7 @@ argc args argv asm +assertfmt async asyncopp asyncops @@ -511,6 +524,7 @@ collatorp comparator comparep compat +compressStream concat cond conf @@ -530,6 +544,7 @@ cp cpuid crc create's +createCStream crypto cryptobad csv @@ -622,6 +637,7 @@ emp encodings encryptor encryptors +endStream endian english enqueue @@ -749,6 +765,7 @@ infeasible inflateInit infmt init +initCStream initializers initn initsize @@ -784,6 +801,7 @@ isupper isxdigit iter iteratively +iters jnr jrx json @@ -849,6 +867,7 @@ majorp malloc marshall marshalled +maxCLevel maxcpu maxdbs mbll @@ -989,12 +1008,14 @@ qdown qrrSS qsort quartile +queueable qup rN rS rb rbrace rbracket +rcursor rdonly rduppo readlock @@ -1201,6 +1222,7 @@ waitpid walk's warmup wb +wcursor wiredTiger wiredtiger workFactor @@ -1222,6 +1244,10 @@ xxxx xxxxx xxxxxx zalloc +zf zfree zlib +zlib's +zstd +zstd's zu diff --git a/dist/s_style b/dist/s_style index 3860a23b991..8e755224ee2 100755 --- a/dist/s_style +++ b/dist/s_style @@ -108,7 +108,7 @@ else ! expr "$f" : 'test/.*' > /dev/null && ! expr "$f" : '.*/utilities/.*' > /dev/null; then if ! expr "$f" : '.*/os_alloc.c' > /dev/null && - egrep '[[:space:]]free[(]|[[:space:]]strdup[(]|[[:space:]]strndup[(]|[[:space:]]malloc[(]|[[:space:]]calloc[(]|[[:space:]]realloc[(]' $f > $t; then + egrep '[[:space:]]free[(]|[[:space:]]strdup[(]|[[:space:]]strndup[(]|[[:space:]]malloc[(]|[[:space:]]calloc[(]|[[:space:]]realloc[(]|[[:space:]]sprintf[(]' $f > $t; then test -s $t && { echo "$f: call to illegal function" cat $t diff --git a/dist/s_void b/dist/s_void index f7bfbcc7e8e..e5e9f97c0b7 100644 --- a/dist/s_void +++ b/dist/s_void @@ -96,10 +96,13 @@ func_ok() -e '/int wiredtiger_extension_init$/d' \ -e '/int wiredtiger_extension_terminate$/d' \ -e '/int wiredtiger_pack_close$/d' \ - -e '/int wt_snappy_pre_size$/d' \ - -e '/int wt_snappy_terminate$/d' \ + -e '/int snappy_pre_size$/d' \ + -e '/int snappy_terminate$/d' \ -e '/int zlib_error$/d' \ - -e '/int zlib_terminate$/d' + -e '/int zlib_terminate$/d' \ + -e '/int zstd_error$/d' \ + -e '/int zstd_pre_size$/d' \ + -e '/int zstd_terminate$/d' } # Complain about functions which return an "int" but which don't return except diff --git a/dist/stat.py b/dist/stat.py index c3c85bbe9b4..e42585c1b8c 100644 --- a/dist/stat.py +++ b/dist/stat.py @@ -42,8 +42,11 @@ compare_srcfile(tmp_file, '../src/include/stat.h') def print_defines_one(capname, base, stats): for v, l in enumerate(stats, base): desc = l.desc - if 'all_only' in l.flags: - desc += ', only reported if statistics=all is set' + if 'cache_walk' in l.flags: + desc += \ + ', only reported if cache_walk or all statistics are enabled' + if 'tree_walk' in l.flags: + desc += ', only reported if tree_walk or all statistics are enabled' if len(textwrap.wrap(desc, 70)) > 1: f.write('/*!\n') f.write(' * %s\n' % '\n * '.join(textwrap.wrap(desc, 70))) diff --git a/dist/stat_data.py b/dist/stat_data.py index 5087afa44dc..bcf5201bd90 100644 --- a/dist/stat_data.py +++ b/dist/stat_data.py @@ -9,7 +9,8 @@ # # Data-source statistics are normally aggregated across the set of underlying # objects. Additional optional configuration flags are available: -# all_only Only gets reported when statistics=all set +# cache_walk Only reported when statistics=cache_walk is set +# tree_walk Only reported when statistics=tree_walk is set # max_aggregate Take the maximum value when aggregating statistics # no_clear Value not cleared when statistics cleared # no_scale Don't scale value per second in the logging tool script @@ -46,6 +47,11 @@ class CacheStat(Stat): prefix = 'cache' def __init__(self, name, desc, flags=''): Stat.__init__(self, name, CacheStat.prefix, desc, flags) +class CacheWalkStat(Stat): + prefix = 'cache_walk' + def __init__(self, name, desc, flags=''): + flags += ',cache_walk' + Stat.__init__(self, name, CacheWalkStat.prefix, desc, flags) class CompressStat(Stat): prefix = 'compression' def __init__(self, name, desc, flags=''): @@ -66,6 +72,10 @@ class JoinStat(Stat): prefix = '' # prefix is inserted dynamically def __init__(self, name, desc, flags=''): Stat.__init__(self, name, JoinStat.prefix, desc, flags) +class LockStat(Stat): + prefix = 'lock' + def __init__(self, name, desc, flags=''): + Stat.__init__(self, name, LockStat.prefix, desc, flags) class LogStat(Stat): prefix = 'log' def __init__(self, name, desc, flags=''): @@ -105,11 +115,16 @@ groups['cursor'] = [CursorStat.prefix, SessionStat.prefix] groups['evict'] = [ BlockStat.prefix, CacheStat.prefix, + CacheWalkStat.prefix, ConnStat.prefix, ThreadStat.prefix ] groups['lsm'] = [LSMStat.prefix, TxnStat.prefix] -groups['memory'] = [CacheStat.prefix, ConnStat.prefix, RecStat.prefix] +groups['memory'] = [ + CacheStat.prefix, + CacheWalkStat.prefix, + ConnStat.prefix, + RecStat.prefix] groups['system'] = [ ConnStat.prefix, DhandleStat.prefix, @@ -226,13 +241,32 @@ connection_stats = [ CacheStat('cache_pages_inuse', 'pages currently held in the cache', 'no_clear,no_scale'), CacheStat('cache_pages_requested', 'pages requested from the cache'), CacheStat('cache_read', 'pages read into cache'), + CacheStat('cache_read_app_count', 'application threads page read from disk to cache count'), + CacheStat('cache_read_app_time', 'application threads page read from disk to cache time (usecs)'), CacheStat('cache_read_lookaside', 'pages read into cache requiring lookaside entries'), CacheStat('cache_read_overflow', 'overflow pages read into cache'), CacheStat('cache_write', 'pages written from cache'), + CacheStat('cache_write_app_count', 'application threads page write from cache to disk count'), + CacheStat('cache_write_app_time', 'application threads page write from cache to disk time (usecs)'), CacheStat('cache_write_lookaside', 'page written requiring lookaside records'), CacheStat('cache_write_restore', 'pages written requiring in-memory restoration'), ########################################## + # Cursor operations + ########################################## + CursorStat('cursor_create', 'cursor create calls'), + CursorStat('cursor_insert', 'cursor insert calls'), + CursorStat('cursor_next', 'cursor next calls'), + CursorStat('cursor_prev', 'cursor prev calls'), + CursorStat('cursor_remove', 'cursor remove calls'), + CursorStat('cursor_reset', 'cursor reset calls'), + CursorStat('cursor_restart', 'cursor restarted searches'), + CursorStat('cursor_search', 'cursor search calls'), + CursorStat('cursor_search_near', 'cursor search near calls'), + CursorStat('cursor_truncate', 'truncate calls'), + CursorStat('cursor_update', 'cursor update calls'), + + ########################################## # Dhandle statistics ########################################## DhandleStat('dh_conn_handle_count', 'connection data handles currently active', 'no_clear,no_scale'), @@ -245,6 +279,25 @@ connection_stats = [ DhandleStat('dh_sweeps', 'connection sweeps'), ########################################## + # Locking statistics + ########################################## + LockStat('lock_checkpoint_count', 'checkpoint lock acquisitions'), + LockStat('lock_checkpoint_wait_application', 'checkpoint lock application thread wait time (usecs)'), + LockStat('lock_checkpoint_wait_internal', 'checkpoint lock internal thread wait time (usecs)'), + LockStat('lock_handle_list_count', 'handle-list lock acquisitions'), + LockStat('lock_handle_list_wait_application', 'handle-list lock application thread wait time (usecs)'), + LockStat('lock_handle_list_wait_internal', 'handle-list lock internal thread wait time (usecs)'), + LockStat('lock_metadata_count', 'metadata lock acquisitions'), + LockStat('lock_metadata_wait_application', 'metadata lock application thread wait time (usecs)'), + LockStat('lock_metadata_wait_internal', 'metadata lock internal thread wait time (usecs)'), + LockStat('lock_schema_count', 'schema lock acquisitions'), + LockStat('lock_schema_wait_application', 'schema lock application thread wait time (usecs)'), + LockStat('lock_schema_wait_internal', 'schema lock internal thread wait time (usecs)'), + LockStat('lock_table_count', 'table lock acquisitions'), + LockStat('lock_table_wait_application', 'table lock application thread time waiting for the table lock (usecs)'), + LockStat('lock_table_wait_internal', 'table lock internal thread time waiting for the table lock (usecs)'), + + ########################################## # Logging statistics ########################################## LogStat('log_buffer_size', 'total log buffer size', 'no_clear,no_scale,size'), @@ -286,41 +339,6 @@ connection_stats = [ LogStat('log_zero_fills', 'log files manually zero-filled'), ########################################## - # Reconciliation statistics - ########################################## - RecStat('rec_page_delete', 'pages deleted'), - RecStat('rec_page_delete_fast', 'fast-path pages deleted'), - RecStat('rec_pages', 'page reconciliation calls'), - RecStat('rec_pages_eviction', 'page reconciliation calls for eviction'), - RecStat('rec_split_stashed_bytes', 'split bytes currently awaiting free', 'no_clear,no_scale,size'), - RecStat('rec_split_stashed_objects', 'split objects currently awaiting free', 'no_clear,no_scale'), - - ########################################## - # Transaction statistics - ########################################## - TxnStat('txn_begin', 'transaction begins'), - TxnStat('txn_checkpoint', 'transaction checkpoints'), - TxnStat('txn_checkpoint_fsync_post', 'transaction fsync calls for checkpoint after allocating the transaction ID'), - TxnStat('txn_checkpoint_fsync_post_duration', 'transaction fsync duration for checkpoint after allocating the transaction ID (usecs)', 'no_clear,no_scale'), - TxnStat('txn_checkpoint_generation', 'transaction checkpoint generation', 'no_clear,no_scale'), - TxnStat('txn_checkpoint_running', 'transaction checkpoint currently running', 'no_clear,no_scale'), - TxnStat('txn_checkpoint_scrub_target', 'transaction checkpoint scrub dirty target', 'no_clear,no_scale'), - TxnStat('txn_checkpoint_scrub_time', 'transaction checkpoint scrub time (msecs)', 'no_clear,no_scale'), - TxnStat('txn_checkpoint_time_max', 'transaction checkpoint max time (msecs)', 'no_clear,no_scale'), - TxnStat('txn_checkpoint_time_min', 'transaction checkpoint min time (msecs)', 'no_clear,no_scale'), - TxnStat('txn_checkpoint_time_recent', 'transaction checkpoint most recent time (msecs)', 'no_clear,no_scale'), - TxnStat('txn_checkpoint_time_total', 'transaction checkpoint total time (msecs)', 'no_clear,no_scale'), - TxnStat('txn_commit', 'transactions committed'), - TxnStat('txn_fail_cache', 'transaction failures due to cache overflow'), - TxnStat('txn_pinned_checkpoint_range', 'transaction range of IDs currently pinned by a checkpoint', 'no_clear,no_scale'), - TxnStat('txn_pinned_range', 'transaction range of IDs currently pinned', 'no_clear,no_scale'), - TxnStat('txn_pinned_snapshot_range', 'transaction range of IDs currently pinned by named snapshots', 'no_clear,no_scale'), - TxnStat('txn_rollback', 'transactions rolled back'), - TxnStat('txn_snapshots_created', 'number of named snapshots created'), - TxnStat('txn_snapshots_dropped', 'number of named snapshots dropped'), - TxnStat('txn_sync', 'transaction sync calls'), - - ########################################## # LSM statistics ########################################## LSMStat('lsm_checkpoint_throttle', 'sleep for LSM checkpoint throttle'), @@ -335,6 +353,16 @@ connection_stats = [ LSMStat('lsm_work_units_done', 'tree maintenance operations executed'), ########################################## + # Reconciliation statistics + ########################################## + RecStat('rec_page_delete', 'pages deleted'), + RecStat('rec_page_delete_fast', 'fast-path pages deleted'), + RecStat('rec_pages', 'page reconciliation calls'), + RecStat('rec_pages_eviction', 'page reconciliation calls for eviction'), + RecStat('rec_split_stashed_bytes', 'split bytes currently awaiting free', 'no_clear,no_scale,size'), + RecStat('rec_split_stashed_objects', 'split objects currently awaiting free', 'no_clear,no_scale'), + + ########################################## # Session operations ########################################## SessionStat('session_cursor_open', 'open cursor count', 'no_clear,no_scale'), @@ -357,21 +385,6 @@ connection_stats = [ SessionStat('session_table_verify_success', 'table verify successful calls', 'no_clear,no_scale'), ########################################## - # Total cursor operations - ########################################## - CursorStat('cursor_create', 'cursor create calls'), - CursorStat('cursor_insert', 'cursor insert calls'), - CursorStat('cursor_next', 'cursor next calls'), - CursorStat('cursor_prev', 'cursor prev calls'), - CursorStat('cursor_remove', 'cursor remove calls'), - CursorStat('cursor_reset', 'cursor reset calls'), - CursorStat('cursor_restart', 'cursor restarted searches'), - CursorStat('cursor_search', 'cursor search calls'), - CursorStat('cursor_search_near', 'cursor search near calls'), - CursorStat('cursor_truncate', 'truncate calls'), - CursorStat('cursor_update', 'cursor update calls'), - - ########################################## # Thread Count statistics ########################################## ThreadStat('thread_fsync_active', 'active filesystem fsync calls','no_clear,no_scale'), @@ -379,8 +392,36 @@ connection_stats = [ ThreadStat('thread_write_active', 'active filesystem write calls','no_clear,no_scale'), ########################################## + # Transaction statistics + ########################################## + TxnStat('txn_begin', 'transaction begins'), + TxnStat('txn_checkpoint', 'transaction checkpoints'), + TxnStat('txn_checkpoint_fsync_post', 'transaction fsync calls for checkpoint after allocating the transaction ID'), + TxnStat('txn_checkpoint_fsync_post_duration', 'transaction fsync duration for checkpoint after allocating the transaction ID (usecs)', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_generation', 'transaction checkpoint generation', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_running', 'transaction checkpoint currently running', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_scrub_target', 'transaction checkpoint scrub dirty target', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_scrub_time', 'transaction checkpoint scrub time (msecs)', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_skipped', 'transaction checkpoints skipped because database was clean'), + TxnStat('txn_checkpoint_time_max', 'transaction checkpoint max time (msecs)', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_time_min', 'transaction checkpoint min time (msecs)', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_time_recent', 'transaction checkpoint most recent time (msecs)', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_time_total', 'transaction checkpoint total time (msecs)', 'no_clear,no_scale'), + TxnStat('txn_commit', 'transactions committed'), + TxnStat('txn_fail_cache', 'transaction failures due to cache overflow'), + TxnStat('txn_pinned_checkpoint_range', 'transaction range of IDs currently pinned by a checkpoint', 'no_clear,no_scale'), + TxnStat('txn_pinned_range', 'transaction range of IDs currently pinned', 'no_clear,no_scale'), + TxnStat('txn_pinned_snapshot_range', 'transaction range of IDs currently pinned by named snapshots', 'no_clear,no_scale'), + TxnStat('txn_rollback', 'transactions rolled back'), + TxnStat('txn_snapshots_created', 'number of named snapshots created'), + TxnStat('txn_snapshots_dropped', 'number of named snapshots dropped'), + TxnStat('txn_sync', 'transaction sync calls'), + + ########################################## # Yield statistics ########################################## + YieldStat('application_cache_time', 'application thread time waiting for cache (usecs)'), + YieldStat('application_evict_time', 'application thread time evicting (usecs)'), YieldStat('page_busy_blocked', 'page acquire busy blocked'), YieldStat('page_forcible_evict_blocked', 'page acquire eviction blocked'), YieldStat('page_locked_blocked', 'page acquire locked blocked'), @@ -395,41 +436,30 @@ connection_stats = sorted(connection_stats, key=attrgetter('desc')) ########################################## dsrc_stats = [ ########################################## - # Session operations - ########################################## - SessionStat('session_compact', 'object compaction'), - SessionStat('session_cursor_open', 'open cursor count', 'no_clear,no_scale'), - - ########################################## - # Cursor operations + # Block manager statistics ########################################## - CursorStat('cursor_create', 'create calls'), - CursorStat('cursor_insert', 'insert calls'), - CursorStat('cursor_insert_bulk', 'bulk-loaded cursor-insert calls'), - CursorStat('cursor_insert_bytes', 'cursor-insert key and value bytes inserted', 'size'), - CursorStat('cursor_next', 'next calls'), - CursorStat('cursor_prev', 'prev calls'), - CursorStat('cursor_remove', 'remove calls'), - CursorStat('cursor_remove_bytes', 'cursor-remove key bytes removed', 'size'), - CursorStat('cursor_reset', 'reset calls'), - CursorStat('cursor_restart', 'restarted searches'), - CursorStat('cursor_search', 'search calls'), - CursorStat('cursor_search_near', 'search near calls'), - CursorStat('cursor_truncate', 'truncate calls'), - CursorStat('cursor_update', 'update calls'), - CursorStat('cursor_update_bytes', 'cursor-update value bytes updated', 'size'), + BlockStat('allocation_size', 'file allocation unit size', 'max_aggregate,no_scale,size'), + BlockStat('block_alloc', 'blocks allocated'), + BlockStat('block_checkpoint_size', 'checkpoint size', 'no_scale,size'), + BlockStat('block_extension', 'allocations requiring file extension'), + BlockStat('block_free', 'blocks freed'), + BlockStat('block_magic', 'file magic number', 'max_aggregate,no_scale'), + BlockStat('block_major', 'file major version number', 'max_aggregate,no_scale'), + BlockStat('block_minor', 'minor version number', 'max_aggregate,no_scale'), + BlockStat('block_reuse_bytes', 'file bytes available for reuse', 'no_scale,size'), + BlockStat('block_size', 'file size in bytes', 'no_scale,size'), ########################################## # Btree statistics ########################################## BtreeStat('btree_checkpoint_generation', 'btree checkpoint generation', 'no_clear,no_scale'), - BtreeStat('btree_column_deleted', 'column-store variable-size deleted values', 'no_scale,all_only'), - BtreeStat('btree_column_fix', 'column-store fixed-size leaf pages', 'no_scale,all_only'), - BtreeStat('btree_column_internal', 'column-store internal pages', 'no_scale,all_only'), - BtreeStat('btree_column_rle', 'column-store variable-size RLE encoded values', 'no_scale,all_only'), - BtreeStat('btree_column_variable', 'column-store variable-size leaf pages', 'no_scale,all_only'), + BtreeStat('btree_column_deleted', 'column-store variable-size deleted values', 'no_scale,tree_walk'), + BtreeStat('btree_column_fix', 'column-store fixed-size leaf pages', 'no_scale,tree_walk'), + BtreeStat('btree_column_internal', 'column-store internal pages', 'no_scale,tree_walk'), + BtreeStat('btree_column_rle', 'column-store variable-size RLE encoded values', 'no_scale,tree_walk'), + BtreeStat('btree_column_variable', 'column-store variable-size leaf pages', 'no_scale,tree_walk'), BtreeStat('btree_compact_rewrite', 'pages rewritten by compaction'), - BtreeStat('btree_entries', 'number of key/value pairs', 'no_scale,all_only'), + BtreeStat('btree_entries', 'number of key/value pairs', 'no_scale,tree_walk'), BtreeStat('btree_fixed_len', 'fixed-record size', 'max_aggregate,no_scale,size'), BtreeStat('btree_maximum_depth', 'maximum tree depth', 'max_aggregate,no_scale'), BtreeStat('btree_maxintlkey', 'maximum internal page key size', 'max_aggregate,no_scale,size'), @@ -437,39 +467,9 @@ dsrc_stats = [ BtreeStat('btree_maxleafkey', 'maximum leaf page key size', 'max_aggregate,no_scale,size'), BtreeStat('btree_maxleafpage', 'maximum leaf page size', 'max_aggregate,no_scale,size'), BtreeStat('btree_maxleafvalue', 'maximum leaf page value size', 'max_aggregate,no_scale,size'), - BtreeStat('btree_overflow', 'overflow pages', 'no_scale,all_only'), - BtreeStat('btree_row_internal', 'row-store internal pages', 'no_scale,all_only'), - BtreeStat('btree_row_leaf', 'row-store leaf pages', 'no_scale,all_only'), - - ########################################## - # LSM statistics - ########################################## - LSMStat('bloom_count', 'bloom filters in the LSM tree', 'no_scale'), - LSMStat('bloom_false_positive', 'bloom filter false positives'), - LSMStat('bloom_hit', 'bloom filter hits'), - LSMStat('bloom_miss', 'bloom filter misses'), - LSMStat('bloom_page_evict', 'bloom filter pages evicted from cache'), - LSMStat('bloom_page_read', 'bloom filter pages read into cache'), - LSMStat('bloom_size', 'total size of bloom filters', 'no_scale,size'), - LSMStat('lsm_checkpoint_throttle', 'sleep for LSM checkpoint throttle'), - LSMStat('lsm_chunk_count', 'chunks in the LSM tree', 'no_scale'), - LSMStat('lsm_generation_max', 'highest merge generation in the LSM tree', 'max_aggregate,no_scale'), - LSMStat('lsm_lookup_no_bloom', 'queries that could have benefited from a Bloom filter that did not exist'), - LSMStat('lsm_merge_throttle', 'sleep for LSM merge throttle'), - - ########################################## - # Block manager statistics - ########################################## - BlockStat('allocation_size', 'file allocation unit size', 'max_aggregate,no_scale,size'), - BlockStat('block_alloc', 'blocks allocated'), - BlockStat('block_checkpoint_size', 'checkpoint size', 'no_scale,size'), - BlockStat('block_extension', 'allocations requiring file extension'), - BlockStat('block_free', 'blocks freed'), - BlockStat('block_magic', 'file magic number', 'max_aggregate,no_scale'), - BlockStat('block_major', 'file major version number', 'max_aggregate,no_scale'), - BlockStat('block_minor', 'minor version number', 'max_aggregate,no_scale'), - BlockStat('block_reuse_bytes', 'file bytes available for reuse', 'no_scale,size'), - BlockStat('block_size', 'file size in bytes', 'no_scale,size'), + BtreeStat('btree_overflow', 'overflow pages', 'no_scale,tree_walk'), + BtreeStat('btree_row_internal', 'row-store internal pages', 'no_scale,tree_walk'), + BtreeStat('btree_row_leaf', 'row-store leaf pages', 'no_scale,tree_walk'), ########################################## # Cache and eviction statistics @@ -498,6 +498,28 @@ dsrc_stats = [ CacheStat('cache_write_restore', 'pages written requiring in-memory restoration'), ########################################## + # Cache content statistics + ########################################## + CacheWalkStat('cache_state_avg_written_size', 'Average on-disk page image size seen', 'no_clear,no_scale'), + CacheWalkStat('cache_state_gen_avg_gap', 'Average difference between current eviction generation when the page was last considered', 'no_clear,no_scale'), + CacheWalkStat('cache_state_gen_current', 'Current eviction generation', 'no_clear,no_scale'), + CacheWalkStat('cache_state_gen_max_gap', 'Maximum difference between current eviction generation when the page was last considered', 'no_clear,no_scale'), + CacheWalkStat('cache_state_max_pagesize', 'Maximum page size seen', 'no_clear,no_scale'), + CacheWalkStat('cache_state_memory', 'Pages created in memory and never written', 'no_clear,no_scale'), + CacheWalkStat('cache_state_min_written_size', 'Minimum on-disk page image size seen', 'no_clear,no_scale'), + CacheWalkStat('cache_state_not_queueable', 'Pages that could not be queued for eviction', 'no_clear,no_scale'), + CacheWalkStat('cache_state_pages', 'Total number of pages currently in cache', 'no_clear,no_scale'), + CacheWalkStat('cache_state_pages_clean', 'Clean pages currently in cache', 'no_clear,no_scale'), + CacheWalkStat('cache_state_pages_dirty', 'Dirty pages currently in cache', 'no_clear,no_scale'), + CacheWalkStat('cache_state_pages_internal', 'Internal pages currently in cache', 'no_clear,no_scale'), + CacheWalkStat('cache_state_pages_leaf', 'Leaf pages currently in cache', 'no_clear,no_scale'), + CacheWalkStat('cache_state_queued', 'Pages currently queued for eviction', 'no_clear,no_scale'), + CacheWalkStat('cache_state_refs_skipped', 'Refs skipped during cache traversal', 'no_clear,no_scale'), + CacheWalkStat('cache_state_root_entries', 'Entries in the root page', 'no_clear,no_scale'), + CacheWalkStat('cache_state_root_size', 'Size of the root page', 'no_clear,no_scale'), + CacheWalkStat('cache_state_smaller_alloc_size', 'On-disk page image sizes smaller than a single allocation unit', 'no_clear,no_scale'), + + ########################################## # Compression statistics ########################################## CompressStat('compress_raw_fail', 'raw compression call failed, no additional data available'), @@ -509,6 +531,41 @@ dsrc_stats = [ CompressStat('compress_write_too_small', 'page written was too small to compress'), ########################################## + # Cursor operations + ########################################## + CursorStat('cursor_create', 'create calls'), + CursorStat('cursor_insert', 'insert calls'), + CursorStat('cursor_insert_bulk', 'bulk-loaded cursor-insert calls'), + CursorStat('cursor_insert_bytes', 'cursor-insert key and value bytes inserted', 'size'), + CursorStat('cursor_next', 'next calls'), + CursorStat('cursor_prev', 'prev calls'), + CursorStat('cursor_remove', 'remove calls'), + CursorStat('cursor_remove_bytes', 'cursor-remove key bytes removed', 'size'), + CursorStat('cursor_reset', 'reset calls'), + CursorStat('cursor_restart', 'restarted searches'), + CursorStat('cursor_search', 'search calls'), + CursorStat('cursor_search_near', 'search near calls'), + CursorStat('cursor_truncate', 'truncate calls'), + CursorStat('cursor_update', 'update calls'), + CursorStat('cursor_update_bytes', 'cursor-update value bytes updated', 'size'), + + ########################################## + # LSM statistics + ########################################## + LSMStat('bloom_count', 'bloom filters in the LSM tree', 'no_scale'), + LSMStat('bloom_false_positive', 'bloom filter false positives'), + LSMStat('bloom_hit', 'bloom filter hits'), + LSMStat('bloom_miss', 'bloom filter misses'), + LSMStat('bloom_page_evict', 'bloom filter pages evicted from cache'), + LSMStat('bloom_page_read', 'bloom filter pages read into cache'), + LSMStat('bloom_size', 'total size of bloom filters', 'no_scale,size'), + LSMStat('lsm_checkpoint_throttle', 'sleep for LSM checkpoint throttle'), + LSMStat('lsm_chunk_count', 'chunks in the LSM tree', 'no_scale'), + LSMStat('lsm_generation_max', 'highest merge generation in the LSM tree', 'max_aggregate,no_scale'), + LSMStat('lsm_lookup_no_bloom', 'queries that could have benefited from a Bloom filter that did not exist'), + LSMStat('lsm_merge_throttle', 'sleep for LSM merge throttle'), + + ########################################## # Reconciliation statistics ########################################## RecStat('rec_dictionary', 'dictionary matches'), @@ -527,6 +584,12 @@ dsrc_stats = [ RecStat('rec_suffix_compression', 'internal page key bytes discarded using suffix compression', 'size'), ########################################## + # Session operations + ########################################## + SessionStat('session_compact', 'object compaction'), + SessionStat('session_cursor_open', 'open cursor count', 'no_clear,no_scale'), + + ########################################## # Transaction statistics ########################################## TxnStat('txn_update_conflict', 'update conflicts'), diff --git a/dist/wtperf_config.py b/dist/wtperf_config.py new file mode 100644 index 00000000000..72256ed5527 --- /dev/null +++ b/dist/wtperf_config.py @@ -0,0 +1,25 @@ +# Output a doxgen version of the wtperf configuration options. +import string, sys + +for line in sys.stdin: + if not line.startswith('OPTION '): + continue + + line = line.replace('OPTION ', '') + v = line.split('",') + v[0] = v[0].replace('"', '').strip() + v[1] = v[1].replace('"', '').strip() + v[2] = v[2].replace('"', '').strip() + v[3] = v[3].replace('"', '').strip() + + if v[3] == 'boolean': + if v[2] == '0': + d = 'false' + else: + d = 'true' + elif v[3] == 'string': + d = '"' + v[2] + '"' + else: + d = v[2] + print '@par ' + v[0] + ' (' + v[3] + ', default=' + d + ')' + print v[1] diff --git a/examples/c/ex_all.c b/examples/c/ex_all.c index a2042c22bbb..ea646604a76 100644 --- a/examples/c/ex_all.c +++ b/examples/c/ex_all.c @@ -611,6 +611,13 @@ session_ops(WT_SESSION *session) "block_compressor=zlib,key_format=S,value_format=S"); /*! [Create a zlib compressed table] */ ret = session->drop(session, "table:mytable", NULL); + + /*! [Create a zstd compressed table] */ + ret = session->create(session, + "table:mytable", + "block_compressor=zstd,key_format=S,value_format=S"); + /*! [Create a zstd compressed table] */ + ret = session->drop(session, "table:mytable", NULL); #endif /*! [Configure checksums to uncompressed] */ @@ -1108,6 +1115,32 @@ main(void) if (ret == 0) (void)conn->close(conn, NULL); + /*! [Configure zlib extension with compression level] */ + ret = wiredtiger_open(home, NULL, + "create," + "extensions=[/usr/local/lib/" + "libwiredtiger_zlib.so=[config=[compression_level=3]]]", &conn); + /*! [Configure zlib extension with compression level] */ + if (ret == 0) + (void)conn->close(conn, NULL); + + /*! [Configure zstd extension] */ + ret = wiredtiger_open(home, NULL, + "create," + "extensions=[/usr/local/lib/libwiredtiger_zstd.so]", &conn); + /*! [Configure zstd extension] */ + if (ret == 0) + (void)conn->close(conn, NULL); + + /*! [Configure zstd extension with compression level] */ + ret = wiredtiger_open(home, NULL, + "create," + "extensions=[/usr/local/lib/" + "libwiredtiger_zstd.so=[config=[compression_level=9]]]", &conn); + /*! [Configure zstd extension with compression level] */ + if (ret == 0) + (void)conn->close(conn, NULL); + /* * This example code gets run, and direct I/O might not be available, * causing the open to fail. The documentation requires code snippets, diff --git a/examples/java/com/wiredtiger/examples/ex_all.java b/examples/java/com/wiredtiger/examples/ex_all.java index 83a37e9a6a5..cf8491aa4f8 100644 --- a/examples/java/com/wiredtiger/examples/ex_all.java +++ b/examples/java/com/wiredtiger/examples/ex_all.java @@ -549,6 +549,12 @@ session_ops(Session session) "block_compressor=zlib,key_format=S,value_format=S"); /*! [Create a zlib compressed table] */ ret = session.drop("table:mytable", null); + + /*! [Create a zstd compressed table] */ + ret = session.create("table:mytable", + "block_compressor=zstd,key_format=S,value_format=S"); + /*! [Create a zstd compressed table] */ + ret = session.drop("table:mytable", null); } // if (false) /*! [Configure checksums to uncompressed] */ @@ -942,6 +948,29 @@ allExample() /*! [Configure zlib extension] */ conn.close(null); + /*! [Configure zlib extension with compression level] */ + conn = wiredtiger.open(home, + "create," + + "extensions=[/usr/local/lib/" + + "libwiredtiger_zlib.so=[config=[compression_level=3]]]"); + /*! [Configure zlib extension with compression level] */ + conn.close(null); + + /*! [Configure zstd extension] */ + conn = wiredtiger.open(home, + "create," + + "extensions=[/usr/local/lib/libwiredtiger_zstd.so]"); + /*! [Configure zstd extension] */ + conn.close(null); + + /*! [Configure zstd extension with compression level] */ + conn = wiredtiger.open(home, + "create," + + "extensions=[/usr/local/lib/" + + "libwiredtiger_zstd.so=[config=[compression_level=9]]]"); + /*! [Configure zstd extension with compression level] */ + conn.close(null); + /* * This example code gets run, and direct I/O might not be available, * causing the open to fail. The documentation requires code snippets, diff --git a/ext/compressors/lz4/lz4_compress.c b/ext/compressors/lz4/lz4_compress.c index 35159d0fa76..885701e564b 100644 --- a/ext/compressors/lz4/lz4_compress.c +++ b/ext/compressors/lz4/lz4_compress.c @@ -31,10 +31,20 @@ #include <stdlib.h> #include <string.h> +/* + * We need to include the configuration file to detect whether this extension + * is being built into the WiredTiger library; application-loaded compression + * functions won't need it. + */ #include <wiredtiger_config.h> + #include <wiredtiger.h> #include <wiredtiger_ext.h> +#ifdef _MSC_VER +#define inline __inline +#endif + /* Local compressor structure. */ typedef struct { WT_COMPRESSOR compressor; /* Must come first */ @@ -171,8 +181,6 @@ lz4_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session, int decoded; uint8_t *dst_tmp; - (void)src_len; /* Unused parameters */ - wt_api = ((LZ4_COMPRESSOR *)compressor)->wt_api; /* @@ -183,6 +191,13 @@ lz4_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session, #ifdef WORDS_BIGENDIAN lz4_prefix_swap(&prefix); #endif + if (prefix.compressed_len + sizeof(LZ4_PREFIX) > src_len) { + (void)wt_api->err_printf(wt_api, + session, + "WT_COMPRESSOR.decompress: stored size exceeds source " + "size"); + return (WT_ERROR); + } /* * Decompress, starting after the prefix bytes. Use safe decompression: @@ -267,18 +282,24 @@ lz4_compress_raw(WT_COMPRESSOR *compressor, WT_SESSION *session, size_t *result_lenp, uint32_t *result_slotsp) { LZ4_PREFIX prefix; - int lz4_len; uint32_t slot; - int sourceSize, targetDestSize; + int lz4_len, sourceSize, targetDestSize; (void)compressor; /* Unused parameters */ (void)session; (void)split_pct; (void)final; - sourceSize = (int)offsets[slots]; /* Type conversion */ - targetDestSize = - (int)((dst_len < page_max ? dst_len : page_max) - extra); + /* + * Set the source and target sizes. The target size is complicated: we + * don't want to exceed the smaller of the maximum page size or the + * destination buffer length, and in both cases we have to take into + * account the space for our overhead and the extra bytes required by + * our caller. + */ + sourceSize = (int)offsets[slots]; + targetDestSize = (int)(page_max < dst_len ? page_max : dst_len); + targetDestSize -= (int)(sizeof(LZ4_PREFIX) + extra); /* Compress, starting after the prefix bytes. */ lz4_len = LZ4_compress_destSize((const char *)src, @@ -352,7 +373,7 @@ lz4_terminate(WT_COMPRESSOR *compressor, WT_SESSION *session) * Add a LZ4 compressor. */ static int -lz_add_compressor(WT_CONNECTION *connection, int raw, const char *name) +lz_add_compressor(WT_CONNECTION *connection, bool raw, const char *name) { LZ4_COMPRESSOR *lz4_compressor; @@ -391,9 +412,9 @@ lz4_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) (void)config; /* Unused parameters */ - if ((ret = lz_add_compressor(connection, 1, "lz4")) != 0) + if ((ret = lz_add_compressor(connection, true, "lz4")) != 0) return (ret); - if ((ret = lz_add_compressor(connection, 0, "lz4-noraw")) != 0) + if ((ret = lz_add_compressor(connection, false, "lz4-noraw")) != 0) return (ret); return (0); } diff --git a/ext/compressors/snappy/snappy_compress.c b/ext/compressors/snappy/snappy_compress.c index 981e334a2de..32f1ddcb9a0 100644 --- a/ext/compressors/snappy/snappy_compress.c +++ b/ext/compressors/snappy/snappy_compress.c @@ -31,10 +31,20 @@ #include <stdlib.h> #include <string.h> +/* + * We need to include the configuration file to detect whether this extension + * is being built into the WiredTiger library; application-loaded compression + * functions won't need it. + */ #include <wiredtiger_config.h> + #include <wiredtiger.h> #include <wiredtiger_ext.h> +#ifdef _MSC_VER +#define inline __inline +#endif + /* Local compressor structure. */ typedef struct { WT_COMPRESSOR compressor; /* Must come first */ @@ -42,6 +52,12 @@ typedef struct { WT_EXTENSION_API *wt_api; /* Extension API */ } SNAPPY_COMPRESSOR; +/* + * Snappy decompression requires an exact compressed byte count. WiredTiger + * doesn't track that value, store it in the destination buffer. + */ +#define SNAPPY_PREFIX sizeof(uint64_t) + #ifdef WORDS_BIGENDIAN /* * snappy_bswap64 -- @@ -64,11 +80,11 @@ snappy_bswap64(uint64_t v) #endif /* - * wt_snappy_error -- + * snappy_error -- * Output an error message, and return a standard error code. */ static int -wt_snappy_error(WT_COMPRESSOR *compressor, +snappy_error(WT_COMPRESSOR *compressor, WT_SESSION *session, const char *call, snappy_status snret) { WT_EXTENSION_API *wt_api; @@ -94,68 +110,69 @@ wt_snappy_error(WT_COMPRESSOR *compressor, } /* - * wt_snappy_compress -- + * snappy_compression -- * WiredTiger snappy compression. */ static int -wt_snappy_compress(WT_COMPRESSOR *compressor, WT_SESSION *session, +snappy_compression(WT_COMPRESSOR *compressor, WT_SESSION *session, uint8_t *src, size_t src_len, uint8_t *dst, size_t dst_len, size_t *result_lenp, int *compression_failed) { snappy_status snret; size_t snaplen; + uint64_t snaplen_u64; char *snapbuf; /* - * dst_len was computed in wt_snappy_pre_size, so we know it's big - * enough. Skip past the space we'll use to store the final count - * of compressed bytes. + * dst_len was computed in snappy_pre_size, so we know it's big enough. + * Skip past the space we'll use to store the final count of compressed + * bytes. */ - snaplen = dst_len - sizeof(size_t); - snapbuf = (char *)dst + sizeof(size_t); + snaplen = dst_len - SNAPPY_PREFIX; + snapbuf = (char *)dst + SNAPPY_PREFIX; /* snaplen is an input and an output arg. */ snret = snappy_compress((char *)src, src_len, snapbuf, &snaplen); - if (snret == SNAPPY_OK) { - if (snaplen + sizeof(size_t) < src_len) { - *result_lenp = snaplen + sizeof(size_t); - *compression_failed = 0; - - /* - * On decompression, snappy requires an exact compressed - * byte count (the current value of snaplen). WiredTiger - * does not preserve that value, so save snaplen at the - * beginning of the destination buffer. - * - * Store the value in little-endian format. - */ + if (snret == SNAPPY_OK && snaplen + SNAPPY_PREFIX < src_len) { + *result_lenp = snaplen + SNAPPY_PREFIX; + *compression_failed = 0; + + /* + * On decompression, snappy requires an exact compressed byte + * count (the current value of snaplen). WiredTiger does not + * preserve that value, so save snaplen at the beginning of + * the destination buffer. + * + * Store the value in little-endian format. + */ + snaplen_u64 = snaplen; #ifdef WORDS_BIGENDIAN - snaplen = snappy_bswap64(snaplen); + snaplen_u64 = snappy_bswap64(snaplen_u64); #endif - *(size_t *)dst = snaplen; - } else - /* The compressor failed to produce a smaller result. */ - *compression_failed = 1; + *(uint64_t *)dst = snaplen_u64; return (0); } - return (wt_snappy_error(compressor, session, "snappy_compress", snret)); + + *compression_failed = 1; + return (snret == SNAPPY_OK ? + 0 : snappy_error(compressor, session, "snappy_compress", snret)); } /* - * wt_snappy_decompress -- + * snappy_decompression -- * WiredTiger snappy decompression. */ static int -wt_snappy_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session, +snappy_decompression(WT_COMPRESSOR *compressor, WT_SESSION *session, uint8_t *src, size_t src_len, uint8_t *dst, size_t dst_len, size_t *result_lenp) { WT_EXTENSION_API *wt_api; snappy_status snret; - size_t snaplen; + uint64_t snaplen; wt_api = ((SNAPPY_COMPRESSOR *)compressor)->wt_api; @@ -163,36 +180,36 @@ wt_snappy_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session, * Retrieve the saved length, handling little- to big-endian conversion * as necessary. */ - snaplen = *(size_t *)src; + snaplen = *(uint64_t *)src; #ifdef WORDS_BIGENDIAN snaplen = snappy_bswap64(snaplen); #endif - if (snaplen + sizeof(size_t) > src_len) { + if (snaplen + SNAPPY_PREFIX > src_len) { (void)wt_api->err_printf(wt_api, session, - "wt_snappy_decompress: stored size exceeds buffer size"); + "WT_COMPRESSOR.decompress: stored size exceeds source " + "size"); return (WT_ERROR); } /* dst_len is an input and an output arg. */ snret = snappy_uncompress( - (char *)src + sizeof(size_t), snaplen, (char *)dst, &dst_len); + (char *)src + SNAPPY_PREFIX, + (size_t)snaplen, (char *)dst, &dst_len); if (snret == SNAPPY_OK) { *result_lenp = dst_len; return (0); } - - return ( - wt_snappy_error(compressor, session, "snappy_decompress", snret)); + return (snappy_error(compressor, session, "snappy_decompress", snret)); } /* - * wt_snappy_pre_size -- + * snappy_pre_size -- * WiredTiger snappy destination buffer sizing. */ static int -wt_snappy_pre_size(WT_COMPRESSOR *compressor, WT_SESSION *session, +snappy_pre_size(WT_COMPRESSOR *compressor, WT_SESSION *session, uint8_t *src, size_t src_len, size_t *result_lenp) { @@ -203,19 +220,19 @@ wt_snappy_pre_size(WT_COMPRESSOR *compressor, WT_SESSION *session, /* * Snappy requires the dest buffer be somewhat larger than the source. * Fortunately, this is fast to compute, and will give us a dest buffer - * in wt_snappy_compress that we can compress to directly. We add space + * in snappy_compress that we can compress to directly. We add space * in the dest buffer to store the accurate compressed size. */ - *result_lenp = snappy_max_compressed_length(src_len) + sizeof(size_t); + *result_lenp = snappy_max_compressed_length(src_len) + SNAPPY_PREFIX; return (0); } /* - * wt_snappy_terminate -- + * snappy_terminate -- * WiredTiger snappy compression termination. */ static int -wt_snappy_terminate(WT_COMPRESSOR *compressor, WT_SESSION *session) +snappy_terminate(WT_COMPRESSOR *compressor, WT_SESSION *session) { (void)session; /* Unused parameters */ @@ -227,9 +244,9 @@ int snappy_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *); /* * snappy_extension_init -- - * WiredTiger snappy compression extension - called directly when - * Snappy support is built in, or via wiredtiger_extension_init when - * snappy support is included via extension loading. + * WiredTiger snappy compression extension - called directly when snappy + * support is built in, or via wiredtiger_extension_init when snappy support + * is included via extension loading. */ int snappy_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) @@ -241,11 +258,11 @@ snappy_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) if ((snappy_compressor = calloc(1, sizeof(SNAPPY_COMPRESSOR))) == NULL) return (errno); - snappy_compressor->compressor.compress = wt_snappy_compress; + snappy_compressor->compressor.compress = snappy_compression; snappy_compressor->compressor.compress_raw = NULL; - snappy_compressor->compressor.decompress = wt_snappy_decompress; - snappy_compressor->compressor.pre_size = wt_snappy_pre_size; - snappy_compressor->compressor.terminate = wt_snappy_terminate; + snappy_compressor->compressor.decompress = snappy_decompression; + snappy_compressor->compressor.pre_size = snappy_pre_size; + snappy_compressor->compressor.terminate = snappy_terminate; snappy_compressor->wt_api = connection->get_extension_api(connection); diff --git a/ext/compressors/zlib/zlib_compress.c b/ext/compressors/zlib/zlib_compress.c index 484df0a6785..ef20503df0a 100644 --- a/ext/compressors/zlib/zlib_compress.c +++ b/ext/compressors/zlib/zlib_compress.c @@ -32,16 +32,18 @@ #include <stdlib.h> #include <string.h> -#include <wiredtiger.h> -#include <wiredtiger_ext.h> - /* * We need to include the configuration file to detect whether this extension - * is being built into the WiredTiger library. + * is being built into the WiredTiger library; application-loaded compression + * functions won't need it. */ -#include "wiredtiger_config.h" +#include <wiredtiger_config.h> + +#include <wiredtiger.h> +#include <wiredtiger_ext.h> + #ifdef _MSC_VER -#define inline __inline +#define inline __inline #endif /* Local compressor structure. */ @@ -234,121 +236,163 @@ zlib_compress_raw(WT_COMPRESSOR *compressor, WT_SESSION *session, { ZLIB_COMPRESSOR *zlib_compressor; ZLIB_OPAQUE opaque; - z_stream *best_zs, last_zs, zs; - uint32_t curr_slot, last_slot; - int ret; + z_stream *best_zs, *last_zs, _last_zs, *zs, _zs; + uint32_t curr_slot, last_slot, zlib_reserved; + bool increase_reserve; + int ret, tret; - curr_slot = last_slot = 0; - (void)split_pct; - (void)dst_len; + (void)split_pct; /* Unused parameters */ (void)final; zlib_compressor = (ZLIB_COMPRESSOR *)compressor; - memset(&zs, 0, sizeof(zs)); - zs.zalloc = zalloc; - zs.zfree = zfree; - opaque.compressor = compressor; - opaque.session = session; - zs.opaque = &opaque; - - if ((ret = deflateInit(&zs, zlib_compressor->zlib_level)) != Z_OK) - return (zlib_error(compressor, session, "deflateInit", ret)); - - zs.next_in = src; - zs.next_out = dst; /* * Experimentally derived, reserve this many bytes for zlib to finish * up a buffer. If this isn't sufficient, we don't fail but we will be * inefficient. */ #define WT_ZLIB_RESERVED 24 - zs.avail_out = (uint32_t)(page_max - (extra + WT_ZLIB_RESERVED)); +#define WT_ZLIB_RESERVED_MAX 48 + zlib_reserved = WT_ZLIB_RESERVED; + + if (0) { +retry: /* If we reached our maximum reserve, quit. */ + if (zlib_reserved == WT_ZLIB_RESERVED_MAX) + return (0); + zlib_reserved = WT_ZLIB_RESERVED_MAX; + } + + best_zs = last_zs = NULL; + last_slot = 0; + increase_reserve = false; + ret = 0; - /* Save the stream state in case the chosen data doesn't fit. */ - if ((ret = deflateCopy(&last_zs, &zs)) != Z_OK) - return (zlib_error(compressor, session, "deflateCopy", ret)); + zs = &_zs; + memset(zs, 0, sizeof(*zs)); + zs->zalloc = zalloc; + zs->zfree = zfree; + opaque.compressor = compressor; + opaque.session = session; + zs->opaque = &opaque; + + if ((ret = deflateInit(zs, zlib_compressor->zlib_level)) != Z_OK) + return (zlib_error(compressor, session, "deflateInit", ret)); + + zs->next_in = src; + zs->next_out = dst; + + /* + * Set the target size. The target size is complicated: we don't want + * to exceed the smaller of the maximum page size or the destination + * buffer length, and in both cases we have to take into account the + * space required by zlib to finish up the buffer and the extra bytes + * required by our caller. + */ + zs->avail_out = (uint32_t)(page_max < dst_len ? page_max : dst_len); + zs->avail_out -= (uint32_t)(zlib_reserved + extra); /* * Strategy: take the available output size and compress that much * input. Continue until there is no input small enough or the * compression fails to fit. */ - for (best_zs = NULL;;) { + for (;;) { /* Find the next slot we will try to compress up to. */ - if ((curr_slot = zlib_find_slot( - zs.total_in + zs.avail_out, offsets, slots)) > last_slot) { - zs.avail_in = offsets[curr_slot] - offsets[last_slot]; - while (zs.avail_in > 0 && zs.avail_out > 0) - if ((ret = deflate(&zs, Z_SYNC_FLUSH)) != Z_OK) - return (zlib_error(compressor, - session, "deflate", ret)); + curr_slot = zlib_find_slot( + zs->total_in + zs->avail_out, offsets, slots); + if (curr_slot > last_slot) { + zs->avail_in = offsets[curr_slot] - offsets[last_slot]; + while (zs->avail_in > 0 && zs->avail_out > 0) + if ((ret = deflate(zs, Z_SYNC_FLUSH)) != Z_OK) { + ret = zlib_error(compressor, + session, "deflate", ret); + goto err; + } } /* * We didn't do a deflate, or it didn't work: use the last saved - * position. + * position (if any). */ - if (curr_slot <= last_slot || zs.avail_in > 0) { - if ((ret = deflateEnd(&zs)) != Z_OK && - ret != Z_DATA_ERROR) - return (zlib_error( - compressor, session, "deflateEnd", ret)); - - best_zs = &last_zs; + if (curr_slot <= last_slot || zs->avail_in > 0) { + best_zs = last_zs; break; } - /* The last deflation succeeded, discard the saved one. */ - if ((ret = deflateEnd(&last_zs)) != Z_OK && ret != Z_DATA_ERROR) - return (zlib_error( - compressor, session, "deflateEnd", ret)); - /* * If there's more compression to do, save a snapshot and keep * going, otherwise, use the current compression. */ last_slot = curr_slot; - if (zs.avail_out > 0) { - if ((ret = deflateCopy(&last_zs, &zs)) != Z_OK) - return (zlib_error( - compressor, session, "deflateCopy", ret)); + if (zs->avail_out > 0) { + /* Discard any previously saved snapshot. */ + if (last_zs != NULL) { + ret = deflateEnd(last_zs); + last_zs = NULL; + if (ret != Z_OK && ret != Z_DATA_ERROR) { + ret = zlib_error(compressor, + session, "deflateEnd", ret); + goto err; + } + } + last_zs = &_last_zs; + if ((ret = deflateCopy(last_zs, zs)) != Z_OK) { + last_zs = NULL; + ret = zlib_error( + compressor, session, "deflateCopy", ret); + goto err; + } continue; } - best_zs = &zs; + best_zs = zs; break; } - best_zs->avail_out += WT_ZLIB_RESERVED; - ret = deflate(best_zs, Z_FINISH); + if (last_slot > 0 && best_zs != NULL) { + /* Add the reserved bytes and try to finish the compression. */ + best_zs->avail_out += zlib_reserved; + ret = deflate(best_zs, Z_FINISH); - /* - * If the end marker didn't fit, report that we got no work done, - * WiredTiger will compress the (possibly large) page image using - * ordinary compression instead. - */ - if (ret == Z_OK || ret == Z_BUF_ERROR) - last_slot = 0; - else if (ret != Z_STREAM_END) - return ( - zlib_error(compressor, session, "deflate end block", ret)); + /* + * If the end marker didn't fit with the default value, try + * again with a maximum value; if that doesn't work, report we + * got no work done, WiredTiger will compress the (possibly + * large) page image using ordinary compression instead. + */ + if (ret == Z_OK || ret == Z_BUF_ERROR) { + last_slot = 0; + increase_reserve = true; + } else if (ret != Z_STREAM_END) { + ret = zlib_error( + compressor, session, "deflate end block", ret); + goto err; + } + ret = 0; + } - if ((ret = deflateEnd(best_zs)) != Z_OK && ret != Z_DATA_ERROR) - return (zlib_error(compressor, session, "deflateEnd", ret)); +err: if (zs != NULL && + (tret = deflateEnd(zs)) != Z_OK && tret != Z_DATA_ERROR) + ret = zlib_error(compressor, session, "deflateEnd", tret); + if (last_zs != NULL && + (tret = deflateEnd(last_zs)) != Z_OK && tret != Z_DATA_ERROR) + ret = zlib_error(compressor, session, "deflateEnd", tret); - if (last_slot > 0) { + if (ret == 0 && last_slot > 0) { *result_slotsp = last_slot; *result_lenp = (size_t)best_zs->total_out; } else { - /* We didn't manage to compress anything: don't retry. */ + /* We didn't manage to compress anything. */ *result_slotsp = 0; *result_lenp = 1; + + if (increase_reserve) + goto retry; } #if 0 /* Decompress the result and confirm it matches the original source. */ - if (last_slot > 0) { + if (ret == 0 && last_slot > 0) { void *decomp; size_t result_len; @@ -363,19 +407,20 @@ zlib_compress_raw(WT_COMPRESSOR *compressor, WT_SESSION *session, "deflate compare with original source", Z_DATA_ERROR); zfree(&opaque, decomp); - if (ret != 0) - return (ret); } #endif #if 0 - fprintf(stderr, - "zlib_compress_raw (%s): page_max %" PRIuMAX ", slots %" PRIu32 - ", take %" PRIu32 ": %" PRIu32 " -> %" PRIuMAX "\n", - final ? "final" : "not final", (uintmax_t)page_max, - slots, last_slot, offsets[last_slot], (uintmax_t)*result_lenp); + if (ret == 0 && last_slot > 0) + fprintf(stderr, + "zlib_compress_raw (%s): page_max %" PRIuMAX ", slots %" + PRIu32 ", take %" PRIu32 ": %" PRIu32 " -> %" PRIuMAX "\n", + final ? "final" : "not final", (uintmax_t)page_max, + slots, last_slot, offsets[last_slot], + (uintmax_t)*result_lenp); #endif - return (0); + + return (ret); } /* @@ -396,7 +441,8 @@ zlib_terminate(WT_COMPRESSOR *compressor, WT_SESSION *session) * Add a zlib compressor. */ static int -zlib_add_compressor(WT_CONNECTION *connection, int raw, const char *name) +zlib_add_compressor( + WT_CONNECTION *connection, bool raw, const char *name, int zlib_level) { ZLIB_COMPRESSOR *zlib_compressor; @@ -415,17 +461,80 @@ zlib_add_compressor(WT_CONNECTION *connection, int raw, const char *name) zlib_compressor->compressor.terminate = zlib_terminate; zlib_compressor->wt_api = connection->get_extension_api(connection); - - /* - * Between 0-10: level: see zlib manual. - */ - zlib_compressor->zlib_level = Z_DEFAULT_COMPRESSION; + zlib_compressor->zlib_level = zlib_level; /* Load the compressor. */ return (connection->add_compressor( connection, name, (WT_COMPRESSOR *)zlib_compressor, NULL)); } +/* + * zlib_init_config -- + * Handle zlib configuration. + */ +static int +zlib_init_config( + WT_CONNECTION *connection, WT_CONFIG_ARG *config, int *zlib_levelp) +{ + WT_CONFIG_ITEM k, v; + WT_CONFIG_PARSER *config_parser; + WT_EXTENSION_API *wtext; + int ret, zlib_level; + + /* If configured as a built-in, there's no configuration argument. */ + if (config == NULL) + return (0); + + /* + * Zlib compression engine allows applications to specify a compression + * level; review the configuration. + */ + wtext = connection->get_extension_api(connection); + if ((ret = wtext->config_get(wtext, NULL, config, "config", &v)) != 0) { + (void)wtext->err_printf(wtext, NULL, + "WT_EXTENSION_API.config_get: zlib configure: %s", + wtext->strerror(wtext, NULL, ret)); + return (ret); + } + if ((ret = wtext->config_parser_open( + wtext, NULL, v.str, v.len, &config_parser)) != 0) { + (void)wtext->err_printf(wtext, NULL, + "WT_EXTENSION_API.config_parser_open: zlib configure: %s", + wtext->strerror(wtext, NULL, ret)); + return (ret); + } + while ((ret = config_parser->next(config_parser, &k, &v)) == 0) + if (strlen("compression_level") == k.len && + strncmp("compression_level", k.str, k.len) == 0) { + /* + * Between 0-9: level: see zlib manual. + */ + zlib_level = (int)v.val; + if (zlib_level < 0 || zlib_level > 9) { + (void)wtext->err_printf(wtext, NULL, + "WT_CONFIG_PARSER.next: zlib configure: " + "unsupported compression level %d", + zlib_level); + return (EINVAL); + } + *zlib_levelp = zlib_level; + continue; + } + if (ret != WT_NOTFOUND) { + (void)wtext->err_printf(wtext, NULL, + "WT_CONFIG_PARSER.next: zlib configure: %s", + wtext->strerror(wtext, NULL, ret)); + return (ret); + } + if ((ret = config_parser->close(config_parser)) != 0) { + (void)wtext->err_printf(wtext, NULL, + "WT_CONFIG_PARSER.close: zlib configure: %s", + wtext->strerror(wtext, NULL, ret)); + return (ret); + } + return (0); +} + int zlib_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *); /* @@ -437,13 +546,17 @@ int zlib_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *); int zlib_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) { - int ret; + int ret, zlib_level; - (void)config; /* Unused parameters */ + zlib_level = Z_DEFAULT_COMPRESSION; /* Default */ + if ((ret = zlib_init_config(connection, config, &zlib_level)) != 0) + return (ret); - if ((ret = zlib_add_compressor(connection, 1, "zlib")) != 0) + if ((ret = zlib_add_compressor( + connection, true, "zlib", zlib_level)) != 0) return (ret); - if ((ret = zlib_add_compressor(connection, 0, "zlib-noraw")) != 0) + if ((ret = zlib_add_compressor( + connection, false, "zlib-noraw", zlib_level)) != 0) return (ret); return (0); } diff --git a/ext/compressors/zstd/Makefile.am b/ext/compressors/zstd/Makefile.am new file mode 100644 index 00000000000..9f0997011e9 --- /dev/null +++ b/ext/compressors/zstd/Makefile.am @@ -0,0 +1,11 @@ +AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include + +if HAVE_BUILTIN_EXTENSION_ZSTD +noinst_LTLIBRARIES = libwiredtiger_zstd.la +else +lib_LTLIBRARIES = libwiredtiger_zstd.la +libwiredtiger_zstd_la_LDFLAGS = -avoid-version -module +endif + +libwiredtiger_zstd_la_SOURCES = zstd_compress.c +libwiredtiger_zstd_la_LIBADD = -lzstd diff --git a/ext/compressors/zstd/zstd_compress.c b/ext/compressors/zstd/zstd_compress.c new file mode 100644 index 00000000000..3d0447248b6 --- /dev/null +++ b/ext/compressors/zstd/zstd_compress.c @@ -0,0 +1,358 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <zstd.h> +#include <errno.h> +#include <stdlib.h> +#include <string.h> + +/* + * We need to include the configuration file to detect whether this extension + * is being built into the WiredTiger library; application-loaded compression + * functions won't need it. + */ +#include <wiredtiger_config.h> + +#include <wiredtiger.h> +#include <wiredtiger_ext.h> + +#ifdef _MSC_VER +#define inline __inline +#endif + +/* Local compressor structure. */ +typedef struct { + WT_COMPRESSOR compressor; /* Must come first */ + + WT_EXTENSION_API *wt_api; /* Extension API */ + + int compression_level; /* compression level */ +} ZSTD_COMPRESSOR; + +/* + * Zstd decompression requires an exact compressed byte count. WiredTiger + * doesn't track that value, store it in the destination buffer. + */ +#define ZSTD_PREFIX sizeof(uint64_t) + +#ifdef WORDS_BIGENDIAN +/* + * zstd_bswap64 -- + * 64-bit unsigned little-endian to/from big-endian value. + */ +static inline uint64_t +zstd_bswap64(uint64_t v) +{ + return ( + ((v << 56) & 0xff00000000000000UL) | + ((v << 40) & 0x00ff000000000000UL) | + ((v << 24) & 0x0000ff0000000000UL) | + ((v << 8) & 0x000000ff00000000UL) | + ((v >> 8) & 0x00000000ff000000UL) | + ((v >> 24) & 0x0000000000ff0000UL) | + ((v >> 40) & 0x000000000000ff00UL) | + ((v >> 56) & 0x00000000000000ffUL) + ); +} +#endif + +/* + * zstd_error -- + * Output an error message, and return a standard error code. + */ +static int +zstd_error(WT_COMPRESSOR *compressor, + WT_SESSION *session, const char *call, size_t error) +{ + WT_EXTENSION_API *wt_api; + + wt_api = ((ZSTD_COMPRESSOR *)compressor)->wt_api; + + (void)wt_api->err_printf(wt_api, session, + "zstd error: %s: %s", call, ZSTD_getErrorName(error)); + return (WT_ERROR); +} + +/* + * zstd_compress -- + * WiredTiger Zstd compression. + */ +static int +zstd_compress(WT_COMPRESSOR *compressor, WT_SESSION *session, + uint8_t *src, size_t src_len, + uint8_t *dst, size_t dst_len, + size_t *result_lenp, int *compression_failed) +{ + ZSTD_COMPRESSOR *zcompressor; + size_t zstd_ret; + uint64_t zstd_len; + + zcompressor = (ZSTD_COMPRESSOR *)compressor; + + /* Compress, starting past the prefix bytes. */ + zstd_ret = ZSTD_compress( + dst + ZSTD_PREFIX, dst_len - ZSTD_PREFIX, + src, src_len, zcompressor->compression_level); + + /* + * If compression succeeded and the compressed length is smaller than + * the original size, return success. + */ + if (!ZSTD_isError(zstd_ret) && zstd_ret + ZSTD_PREFIX < src_len) { + *result_lenp = zstd_ret + ZSTD_PREFIX; + *compression_failed = 0; + + /* + * On decompression, Zstd requires an exact compressed byte + * count (the current value of zstd_ret). WiredTiger does not + * preserve that value, so save zstd_ret at the beginning of + * the destination buffer. + * + * Store the value in little-endian format. + */ + zstd_len = zstd_ret; +#ifdef WORDS_BIGENDIAN + zstd_len = zstd_bswap64(zstd_len); +#endif + *(uint64_t *)dst = zstd_len; + return (0); + } + + *compression_failed = 1; + return (ZSTD_isError(zstd_ret) ? + zstd_error(compressor, session, "ZSTD_compress", zstd_ret) : 0); +} + +/* + * zstd_decompress -- + * WiredTiger Zstd decompression. + */ +static int +zstd_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session, + uint8_t *src, size_t src_len, + uint8_t *dst, size_t dst_len, + size_t *result_lenp) +{ + WT_EXTENSION_API *wt_api; + size_t zstd_ret; + uint64_t zstd_len; + + wt_api = ((ZSTD_COMPRESSOR *)compressor)->wt_api; + + /* + * Retrieve the saved length, handling little- to big-endian conversion + * as necessary. + */ + zstd_len = *(uint64_t *)src; +#ifdef WORDS_BIGENDIAN + zstd_len = zstd_bswap64(zstd_len); +#endif + if (zstd_len + ZSTD_PREFIX > src_len) { + (void)wt_api->err_printf(wt_api, + session, + "WT_COMPRESSOR.decompress: stored size exceeds source " + "size"); + return (WT_ERROR); + } + + zstd_ret = + ZSTD_decompress(dst, dst_len, src + ZSTD_PREFIX, (size_t)zstd_len); + + if (!ZSTD_isError(zstd_ret)) { + *result_lenp = zstd_ret; + return (0); + } + return (zstd_error(compressor, session, "ZSTD_decompress", zstd_ret)); +} + +/* + * zstd_pre_size -- + * WiredTiger Zstd destination buffer sizing for compression. + */ +static int +zstd_pre_size(WT_COMPRESSOR *compressor, WT_SESSION *session, + uint8_t *src, size_t src_len, size_t *result_lenp) +{ + (void)compressor; /* Unused parameters */ + (void)session; + (void)src; + + /* + * Zstd compression runs faster if the destination buffer is sized at + * the upper-bound of the buffer size needed by the compression. Use + * the library calculation of that overhead (plus our overhead). + */ + *result_lenp = ZSTD_compressBound(src_len) + ZSTD_PREFIX; + return (0); +} + +/* + * zstd_terminate -- + * WiredTiger Zstd compression termination. + */ +static int +zstd_terminate(WT_COMPRESSOR *compressor, WT_SESSION *session) +{ + (void)session; /* Unused parameters */ + + free(compressor); + return (0); +} + +/* + * zstd_init_config -- + * Handle zstd configuration. + */ +static int +zstd_init_config( + WT_CONNECTION *connection, WT_CONFIG_ARG *config, int *compression_levelp) +{ + WT_CONFIG_ITEM k, v; + WT_CONFIG_PARSER *config_parser; + WT_EXTENSION_API *wtext; + int ret; + + /* If configured as a built-in, there's no configuration argument. */ + if (config == NULL) + return (0); + + /* + * Zstd compression engine allows applications to specify a compression + * level; review the configuration. + */ + wtext = connection->get_extension_api(connection); + if ((ret = wtext->config_get(wtext, NULL, config, "config", &v)) != 0) { + (void)wtext->err_printf(wtext, NULL, + "WT_EXTENSION_API.config_get: zstd configure: %s", + wtext->strerror(wtext, NULL, ret)); + return (ret); + } + if ((ret = wtext->config_parser_open( + wtext, NULL, v.str, v.len, &config_parser)) != 0) { + (void)wtext->err_printf(wtext, NULL, + "WT_EXTENSION_API.config_parser_open: zstd configure: %s", + wtext->strerror(wtext, NULL, ret)); + return (ret); + } + while ((ret = config_parser->next(config_parser, &k, &v)) == 0) + if (strlen("compression_level") == k.len && + strncmp("compression_level", k.str, k.len) == 0) { + *compression_levelp = (int)v.val; + continue; + } + if (ret != WT_NOTFOUND) { + (void)wtext->err_printf(wtext, NULL, + "WT_CONFIG_PARSER.next: zstd configure: %s", + wtext->strerror(wtext, NULL, ret)); + return (ret); + } + if ((ret = config_parser->close(config_parser)) != 0) { + (void)wtext->err_printf(wtext, NULL, + "WT_CONFIG_PARSER.close: zstd configure: %s", + wtext->strerror(wtext, NULL, ret)); + return (ret); + } + return (0); +} + +int zstd_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *); + +/* + * zstd_extension_init -- + * WiredTiger Zstd compression extension - called directly when Zstd + * support is built in, or via wiredtiger_extension_init when Zstd support + * is included via extension loading. + */ +int +zstd_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) +{ + ZSTD_COMPRESSOR *zstd_compressor; + int compression_level, ret; + + /* + * Zstd's sweet-spot is better compression than zlib at significantly + * faster compression/decompression speeds. LZ4 and snappy are faster + * than zstd, but have worse compression ratios. Applications wanting + * faster compression/decompression with worse compression will select + * LZ4 or snappy, so we configure zstd for better compression. + * + * From the zstd github site, default measurements of the compression + * engines we support, listing compression ratios with compression and + * decompression speeds: + * + * Name Ratio C.speed D.speed + * MB/s MB/s + * zstd 2.877 330 940 + * zlib 2.730 95 360 + * LZ4 2.101 620 3100 + * snappy 2.091 480 1600 + * + * Set the zstd compression level to 3: according to the zstd web site, + * that reduces zstd's compression speed to around 200 MB/s, increasing + * the compression ratio to 3.100 (close to zlib's best compression + * ratio). In other words, position zstd as a zlib replacement, having + * similar compression at much higher compression/decompression speeds. + */ + compression_level = 3; + if ((ret = + zstd_init_config(connection, config, &compression_level)) != 0) + return (ret); + + if ((zstd_compressor = calloc(1, sizeof(ZSTD_COMPRESSOR))) == NULL) + return (errno); + + zstd_compressor->compressor.compress = zstd_compress; + zstd_compressor->compressor.compress_raw = NULL; + zstd_compressor->compressor.decompress = zstd_decompress; + zstd_compressor->compressor.pre_size = zstd_pre_size; + zstd_compressor->compressor.terminate = zstd_terminate; + + zstd_compressor->wt_api = connection->get_extension_api(connection); + + zstd_compressor->compression_level = compression_level; + + /* Load the compressor */ + return (connection->add_compressor( + connection, "zstd", (WT_COMPRESSOR *)zstd_compressor, NULL)); +} + +/* + * We have to remove this symbol when building as a builtin extension otherwise + * it will conflict with other builtin libraries. + */ +#ifndef HAVE_BUILTIN_EXTENSION_ZSTD +/* + * wiredtiger_extension_init -- + * WiredTiger Zstd compression extension. + */ +int +wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) +{ + return (zstd_extension_init(connection, config)); +} +#endif diff --git a/src/async/async_worker.c b/src/async/async_worker.c index 401d0616eab..b1bc3902f7c 100644 --- a/src/async/async_worker.c +++ b/src/async/async_worker.c @@ -216,7 +216,7 @@ __async_worker_execop(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op, break; case WT_AOP_NONE: WT_RET_MSG(session, EINVAL, - "Unknown async optype %d\n", op->optype); + "Unknown async optype %d", op->optype); } return (0); } diff --git a/src/block/block_ckpt.c b/src/block/block_ckpt.c index b7ac953cdb1..48522768dc9 100644 --- a/src/block/block_ckpt.c +++ b/src/block/block_ckpt.c @@ -615,8 +615,6 @@ live_update: WT_CKPT_FOREACH(ckptbase, ckpt) if (F_ISSET(ckpt, WT_CKPT_ADD)) { /* - * Set the checkpoint size for the live system. - * * !!! * Our caller wants the final checkpoint size. Setting * the size here violates layering, but the alternative @@ -624,7 +622,31 @@ live_update: * cookie into its components, and that's a fair amount * of work. */ - ckpt->ckpt_size = ci->ckpt_size = ckpt_size; + ckpt->ckpt_size = ckpt_size; + + /* + * Set the rolling checkpoint size for the live system. + * The current size includes the current checkpoint's + * root page size (root pages are on the checkpoint's + * block allocation list as root pages are allocated + * with the usual block allocation functions). That's + * correct, but we don't want to include it in the size + * for the next checkpoint. + */ + ckpt_size -= ci->root_size; + + /* + * Additionally, we had a bug for awhile where the live + * checkpoint size grew without bound. We can't sanity + * check the value, that would require walking the tree + * as part of the checkpoint. Bound any bug at the size + * of the file. + * It isn't practical to assert that the value is within + * bounds since databases created with older versions + * of WiredTiger (2.8.0) would likely see an error. + */ + ci->ckpt_size = + WT_MIN(ckpt_size, (uint64_t)block->size); WT_ERR(__ckpt_update(session, block, ckpt, ci, true)); } diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index 3690b41ead4..41ae457b0fe 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -1217,7 +1217,7 @@ err: if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED)) /* * __wt_btcur_init -- - * Initialize an cursor used for internal purposes. + * Initialize a cursor used for internal purposes. */ void __wt_btcur_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index 9591023e163..337a3ea036f 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -271,6 +271,17 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) else F_CLR(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION); + WT_RET(__wt_config_gets(session, + cfg, "ignore_in_memory_cache_size", &cval)); + if (cval.val) { + if (!F_ISSET(conn, WT_CONN_IN_MEMORY)) + WT_RET_MSG(session, EINVAL, + "ignore_in_memory_cache_size setting is only valid " + "with databases configured to run in-memory"); + F_SET(btree, WT_BTREE_IGNORE_CACHE); + } else + F_CLR(btree, WT_BTREE_IGNORE_CACHE); + WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval)); if (cval.val) F_CLR(btree, WT_BTREE_NO_LOGGING); @@ -353,7 +364,7 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush")); btree->checkpointing = WT_CKPT_OFF; /* Not checkpointing */ - btree->modified = 0; /* Clean */ + btree->modified = false; /* Clean */ btree->write_gen = ckpt->write_gen; /* Write generation */ return (0); diff --git a/src/btree/bt_io.c b/src/btree/bt_io.c index 42c3a849a88..a8645f79dbe 100644 --- a/src/btree/bt_io.c +++ b/src/btree/bt_io.c @@ -171,6 +171,7 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, bool checkpoint, bool checkpoint_io, bool compressed) { + struct timespec start, stop; WT_BM *bm; WT_BTREE *btree; WT_DECL_ITEM(ctmp); @@ -356,6 +357,8 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, data_checksum = !compressed; break; } + if (!F_ISSET(session, WT_SESSION_INTERNAL)) + __wt_epoch(session, &start); /* Call the block manager to write the block. */ WT_ERR(checkpoint ? @@ -363,6 +366,14 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, bm->write( bm, session, ip, addr, addr_sizep, data_checksum, checkpoint_io)); + /* Update some statistics now that the write is done */ + if (!F_ISSET(session, WT_SESSION_INTERNAL)) { + __wt_epoch(session, &stop); + WT_STAT_CONN_INCR(session, cache_write_app_count); + WT_STAT_CONN_INCRV(session, cache_write_app_time, + WT_TIMEDIFF_US(stop, start)); + } + WT_STAT_CONN_INCR(session, cache_write); WT_STAT_DATA_INCR(session, cache_write); S2C(session)->cache->bytes_written += dsk->mem_size; diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index c54eaa69c43..90188498535 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -327,22 +327,28 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref) if (__wt_hazard_count(session, page) > 1) return (false); + /* If we can do an in-memory split, do it. */ + if (__wt_leaf_page_can_split(session, page)) + return (true); + if (page->memory_footprint < btree->maxmempage) + return (false); + + /* Bump the oldest ID, we're about to do some visibility checks. */ + WT_IGNORE_RET(__wt_txn_update_oldest(session, 0)); + /* - * If we have already tried and the transaction state has not moved on, - * eviction is highly likely to fail. + * Allow some leeway if the transaction ID isn't moving forward since + * it is unlikely eviction will be able to evict the page. Don't keep + * skipping the page indefinitely or large records can lead to + * extremely large memory footprints. */ - if (page->modify->last_eviction_id == __wt_txn_oldest_id(session)) + if (page->modify->update_restored && + page->modify->last_eviction_id == __wt_txn_oldest_id(session)) return (false); - if (page->memory_footprint < btree->maxmempage) - return (__wt_leaf_page_can_split(session, page)); - /* Trigger eviction on the next page release. */ __wt_page_evict_soon(session, ref); - /* Bump the oldest ID, we're about to do some visibility checks. */ - WT_IGNORE_RET(__wt_txn_update_oldest(session, 0)); - /* If eviction cannot succeed, don't try. */ return (__wt_page_can_evict(session, ref, NULL)); } @@ -354,6 +360,7 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref) static int __page_read(WT_SESSION_IMPL *session, WT_REF *ref) { + struct timespec start, stop; const WT_PAGE_HEADER *dsk; WT_BTREE *btree; WT_DECL_RET; @@ -401,7 +408,15 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref) * There's an address, read or map the backing disk page and build an * in-memory version of the page. */ + if (!F_ISSET(session, WT_SESSION_INTERNAL)) + __wt_epoch(session, &start); WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size)); + if (!F_ISSET(session, WT_SESSION_INTERNAL)) { + __wt_epoch(session, &stop); + WT_STAT_CONN_INCR(session, cache_read_app_count); + WT_STAT_CONN_INCRV(session, cache_read_app_time, + WT_TIMEDIFF_US(stop, start)); + } WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize, WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page)); diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index ea667460966..017c820ea29 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -1582,6 +1582,13 @@ __split_multi_inmem( */ page->modify->first_dirty_txn = WT_TXN_FIRST; + /* + * If the new page is modified, save the oldest ID from reconciliation + * to avoid repeatedly attempting eviction on the same page. + */ + page->modify->last_eviction_id = orig->modify->last_eviction_id; + page->modify->update_restored = 1; + err: /* Free any resources that may have been cached in the cursor. */ WT_TRET(__wt_btcur_close(&cbt, true)); @@ -2245,14 +2252,6 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, WT_MULTI *multi) WT_ERR(__split_multi_inmem(session, page, multi, new)); /* - * If the new page is modified, save the oldest ID from reconciliation - * to avoid repeatedly attempting eviction on the same page. - */ - if (new->page->modify != NULL) - new->page->modify->last_eviction_id = - page->modify->last_eviction_id; - - /* * The rewrite succeeded, we can no longer fail. * * Finalize the move, discarding moved update lists from the original diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c index d3ddf33446e..06428b87f6e 100644 --- a/src/btree/bt_stat.c +++ b/src/btree/bt_stat.c @@ -8,6 +8,7 @@ #include "wt_internal.h" +static int __stat_tree_walk(WT_SESSION_IMPL *); static int __stat_page(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS **); static void __stat_page_col_var(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS **); static void __stat_page_row_int(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS **); @@ -23,9 +24,7 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) { WT_BM *bm; WT_BTREE *btree; - WT_DECL_RET; WT_DSRC_STATS **stats; - WT_REF *next_walk; btree = S2BT(session); bm = btree->bm; @@ -44,9 +43,29 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) WT_STAT_SET(session, stats, cache_bytes_inuse, __wt_btree_bytes_inuse(session)); - /* Everything else is really, really expensive. */ - if (!F_ISSET(cst, WT_CONN_STAT_ALL)) - return (0); + if (F_ISSET(cst, WT_STAT_TYPE_CACHE_WALK)) + __wt_curstat_cache_walk(session); + + if (F_ISSET(cst, WT_STAT_TYPE_TREE_WALK)) + WT_RET(__stat_tree_walk(session)); + + return (0); +} + +/* + * __stat_tree_walk -- + * Gather btree statistics that require traversing the tree. + */ +static int +__stat_tree_walk(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_DSRC_STATS **stats; + WT_REF *next_walk; + + btree = S2BT(session); + stats = btree->dhandle->stats; /* * Clear the statistics we're about to count. diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index b41179a565d..6d4ad9d0d0f 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -9,6 +9,59 @@ #include "wt_internal.h" /* + * __sync_checkpoint_can_skip -- + * There are limited conditions under which we can skip writing a dirty + * page during checkpoint. + */ +static inline bool +__sync_checkpoint_can_skip(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_PAGE_MODIFY *mod; + WT_MULTI *multi; + WT_TXN *txn; + u_int i; + + mod = page->modify; + txn = &session->txn; + + /* + * We can skip some dirty pages during a checkpoint. The requirements: + * + * 1. they must be leaf pages, + * 2. there is a snapshot transaction active (which is the case in + * ordinary application checkpoints but not all internal cases), + * 3. the first dirty update on the page is sufficiently recent the + * checkpoint transaction would skip them, + * 4. there's already an address for every disk block involved. + */ + if (WT_PAGE_IS_INTERNAL(page)) + return (false); + if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)) + return (false); + if (!WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) + return (false); + + /* + * The problematic case is when a page was evicted but when there were + * unresolved updates and not every block associated with the page has + * a disk address. We can't skip such pages because we need a checkpoint + * write with valid addresses. + * + * The page's modification information can change underfoot if the page + * is being reconciled, so we'd normally serialize with reconciliation + * before reviewing page-modification information. However, checkpoint + * is the only valid writer of dirty leaf pages at this point, we skip + * the lock. + */ + if (mod->rec_result == WT_PM_REC_MULTIBLOCK) + for (multi = mod->mod_multi, + i = 0; i < mod->mod_multi_entries; ++multi, ++i) + if (multi->addr.addr == NULL) + return (false); + return (true); +} + +/* * __sync_file -- * Flush pages for a specific file. */ @@ -20,24 +73,23 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; - WT_PAGE_MODIFY *mod; WT_REF *walk; WT_TXN *txn; uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; - uint64_t oldest_id, saved_snap_min; + uint64_t oldest_id, saved_pinned_id; uint32_t flags; conn = S2C(session); btree = S2BT(session); walk = NULL; txn = &session->txn; - saved_snap_min = WT_SESSION_TXN_STATE(session)->snap_min; + saved_pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; flags = WT_READ_CACHE | WT_READ_NO_GEN; internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) - WT_RET(__wt_epoch(session, &start)); + __wt_epoch(session, &start); switch (syncop) { case WT_SYNC_WRITE_LEAVES: @@ -161,29 +213,15 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) * reference and checking modified. */ page = walk->page; - mod = page->modify; /* - * Write dirty pages, unless we can be sure they only - * became dirty after the checkpoint started. - * - * We can skip dirty pages if: - * (1) they are leaf pages; - * (2) there is a snapshot transaction active (which - * is the case in ordinary application checkpoints - * but not all internal cases); and - * (3) the first dirty update on the page is - * sufficiently recent that the checkpoint - * transaction would skip them. - * - * Mark the tree dirty: the checkpoint marked it clean - * and we can't skip future checkpoints until this page - * is written. + * Write dirty pages, if we can't skip them. If we skip + * a page, mark the tree dirty. The checkpoint marked it + * clean and we can't skip future checkpoints until this + * page is written. */ - if (!WT_PAGE_IS_INTERNAL(page) && - F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) && - WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) { - __wt_page_modify_set(session, page); + if (__sync_checkpoint_can_skip(session, page)) { + __wt_tree_modify_set(session); continue; } @@ -205,15 +243,14 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) } if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { - WT_ERR(__wt_epoch(session, &end)); + __wt_epoch(session, &end); __wt_verbose(session, WT_VERB_CHECKPOINT, - "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64 - " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64 - " bytes, %" PRIu64 " pages of internal\n\t" - "Took: %" PRIu64 "ms", + "__sync_file WT_SYNC_%s wrote: %" PRIu64 + " leaf pages (%" PRIu64 "B), %" PRIu64 + " internal pages (%" PRIu64 "B), and took %" PRIu64 "ms", syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", - leaf_bytes, leaf_pages, internal_bytes, internal_pages, + leaf_pages, leaf_bytes, internal_pages, internal_bytes, WT_TIMEDIFF_MS(end, start)); } @@ -226,7 +263,7 @@ err: /* On error, clear any left-over tree walk. */ * snapshot active when we started, release it. */ if (txn->isolation == WT_ISO_READ_COMMITTED && - saved_snap_min == WT_TXN_NONE) + saved_pinned_id == WT_TXN_NONE) __wt_txn_release_snapshot(session); /* Clear the checkpoint flag and push the change. */ diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c index 4c338bc6ad9..41f50957809 100644 --- a/src/cache/cache_las.c +++ b/src/cache/cache_las.c @@ -49,7 +49,7 @@ __wt_las_stats_update(WT_SESSION_IMPL *session) * don't have to worry about users seeing inconsistent data source * information. */ - if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR)) { + if (FLD_ISSET(conn->stat_flags, WT_STAT_CLEAR)) { WT_STAT_SET(session, dstats, cursor_insert, 0); WT_STAT_SET(session, dstats, cursor_remove, 0); } diff --git a/src/checksum/power8/README.md b/src/checksum/power8/README.md index 3e2976650cd..579d841a02c 100644 --- a/src/checksum/power8/README.md +++ b/src/checksum/power8/README.md @@ -39,7 +39,7 @@ Quick start - Type make to create the constants (crc32_constants.h) -- Import the code into your application (crc32.S crc32_wrapper.c +- Import the code into your application (crc32.sx crc32_wrapper.c crc32_constants.h ppc-opcode.h) and call the CRC: ``` diff --git a/src/checksum/power8/crc32.S b/src/checksum/power8/crc32.sx index 0b7870668b5..0b7870668b5 100644 --- a/src/checksum/power8/crc32.S +++ b/src/checksum/power8/crc32.sx diff --git a/src/checksum/zseries/crc32le-vx.S b/src/checksum/zseries/crc32le-vx.sx index 0f1392b0952..0f1392b0952 100644 --- a/src/checksum/zseries/crc32le-vx.S +++ b/src/checksum/zseries/crc32le-vx.sx diff --git a/src/config/config_collapse.c b/src/config/config_collapse.c index ea956ebfff9..7fe78d06ba7 100644 --- a/src/config/config_collapse.c +++ b/src/config/config_collapse.c @@ -47,7 +47,7 @@ __wt_config_collapse( if (k.type != WT_CONFIG_ITEM_STRING && k.type != WT_CONFIG_ITEM_ID) WT_ERR_MSG(session, EINVAL, - "Invalid configuration key found: '%s'\n", k.str); + "Invalid configuration key found: '%s'", k.str); WT_ERR(__wt_config_get(session, cfg, &k, &v)); /* Include the quotes around string keys/values. */ if (k.type == WT_CONFIG_ITEM_STRING) { @@ -132,7 +132,7 @@ __config_merge_scan(WT_SESSION_IMPL *session, if (k.type != WT_CONFIG_ITEM_STRING && k.type != WT_CONFIG_ITEM_ID) WT_ERR_MSG(session, EINVAL, - "Invalid configuration key found: '%s'\n", k.str); + "Invalid configuration key found: '%s'", k.str); /* Include the quotes around string keys/values. */ if (k.type == WT_CONFIG_ITEM_STRING) { diff --git a/src/config/config_def.c b/src/config/config_def.c index 7bad5f12a9f..018cc7a8ac4 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -138,7 +138,8 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = { NULL, NULL, confchk_wiredtiger_open_shared_cache_subconfigs, 5 }, { "statistics", "list", - NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]", + NULL, "choices=[\"all\",\"cache_walk\",\"fast\",\"none\"," + "\"clear\",\"tree_walk\"]", NULL, 0 }, { "statistics_log", "category", NULL, NULL, @@ -246,6 +247,9 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_create[] = { { "format", "string", NULL, "choices=[\"btree\"]", NULL, 0 }, { "huffman_key", "string", NULL, NULL, NULL, 0 }, { "huffman_value", "string", NULL, NULL, NULL, 0 }, + { "ignore_in_memory_cache_size", "boolean", + NULL, NULL, + NULL, 0 }, { "immutable", "boolean", NULL, NULL, NULL, 0 }, { "internal_item_max", "int", NULL, "min=0", NULL, 0 }, { "internal_key_max", "int", NULL, "min=0", NULL, 0 }, @@ -331,7 +335,8 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_open_cursor[] = { { "readonly", "boolean", NULL, NULL, NULL, 0 }, { "skip_sort_check", "boolean", NULL, NULL, NULL, 0 }, { "statistics", "list", - NULL, "choices=[\"all\",\"fast\",\"clear\",\"size\"]", + NULL, "choices=[\"all\",\"cache_walk\",\"fast\",\"clear\"," + "\"size\",\"tree_walk\"]", NULL, 0 }, { "target", "list", NULL, NULL, NULL, 0 }, { NULL, NULL, NULL, NULL, NULL, 0 } @@ -413,6 +418,9 @@ static const WT_CONFIG_CHECK confchk_file_config[] = { { "format", "string", NULL, "choices=[\"btree\"]", NULL, 0 }, { "huffman_key", "string", NULL, NULL, NULL, 0 }, { "huffman_value", "string", NULL, NULL, NULL, 0 }, + { "ignore_in_memory_cache_size", "boolean", + NULL, NULL, + NULL, 0 }, { "internal_item_max", "int", NULL, "min=0", NULL, 0 }, { "internal_key_max", "int", NULL, "min=0", NULL, 0 }, { "internal_key_truncate", "boolean", NULL, NULL, NULL, 0 }, @@ -471,6 +479,9 @@ static const WT_CONFIG_CHECK confchk_file_meta[] = { { "huffman_key", "string", NULL, NULL, NULL, 0 }, { "huffman_value", "string", NULL, NULL, NULL, 0 }, { "id", "string", NULL, NULL, NULL, 0 }, + { "ignore_in_memory_cache_size", "boolean", + NULL, NULL, + NULL, 0 }, { "internal_item_max", "int", NULL, "min=0", NULL, 0 }, { "internal_key_max", "int", NULL, "min=0", NULL, 0 }, { "internal_key_truncate", "boolean", NULL, NULL, NULL, 0 }, @@ -544,6 +555,9 @@ static const WT_CONFIG_CHECK confchk_lsm_meta[] = { { "format", "string", NULL, "choices=[\"btree\"]", NULL, 0 }, { "huffman_key", "string", NULL, NULL, NULL, 0 }, { "huffman_value", "string", NULL, NULL, NULL, 0 }, + { "ignore_in_memory_cache_size", "boolean", + NULL, NULL, + NULL, 0 }, { "internal_item_max", "int", NULL, "min=0", NULL, 0 }, { "internal_key_max", "int", NULL, "min=0", NULL, 0 }, { "internal_key_truncate", "boolean", NULL, NULL, NULL, 0 }, @@ -697,7 +711,8 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { NULL, NULL, confchk_wiredtiger_open_shared_cache_subconfigs, 5 }, { "statistics", "list", - NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]", + NULL, "choices=[\"all\",\"cache_walk\",\"fast\",\"none\"," + "\"clear\",\"tree_walk\"]", NULL, 0 }, { "statistics_log", "category", NULL, NULL, @@ -781,7 +796,8 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { NULL, NULL, confchk_wiredtiger_open_shared_cache_subconfigs, 5 }, { "statistics", "list", - NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]", + NULL, "choices=[\"all\",\"cache_walk\",\"fast\",\"none\"," + "\"clear\",\"tree_walk\"]", NULL, 0 }, { "statistics_log", "category", NULL, NULL, @@ -862,7 +878,8 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { NULL, NULL, confchk_wiredtiger_open_shared_cache_subconfigs, 5 }, { "statistics", "list", - NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]", + NULL, "choices=[\"all\",\"cache_walk\",\"fast\",\"none\"," + "\"clear\",\"tree_walk\"]", NULL, 0 }, { "statistics_log", "category", NULL, NULL, @@ -941,7 +958,8 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { NULL, NULL, confchk_wiredtiger_open_shared_cache_subconfigs, 5 }, { "statistics", "list", - NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]", + NULL, "choices=[\"all\",\"cache_walk\",\"fast\",\"none\"," + "\"clear\",\"tree_walk\"]", NULL, 0 }, { "statistics_log", "category", NULL, NULL, @@ -1053,18 +1071,18 @@ static const WT_CONFIG_ENTRY config_entries[] = { "block_compressor=,cache_resident=false,checksum=uncompressed," "colgroups=,collator=,columns=,dictionary=0,encryption=(keyid=," "name=),exclusive=false,extractor=,format=btree,huffman_key=," - "huffman_value=,immutable=false,internal_item_max=0," - "internal_key_max=0,internal_key_truncate=true," - "internal_page_max=4KB,key_format=u,key_gap=10,leaf_item_max=0," - "leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=0," - "log=(enabled=true),lsm=(auto_throttle=true,bloom=true," - "bloom_bit_count=16,bloom_config=,bloom_hash_count=8," + "huffman_value=,ignore_in_memory_cache_size=false,immutable=false" + ",internal_item_max=0,internal_key_max=0," + "internal_key_truncate=true,internal_page_max=4KB,key_format=u," + "key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB," + "leaf_value_max=0,log=(enabled=true),lsm=(auto_throttle=true," + "bloom=true,bloom_bit_count=16,bloom_config=,bloom_hash_count=8," "bloom_oldest=false,chunk_count_limit=0,chunk_max=5GB," "chunk_size=10MB,merge_max=15,merge_min=0),memory_page_max=5MB," "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false," "prefix_compression_min=4,source=,split_deepen_min_child=0," "split_deepen_per_child=0,split_pct=75,type=file,value_format=u", - confchk_WT_SESSION_create, 40 + confchk_WT_SESSION_create, 41 }, { "WT_SESSION.drop", "checkpoint_wait=true,force=false,lock_wait=true," @@ -1148,7 +1166,8 @@ static const WT_CONFIG_ENTRY config_entries[] = { "allocation_size=4KB,app_metadata=,block_allocation=best," "block_compressor=,cache_resident=false,checksum=uncompressed," "collator=,columns=,dictionary=0,encryption=(keyid=,name=)," - "format=btree,huffman_key=,huffman_value=,internal_item_max=0," + "format=btree,huffman_key=,huffman_value=," + "ignore_in_memory_cache_size=false,internal_item_max=0," "internal_key_max=0,internal_key_truncate=true," "internal_page_max=4KB,key_format=u,key_gap=10,leaf_item_max=0," "leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=0," @@ -1156,14 +1175,15 @@ static const WT_CONFIG_ENTRY config_entries[] = { "os_cache_max=0,prefix_compression=false,prefix_compression_min=4" ",split_deepen_min_child=0,split_deepen_per_child=0,split_pct=75," "value_format=u", - confchk_file_config, 33 + confchk_file_config, 34 }, { "file.meta", "allocation_size=4KB,app_metadata=,block_allocation=best," "block_compressor=,cache_resident=false,checkpoint=," "checkpoint_lsn=,checksum=uncompressed,collator=,columns=," "dictionary=0,encryption=(keyid=,name=),format=btree,huffman_key=" - ",huffman_value=,id=,internal_item_max=0,internal_key_max=0," + ",huffman_value=,id=,ignore_in_memory_cache_size=false," + "internal_item_max=0,internal_key_max=0," "internal_key_truncate=true,internal_page_max=4KB,key_format=u," "key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB," "leaf_value_max=0,log=(enabled=true),memory_page_max=5MB," @@ -1171,7 +1191,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "prefix_compression_min=4,split_deepen_min_child=0," "split_deepen_per_child=0,split_pct=75,value_format=u," "version=(major=0,minor=0)", - confchk_file_meta, 37 + confchk_file_meta, 38 }, { "index.meta", "app_metadata=,collator=,columns=,extractor=,immutable=false," @@ -1183,18 +1203,19 @@ static const WT_CONFIG_ENTRY config_entries[] = { "block_compressor=,cache_resident=false,checksum=uncompressed," "chunks=,collator=,columns=,dictionary=0,encryption=(keyid=," "name=),format=btree,huffman_key=,huffman_value=," - "internal_item_max=0,internal_key_max=0," - "internal_key_truncate=true,internal_page_max=4KB,key_format=u," - "key_gap=10,last=,leaf_item_max=0,leaf_key_max=0," - "leaf_page_max=32KB,leaf_value_max=0,log=(enabled=true)," - "lsm=(auto_throttle=true,bloom=true,bloom_bit_count=16," - "bloom_config=,bloom_hash_count=8,bloom_oldest=false," - "chunk_count_limit=0,chunk_max=5GB,chunk_size=10MB,merge_max=15," - "merge_min=0),memory_page_max=5MB,old_chunks=," - "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false," - "prefix_compression_min=4,split_deepen_min_child=0," - "split_deepen_per_child=0,split_pct=75,value_format=u", - confchk_lsm_meta, 37 + "ignore_in_memory_cache_size=false,internal_item_max=0," + "internal_key_max=0,internal_key_truncate=true," + "internal_page_max=4KB,key_format=u,key_gap=10,last=," + "leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB," + "leaf_value_max=0,log=(enabled=true),lsm=(auto_throttle=true," + "bloom=true,bloom_bit_count=16,bloom_config=,bloom_hash_count=8," + "bloom_oldest=false,chunk_count_limit=0,chunk_max=5GB," + "chunk_size=10MB,merge_max=15,merge_min=0),memory_page_max=5MB," + "old_chunks=,os_cache_dirty_max=0,os_cache_max=0," + "prefix_compression=false,prefix_compression_min=4," + "split_deepen_min_child=0,split_deepen_per_child=0,split_pct=75," + "value_format=u", + confchk_lsm_meta, 38 }, { "table.meta", "app_metadata=,colgroups=,collator=,columns=,key_format=u," diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index 0951fd4e58c..04c29e957a3 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -789,14 +789,17 @@ __conn_get_extension_api(WT_CONNECTION *wt_conn) return (&conn->extension_api); } +#ifdef HAVE_BUILTIN_EXTENSION_LZ4 + extern int lz4_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *); +#endif #ifdef HAVE_BUILTIN_EXTENSION_SNAPPY extern int snappy_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *); #endif #ifdef HAVE_BUILTIN_EXTENSION_ZLIB extern int zlib_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *); #endif -#ifdef HAVE_BUILTIN_EXTENSION_LZ4 - extern int lz4_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *); +#ifdef HAVE_BUILTIN_EXTENSION_ZSTD + extern int zstd_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *); #endif /* @@ -808,14 +811,17 @@ __conn_load_default_extensions(WT_CONNECTION_IMPL *conn) { WT_UNUSED(conn); +#ifdef HAVE_BUILTIN_EXTENSION_LZ4 + WT_RET(lz4_extension_init(&conn->iface, NULL)); +#endif #ifdef HAVE_BUILTIN_EXTENSION_SNAPPY WT_RET(snappy_extension_init(&conn->iface, NULL)); #endif #ifdef HAVE_BUILTIN_EXTENSION_ZLIB WT_RET(zlib_extension_init(&conn->iface, NULL)); #endif -#ifdef HAVE_BUILTIN_EXTENSION_LZ4 - WT_RET(lz4_extension_init(&conn->iface, NULL)); +#ifdef HAVE_BUILTIN_EXTENSION_ZSTD + WT_RET(zstd_extension_init(&conn->iface, NULL)); #endif return (0); } @@ -1668,32 +1674,60 @@ __conn_statistics_config(WT_SESSION_IMPL *session, const char *cfg[]) if ((ret = __wt_config_subgets( session, &cval, "fast", &sval)) == 0 && sval.val != 0) { - LF_SET(WT_CONN_STAT_FAST); + LF_SET(WT_STAT_TYPE_FAST); ++set; } WT_RET_NOTFOUND_OK(ret); if ((ret = __wt_config_subgets( session, &cval, "all", &sval)) == 0 && sval.val != 0) { - LF_SET(WT_CONN_STAT_ALL | WT_CONN_STAT_FAST); + LF_SET( + WT_STAT_TYPE_ALL | WT_STAT_TYPE_CACHE_WALK | + WT_STAT_TYPE_FAST | WT_STAT_TYPE_TREE_WALK); ++set; } WT_RET_NOTFOUND_OK(ret); + if (set > 1) + WT_RET_MSG(session, EINVAL, + "Only one of all, fast, none configuration values should " + "be specified"); + + /* + * Now that we've parsed general statistics categories, process + * sub-categories. + */ + if ((ret = __wt_config_subgets( + session, &cval, "cache_walk", &sval)) == 0 && sval.val != 0) + /* + * Configuring cache walk statistics implies fast statistics. + * Keep that knowledge internal for now - it may change in the + * future. + */ + LF_SET(WT_STAT_TYPE_FAST | WT_STAT_TYPE_CACHE_WALK); + WT_RET_NOTFOUND_OK(ret); + + if ((ret = __wt_config_subgets( + session, &cval, "tree_walk", &sval)) == 0 && sval.val != 0) + /* + * Configuring tree walk statistics implies fast statistics. + * Keep that knowledge internal for now - it may change in the + * future. + */ + LF_SET(WT_STAT_TYPE_FAST | WT_STAT_TYPE_TREE_WALK); + WT_RET_NOTFOUND_OK(ret); + if ((ret = __wt_config_subgets( session, &cval, "clear", &sval)) == 0 && sval.val != 0) { - if (!LF_ISSET(WT_CONN_STAT_FAST | WT_CONN_STAT_ALL)) + if (!LF_ISSET(WT_STAT_TYPE_ALL | WT_STAT_TYPE_CACHE_WALK | + WT_STAT_TYPE_FAST | WT_STAT_TYPE_TREE_WALK)) WT_RET_MSG(session, EINVAL, - "the value \"clear\" can be specified only if " - "either \"all\" or \"fast\" is specified"); - LF_SET(WT_CONN_STAT_CLEAR); + "the value \"clear\" can only be specified if " + "statistics are enabled"); + LF_SET(WT_STAT_CLEAR); } WT_RET_NOTFOUND_OK(ret); - if (set > 1) - WT_RET_MSG(session, EINVAL, - "only one statistics configuration value may be specified"); - /* Configuring statistics clears any existing values. */ conn->stat_flags = flags; @@ -1943,6 +1977,42 @@ __conn_chk_file_system(WT_SESSION_IMPL *session, bool readonly) } /* + * wiredtiger_dummy_session_init -- + * Initialize the connection's dummy session. + */ +static void +wiredtiger_dummy_session_init( + WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler) +{ + WT_SESSION_IMPL *session; + + session = &conn->dummy_session; + + /* + * We use a fake session until we can allocate and initialize the real + * ones. Initialize the necessary fields (unfortunately, the fields we + * initialize have been selected by core dumps, we need to do better). + */ + session->iface.connection = &conn->iface; + session->name = "wiredtiger_open"; + + /* Standard I/O and error handling first. */ + __wt_os_stdio(session); + __wt_event_handler_set(session, event_handler); + + /* Statistics */ + session->stat_bucket = 0; + + /* + * Set the default session's strerror method. If one of the extensions + * being loaded reports an error via the WT_EXTENSION_API strerror + * method, but doesn't supply that method a WT_SESSION handle, we'll + * use the WT_CONNECTION_IMPL's default session and its strerror method. + */ + session->iface.strerror = __wt_session_strerror; +} + +/* * wiredtiger_open -- * Main library entry point: open a new connection to a WiredTiger * database. @@ -2013,21 +2083,11 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, TAILQ_INSERT_TAIL(&__wt_process.connqh, conn, q); __wt_spin_unlock(NULL, &__wt_process.spinlock); - session = conn->default_session = &conn->dummy_session; - session->iface.connection = &conn->iface; - session->name = "wiredtiger_open"; - - /* Do standard I/O and error handling first. */ - __wt_os_stdio(session); - __wt_event_handler_set(session, event_handler); - /* - * Set the default session's strerror method. If one of the extensions - * being loaded reports an error via the WT_EXTENSION_API strerror - * method, but doesn't supply that method a WT_SESSION handle, we'll - * use the WT_CONNECTION_IMPL's default session and its strerror method. + * Initialize the fake session used until we can create real sessions. */ - conn->default_session->iface.strerror = __wt_session_strerror; + wiredtiger_dummy_session_init(conn, event_handler); + session = conn->default_session = &conn->dummy_session; /* Basic initialization of the connection structure. */ WT_ERR(__wt_connection_init(conn)); diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c index 6788b1f7f47..fe5f94ea03d 100644 --- a/src/conn/conn_cache.c +++ b/src/conn/conn_cache.c @@ -42,47 +42,38 @@ __cache_config_local(WT_SESSION_IMPL *session, bool shared, const char *cfg[]) WT_RET(__wt_config_gets(session, cfg, "eviction_trigger", &cval)); cache->eviction_trigger = (u_int)cval.val; - if (F_ISSET(conn, WT_CONN_IN_MEMORY)) - cache->eviction_checkpoint_target = - cache->eviction_dirty_target = - cache->eviction_dirty_trigger = 100U; - else { - WT_RET(__wt_config_gets( - session, cfg, "eviction_checkpoint_target", &cval)); - cache->eviction_checkpoint_target = (u_int)cval.val; + WT_RET(__wt_config_gets( + session, cfg, "eviction_checkpoint_target", &cval)); + cache->eviction_checkpoint_target = (u_int)cval.val; - WT_RET(__wt_config_gets( - session, cfg, "eviction_dirty_target", &cval)); - cache->eviction_dirty_target = (u_int)cval.val; + WT_RET(__wt_config_gets(session, cfg, "eviction_dirty_target", &cval)); + cache->eviction_dirty_target = (u_int)cval.val; - /* - * Don't allow the dirty target to be larger than the overall - * target. - */ - if (cache->eviction_dirty_target > cache->eviction_target) - cache->eviction_dirty_target = cache->eviction_target; + /* + * Don't allow the dirty target to be larger than the overall + * target. + */ + if (cache->eviction_dirty_target > cache->eviction_target) + cache->eviction_dirty_target = cache->eviction_target; - /* - * Sanity check the checkpoint target: don't allow a value - * lower than the dirty target. - */ - if (cache->eviction_checkpoint_target > 0 && - cache->eviction_checkpoint_target < - cache->eviction_dirty_target) - cache->eviction_checkpoint_target = - cache->eviction_dirty_target; + /* + * Sanity check the checkpoint target: don't allow a value + * lower than the dirty target. + */ + if (cache->eviction_checkpoint_target > 0 && + cache->eviction_checkpoint_target < cache->eviction_dirty_target) + cache->eviction_checkpoint_target = + cache->eviction_dirty_target; - WT_RET(__wt_config_gets( - session, cfg, "eviction_dirty_trigger", &cval)); - cache->eviction_dirty_trigger = (u_int)cval.val; + WT_RET(__wt_config_gets(session, cfg, "eviction_dirty_trigger", &cval)); + cache->eviction_dirty_trigger = (u_int)cval.val; - /* - * Don't allow the dirty trigger to be larger than the overall - * trigger or we can get stuck with a cache full of dirty data. - */ - if (cache->eviction_dirty_trigger > cache->eviction_trigger) - cache->eviction_dirty_trigger = cache->eviction_trigger; - } + /* + * Don't allow the dirty trigger to be larger than the overall + * trigger or we can get stuck with a cache full of dirty data. + */ + if (cache->eviction_dirty_trigger > cache->eviction_trigger) + cache->eviction_dirty_trigger = cache->eviction_trigger; WT_RET(__wt_config_gets(session, cfg, "eviction.threads_max", &cval)); WT_ASSERT(session, cval.val > 0); @@ -192,26 +183,26 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]) * get any work done. */ if (cache->eviction_target >= cache->eviction_trigger) - WT_ERR_MSG(session, EINVAL, + WT_RET_MSG(session, EINVAL, "eviction target must be lower than the eviction trigger"); - WT_ERR(__wt_cond_auto_alloc(session, "cache eviction server", + WT_RET(__wt_cond_auto_alloc(session, "cache eviction server", false, 10000, WT_MILLION, &cache->evict_cond)); - WT_ERR(__wt_spin_init(session, &cache->evict_pass_lock, "evict pass")); - WT_ERR(__wt_spin_init(session, + WT_RET(__wt_spin_init(session, &cache->evict_pass_lock, "evict pass")); + WT_RET(__wt_spin_init(session, &cache->evict_queue_lock, "cache eviction queue")); - WT_ERR(__wt_spin_init(session, &cache->evict_walk_lock, "cache walk")); + WT_RET(__wt_spin_init(session, &cache->evict_walk_lock, "cache walk")); if ((ret = __wt_open_internal_session(conn, "evict pass", false, WT_SESSION_NO_DATA_HANDLES, &cache->walk_session)) != 0) - WT_ERR_MSG(NULL, ret, + WT_RET_MSG(NULL, ret, "Failed to create session for eviction walks"); /* Allocate the LRU eviction queue. */ cache->evict_slots = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR; for (i = 0; i < WT_EVICT_QUEUE_MAX; ++i) { - WT_ERR(__wt_calloc_def(session, + WT_RET(__wt_calloc_def(session, cache->evict_slots, &cache->evict_queues[i].evict_queue)); - WT_ERR(__wt_spin_init(session, + WT_RET(__wt_spin_init(session, &cache->evict_queues[i].evict_lock, "cache eviction")); } @@ -227,9 +218,6 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]) */ __wt_cache_stats_update(session); return (0); - -err: WT_RET(__wt_cache_destroy(session)); - return (ret); } /* diff --git a/src/conn/conn_ckpt.c b/src/conn/conn_ckpt.c index 451b0cd86f6..1d18c128c5b 100644 --- a/src/conn/conn_ckpt.c +++ b/src/conn/conn_ckpt.c @@ -89,22 +89,36 @@ __ckpt_server(void *arg) */ __wt_cond_wait(session, conn->ckpt_cond, conn->ckpt_usecs); - /* Checkpoint the database. */ - WT_ERR(wt_session->checkpoint(wt_session, NULL)); - - /* Reset. */ - if (conn->ckpt_logsize) { - __wt_log_written_reset(session); - conn->ckpt_signalled = false; - - /* - * In case we crossed the log limit during the - * checkpoint and the condition variable was already - * signalled, do a tiny wait to clear it so we don't do - * another checkpoint immediately. - */ - __wt_cond_wait(session, conn->ckpt_cond, 1); - } + /* + * Checkpoint the database if the connection is marked dirty. + * A connection is marked dirty whenever a btree gets marked + * dirty, which reflects upon a change in the database that + * needs to be checkpointed. Said that, there can be short + * instances when a btree gets marked dirty and the connection + * is yet to be. We might skip a checkpoint in that short + * instance, which is okay because by the next time we get to + * checkpoint, the connection would have been marked dirty and + * hence the checkpoint will not be skipped this time. + */ + if (conn->modified) { + WT_ERR(wt_session->checkpoint(wt_session, NULL)); + + /* Reset. */ + if (conn->ckpt_logsize) { + __wt_log_written_reset(session); + conn->ckpt_signalled = false; + + /* + * In case we crossed the log limit during the + * checkpoint and the condition variable was + * already signalled, do a tiny wait to clear + * it so we don't do another checkpoint + * immediately. + */ + __wt_cond_wait(session, conn->ckpt_cond, 1); + } + } else + WT_STAT_CONN_INCR(session, txn_checkpoint_skipped); } if (0) { diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c index 5ff8b7f798b..5104624523b 100644 --- a/src/conn/conn_handle.c +++ b/src/conn/conn_handle.c @@ -50,21 +50,23 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) /* Statistics. */ __wt_stat_connection_init(conn); - /* Locks. */ + /* Spinlocks. */ WT_RET(__wt_spin_init(session, &conn->api_lock, "api")); - WT_RET(__wt_spin_init(session, &conn->checkpoint_lock, "checkpoint")); - WT_RET(__wt_spin_init(session, &conn->dhandle_lock, "data handle")); + WT_SPIN_INIT_TRACKED(session, &conn->checkpoint_lock, checkpoint); + WT_SPIN_INIT_TRACKED(session, &conn->dhandle_lock, handle_list); WT_RET(__wt_spin_init(session, &conn->encryptor_lock, "encryptor")); WT_RET(__wt_spin_init(session, &conn->fh_lock, "file list")); - WT_RET(__wt_rwlock_alloc(session, - &conn->hot_backup_lock, "hot backup")); WT_RET(__wt_spin_init(session, &conn->las_lock, "lookaside table")); - WT_RET(__wt_spin_init(session, &conn->metadata_lock, "metadata")); + WT_SPIN_INIT_TRACKED(session, &conn->metadata_lock, metadata); WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure")); - WT_RET(__wt_spin_init(session, &conn->schema_lock, "schema")); - WT_RET(__wt_spin_init(session, &conn->table_lock, "table creation")); + WT_SPIN_INIT_TRACKED(session, &conn->schema_lock, schema); + WT_SPIN_INIT_TRACKED(session, &conn->table_lock, table); WT_RET(__wt_spin_init(session, &conn->turtle_lock, "turtle file")); + /* Read-write locks */ + WT_RET(__wt_rwlock_alloc( + session, &conn->hot_backup_lock, "hot backup")); + WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS, &conn->page_lock)); WT_CACHE_LINE_ALIGNMENT_VERIFY(session, conn->page_lock); for (i = 0; i < WT_PAGE_LOCKS; ++i) diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c index 2786526c2fa..34743034877 100644 --- a/src/conn/conn_log.c +++ b/src/conn/conn_log.c @@ -839,10 +839,10 @@ __log_server(void *arg) /* Wait until the next event. */ - WT_ERR(__wt_epoch(session, &start)); + __wt_epoch(session, &start); __wt_cond_auto_wait_signal(session, conn->log_cond, did_work, &signalled); - WT_ERR(__wt_epoch(session, &now)); + __wt_epoch(session, &now); timediff = WT_TIMEDIFF_MS(now, start); } diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c index 66979dfd023..0715a035807 100644 --- a/src/conn/conn_stat.c +++ b/src/conn/conn_stat.c @@ -130,12 +130,12 @@ __statlog_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp) WT_RET(__wt_config_gets(session, cfg, "statistics_log.json", &cval)); if (cval.val != 0) - FLD_SET(conn->stat_flags, WT_CONN_STAT_JSON); + FLD_SET(conn->stat_flags, WT_STAT_JSON); WT_RET(__wt_config_gets( session, cfg, "statistics_log.on_close", &cval)); if (cval.val != 0) - FLD_SET(conn->stat_flags, WT_CONN_STAT_ON_CLOSE); + FLD_SET(conn->stat_flags, WT_STAT_ON_CLOSE); /* * We don't allow the log path to be reconfigured for security reasons. @@ -206,7 +206,7 @@ __statlog_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp) #define WT_TIMESTAMP_JSON_DEFAULT "%Y-%m-%dT%H:%M:%S.000Z" WT_ERR(__wt_config_gets( session, cfg, "statistics_log.timestamp", &cval)); - if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_JSON) && + if (FLD_ISSET(conn->stat_flags, WT_STAT_JSON) && WT_STRING_MATCH(WT_TIMESTAMP_DEFAULT, cval.str, cval.len)) WT_ERR(__wt_strdup( session, WT_TIMESTAMP_JSON_DEFAULT, &conn->stat_format)); @@ -264,7 +264,7 @@ __statlog_dump(WT_SESSION_IMPL *session, const char *name, bool conn_stats) goto err; } - if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_JSON)) { + if (FLD_ISSET(conn->stat_flags, WT_STAT_JSON)) { WT_ERR(__wt_fprintf(session, conn->stat_fs, "{\"version\":\"%s\",\"localTime\":\"%s\"", WIREDTIGER_VERSION_STRING, conn->stat_stamp)); @@ -415,7 +415,7 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp) conn = S2C(session); /* Get the current local time of day. */ - WT_RET(__wt_epoch(session, &ts)); + __wt_epoch(session, &ts); tm = localtime_r(&ts.tv_sec, &_tm); /* Create the logging path name for this time of day. */ @@ -482,7 +482,7 @@ __wt_statlog_log_one(WT_SESSION_IMPL *session) conn = S2C(session); - if (!FLD_ISSET(conn->stat_flags, WT_CONN_STAT_ON_CLOSE)) + if (!FLD_ISSET(conn->stat_flags, WT_STAT_ON_CLOSE)) return (0); if (F_ISSET(conn, WT_CONN_SERVER_RUN) && diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c index 03593f8951a..dba37fa2eb0 100644 --- a/src/conn/conn_sweep.c +++ b/src/conn/conn_sweep.c @@ -271,7 +271,7 @@ __sweep_server(void *arg) /* Wait until the next event. */ __wt_cond_wait(session, conn->sweep_cond, conn->sweep_interval * WT_MILLION); - WT_ERR(__wt_seconds(session, &now)); + __wt_seconds(session, &now); WT_STAT_CONN_INCR(session, dh_sweeps); diff --git a/src/cursor/cur_file.c b/src/cursor/cur_file.c index e304cf7b775..9fc466f4c76 100644 --- a/src/cursor/cur_file.c +++ b/src/cursor/cur_file.c @@ -117,12 +117,12 @@ err: API_END_RET(session, ret); } /* - * __curfile_next_random -- + * __wt_curfile_next_random -- * WT_CURSOR->next method for the btree cursor type when configured with - * next_random. + * next_random. This is exported because it is called directly within LSM. */ -static int -__curfile_next_random(WT_CURSOR *cursor) +int +__wt_curfile_next_random(WT_CURSOR *cursor) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; @@ -473,7 +473,7 @@ __curfile_create(WT_SESSION_IMPL *session, "column-store objects"); __wt_cursor_set_notsup(cursor); - cursor->next = __curfile_next_random; + cursor->next = __wt_curfile_next_random; cursor->reset = __curfile_reset; WT_ERR(__wt_config_gets_def( diff --git a/src/cursor/cur_stat.c b/src/cursor/cur_stat.c index 700cc366ff0..b36416debe1 100644 --- a/src/cursor/cur_stat.c +++ b/src/cursor/cur_stat.c @@ -354,7 +354,7 @@ __curstat_conn_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) */ __wt_conn_stat_init(session); __wt_stat_connection_aggregate(conn->stats, &cst->u.conn_stats); - if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) + if (F_ISSET(cst, WT_STAT_CLEAR)) __wt_stat_connection_clear_all(conn->stats); cst->stats = (int64_t *)&cst->u.conn_stats; @@ -380,7 +380,7 @@ __curstat_file_init(WT_SESSION_IMPL *session, * If we are only getting the size of the file, we don't need to open * the tree. */ - if (F_ISSET(cst, WT_CONN_STAT_SIZE)) { + if (F_ISSET(cst, WT_STAT_TYPE_SIZE)) { filename = uri; if (!WT_PREFIX_SKIP(filename, "file:")) return (EINVAL); @@ -401,7 +401,7 @@ __curstat_file_init(WT_SESSION_IMPL *session, if ((ret = __wt_btree_stat_init(session, cst)) == 0) { __wt_stat_dsrc_init_single(&cst->u.dsrc_stats); __wt_stat_dsrc_aggregate(dhandle->stats, &cst->u.dsrc_stats); - if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) + if (F_ISSET(cst, WT_STAT_CLEAR)) __wt_stat_dsrc_clear_all(dhandle->stats); __wt_curstat_dsrc_final(cst); } @@ -604,50 +604,79 @@ __wt_curstat_open(WT_SESSION_IMPL *session, if ((ret = __wt_config_gets(session, cfg, "statistics", &cval)) == 0) { if ((ret = __wt_config_subgets( session, &cval, "all", &sval)) == 0 && sval.val != 0) { - if (!FLD_ISSET(conn->stat_flags, WT_CONN_STAT_ALL)) + if (!FLD_ISSET(conn->stat_flags, WT_STAT_TYPE_ALL)) goto config_err; - F_SET(cst, WT_CONN_STAT_ALL | WT_CONN_STAT_FAST); + F_SET(cst, WT_STAT_TYPE_ALL | WT_STAT_TYPE_CACHE_WALK | + WT_STAT_TYPE_FAST | WT_STAT_TYPE_TREE_WALK); } WT_ERR_NOTFOUND_OK(ret); if ((ret = __wt_config_subgets( session, &cval, "fast", &sval)) == 0 && sval.val != 0) { - if (F_ISSET(cst, WT_CONN_STAT_ALL)) + if (F_ISSET(cst, WT_STAT_TYPE_ALL)) WT_ERR_MSG(session, EINVAL, - "only one statistics configuration value " - "may be specified"); - F_SET(cst, WT_CONN_STAT_FAST); + "Only one of all, fast, none " + "configuration values should be specified"); + F_SET(cst, WT_STAT_TYPE_FAST); } WT_ERR_NOTFOUND_OK(ret); + + if ((ret = __wt_config_subgets(session, + &cval, "cache_walk", &sval)) == 0 && sval.val != 0) { + /* + * Configuring cache walk statistics implies fast + * statistics. Keep that knowledge internal for now - + * it may change in the future. + */ + F_SET(cst, WT_STAT_TYPE_CACHE_WALK | WT_STAT_TYPE_FAST); + } + WT_ERR_NOTFOUND_OK(ret); + + if ((ret = __wt_config_subgets(session, + &cval, "tree_walk", &sval)) == 0 && sval.val != 0) { + /* + * Configuring tree walk statistics implies fast + * statistics. Keep that knowledge internal for now - + * it may change in the future. + */ + F_SET(cst, WT_STAT_TYPE_FAST | WT_STAT_TYPE_TREE_WALK); + } + WT_ERR_NOTFOUND_OK(ret); + if ((ret = __wt_config_subgets( session, &cval, "size", &sval)) == 0 && sval.val != 0) { - if (F_ISSET(cst, WT_CONN_STAT_FAST | WT_CONN_STAT_ALL)) + if (F_ISSET(cst, WT_STAT_TYPE_FAST | WT_STAT_TYPE_ALL)) WT_ERR_MSG(session, EINVAL, - "only one statistics configuration value " - "may be specified"); - F_SET(cst, WT_CONN_STAT_SIZE); + "Only one of all, fast, none " + "configuration values should be specified"); + F_SET(cst, WT_STAT_TYPE_SIZE); } WT_ERR_NOTFOUND_OK(ret); if ((ret = __wt_config_subgets( session, &cval, "clear", &sval)) == 0 && sval.val != 0) { - if (F_ISSET(cst, WT_CONN_STAT_SIZE)) + if (F_ISSET(cst, WT_STAT_TYPE_SIZE)) WT_ERR_MSG(session, EINVAL, "clear is incompatible with size " "statistics"); - F_SET(cst, WT_CONN_STAT_CLEAR); + F_SET(cst, WT_STAT_CLEAR); } WT_ERR_NOTFOUND_OK(ret); /* If no configuration, use the connection's configuration. */ if (cst->flags == 0) { - if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_ALL)) - F_SET(cst, WT_CONN_STAT_ALL); - if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_FAST)) - F_SET(cst, WT_CONN_STAT_FAST); + if (FLD_ISSET(conn->stat_flags, WT_STAT_TYPE_ALL)) + F_SET(cst, WT_STAT_TYPE_ALL); + if (FLD_ISSET( + conn->stat_flags, WT_STAT_TYPE_CACHE_WALK)) + F_SET(cst, WT_STAT_TYPE_CACHE_WALK); + if (FLD_ISSET(conn->stat_flags, WT_STAT_TYPE_FAST)) + F_SET(cst, WT_STAT_TYPE_FAST); + if (FLD_ISSET(conn->stat_flags, WT_STAT_TYPE_TREE_WALK)) + F_SET(cst, WT_STAT_TYPE_TREE_WALK); } /* If the connection configures clear, so do we. */ - if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR)) - F_SET(cst, WT_CONN_STAT_CLEAR); + if (FLD_ISSET(conn->stat_flags, WT_STAT_CLEAR)) + F_SET(cst, WT_STAT_CLEAR); } /* @@ -670,9 +699,9 @@ __wt_curstat_open(WT_SESSION_IMPL *session, /* * Do the initial statistics snapshot: there won't be cursor operations - * to trigger initialization when aggregating statistics for upper-level - * objects like tables, we need to a valid set of statistics when before - * the open returns. + * to trigger initialization with aggregating statistics for upper-level + * objects like tables so we need a valid set of statistics before the + * open returns. */ WT_ERR(__wt_curstat_init(session, uri, other, cst->cfg, cst)); cst->notinitialized = false; diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c index 1b93b27f564..6543d54e90f 100644 --- a/src/cursor/cur_table.c +++ b/src/cursor/cur_table.c @@ -757,13 +757,36 @@ err: API_END_RET(session, ret); } /* + * __curtable_complete -- + * Return failure if the table is not yet fully created. + */ +static int +__curtable_complete(WT_SESSION_IMPL *session, WT_TABLE *table) +{ + WT_DECL_RET; + bool complete; + + if (table->cg_complete) + return (0); + + /* If the table is incomplete, wait on the table lock and recheck. */ + complete = false; + WT_WITH_TABLE_LOCK(session, ret, complete = table->cg_complete); + WT_RET(ret); + if (!complete) + WT_RET_MSG(session, EINVAL, + "'%s' not available until all column groups are created", + table->name); + return (0); +} + +/* * __curtable_open_colgroups -- * Open cursors on column groups for a table cursor. */ static int __curtable_open_colgroups(WT_CURSOR_TABLE *ctable, const char *cfg_arg[]) { - WT_DECL_RET; WT_SESSION_IMPL *session; WT_TABLE *table; WT_CURSOR **cp; @@ -775,21 +798,11 @@ __curtable_open_colgroups(WT_CURSOR_TABLE *ctable, const char *cfg_arg[]) cfg_arg[0], cfg_arg[1], "dump=\"\",readonly=0", NULL, NULL }; u_int i; - bool complete; session = (WT_SESSION_IMPL *)ctable->iface.session; table = ctable->table; - /* If the table is incomplete, wait on the table lock and recheck. */ - complete = table->cg_complete; - if (!complete) { - WT_WITH_TABLE_LOCK(session, ret, complete = table->cg_complete); - WT_RET(ret); - } - if (!complete) - WT_RET_MSG(session, EINVAL, - "Can't use '%s' until all column groups are created", - table->name); + WT_RET(__curtable_complete(session, table)); /* completeness check */ WT_RET(__wt_calloc_def(session, WT_COLGROUPS(table), &ctable->cg_cursors)); @@ -887,6 +900,8 @@ __wt_curtable_open(WT_SESSION_IMPL *session, size = WT_PTRDIFF(columns, tablename); WT_RET(__wt_schema_get_table(session, tablename, size, false, &table)); + WT_RET(__curtable_complete(session, table)); /* completeness check */ + if (table->is_simple) { /* Just return a cursor on the underlying data source. */ ret = __wt_open_cursor(session, diff --git a/src/docs/build-posix.dox b/src/docs/build-posix.dox index 4889bf931c9..3e7f8f37acd 100644 --- a/src/docs/build-posix.dox +++ b/src/docs/build-posix.dox @@ -150,10 +150,14 @@ Configure WiredTiger to support the \c verbose configuration string to Configure WiredTiger for <a href="http://www.zlib.net/">zlib</a> compression; see @ref compression for more information. +@par \c --enable-zstd +Configure WiredTiger for <a href="https://github.com/facebook/zstd">Zstd</a> +compression; see @ref compression for more information. + @par <code>--with-builtins</code> Configure WiredTiger to include support for extensions in the main library. This avoids requiring additional libraries for supported extensions. Currently -supported options are \c lz4, \c snappy and \c zlib. +supported options are \c lz4, \c snappy, \c zlib and \c zstd. @par <code>--with-python-prefix</code> Configure WiredTiger to install Python libraries to a non-standard Python diff --git a/src/docs/compression.dox b/src/docs/compression.dox index 0be96835760..74bed5c6f68 100644 --- a/src/docs/compression.dox +++ b/src/docs/compression.dox @@ -1,7 +1,7 @@ /*! @m_page{{c,java},compression,Compressors} This section explains how to configure WiredTiger's builtin support for -the lz4, snappy and zlib compression engines. +the lz4, snappy, zlib and zstd compression engines. @section compression_lz4 Using LZ4 compression @@ -85,11 +85,53 @@ an extension. For example, with the WiredTiger library installed in @snippet ex_all.c Configure zlib extension +The default compression level for the zlib compression is +\c Z_DEFAULT_COMPRESSION (see the zlib documentation for further +information); compression can be configured to other levels using the +additional configuration argument \c compression_level. + +@snippet ex_all.c Configure zlib extension with compression level + Finally, when creating the WiredTiger object, set \c block_compressor to \c zlib: @snippet ex_all.c Create a zlib compressed table +@section compression_zstd Using Zstd compression + +To use the builtin support for Facebook's +<a href="https://github.com/facebook/zstd">Zstd</a> +compression, first check that Zstd is installed in include and library +directories searched by the compiler. Once Zstd is installed, you can +enable Zstd using the \c --enable-zstd option to configure. + +If Zstd is installed in a location not normally searched by the +compiler toolchain, you'll need to modify the \c CPPFLAGS and \c LDFLAGS +to indicate these locations. For example, with the Zstd includes and +libraries installed in \c /usr/local/include and \c /usr/local/lib, you +would run configure with the following additional arguments: + +@code +--enable-zstd CPPFLAGS="-I/usr/local/include" LDFLAGS="-L/usr/local/include" +@endcode + +When opening the WiredTiger database, load the Zstd shared library as +an extension. For example, with the WiredTiger library installed in +\c /usr/local/lib, you would use the following extension: + +@snippet ex_all.c Configure zstd extension + +The default compression level for the zstd compression is 3; compression +can be configured to other levels using the additional configuration +argument \c compression_level. + +@snippet ex_all.c Configure zstd extension with compression level + +Finally, when creating the WiredTiger object, set \c block_compressor +to \c zstd: + +@snippet ex_all.c Create a zstd compressed table + @section compression_upgrading Upgrading compression engines WiredTiger does not store information with file blocks to identify the diff --git a/src/docs/spell.ok b/src/docs/spell.ok index a2ef7658ec6..4b1337f84b8 100644 --- a/src/docs/spell.ok +++ b/src/docs/spell.ok @@ -95,6 +95,7 @@ WiredTigerStat WiredTigerTestCase Yann Za +Zstd aR abstime ack'ed @@ -507,3 +508,4 @@ xa yieldcpu zlib zseries +zstd diff --git a/src/docs/wtperf.dox b/src/docs/wtperf.dox index a49d0d9f871..83aadf8a776 100644 --- a/src/docs/wtperf.dox +++ b/src/docs/wtperf.dox @@ -150,33 +150,27 @@ number of async worker threads @par checkpoint_interval (unsigned int, default=120) checkpoint every interval seconds during the workload phase. @par checkpoint_stress_rate (unsigned int, default=0) -checkpoint every rate operations during the populate phase in the -populate thread(s), 0 to disable +checkpoint every rate operations during the populate phase in the populate thread(s), 0 to disable @par checkpoint_threads (unsigned int, default=0) number of checkpoint threads -@par conn_config (string, default=create) +@par conn_config (string, default="create") connection configuration string +@par close_conn (boolean, default=true) +properly close connection at end of test. Setting to false does not sync data to disk and can result in lost data after test exits. @par compact (boolean, default=false) post-populate compact for LSM merging activity -@par compression (string, default=none) -compression extension. Allowed configuration values are: 'none', -'lz4', 'snappy', 'zlib' +@par compression (string, default="none") +compression extension. Allowed configuration values are: 'none', 'lz4', 'snappy', 'zlib', 'zstd' @par create (boolean, default=true) do population phase; false to use existing database @par database_count (unsigned int, default=1) -number of WiredTiger databases to use. Each database will execute the -workload using a separate home directory and complete set of worker -threads -@par drop_tables (unsigned int, default=0) -Whether to drop all tables at the end of the run, and report time -taken to do the drop. +number of WiredTiger databases to use. Each database will execute the workload using a separate home directory and complete set of worker threads +@par drop_tables (boolean, default=false) +Whether to drop all tables at the end of the run, and report time taken to do the drop. @par icount (unsigned int, default=5000) -number of records to initially populate. If multiple tables are -configured the count is spread evenly across all tables. +number of records to initially populate. If multiple tables are configured the count is spread evenly across all tables. @par idle_table_cycle (unsigned int, default=0) -Enable regular create and drop of idle tables, value is the maximum -number of seconds a create or drop is allowed before flagging an -error. Default 0 which means disabled. +Enable regular create and drop of idle tables, value is the maximum number of seconds a create or drop is allowed before flagging an error. Default 0 which means disabled. @par index (boolean, default=false) Whether to create an index on the value field. @par insert_rmw (boolean, default=false) @@ -188,28 +182,21 @@ perform partial logging on first table only. @par log_like_table (boolean, default=false) Append all modification operations to another shared table. @par min_throughput (unsigned int, default=0) -notify if any throughput measured is less than this amount. Aborts or -prints warning based on min_throughput_fatal setting. Requires -sample_interval to be configured +notify if any throughput measured is less than this amount. Aborts or prints warning based on min_throughput_fatal setting. Requires sample_interval to be configured @par min_throughput_fatal (boolean, default=false) print warning (false) or abort (true) of min_throughput failure. @par max_latency (unsigned int, default=0) -notify if any latency measured exceeds this number of -milliseconds.Aborts or prints warning based on min_throughput_fatal -setting. Requires sample_interval to be configured +notify if any latency measured exceeds this number of milliseconds. Aborts or prints warning based on min_throughput_fatal setting. Requires sample_interval to be configured @par max_latency_fatal (boolean, default=false) print warning (false) or abort (true) of max_latency failure. @par pareto (unsigned int, default=0) -use pareto distribution for random numbers. Zero to disable, otherwise -a percentage indicating how aggressive the distribution should be. +use pareto distribution for random numbers. Zero to disable, otherwise a percentage indicating how aggressive the distribution should be. @par populate_ops_per_txn (unsigned int, default=0) -number of operations to group into each transaction in the populate -phase, zero for auto-commit +number of operations to group into each transaction in the populate phase, zero for auto-commit @par populate_threads (unsigned int, default=1) number of populate threads, 1 for bulk load @par random_range (unsigned int, default=0) -if non zero choose a value from within this range as the key for -insert operations +if non zero choose a value from within this range as the key for insert operations @par random_value (boolean, default=false) generate random content for the value @par range_partition (boolean, default=false) @@ -217,9 +204,7 @@ partition data by range (vs hash) @par read_range (unsigned int, default=0) scan a range of keys after each search @par readonly (boolean, default=false) -reopen the connection between populate and workload phases in readonly -mode. Requires reopen_connection turned on (default). Requires that -read be the only workload specified +reopen the connection between populate and workload phases in readonly mode. Requires reopen_connection turned on (default). Requires that read be the only workload specified @par reopen_connection (boolean, default=true) close and reopen the connection between populate and workload phases @par report_interval (unsigned int, default=2) @@ -231,40 +216,22 @@ total workload seconds @par sample_interval (unsigned int, default=0) performance logging every interval seconds, 0 to disable @par sample_rate (unsigned int, default=50) -how often the latency of operations is measured. One for every -operation,two for every second operation, three for every third -operation etc. -@par sess_config (string, default=) +how often the latency of operations is measured. One for every operation, two for every second operation, three for every third operation etc. +@par sess_config (string, default="") session configuration string @par session_count_idle (unsigned int, default=0) number of idle sessions to create. Default 0. -@par table_config (string, default=key_format=S,value_format=S,type=lsm,exclusive=true,allocation_size=4kb,internal_page_max=64kb,leaf_page_max=4kb,split_pct=100) +@par table_config (string, default="key_format=S,value_format=S,type=lsm,exclusive=true, allocation_size=4kb,internal_page_max=64kb,leaf_page_max=4kb, split_pct=100") table configuration string @par table_count (unsigned int, default=1) -number of tables to run operations over. Keys are divided evenly over -the tables. Cursors are held open on all tables. Default 1, maximum -99999. +number of tables to run operations over. Keys are divided evenly over the tables. Cursors are held open on all tables. Default 1, maximum 99999. @par table_count_idle (unsigned int, default=0) number of tables to create, that won't be populated. Default 0. -@par threads (string, default=) -workload configuration: each 'count' entry is the total number of -threads, and the 'insert', 'read' and 'update' entries are the ratios -of insert, read and update operations done by each worker thread; If a -throttle value is provided each thread will do a maximum of that -number of operations per second; multiple workload configurations may -be specified per threads configuration; for example, a more complex -threads configuration might be -'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' -which would create 2 threads doing nothing but reads and 8 threads -each doing 50% inserts and 25% reads and updates. Allowed -configuration values are 'count', 'throttle', 'update_delta', 'reads', -'inserts', 'updates', 'truncate', 'truncate_pct' and 'truncate_count'. -There are also behavior modifiers, supported modifiers are -'ops_per_txn' -@par transaction_config (string, default=) -transaction configuration string, relevant when populate_opts_per_txn -is nonzero -@par table_name (string, default=test) +@par threads (string, default="") +workload configuration: each 'count' entry is the total number of threads, and the 'insert', 'read' and 'update' entries are the ratios of insert, read and update operations done by each worker thread; If a throttle value is provided each thread will do a maximum of that number of operations per second; multiple workload configurations may be specified per threads configuration; for example, a more complex threads configuration might be 'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' which would create 2 threads doing nothing but reads and 8 threads each doing 50% inserts and 25% reads and updates. Allowed configuration values are 'count', 'throttle', 'update_delta', 'reads', 'inserts', 'updates', 'truncate', 'truncate_pct' and 'truncate_count'. There are also behavior modifiers, supported modifiers are 'ops_per_txn' +@par transaction_config (string, default="") +WT_SESSION.begin_transaction configuration string, applied during the populate phase when populate_ops_per_txn is nonzero +@par table_name (string, default="test") table name @par truncate_single_ops (boolean, default=false) Implement truncate via cursor remove instead of session API diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index acc81f566a5..6c99f3a13dc 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -233,10 +233,10 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread) #ifdef HAVE_DIAGNOSTIC /* - * Ensure the cache stuck timer is initialized when starting eviction + * Ensure the cache stuck timer is initialized when starting eviction. */ if (thread->id == 0) - WT_ERR(__wt_epoch(session, &cache->stuck_ts)); + __wt_epoch(session, &cache->stuck_ts); #endif while (F_ISSET(conn, WT_CONN_EVICTION_RUN) && @@ -350,10 +350,10 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work) } else if (cache->pages_evicted != cache->pages_evict) { cache->pages_evicted = cache->pages_evict; #ifdef HAVE_DIAGNOSTIC - WT_RET(__wt_epoch(session, &cache->stuck_ts)); + __wt_epoch(session, &cache->stuck_ts); } else { /* After being stuck for 5 minutes, give up. */ - WT_RET(__wt_epoch(session, &now)); + __wt_epoch(session, &now); if (WT_TIMEDIFF_SEC(now, cache->stuck_ts) > 300) { ret = ETIMEDOUT; __wt_err(session, ret, @@ -465,16 +465,16 @@ __evict_update_work(WT_SESSION_IMPL *session) */ bytes_max = conn->cache_size + 1; bytes_inuse = __wt_cache_bytes_inuse(cache); - if (bytes_inuse > (cache->eviction_target * bytes_max) / 100) - F_SET(cache, WT_CACHE_EVICT_CLEAN); if (__wt_eviction_clean_needed(session, NULL)) F_SET(cache, WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD); + else if (bytes_inuse > (cache->eviction_target * bytes_max) / 100) + F_SET(cache, WT_CACHE_EVICT_CLEAN); dirty_inuse = __wt_cache_dirty_leaf_inuse(cache); - if (dirty_inuse > (cache->eviction_dirty_target * bytes_max) / 100) - F_SET(cache, WT_CACHE_EVICT_DIRTY); if (__wt_eviction_dirty_needed(session, NULL)) F_SET(cache, WT_CACHE_EVICT_DIRTY | WT_CACHE_EVICT_DIRTY_HARD); + else if (dirty_inuse > (cache->eviction_dirty_target * bytes_max) / 100) + F_SET(cache, WT_CACHE_EVICT_DIRTY); /* * If application threads are blocked by the total volume of data in @@ -506,12 +506,6 @@ __evict_update_work(WT_SESSION_IMPL *session) F_CLR(cache, WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD); } - /* If threads are blocked by eviction we should be looking for pages. */ - WT_ASSERT(session, !F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD) || - F_ISSET(cache, WT_CACHE_EVICT_CLEAN)); - WT_ASSERT(session, !F_ISSET(cache, WT_CACHE_EVICT_DIRTY_HARD) || - F_ISSET(cache, WT_CACHE_EVICT_DIRTY)); - WT_STAT_CONN_SET(session, cache_eviction_state, F_MASK(cache, WT_CACHE_EVICT_MASK)); @@ -543,7 +537,7 @@ __evict_pass(WT_SESSION_IMPL *session) /* Evict pages from the cache. */ for (loop = 0; cache->pass_intr == 0; loop++) { - WT_RET(__wt_epoch(session, &now)); + __wt_epoch(session, &now); if (loop == 0) prev = now; @@ -554,6 +548,7 @@ __evict_pass(WT_SESSION_IMPL *session) * does need to do some work. */ __wt_cache_read_gen_incr(session); + ++cache->evict_pass_gen; /* * Update the oldest ID: we use it to decide whether pages are @@ -895,12 +890,11 @@ __evict_lru_walk(WT_SESSION_IMPL *session) /* Fill the next queue (that isn't the urgent queue). */ queue = cache->evict_fill_queue; other_queue = cache->evict_queues + (1 - (queue - cache->evict_queues)); + cache->evict_fill_queue = other_queue; /* If this queue is full, try the other one. */ if (__evict_queue_full(queue) && !__evict_queue_full(other_queue)) queue = other_queue; - cache->evict_fill_queue = - &cache->evict_queues[1 - (queue - cache->evict_queues)]; /* * If both queues are full and haven't been empty on recent refills, @@ -1062,7 +1056,7 @@ __evict_walk(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue) WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; - u_int max_entries, retries, slot, start_slot, spins; + u_int max_entries, retries, slot, spins, start_slot, total_candidates; bool dhandle_locked, incr; conn = S2C(session); @@ -1079,6 +1073,14 @@ __evict_walk(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue) start_slot = slot = queue->evict_entries; max_entries = WT_MIN(slot + WT_EVICT_WALK_INCR, cache->evict_slots); + /* + * Another pathological case: if there are only a tiny number of + * candidate pages in cache, don't put all of them on one queue. + */ + total_candidates = (u_int)(F_ISSET(cache, WT_CACHE_EVICT_CLEAN) ? + __wt_cache_pages_inuse(cache) : cache->pages_dirty_leaf); + max_entries = WT_MIN(max_entries, 1 + total_candidates / 2); + retry: while (slot < max_entries) { /* * If another thread is waiting on the eviction server to clear @@ -1282,8 +1284,8 @@ __evict_push_candidate(WT_SESSION_IMPL *session, * Get a few page eviction candidates from a single underlying file. */ static int -__evict_walk_file(WT_SESSION_IMPL *session, - WT_EVICT_QUEUE *queue, u_int max_entries, u_int *slotp) +__evict_walk_file(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, + u_int max_entries, u_int *slotp) { WT_BTREE *btree; WT_CACHE *cache; @@ -1410,6 +1412,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, page = ref->page; modified = __wt_page_is_modified(page); + page->evict_pass_gen = cache->evict_pass_gen; /* * Use the EVICT_LRU flag to avoid putting pages onto the list @@ -1508,19 +1511,22 @@ fast: /* If the page can't be evicted, give up. */ btree->evict_walk_period = 0; /* - * If we happen to end up on the root page, clear it. We have to track - * hazard pointers, and the root page complicates that calculation. + * If we happen to end up on the root page or a page requiring urgent + * eviction, clear it. We have to track hazard pointers, and the root + * page complicates that calculation. * * Likewise if we found no new candidates during the walk: there is no - * point keeping a page pinned, since it may be the only candidate in an - * idle tree. + * point keeping a page pinned, since it may be the only candidate in + * an idle tree. * * If we land on a page requiring forced eviction, move on to the next * page: we want this page evicted as quickly as possible. */ if ((ref = btree->evict_ref) != NULL) { /* Give up the walk occasionally. */ - if (__wt_ref_is_root(ref) || evict == start || give_up) + if (__wt_ref_is_root(ref) || evict == start || give_up || + ref->page->read_gen == WT_READGEN_OLDEST || + ref->page->memory_footprint >= btree->splitmempage) WT_RET(__evict_clear_walk(session, restarts == 0)); else if (ref->page->read_gen == WT_READGEN_OLDEST) WT_RET_NOTFOUND_OK(__wt_tree_walk_count( @@ -1543,17 +1549,17 @@ __evict_get_ref( WT_SESSION_IMPL *session, bool is_server, WT_BTREE **btreep, WT_REF **refp) { WT_CACHE *cache; - WT_DECL_RET; WT_EVICT_ENTRY *evict; WT_EVICT_QUEUE *queue, *other_queue, *urgent_queue; uint32_t candidates; - bool is_app, urgent_ok; + bool is_app, server_only, urgent_ok; cache = S2C(session)->cache; is_app = !F_ISSET(session, WT_SESSION_INTERNAL); + server_only = is_server && !WT_EVICT_HAS_WORKERS(session); urgent_ok = (!is_app && !is_server) || !WT_EVICT_HAS_WORKERS(session) || - __wt_cache_aggressive(session); + (is_app && __wt_cache_aggressive(session)); urgent_queue = cache->evict_urgent_queue; *btreep = NULL; *refp = NULL; @@ -1569,7 +1575,8 @@ __evict_get_ref( } /* - * The server repopulates whenever the other queue is not full. + * The server repopulates whenever the other queue is not full, as long + * as at least one page has been evicted out of the current queue. * * Note that there are pathological cases where there are only enough * eviction candidates in the cache to fill one queue. In that case, @@ -1577,18 +1584,14 @@ __evict_get_ref( * Such cases are extremely rare in real applications. */ if (is_server && + (!urgent_ok || __evict_queue_empty(urgent_queue, false)) && + !__evict_queue_full(cache->evict_current_queue) && + !__evict_queue_full(cache->evict_fill_queue) && (cache->evict_empty_score > WT_EVICT_SCORE_CUTOFF || - __evict_queue_empty(cache->evict_fill_queue, false))) { - while ((ret = __wt_spin_trylock( - session, &cache->evict_queue_lock)) == EBUSY) - if ((!urgent_ok || - __evict_queue_empty(urgent_queue, false)) && - !__evict_queue_full(cache->evict_fill_queue)) - return (WT_NOTFOUND); + __evict_queue_empty(cache->evict_fill_queue, false))) + return (WT_NOTFOUND); - WT_RET(ret); - } else - __wt_spin_lock(session, &cache->evict_queue_lock); + __wt_spin_lock(session, &cache->evict_queue_lock); /* Check the urgent queue first. */ if (urgent_ok && !__evict_queue_empty(urgent_queue, false)) @@ -1596,17 +1599,15 @@ __evict_get_ref( else { /* * Check if the current queue needs to change. - * The current queue could have changed while we waited for - * the lock. * * The server will only evict half of the pages before looking - * for more. The remainder are left to eviction workers (if any - * configured), or application threads if necessary. + * for more, but should only switch queues if there are no + * other eviction workers. */ queue = cache->evict_current_queue; other_queue = cache->evict_other_queue; - if (__evict_queue_empty(queue, is_server) && - !__evict_queue_empty(other_queue, is_server)) { + if (__evict_queue_empty(queue, server_only) && + !__evict_queue_empty(other_queue, server_only)) { cache->evict_current_queue = other_queue; cache->evict_other_queue = queue; } @@ -1715,15 +1716,19 @@ __evict_get_ref( static int __evict_page(WT_SESSION_IMPL *session, bool is_server) { + struct timespec enter, leave; WT_BTREE *btree; WT_CACHE *cache; WT_DECL_RET; WT_REF *ref; + bool app_timer; WT_RET(__evict_get_ref(session, is_server, &btree, &ref)); WT_ASSERT(session, ref->state == WT_REF_LOCKED); + app_timer = false; cache = S2C(session)->cache; + /* * An internal session flags either the server itself or an eviction * worker thread. @@ -1739,6 +1744,10 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server) WT_STAT_CONN_INCR(session, cache_eviction_app_dirty); WT_STAT_CONN_INCR(session, cache_eviction_app); cache->app_evicts++; + if (WT_STAT_ENABLED(session)) { + app_timer = true; + __wt_epoch(session, &enter); + } } /* @@ -1756,6 +1765,11 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server) (void)__wt_atomic_subv32(&btree->evict_busy, 1); + if (app_timer) { + __wt_epoch(session, &leave); + WT_STAT_CONN_INCRV(session, + application_evict_time, WT_TIMEDIFF_US(leave, enter)); + } return (ret); } @@ -1767,6 +1781,7 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server) int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) { + struct timespec enter, leave; WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; @@ -1792,9 +1807,11 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) /* Wake the eviction server if we need to do work. */ __wt_evict_server_wake(session); - init_evict_count = cache->pages_evict; + /* Track how long application threads spend doing eviction. */ + if (WT_STAT_ENABLED(session) && !F_ISSET(session, WT_SESSION_INTERNAL)) + __wt_epoch(session, &enter); - for (;;) { + for (init_evict_count = cache->pages_evict;; ret = 0) { /* * A pathological case: if we're the oldest transaction in the * system and the eviction server is stuck trying to find space, @@ -1804,7 +1821,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) if (__wt_cache_stuck(session) && __wt_txn_am_oldest(session)) { --cache->evict_aggressive_score; WT_STAT_CONN_INCR(session, txn_fail_cache); - return (WT_ROLLBACK); + WT_ERR(WT_ROLLBACK); } /* @@ -1816,7 +1833,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) * limit the work to 5 evictions and return. If that's not the * case, we can do more. */ - if (!busy && txn_state->snap_min != WT_TXN_NONE && + if (!busy && txn_state->pinned_id != WT_TXN_NONE && txn_global->current != txn_global->oldest_id) busy = true; max_pages_evicted = busy ? 5 : 20; @@ -1825,7 +1842,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) if (!__wt_eviction_needed(session, busy, &pct_full) || (pct_full < 100 && cache->pages_evict > init_evict_count + max_pages_evicted)) - return (0); + break; /* * Don't make application threads participate in scrubbing for @@ -1842,7 +1859,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) switch (ret = __evict_page(session, false)) { case 0: if (busy) - return (0); + goto err; /* FALLTHROUGH */ case EBUSY: break; @@ -1853,9 +1870,18 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) cache->app_waits++; break; default: - return (ret); + goto err; } } + +err: if (WT_STAT_ENABLED(session) && + !F_ISSET(session, WT_SESSION_INTERNAL)) { + __wt_epoch(session, &leave); + WT_STAT_CONN_INCRV(session, + application_cache_time, WT_TIMEDIFF_US(leave, enter)); + } + + return (ret); /* NOTREACHED */ } diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c index 092f80cc000..3d1557e027e 100644 --- a/src/evict/evict_page.c +++ b/src/evict/evict_page.c @@ -31,23 +31,14 @@ __evict_exclusive_clear(WT_SESSION_IMPL *session, WT_REF *ref) static inline int __evict_exclusive(WT_SESSION_IMPL *session, WT_REF *ref) { - int loops; - WT_ASSERT(session, ref->state == WT_REF_LOCKED); /* * Check for a hazard pointer indicating another thread is using the * page, meaning the page cannot be evicted. */ - for (loops = 0; loops < 10; loops++) { - if (__wt_page_hazard_check(session, ref->page) == NULL) - return (0); - if (ref->page->read_gen != WT_READGEN_OLDEST && - ref->page->memory_footprint < - S2BT(session)->split_deepen_min_child) - break; - __wt_sleep(0, WT_THOUSAND); - } + if (__wt_page_hazard_check(session, ref->page) == NULL) + return (0); WT_STAT_DATA_INCR(session, cache_eviction_hazard); WT_STAT_CONN_INCR(session, cache_eviction_hazard); diff --git a/src/evict/evict_stat.c b/src/evict/evict_stat.c new file mode 100644 index 00000000000..2dd3b1e83a0 --- /dev/null +++ b/src/evict/evict_stat.c @@ -0,0 +1,138 @@ +/*- + * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __evict_stat_walk -- + * Walk all the pages in cache for a dhandle gathering stats information + */ +static void +__evict_stat_walk(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_PAGE *page; + WT_REF *next_walk; + uint64_t dsk_size, gen_gap, size; + uint64_t written_size_cnt, written_size_sum; + uint64_t gen_gap_cnt, gen_gap_max, gen_gap_sum; + uint64_t max_pagesize, min_written_size; + uint64_t num_memory, num_queued, num_not_queueable, num_smaller_allocsz; + uint64_t pages_clean, pages_dirty, pages_internal, pages_leaf; + uint64_t seen_count, walk_count; + + btree = S2BT(session); + next_walk = NULL; + written_size_cnt = written_size_sum = 0; + gen_gap_cnt = gen_gap_max = gen_gap_sum = 0; + max_pagesize = 0; + num_memory = num_queued = num_not_queueable = num_smaller_allocsz = 0; + pages_clean = pages_dirty = pages_internal = pages_leaf = 0; + seen_count = walk_count = 0; + min_written_size = UINT64_MAX; + + while (__wt_tree_walk_count(session, &next_walk, &walk_count, + WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 && + next_walk != NULL) { + ++seen_count; + page = next_walk->page; + size = page->memory_footprint; + + if (__wt_page_is_modified(page)) + ++pages_dirty; + else + ++pages_clean; + + if (!__wt_ref_is_root(next_walk) && + !__wt_page_can_evict(session, next_walk, NULL)) + ++num_not_queueable; + + if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)) + ++num_queued; + + if (size > max_pagesize) + max_pagesize = size; + + dsk_size = page->dsk != NULL ? page->dsk->mem_size : 0; + if (dsk_size != 0) { + if (dsk_size < btree->allocsize) + ++num_smaller_allocsz; + if (dsk_size < min_written_size) + min_written_size = dsk_size; + ++written_size_cnt; + written_size_sum += dsk_size; + } else + ++num_memory; + + if (WT_PAGE_IS_INTERNAL(page)) + ++pages_internal; + else + ++pages_leaf; + + /* Skip root pages since they are never considered */ + if (__wt_ref_is_root(next_walk)) + continue; + + gen_gap = + S2C(session)->cache->evict_pass_gen - page->evict_pass_gen; + if (gen_gap > gen_gap_max) + gen_gap_max = gen_gap; + gen_gap_sum += gen_gap; + ++gen_gap_cnt; + } + + WT_STAT_DATA_SET(session, cache_state_avg_written_size, + written_size_cnt == 0 ? 0 : written_size_sum / written_size_cnt); + WT_STAT_DATA_SET(session, cache_state_gen_avg_gap, + gen_gap_cnt == 0 ? 0 : gen_gap_sum / gen_gap_cnt); + + WT_STAT_DATA_SET(session, cache_state_gen_max_gap, gen_gap_max); + WT_STAT_DATA_SET(session, cache_state_max_pagesize, max_pagesize); + WT_STAT_DATA_SET(session, + cache_state_min_written_size, min_written_size); + WT_STAT_DATA_SET(session, cache_state_memory, num_memory); + WT_STAT_DATA_SET(session, cache_state_queued, num_queued); + WT_STAT_DATA_SET(session, cache_state_not_queueable, num_not_queueable); + WT_STAT_DATA_SET(session, + cache_state_smaller_alloc_size, num_smaller_allocsz); + WT_STAT_DATA_SET(session, cache_state_pages, walk_count); + WT_STAT_DATA_SET(session, cache_state_pages_clean, pages_clean); + WT_STAT_DATA_SET(session, cache_state_pages_dirty, pages_dirty); + WT_STAT_DATA_SET(session, cache_state_pages_internal, pages_internal); + WT_STAT_DATA_SET(session, cache_state_pages_leaf, pages_leaf); + WT_STAT_DATA_SET(session, + cache_state_refs_skipped, walk_count - seen_count); +} + +/* + * __wt_curstat_cache_walk -- + * Initialize the statistics for a cache cache_walk pass. + */ +void +__wt_curstat_cache_walk(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_CONNECTION_IMPL *conn; + WT_PAGE_INDEX *root_idx; + + btree = S2BT(session); + conn = S2C(session); + + /* Set statistics that don't require walking the cache. */ + WT_STAT_DATA_SET(session, + cache_state_gen_current, conn->cache->evict_pass_gen); + + /* Root page statistics */ + root_idx = WT_INTL_INDEX_GET_SAFE(btree->root.page); + WT_STAT_DATA_SET(session, + cache_state_root_entries, root_idx->entries); + WT_STAT_DATA_SET(session, + cache_state_root_size, btree->root.page->memory_footprint); + + WT_WITH_HANDLE_LIST_LOCK(session, __evict_stat_walk(session)); +} diff --git a/src/include/api.h b/src/include/api.h index e1b2f8edaf3..2783d17f825 100644 --- a/src/include/api.h +++ b/src/include/api.h @@ -139,7 +139,9 @@ (s) = (WT_SESSION_IMPL *)(cur)->session; \ TXN_API_CALL_NOCONF(s, WT_CURSOR, n, cur, \ ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle); \ - if (F_ISSET(S2C(s), WT_CONN_IN_MEMORY) && __wt_cache_full(s)) \ + if (F_ISSET(S2C(s), WT_CONN_IN_MEMORY) && \ + !F_ISSET((WT_BTREE *)(bt), WT_BTREE_IGNORE_CACHE) && \ + __wt_cache_full(s)) \ WT_ERR(WT_CACHE_FULL); #define JOINABLE_CURSOR_UPDATE_API_CALL(cur, s, n, bt) \ diff --git a/src/include/btmem.h b/src/include/btmem.h index b4ca937e7ed..84c91097a99 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -430,6 +430,8 @@ struct __wt_page_modify { #define WT_PM_REC_MULTIBLOCK 2 /* Reconciliation: multiple blocks */ #define WT_PM_REC_REPLACE 3 /* Reconciliation: single block */ uint8_t rec_result; /* Reconciliation state */ + + uint8_t update_restored; /* Page created by restoring updates */ }; /* @@ -619,6 +621,8 @@ struct __wt_page { #define WT_READGEN_START_VALUE 100 #define WT_READGEN_STEP 100 uint64_t read_gen; + /* The evict pass generation for the page */ + uint64_t evict_pass_gen; size_t memory_footprint; /* Memory attached to the page */ diff --git a/src/include/btree.h b/src/include/btree.h index cfaf59e70e1..713d46ae85f 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -119,7 +119,7 @@ struct __wt_btree { uint64_t last_recno; /* Column-store last record number */ WT_REF root; /* Root page reference */ - int modified; /* If the tree ever modified */ + bool modified; /* If the tree ever modified */ bool bulk_load_ok; /* Bulk-load is a possibility */ WT_BM *bm; /* Block manager reference */ @@ -154,18 +154,19 @@ struct __wt_btree { WT_SPINLOCK flush_lock; /* Lock to flush the tree's pages */ /* Flags values up to 0xff are reserved for WT_DHANDLE_* */ -#define WT_BTREE_BULK 0x00100 /* Bulk-load handle */ -#define WT_BTREE_IN_MEMORY 0x00200 /* Cache-resident object */ -#define WT_BTREE_LOOKASIDE 0x00400 /* Look-aside table */ -#define WT_BTREE_NO_CHECKPOINT 0x00800 /* Disable checkpoints */ -#define WT_BTREE_NO_EVICTION 0x01000 /* Disable eviction */ -#define WT_BTREE_NO_LOGGING 0x02000 /* Disable logging */ -#define WT_BTREE_NO_RECONCILE 0x04000 /* Allow splits, even with no evict */ -#define WT_BTREE_REBALANCE 0x08000 /* Handle is for rebalance */ -#define WT_BTREE_SALVAGE 0x10000 /* Handle is for salvage */ -#define WT_BTREE_SKIP_CKPT 0x20000 /* Handle skipped checkpoint */ -#define WT_BTREE_UPGRADE 0x40000 /* Handle is for upgrade */ -#define WT_BTREE_VERIFY 0x80000 /* Handle is for verify */ +#define WT_BTREE_BULK 0x000100 /* Bulk-load handle */ +#define WT_BTREE_IGNORE_CACHE 0x000200 /* Cache-resident object */ +#define WT_BTREE_IN_MEMORY 0x000400 /* Cache-resident object */ +#define WT_BTREE_LOOKASIDE 0x000800 /* Look-aside table */ +#define WT_BTREE_NO_CHECKPOINT 0x001000 /* Disable checkpoints */ +#define WT_BTREE_NO_EVICTION 0x002000 /* Disable eviction */ +#define WT_BTREE_NO_LOGGING 0x004000 /* Disable logging */ +#define WT_BTREE_NO_RECONCILE 0x008000 /* Allow splits, even with no evict */ +#define WT_BTREE_REBALANCE 0x010000 /* Handle is for rebalance */ +#define WT_BTREE_SALVAGE 0x020000 /* Handle is for salvage */ +#define WT_BTREE_SKIP_CKPT 0x040000 /* Handle skipped checkpoint */ +#define WT_BTREE_UPGRADE 0x080000 /* Handle is for upgrade */ +#define WT_BTREE_VERIFY 0x100000 /* Handle is for verify */ uint32_t flags; }; diff --git a/src/include/btree.i b/src/include/btree.i index a9ce4f754a9..daf2eb158c1 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -485,6 +485,38 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) } /* + * __wt_tree_modify_set -- + * Mark the tree dirty. + */ +static inline void +__wt_tree_modify_set(WT_SESSION_IMPL *session) +{ + /* + * Test before setting the dirty flag, it's a hot cache line. + * + * The tree's modified flag is cleared by the checkpoint thread: set it + * and insert a barrier before dirtying the page. (I don't think it's + * a problem if the tree is marked dirty with all the pages clean, it + * might result in an extra checkpoint that doesn't do any work but it + * shouldn't cause problems; regardless, let's play it safe.) + */ + if (!S2BT(session)->modified) { + /* Assert we never dirty a checkpoint handle. */ + WT_ASSERT(session, session->dhandle->checkpoint == NULL); + + S2BT(session)->modified = true; + WT_FULL_BARRIER(); + } + + /* + * The btree may already be marked dirty while the connection is still + * clean; mark the connection dirty outside the test of the btree state. + */ + if (!S2C(session)->modified) + S2C(session)->modified = true; +} + +/* * __wt_page_modify_clear -- * Clean a modified page. */ @@ -513,22 +545,9 @@ __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) /* * Mark the tree dirty (even if the page is already marked dirty), newly * created pages to support "empty" files are dirty, but the file isn't - * marked dirty until there's a real change needing to be written. Test - * before setting the dirty flag, it's a hot cache line. - * - * The tree's modified flag is cleared by the checkpoint thread: set it - * and insert a barrier before dirtying the page. (I don't think it's - * a problem if the tree is marked dirty with all the pages clean, it - * might result in an extra checkpoint that doesn't do any work but it - * shouldn't cause problems; regardless, let's play it safe.) + * marked dirty until there's a real change needing to be written. */ - if (S2BT(session)->modified == 0) { - /* Assert we never dirty a checkpoint handle. */ - WT_ASSERT(session, session->dhandle->checkpoint == NULL); - - S2BT(session)->modified = 1; - WT_FULL_BARRIER(); - } + __wt_tree_modify_set(session); __wt_page_only_modify_set(session, page); } @@ -1159,15 +1178,7 @@ __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) * There is no point doing an in-memory split unless there is a lot of * data in the last skiplist on the page. Split if there are enough * items and the skiplist does not fit within a single disk page. - * - * Rather than scanning the whole list, walk a higher level, which - * gives a sample of the items -- at level 0 we have all the items, at - * level 1 we have 1/4 and at level 2 we have 1/16th. If we see more - * than 30 items and more data than would fit in a disk page, split. */ -#define WT_MIN_SPLIT_DEPTH 2 -#define WT_MIN_SPLIT_COUNT 30 -#define WT_MIN_SPLIT_MULTIPLIER 16 /* At level 2, we see 1/16th entries */ ins_head = page->type == WT_PAGE_ROW_LEAF ? (page->pg_row_entries == 0 ? @@ -1176,8 +1187,40 @@ __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) WT_COL_APPEND(page); if (ins_head == NULL) return (false); + + /* + * In the extreme case, where the page is much larger than the maximum + * size, split as soon as there are 5 items on the page. + */ +#define WT_MAX_SPLIT_COUNT 5 + if (page->memory_footprint > btree->maxleafpage * 2) { + for (count = 0, ins = ins_head->head[0]; + ins != NULL; + ins = ins->next[0]) { + if (++count < WT_MAX_SPLIT_COUNT) + continue; + + WT_STAT_CONN_INCR(session, cache_inmem_splittable); + WT_STAT_DATA_INCR(session, cache_inmem_splittable); + return (true); + } + + return (false); + } + + /* + * Rather than scanning the whole list, walk a higher level, which + * gives a sample of the items -- at level 0 we have all the items, at + * level 1 we have 1/4 and at level 2 we have 1/16th. If we see more + * than 30 items and more data than would fit in a disk page, split. + */ +#define WT_MIN_SPLIT_DEPTH 2 +#define WT_MIN_SPLIT_COUNT 30 +#define WT_MIN_SPLIT_MULTIPLIER 16 /* At level 2, we see 1/16th entries */ + for (count = 0, size = 0, ins = ins_head->head[WT_MIN_SPLIT_DEPTH]; - ins != NULL; ins = ins->next[WT_MIN_SPLIT_DEPTH]) { + ins != NULL; + ins = ins->next[WT_MIN_SPLIT_DEPTH]) { count += WT_MIN_SPLIT_MULTIPLIER; size += WT_MIN_SPLIT_MULTIPLIER * (WT_INSERT_KEY_SIZE(ins) + WT_UPDATE_MEMSIZE(ins->upd)); diff --git a/src/include/cache.h b/src/include/cache.h index b24b625aec4..9a2b83b5b57 100644 --- a/src/include/cache.h +++ b/src/include/cache.h @@ -91,6 +91,7 @@ struct __wt_cache { uint64_t read_gen; /* Current page read generation */ uint64_t read_gen_oldest; /* Oldest read generation the eviction * server saw in its last queue load */ + uint64_t evict_pass_gen; /* Number of eviction passes */ /* * Eviction thread information. diff --git a/src/include/cache.i b/src/include/cache.i index 4255d04ec37..17ab39e97d2 100644 --- a/src/include/cache.i +++ b/src/include/cache.i @@ -355,7 +355,7 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, bool *didworkp) txn_state = WT_SESSION_TXN_STATE(session); busy = busy || txn_state->id != WT_TXN_NONE || session->nhazard > 0 || - (txn_state->snap_min != WT_TXN_NONE && + (txn_state->pinned_id != WT_TXN_NONE && txn_global->current != txn_global->oldest_id); /* diff --git a/src/include/connection.h b/src/include/connection.h index e19ad684b24..d7c3bf69686 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -285,13 +285,7 @@ struct __wt_connection_impl { uint64_t ckpt_time_recent; /* Checkpoint time recent/total */ uint64_t ckpt_time_total; -#define WT_CONN_STAT_ALL 0x01 /* "all" statistics configured */ -#define WT_CONN_STAT_CLEAR 0x02 /* clear after gathering */ -#define WT_CONN_STAT_FAST 0x04 /* "fast" statistics configured */ -#define WT_CONN_STAT_JSON 0x08 /* output JSON format */ -#define WT_CONN_STAT_ON_CLOSE 0x10 /* output statistics on close */ -#define WT_CONN_STAT_SIZE 0x20 /* "size" statistics configured */ - uint32_t stat_flags; + uint32_t stat_flags; /* Options declared in flags.py */ /* Connection statistics */ WT_CONNECTION_STATS *stats[WT_COUNTER_SLOTS]; @@ -352,6 +346,12 @@ struct __wt_connection_impl { WT_SESSION_IMPL *meta_ckpt_session;/* Metadata checkpoint session */ + /* + * Is there a data/schema change that needs to be the part of a + * checkpoint. + */ + bool modified; + WT_SESSION_IMPL *sweep_session; /* Handle sweep session */ wt_thread_t sweep_tid; /* Handle sweep thread */ int sweep_tid_set; /* Handle sweep thread set */ diff --git a/src/include/cursor.h b/src/include/cursor.h index f1fa4d193ac..e322a53a65d 100644 --- a/src/include/cursor.h +++ b/src/include/cursor.h @@ -467,7 +467,7 @@ struct __wt_cursor_stat { uint64_t v; /* Current stats value */ WT_ITEM pv; /* Current stats value (string) */ - /* Uses the same values as WT_CONNECTION::stat_flags field */ + /* Options declared in flags.py, shared by WT_CONNECTION::stat_flags */ uint32_t flags; }; diff --git a/src/include/extern.h b/src/include/extern.h index 5444b2e9f14..79e6405e148 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -282,6 +282,7 @@ extern int __wt_curbulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bo extern int __wt_curconfig_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_curds_open( WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_curdump_create(WT_CURSOR *child, WT_CURSOR *owner, WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_curfile_next_random(WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_curfile_update_check(WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_curindex_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -352,6 +353,7 @@ extern void __wt_evict_priority_clear(WT_SESSION_IMPL *session); extern int __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern void __wt_curstat_cache_walk(WT_SESSION_IMPL *session); extern void __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn); extern int __wt_log_flush_lsn(WT_SESSION_IMPL *session, WT_LSN *lsn, bool start) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn); @@ -677,7 +679,7 @@ extern uint32_t __wt_log2_int(uint32_t n); extern bool __wt_ispo2(uint32_t v); extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2); extern void __wt_random_init(WT_RAND_STATE volatile *rnd_state); -extern int __wt_random_init_seed( WT_SESSION_IMPL *session, WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern void __wt_random_init_seed( WT_SESSION_IMPL *session, WT_RAND_STATE volatile *rnd_state); extern uint32_t __wt_random(WT_RAND_STATE volatile *rnd_state); extern int __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -723,7 +725,7 @@ extern int __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_F extern void __wt_txn_release(WT_SESSION_IMPL *session); extern int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_txn_init(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_txn_init(WT_SESSION_IMPL *session, WT_SESSION_IMPL *session_ret) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_txn_stats_update(WT_SESSION_IMPL *session); extern void __wt_txn_destroy(WT_SESSION_IMPL *session); extern int __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); diff --git a/src/include/extern_posix.h b/src/include/extern_posix.h index d2f74d2ffe4..fd94ef0ddf2 100644 --- a/src/include/extern_posix.h +++ b/src/include/extern_posix.h @@ -27,5 +27,5 @@ extern void __wt_sleep(uint64_t seconds, uint64_t micro_seconds); extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_thread_id(char *buf, size_t buflen); -extern int __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp); extern void __wt_yield(void); diff --git a/src/include/extern_win.h b/src/include/extern_win.h index 8c2b19056e0..f06ee881ece 100644 --- a/src/include/extern_win.h +++ b/src/include/extern_win.h @@ -25,7 +25,7 @@ extern void __wt_sleep(uint64_t seconds, uint64_t micro_seconds); extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_thread_id(char *buf, size_t buflen); -extern int __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp); extern int __wt_to_utf16_string( WT_SESSION_IMPL *session, const char*utf8, WT_ITEM **outbuf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_to_utf8_string( WT_SESSION_IMPL *session, const wchar_t*wide, WT_ITEM **outbuf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern DWORD __wt_getlasterror(void); diff --git a/src/include/flags.h b/src/include/flags.h index 5d718da473d..b0d167525b2 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -70,6 +70,14 @@ #define WT_SESSION_NO_SCHEMA_LOCK 0x00020000 #define WT_SESSION_QUIET_CORRUPT_FILE 0x00040000 #define WT_SESSION_SERVER_ASYNC 0x00080000 +#define WT_STAT_CLEAR 0x00000001 +#define WT_STAT_JSON 0x00000002 +#define WT_STAT_ON_CLOSE 0x00000004 +#define WT_STAT_TYPE_ALL 0x00000008 +#define WT_STAT_TYPE_CACHE_WALK 0x00000010 +#define WT_STAT_TYPE_FAST 0x00000020 +#define WT_STAT_TYPE_SIZE 0x00000040 +#define WT_STAT_TYPE_TREE_WALK 0x00000080 #define WT_TXN_LOG_CKPT_CLEANUP 0x00000001 #define WT_TXN_LOG_CKPT_PREPARE 0x00000002 #define WT_TXN_LOG_CKPT_START 0x00000004 diff --git a/src/include/lsm.h b/src/include/lsm.h index 2550ca444c1..b433e4c3c44 100644 --- a/src/include/lsm.h +++ b/src/include/lsm.h @@ -31,6 +31,17 @@ struct __wt_lsm_worker_args { }; /* + * WT_LSM_CURSOR_CHUNK -- + * Iterator struct containing all the LSM cursor access points for a chunk. + */ +struct __wt_lsm_cursor_chunk { + WT_BLOOM *bloom; /* Bloom filter handle for each chunk.*/ + WT_CURSOR *cursor; /* Cursor handle for each chunk. */ + uint64_t count; /* Number of items in chunk */ + uint64_t switch_txn; /* Switch txn for each chunk */ +}; + +/* * WT_CURSOR_LSM -- * An LSM cursor. */ @@ -43,17 +54,12 @@ struct __wt_cursor_lsm { u_int nchunks; /* Number of chunks in the cursor */ u_int nupdates; /* Updates needed (including snapshot isolation checks). */ - WT_BLOOM **blooms; /* Bloom filter handles. */ - size_t bloom_alloc; - - WT_CURSOR **cursors; /* Cursor handles. */ - size_t cursor_alloc; - - WT_CURSOR *current; /* The current cursor for iteration */ + WT_CURSOR *current; /* The current cursor for iteration */ WT_LSM_CHUNK *primary_chunk; /* The current primary chunk */ - uint64_t *switch_txn; /* Switch txn for each chunk */ - size_t txnid_alloc; + WT_LSM_CURSOR_CHUNK **chunks; /* Array of LSM cursor units */ + size_t chunks_alloc; /* Current size iterators array */ + size_t chunks_count; /* Current number of iterators */ u_int update_count; /* Updates performed. */ diff --git a/src/include/misc.i b/src/include/misc.i index f267c7afc91..befd480e085 100644 --- a/src/include/misc.i +++ b/src/include/misc.i @@ -33,16 +33,14 @@ __wt_strdup(WT_SESSION_IMPL *session, const char *str, void *retp) * __wt_seconds -- * Return the seconds since the Epoch. */ -static inline int +static inline void __wt_seconds(WT_SESSION_IMPL *session, time_t *timep) { struct timespec t; - WT_RET(__wt_epoch(session, &t)); + __wt_epoch(session, &t); *timep = t.tv_sec; - - return (0); } /* diff --git a/src/include/mutex.h b/src/include/mutex.h index f0f8173bad4..b736d6ee9fb 100644 --- a/src/include/mutex.h +++ b/src/include/mutex.h @@ -74,6 +74,16 @@ struct __wt_rwlock { struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_spinlock { volatile int lock; + + /* + * We track acquisitions and time spent waiting for some locks. For + * performance reasons and to make it possible to write generic code + * that tracks statistics for different locks, we store the offset + * of the statistics fields to be updated during lock acquisition. + */ + int16_t stat_count_off; /* acquisitions offset */ + int16_t stat_app_usecs_off; /* waiting application threads offset */ + int16_t stat_int_usecs_off; /* waiting server threads offset */ }; #elif SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX ||\ @@ -83,7 +93,17 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_spinlock { struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_spinlock { wt_mutex_t lock; - const char *name; /* Statistics: mutex name */ + const char *name; /* Mutex name */ + + /* + * We track acquisitions and time spent waiting for some locks. For + * performance reasons and to make it possible to write generic code + * that tracks statistics for different locks, we store the offset + * of the statistics fields to be updated during lock acquisition. + */ + int16_t stat_count_off; /* acquisitions offset */ + int16_t stat_app_usecs_off; /* waiting application threads offset */ + int16_t stat_int_usecs_off; /* waiting server threads offset */ int8_t initialized; /* Lock initialized, for cleanup */ }; diff --git a/src/include/mutex.i b/src/include/mutex.i index cb1847d9991..a6309e0976b 100644 --- a/src/include/mutex.i +++ b/src/include/mutex.i @@ -32,6 +32,7 @@ __wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name) WT_UNUSED(name); t->lock = 0; + t->stat_count_off = t->stat_app_usecs_off = t->stat_int_usecs_off = -1; return (0); } @@ -111,6 +112,7 @@ __wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name) #endif t->name = name; + t->stat_count_off = t->stat_app_usecs_off = t->stat_int_usecs_off = -1; t->initialized = 1; WT_UNUSED(session); @@ -255,3 +257,46 @@ __wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) #error Unknown spinlock type #endif + +/* + * WT_SPIN_INIT_TRACKED -- + * Spinlock initialization, with tracking. + * + * Implemented as a macro so we can pass in a statistics field and convert + * it into a statistics structure array offset. + */ +#define WT_SPIN_INIT_TRACKED(session, t, name) do { \ + WT_RET(__wt_spin_init(session, t, #name)); \ + (t)->stat_count_off = (int16_t)WT_STATS_FIELD_TO_OFFSET( \ + S2C(session)->stats, lock_##name##_count); \ + (t)->stat_app_usecs_off = (int16_t)WT_STATS_FIELD_TO_OFFSET( \ + S2C(session)->stats, lock_##name##_wait_application); \ + (t)->stat_int_usecs_off = (int16_t)WT_STATS_FIELD_TO_OFFSET( \ + S2C(session)->stats, lock_##name##_wait_internal); \ +} while (0) + +/* + * __wt_spin_lock_track -- + * Spinlock acquisition, with tracking. + */ +static inline void +__wt_spin_lock_track(WT_SESSION_IMPL *session, WT_SPINLOCK *t) +{ + struct timespec enter, leave; + int64_t **stats; + + if (t->stat_count_off != -1 && WT_STAT_ENABLED(session)) { + __wt_epoch(session, &enter); + __wt_spin_lock(session, t); + __wt_epoch(session, &leave); + stats = (int64_t **)S2C(session)->stats; + stats[session->stat_bucket][t->stat_count_off]++; + if (F_ISSET(session, WT_SESSION_INTERNAL)) + stats[session->stat_bucket][t->stat_int_usecs_off] += + (int64_t)WT_TIMEDIFF_US(leave, enter); + else + stats[session->stat_bucket][t->stat_app_usecs_off] += + (int64_t)WT_TIMEDIFF_US(leave, enter); + } else + __wt_spin_lock(session, t); +} diff --git a/src/include/schema.h b/src/include/schema.h index f93c596e2ca..6a5ce67a867 100644 --- a/src/include/schema.h +++ b/src/include/schema.h @@ -86,11 +86,11 @@ struct __wt_table { if (F_ISSET(session, (flag))) { \ op; \ } else { \ - __wt_spin_lock(session, (lock)); \ + __wt_spin_lock_track(session, lock); \ F_SET(session, (flag)); \ op; \ F_CLR(session, (flag)); \ - __wt_spin_unlock(session, (lock)); \ + __wt_spin_unlock(session, lock); \ } \ } while (0) @@ -102,11 +102,11 @@ struct __wt_table { ret = 0; \ if (!F_ISSET(session, (flag)) && \ F_ISSET(session, WT_SESSION_LOCK_NO_WAIT)) { \ - if ((ret = __wt_spin_trylock(session, (lock))) == 0) { \ + if ((ret = __wt_spin_trylock(session, lock)) == 0) { \ F_SET(session, (flag)); \ op; \ F_CLR(session, (flag)); \ - __wt_spin_unlock(session, (lock)); \ + __wt_spin_unlock(session, lock); \ } \ } else \ WT_WITH_LOCK_WAIT(session, lock, flag, op); \ diff --git a/src/include/session.h b/src/include/session.h index aa51dae58c4..3f9f495c134 100644 --- a/src/include/session.h +++ b/src/include/session.h @@ -147,6 +147,9 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl { void *reconcile; /* Reconciliation support */ int (*reconcile_cleanup)(WT_SESSION_IMPL *); + /* Sessions have an associated statistics bucket based on its ID. */ + u_int stat_bucket; /* Statistics bucket offset */ + uint32_t flags; /* diff --git a/src/include/stat.h b/src/include/stat.h index cd0cae16826..d0b0b60585a 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -79,9 +79,9 @@ * those structures regardless of the specific statistic structure we're working * with, by translating statistics structure field names to structure offsets. * - * Translate a statistic's value name to an offset. + * Translate a statistic's value name to an offset in the array. */ -#define WT_STATS_FIELD_TO_SLOT(stats, fld) \ +#define WT_STATS_FIELD_TO_OFFSET(stats, fld) \ (int)(&(stats)[0]->fld - (int64_t *)(stats)[0]) /* @@ -140,38 +140,54 @@ __wt_stats_clear(void *stats_arg, int slot) #define WT_STAT_ENABLED(session) (S2C(session)->stat_flags != 0) #define WT_STAT_READ(stats, fld) \ - __wt_stats_aggregate(stats, WT_STATS_FIELD_TO_SLOT(stats, fld)) + __wt_stats_aggregate(stats, WT_STATS_FIELD_TO_OFFSET(stats, fld)) #define WT_STAT_WRITE(session, stats, fld, v) do { \ if (WT_STAT_ENABLED(session)) \ (stats)->fld = (int64_t)(v); \ } while (0) -#define WT_STAT_DECRV(session, stats, fld, value) do { \ +#define WT_STAT_DECRV_BASE(session, stat, fld, value) do { \ if (WT_STAT_ENABLED(session)) \ - (stats)[WT_STATS_SLOT_ID(session)]->fld -= (int64_t)(value); \ + (stat)->fld -= (int64_t)(value); \ } while (0) -#define WT_STAT_DECRV_ATOMIC(session, stats, fld, value) do { \ +#define WT_STAT_DECRV_ATOMIC_BASE(session, stat, fld, value) do { \ + if (WT_STAT_ENABLED(session)) \ + __wt_atomic_subi64(&(stat)->fld, (int64_t)(value)); \ +} while (0) +#define WT_STAT_INCRV_BASE(session, stat, fld, value) do { \ + if (WT_STAT_ENABLED(session)) \ + (stat)->fld += (int64_t)(value); \ +} while (0) +#define WT_STAT_INCRV_ATOMIC_BASE(session, stat, fld, value) do { \ if (WT_STAT_ENABLED(session)) \ - __wt_atomic_subi64(&(stats)[WT_STATS_SLOT_ID(session)]->fld, \ - (int64_t)(value)); \ + __wt_atomic_addi64(&(stat)->fld, (int64_t)(value)); \ +} while (0) + +#define WT_STAT_DECRV(session, stats, fld, value) do { \ + WT_STAT_DECRV_BASE( \ + session, (stats)[(session)->stat_bucket], fld, value); \ +} while (0) +#define WT_STAT_DECRV_ATOMIC(session, stats, fld, value) do { \ + WT_STAT_DECRV_ATOMIC_BASE( \ + session, (stats)[(session)->stat_bucket], fld, value); \ } while (0) #define WT_STAT_DECR(session, stats, fld) \ WT_STAT_DECRV(session, stats, fld, 1) + #define WT_STAT_INCRV(session, stats, fld, value) do { \ - if (WT_STAT_ENABLED(session)) \ - (stats)[WT_STATS_SLOT_ID(session)]->fld += (int64_t)(value); \ + WT_STAT_INCRV_BASE( \ + session, (stats)[(session)->stat_bucket], fld, value); \ } while (0) #define WT_STAT_INCRV_ATOMIC(session, stats, fld, value) do { \ - if (WT_STAT_ENABLED(session)) \ - __wt_atomic_addi64(&(stats)[WT_STATS_SLOT_ID(session)]->fld, \ - (int64_t)(value)); \ + WT_STAT_INCRV_ATOMIC_BASE( \ + session, (stats)[(session)->stat_bucket], fld, value); \ } while (0) #define WT_STAT_INCR(session, stats, fld) \ WT_STAT_INCRV(session, stats, fld, 1) #define WT_STAT_SET(session, stats, fld, value) do { \ if (WT_STAT_ENABLED(session)) { \ __wt_stats_clear(stats, \ - WT_STATS_FIELD_TO_SLOT(stats, fld)); \ + WT_STATS_FIELD_TO_OFFSET(stats, fld)); \ (stats)[0]->fld = (int64_t)(value); \ } \ } while (0) @@ -179,18 +195,24 @@ __wt_stats_clear(void *stats_arg, int slot) /* * Update connection handle statistics if statistics gathering is enabled. */ -#define WT_STAT_CONN_DECR(session, fld) \ - WT_STAT_DECR(session, S2C(session)->stats, fld) -#define WT_STAT_CONN_DECR_ATOMIC(session, fld) \ - WT_STAT_DECRV_ATOMIC(session, S2C(session)->stats, fld, 1) #define WT_STAT_CONN_DECRV(session, fld, value) \ - WT_STAT_DECRV(session, S2C(session)->stats, fld, value) -#define WT_STAT_CONN_INCR(session, fld) \ - WT_STAT_INCR(session, S2C(session)->stats, fld) -#define WT_STAT_CONN_INCR_ATOMIC(session, fld) \ - WT_STAT_INCRV_ATOMIC(session, S2C(session)->stats, fld, 1) + WT_STAT_DECRV_BASE(session, \ + S2C(session)->stats[(session)->stat_bucket], fld, value) +#define WT_STAT_CONN_DECR_ATOMIC(session, fld) \ + WT_STAT_DECRV_ATOMIC_BASE(session, \ + S2C(session)->stats[(session)->stat_bucket], fld, 1) +#define WT_STAT_CONN_DECR(session, fld) \ + WT_STAT_CONN_DECRV(session, fld, 1) + #define WT_STAT_CONN_INCRV(session, fld, value) \ - WT_STAT_INCRV(session, S2C(session)->stats, fld, value) + WT_STAT_INCRV_BASE(session, \ + S2C(session)->stats[(session)->stat_bucket], fld, value) +#define WT_STAT_CONN_INCR_ATOMIC(session, fld) \ + WT_STAT_INCRV_ATOMIC_BASE(session, \ + S2C(session)->stats[(session)->stat_bucket], fld, 1) +#define WT_STAT_CONN_INCR(session, fld) \ + WT_STAT_CONN_INCRV(session, fld, 1) + #define WT_STAT_CONN_SET(session, fld, value) \ WT_STAT_SET(session, S2C(session)->stats, fld, value) @@ -263,6 +285,10 @@ struct __wt_connection_stats { int64_t block_byte_write_checkpoint; int64_t block_map_read; int64_t block_byte_map_read; + int64_t cache_read_app_count; + int64_t cache_read_app_time; + int64_t cache_write_app_count; + int64_t cache_write_app_time; int64_t cache_bytes_image; int64_t cache_bytes_inuse; int64_t cache_bytes_other; @@ -356,6 +382,21 @@ struct __wt_connection_stats { int64_t dh_sweeps; int64_t dh_session_handles; int64_t dh_session_sweeps; + int64_t lock_checkpoint_count; + int64_t lock_checkpoint_wait_application; + int64_t lock_checkpoint_wait_internal; + int64_t lock_handle_list_count; + int64_t lock_handle_list_wait_application; + int64_t lock_handle_list_wait_internal; + int64_t lock_metadata_count; + int64_t lock_metadata_wait_application; + int64_t lock_metadata_wait_internal; + int64_t lock_schema_count; + int64_t lock_schema_wait_application; + int64_t lock_schema_wait_internal; + int64_t lock_table_count; + int64_t lock_table_wait_application; + int64_t lock_table_wait_internal; int64_t log_slot_switch_busy; int64_t log_slot_closes; int64_t log_slot_races; @@ -420,6 +461,8 @@ struct __wt_connection_stats { int64_t thread_fsync_active; int64_t thread_read_active; int64_t thread_write_active; + int64_t application_evict_time; + int64_t application_cache_time; int64_t page_busy_blocked; int64_t page_forcible_evict_blocked; int64_t page_locked_blocked; @@ -437,6 +480,7 @@ struct __wt_connection_stats { int64_t txn_checkpoint_scrub_time; int64_t txn_checkpoint_time_total; int64_t txn_checkpoint; + int64_t txn_checkpoint_skipped; int64_t txn_fail_cache; int64_t txn_checkpoint_fsync_post; int64_t txn_checkpoint_fsync_post_duration; @@ -515,6 +559,24 @@ struct __wt_dsrc_stats { int64_t cache_write; int64_t cache_write_restore; int64_t cache_eviction_clean; + int64_t cache_state_gen_avg_gap; + int64_t cache_state_avg_written_size; + int64_t cache_state_pages_clean; + int64_t cache_state_gen_current; + int64_t cache_state_pages_dirty; + int64_t cache_state_root_entries; + int64_t cache_state_pages_internal; + int64_t cache_state_pages_leaf; + int64_t cache_state_gen_max_gap; + int64_t cache_state_max_pagesize; + int64_t cache_state_min_written_size; + int64_t cache_state_smaller_alloc_size; + int64_t cache_state_memory; + int64_t cache_state_queued; + int64_t cache_state_not_queueable; + int64_t cache_state_refs_skipped; + int64_t cache_state_root_size; + int64_t cache_state_pages; int64_t compress_read; int64_t compress_write; int64_t compress_write_fail; diff --git a/src/include/txn.h b/src/include/txn.h index 2e41ae8620d..8128e8e4cc2 100644 --- a/src/include/txn.h +++ b/src/include/txn.h @@ -49,9 +49,9 @@ WT_ASSERT((s), (s)->txn.forced_iso > 0); \ (s)->txn.forced_iso--; \ WT_ASSERT((s), txn_state->id == saved_state.id && \ - (txn_state->snap_min == saved_state.snap_min || \ - saved_state.snap_min == WT_TXN_NONE)); \ - txn_state->snap_min = saved_state.snap_min; \ + (txn_state->pinned_id == saved_state.pinned_id || \ + saved_state.pinned_id == WT_TXN_NONE)); \ + txn_state->pinned_id = saved_state.pinned_id; \ } while (0) struct __wt_named_snapshot { @@ -59,14 +59,14 @@ struct __wt_named_snapshot { TAILQ_ENTRY(__wt_named_snapshot) q; - uint64_t snap_min, snap_max; + uint64_t pinned_id, snap_min, snap_max; uint64_t *snapshot; uint32_t snapshot_count; }; struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_txn_state { volatile uint64_t id; - volatile uint64_t snap_min; + volatile uint64_t pinned_id; }; struct __wt_txn_global { diff --git a/src/include/txn.i b/src/include/txn.i index 1a8851a9a2a..cf7e2eafc65 100644 --- a/src/include/txn.i +++ b/src/include/txn.i @@ -309,7 +309,7 @@ __wt_txn_idle_cache_check(WT_SESSION_IMPL *session) * WT_TXN_HAS_SNAPSHOT. */ if (F_ISSET(txn, WT_TXN_RUNNING) && - !F_ISSET(txn, WT_TXN_HAS_ID) && txn_state->snap_min == WT_TXN_NONE) + !F_ISSET(txn, WT_TXN_HAS_ID) && txn_state->pinned_id == WT_TXN_NONE) WT_RET(__wt_cache_eviction_check(session, false, NULL)); return (0); @@ -480,8 +480,8 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session) * positioned on a value, it can't be freed. */ if (txn->isolation == WT_ISO_READ_UNCOMMITTED) { - if (txn_state->snap_min == WT_TXN_NONE) - txn_state->snap_min = txn_global->last_running; + if (txn_state->pinned_id == WT_TXN_NONE) + txn_state->pinned_id = txn_global->last_running; } else if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)) WT_RET(__wt_txn_get_snapshot(session)); diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index f4763a113f1..b6185b4ead6 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -962,8 +962,9 @@ struct __wt_session { * where appropriate (for example\, a cache size statistic is not * cleared\, while the count of cursor insert operations will be * cleared). See @ref statistics for more information., a list\, with - * values chosen from the following options: \c "all"\, \c "fast"\, \c - * "clear"\, \c "size"; default empty.} + * values chosen from the following options: \c "all"\, \c + * "cache_walk"\, \c "fast"\, \c "clear"\, \c "size"\, \c "tree_walk"; + * default empty.} * @config{target, if non-empty\, backup the list of objects; valid only * for a backup data source., a list of strings; default empty.} * @configend @@ -1004,9 +1005,9 @@ struct __wt_session { * @config{block_compressor, configure a compressor for file blocks. * Permitted values are \c "none" or custom compression engine name * created with WT_CONNECTION::add_compressor. If WiredTiger has - * builtin support for \c "snappy"\, \c "lz4" or \c "zlib" compression\, - * these names are also available. See @ref compression for more - * information., a string; default \c none.} + * builtin support for \c "lz4"\, \c "snappy"\, \c "zlib" or \c "zstd" + * compression\, these names are also available. See @ref compression + * for more information., a string; default \c none.} * @config{cache_resident, do not ever evict the object's pages from * cache. Not compatible with LSM tables; see @ref * tuning_cache_resident for more information., a boolean flag; default @@ -1069,6 +1070,11 @@ struct __wt_session { * Permitted values are \c "none"\, \c "english"\, \c "utf8<file>" or \c * "utf16<file>". See @ref huffman for more information., a string; * default \c none.} + * @config{ignore_in_memory_cache_size, allow update and insert + * operations to proceed even if the cache is already at capacity. Only + * valid in conjunction with in-memory databases. Should be used with + * caution - this configuration allows WiredTiger to consume memory over + * the configured cache limit., a boolean flag; default \c false.} * @config{immutable, configure the index to be immutable - that is an * index is not changed by any update to a record in the table., a * boolean flag; default \c false.} @@ -1815,14 +1821,13 @@ struct __wt_connection { * default \c 5.} * @config{eviction_dirty_target, perform eviction in worker threads * when the cache contains at least this much dirty content\, expressed - * as a percentage of the total cache size. Ignored if \c in_memory is - * \c true., an integer between 1 and 99; default \c 5.} + * as a percentage of the total cache size., an integer between 1 and + * 99; default \c 5.} * @config{eviction_dirty_trigger, trigger application threads to * perform eviction when the cache contains at least this much dirty * content\, expressed as a percentage of the total cache size. This - * setting only alters behavior if it is lower than eviction_trigger. - * Ignored if \c in_memory is \c true., an integer between 1 and 99; - * default \c 20.} + * setting only alters behavior if it is lower than eviction_trigger., + * an integer between 1 and 99; default \c 20.} * @config{eviction_target, perform eviction in worker threads when the * cache contains at least this much content\, expressed as a percentage * of the total cache size. Must be less than \c eviction_trigger., an @@ -1899,8 +1904,9 @@ struct __wt_connection { * reset each time a statistics cursor is used to gather statistics\, as * well as each time statistics are logged using the \c statistics_log * configuration. See @ref statistics for more information., a list\, - * with values chosen from the following options: \c "all"\, \c "fast"\, - * \c "none"\, \c "clear"; default \c none.} + * with values chosen from the following options: \c "all"\, \c + * "cache_walk"\, \c "fast"\, \c "none"\, \c "clear"\, \c "tree_walk"; + * default \c none.} * @config{statistics_log = (, log any statistics the database is * configured to maintain\, to a file. See @ref statistics for more * information. Enabling the statistics log server uses a session from @@ -2281,13 +2287,12 @@ struct __wt_connection { * is \c true., an integer between 0 and 99; default \c 5.} * @config{eviction_dirty_target, perform eviction in worker threads when the * cache contains at least this much dirty content\, expressed as a percentage - * of the total cache size. Ignored if \c in_memory is \c true., an integer - * between 1 and 99; default \c 5.} + * of the total cache size., an integer between 1 and 99; default \c 5.} * @config{eviction_dirty_trigger, trigger application threads to perform * eviction when the cache contains at least this much dirty content\, expressed * as a percentage of the total cache size. This setting only alters behavior - * if it is lower than eviction_trigger. Ignored if \c in_memory is \c true., - * an integer between 1 and 99; default \c 20.} + * if it is lower than eviction_trigger., an integer between 1 and 99; default + * \c 20.} * @config{eviction_target, perform eviction in worker threads when the cache * contains at least this much content\, expressed as a percentage of the total * cache size. Must be less than \c eviction_trigger., an integer between 10 @@ -2333,11 +2338,11 @@ struct __wt_connection { * @config{ compressor, configure a compressor for log * records. Permitted values are \c "none" or custom compression engine name * created with WT_CONNECTION::add_compressor. If WiredTiger has builtin - * support for \c "snappy"\, \c "lz4" or \c "zlib" compression\, these names are - * also available. See @ref compression for more information., a string; - * default \c none.} - * @config{ enabled, enable logging - * subsystem., a boolean flag; default \c false.} + * support for \c "lz4"\, \c "snappy"\, \c "zlib" or \c "zstd" compression\, + * these names are also available. See @ref compression for more information., + * a string; default \c none.} + * @config{ enabled, enable + * logging subsystem., a boolean flag; default \c false.} * @config{ file_max, the maximum size of log files., an * integer between 100KB and 2GB; default \c 100MB.} * @config{ path, the name of a directory into which log @@ -2403,8 +2408,9 @@ struct __wt_connection { * statistics are reset each time a statistics cursor is used to gather * statistics\, as well as each time statistics are logged using the \c * statistics_log configuration. See @ref statistics for more information., a - * list\, with values chosen from the following options: \c "all"\, \c "fast"\, - * \c "none"\, \c "clear"; default \c none.} + * list\, with values chosen from the following options: \c "all"\, \c + * "cache_walk"\, \c "fast"\, \c "none"\, \c "clear"\, \c "tree_walk"; default + * \c none.} * @config{statistics_log = (, log any statistics the database is configured to * maintain\, to a file. See @ref statistics for more information. Enabling * the statistics log server uses a session from the configured session_max., a @@ -4274,384 +4280,437 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_BLOCK_MAP_READ 1029 /*! block-manager: mapped bytes read */ #define WT_STAT_CONN_BLOCK_BYTE_MAP_READ 1030 +/*! cache: application threads page read from disk to cache count */ +#define WT_STAT_CONN_CACHE_READ_APP_COUNT 1031 +/*! cache: application threads page read from disk to cache time (usecs) */ +#define WT_STAT_CONN_CACHE_READ_APP_TIME 1032 +/*! cache: application threads page write from cache to disk count */ +#define WT_STAT_CONN_CACHE_WRITE_APP_COUNT 1033 +/*! cache: application threads page write from cache to disk time (usecs) */ +#define WT_STAT_CONN_CACHE_WRITE_APP_TIME 1034 /*! cache: bytes belonging to page images in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_IMAGE 1031 +#define WT_STAT_CONN_CACHE_BYTES_IMAGE 1035 /*! cache: bytes currently in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_INUSE 1032 +#define WT_STAT_CONN_CACHE_BYTES_INUSE 1036 /*! cache: bytes not belonging to page images in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_OTHER 1033 +#define WT_STAT_CONN_CACHE_BYTES_OTHER 1037 /*! cache: bytes read into cache */ -#define WT_STAT_CONN_CACHE_BYTES_READ 1034 +#define WT_STAT_CONN_CACHE_BYTES_READ 1038 /*! cache: bytes written from cache */ -#define WT_STAT_CONN_CACHE_BYTES_WRITE 1035 +#define WT_STAT_CONN_CACHE_BYTES_WRITE 1039 /*! cache: checkpoint blocked page eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1036 +#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1040 /*! cache: eviction calls to get a page */ -#define WT_STAT_CONN_CACHE_EVICTION_GET_REF 1037 +#define WT_STAT_CONN_CACHE_EVICTION_GET_REF 1041 /*! cache: eviction calls to get a page found queue empty */ -#define WT_STAT_CONN_CACHE_EVICTION_GET_REF_EMPTY 1038 +#define WT_STAT_CONN_CACHE_EVICTION_GET_REF_EMPTY 1042 /*! cache: eviction calls to get a page found queue empty after locking */ -#define WT_STAT_CONN_CACHE_EVICTION_GET_REF_EMPTY2 1039 +#define WT_STAT_CONN_CACHE_EVICTION_GET_REF_EMPTY2 1043 /*! cache: eviction currently operating in aggressive mode */ -#define WT_STAT_CONN_CACHE_EVICTION_AGGRESSIVE_SET 1040 +#define WT_STAT_CONN_CACHE_EVICTION_AGGRESSIVE_SET 1044 /*! cache: eviction empty score */ -#define WT_STAT_CONN_CACHE_EVICTION_EMPTY_SCORE 1041 +#define WT_STAT_CONN_CACHE_EVICTION_EMPTY_SCORE 1045 /*! cache: eviction server candidate queue empty when topping up */ -#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1042 +#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1046 /*! cache: eviction server candidate queue not empty when topping up */ -#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1043 +#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1047 /*! cache: eviction server evicting pages */ -#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1044 +#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1048 /*! * cache: eviction server slept, because we did not make progress with * eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SERVER_SLEPT 1045 +#define WT_STAT_CONN_CACHE_EVICTION_SERVER_SLEPT 1049 /*! cache: eviction server unable to reach eviction goal */ -#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1046 +#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1050 /*! cache: eviction state */ -#define WT_STAT_CONN_CACHE_EVICTION_STATE 1047 +#define WT_STAT_CONN_CACHE_EVICTION_STATE 1051 /*! cache: eviction walks abandoned */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ABANDONED 1048 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ABANDONED 1052 /*! cache: eviction worker thread evicting pages */ -#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1049 +#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1053 /*! cache: failed eviction of pages that exceeded the in-memory maximum */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1050 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1054 /*! cache: files with active eviction walks */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ACTIVE 1051 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ACTIVE 1055 /*! cache: files with new eviction walks started */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STARTED 1052 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STARTED 1056 /*! cache: hazard pointer blocked page eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1053 +#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1057 /*! cache: hazard pointer check calls */ -#define WT_STAT_CONN_CACHE_HAZARD_CHECKS 1054 +#define WT_STAT_CONN_CACHE_HAZARD_CHECKS 1058 /*! cache: hazard pointer check entries walked */ -#define WT_STAT_CONN_CACHE_HAZARD_WALKS 1055 +#define WT_STAT_CONN_CACHE_HAZARD_WALKS 1059 /*! cache: hazard pointer maximum array length */ -#define WT_STAT_CONN_CACHE_HAZARD_MAX 1056 +#define WT_STAT_CONN_CACHE_HAZARD_MAX 1060 /*! cache: in-memory page passed criteria to be split */ -#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1057 +#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1061 /*! cache: in-memory page splits */ -#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1058 +#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1062 /*! cache: internal pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1059 +#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1063 /*! cache: internal pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1060 +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1064 /*! cache: leaf pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1061 +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1065 /*! cache: lookaside table insert calls */ -#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1062 +#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1066 /*! cache: lookaside table remove calls */ -#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1063 +#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1067 /*! cache: maximum bytes configured */ -#define WT_STAT_CONN_CACHE_BYTES_MAX 1064 +#define WT_STAT_CONN_CACHE_BYTES_MAX 1068 /*! cache: maximum page size at eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1065 +#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1069 /*! cache: modified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1066 +#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1070 /*! cache: modified pages evicted by application threads */ -#define WT_STAT_CONN_CACHE_EVICTION_APP_DIRTY 1067 +#define WT_STAT_CONN_CACHE_EVICTION_APP_DIRTY 1071 /*! cache: overflow pages read into cache */ -#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1068 +#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1072 /*! cache: overflow values cached in memory */ -#define WT_STAT_CONN_CACHE_OVERFLOW_VALUE 1069 +#define WT_STAT_CONN_CACHE_OVERFLOW_VALUE 1073 /*! cache: page split during eviction deepened the tree */ -#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1070 +#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1074 /*! cache: page written requiring lookaside records */ -#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1071 +#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1075 /*! cache: pages currently held in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_INUSE 1072 +#define WT_STAT_CONN_CACHE_PAGES_INUSE 1076 /*! cache: pages evicted because they exceeded the in-memory maximum */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1073 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1077 /*! cache: pages evicted because they had chains of deleted items */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1074 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1078 /*! cache: pages evicted by application threads */ -#define WT_STAT_CONN_CACHE_EVICTION_APP 1075 +#define WT_STAT_CONN_CACHE_EVICTION_APP 1079 /*! cache: pages queued for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED 1076 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED 1080 /*! cache: pages queued for urgent eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_URGENT 1077 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_URGENT 1081 /*! cache: pages queued for urgent eviction during walk */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_OLDEST 1078 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_OLDEST 1082 /*! cache: pages read into cache */ -#define WT_STAT_CONN_CACHE_READ 1079 +#define WT_STAT_CONN_CACHE_READ 1083 /*! cache: pages read into cache requiring lookaside entries */ -#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1080 +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1084 /*! cache: pages requested from the cache */ -#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1081 +#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1085 /*! cache: pages seen by eviction walk */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1082 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1086 /*! cache: pages selected for eviction unable to be evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1083 +#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1087 /*! cache: pages walked for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK 1084 +#define WT_STAT_CONN_CACHE_EVICTION_WALK 1088 /*! cache: pages written from cache */ -#define WT_STAT_CONN_CACHE_WRITE 1085 +#define WT_STAT_CONN_CACHE_WRITE 1089 /*! cache: pages written requiring in-memory restoration */ -#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1086 +#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1090 /*! cache: percentage overhead */ -#define WT_STAT_CONN_CACHE_OVERHEAD 1087 +#define WT_STAT_CONN_CACHE_OVERHEAD 1091 /*! cache: tracked bytes belonging to internal pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1088 +#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1092 /*! cache: tracked bytes belonging to leaf pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_LEAF 1089 +#define WT_STAT_CONN_CACHE_BYTES_LEAF 1093 /*! cache: tracked dirty bytes in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1090 +#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1094 /*! cache: tracked dirty pages in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1091 +#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1095 /*! cache: unmodified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1092 +#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1096 /*! connection: auto adjusting condition resets */ -#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1093 +#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1097 /*! connection: auto adjusting condition wait calls */ -#define WT_STAT_CONN_COND_AUTO_WAIT 1094 +#define WT_STAT_CONN_COND_AUTO_WAIT 1098 /*! connection: files currently open */ -#define WT_STAT_CONN_FILE_OPEN 1095 +#define WT_STAT_CONN_FILE_OPEN 1099 /*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1096 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1100 /*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1097 +#define WT_STAT_CONN_MEMORY_FREE 1101 /*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1098 +#define WT_STAT_CONN_MEMORY_GROW 1102 /*! connection: pthread mutex condition wait calls */ -#define WT_STAT_CONN_COND_WAIT 1099 +#define WT_STAT_CONN_COND_WAIT 1103 /*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1100 +#define WT_STAT_CONN_RWLOCK_READ 1104 /*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1101 +#define WT_STAT_CONN_RWLOCK_WRITE 1105 /*! connection: total fsync I/Os */ -#define WT_STAT_CONN_FSYNC_IO 1102 +#define WT_STAT_CONN_FSYNC_IO 1106 /*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1103 +#define WT_STAT_CONN_READ_IO 1107 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1104 +#define WT_STAT_CONN_WRITE_IO 1108 /*! cursor: cursor create calls */ -#define WT_STAT_CONN_CURSOR_CREATE 1105 +#define WT_STAT_CONN_CURSOR_CREATE 1109 /*! cursor: cursor insert calls */ -#define WT_STAT_CONN_CURSOR_INSERT 1106 +#define WT_STAT_CONN_CURSOR_INSERT 1110 /*! cursor: cursor next calls */ -#define WT_STAT_CONN_CURSOR_NEXT 1107 +#define WT_STAT_CONN_CURSOR_NEXT 1111 /*! cursor: cursor prev calls */ -#define WT_STAT_CONN_CURSOR_PREV 1108 +#define WT_STAT_CONN_CURSOR_PREV 1112 /*! cursor: cursor remove calls */ -#define WT_STAT_CONN_CURSOR_REMOVE 1109 +#define WT_STAT_CONN_CURSOR_REMOVE 1113 /*! cursor: cursor reset calls */ -#define WT_STAT_CONN_CURSOR_RESET 1110 +#define WT_STAT_CONN_CURSOR_RESET 1114 /*! cursor: cursor restarted searches */ -#define WT_STAT_CONN_CURSOR_RESTART 1111 +#define WT_STAT_CONN_CURSOR_RESTART 1115 /*! cursor: cursor search calls */ -#define WT_STAT_CONN_CURSOR_SEARCH 1112 +#define WT_STAT_CONN_CURSOR_SEARCH 1116 /*! cursor: cursor search near calls */ -#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1113 +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1117 /*! cursor: cursor update calls */ -#define WT_STAT_CONN_CURSOR_UPDATE 1114 +#define WT_STAT_CONN_CURSOR_UPDATE 1118 /*! cursor: truncate calls */ -#define WT_STAT_CONN_CURSOR_TRUNCATE 1115 +#define WT_STAT_CONN_CURSOR_TRUNCATE 1119 /*! data-handle: connection data handles currently active */ -#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1116 +#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1120 /*! data-handle: connection sweep candidate became referenced */ -#define WT_STAT_CONN_DH_SWEEP_REF 1117 +#define WT_STAT_CONN_DH_SWEEP_REF 1121 /*! data-handle: connection sweep dhandles closed */ -#define WT_STAT_CONN_DH_SWEEP_CLOSE 1118 +#define WT_STAT_CONN_DH_SWEEP_CLOSE 1122 /*! data-handle: connection sweep dhandles removed from hash list */ -#define WT_STAT_CONN_DH_SWEEP_REMOVE 1119 +#define WT_STAT_CONN_DH_SWEEP_REMOVE 1123 /*! data-handle: connection sweep time-of-death sets */ -#define WT_STAT_CONN_DH_SWEEP_TOD 1120 +#define WT_STAT_CONN_DH_SWEEP_TOD 1124 /*! data-handle: connection sweeps */ -#define WT_STAT_CONN_DH_SWEEPS 1121 +#define WT_STAT_CONN_DH_SWEEPS 1125 /*! data-handle: session dhandles swept */ -#define WT_STAT_CONN_DH_SESSION_HANDLES 1122 +#define WT_STAT_CONN_DH_SESSION_HANDLES 1126 /*! data-handle: session sweep attempts */ -#define WT_STAT_CONN_DH_SESSION_SWEEPS 1123 +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1127 +/*! lock: checkpoint lock acquisitions */ +#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1128 +/*! lock: checkpoint lock application thread wait time (usecs) */ +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1129 +/*! lock: checkpoint lock internal thread wait time (usecs) */ +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1130 +/*! lock: handle-list lock acquisitions */ +#define WT_STAT_CONN_LOCK_HANDLE_LIST_COUNT 1131 +/*! lock: handle-list lock application thread wait time (usecs) */ +#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_APPLICATION 1132 +/*! lock: handle-list lock internal thread wait time (usecs) */ +#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_INTERNAL 1133 +/*! lock: metadata lock acquisitions */ +#define WT_STAT_CONN_LOCK_METADATA_COUNT 1134 +/*! lock: metadata lock application thread wait time (usecs) */ +#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1135 +/*! lock: metadata lock internal thread wait time (usecs) */ +#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1136 +/*! lock: schema lock acquisitions */ +#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1137 +/*! lock: schema lock application thread wait time (usecs) */ +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1138 +/*! lock: schema lock internal thread wait time (usecs) */ +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1139 +/*! lock: table lock acquisitions */ +#define WT_STAT_CONN_LOCK_TABLE_COUNT 1140 +/*! + * lock: table lock application thread time waiting for the table lock + * (usecs) + */ +#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1141 +/*! + * lock: table lock internal thread time waiting for the table lock + * (usecs) + */ +#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1142 /*! log: busy returns attempting to switch slots */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1124 +#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1143 /*! log: consolidated slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1125 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1144 /*! log: consolidated slot join races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1126 +#define WT_STAT_CONN_LOG_SLOT_RACES 1145 /*! log: consolidated slot join transitions */ -#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1127 +#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1146 /*! log: consolidated slot joins */ -#define WT_STAT_CONN_LOG_SLOT_JOINS 1128 +#define WT_STAT_CONN_LOG_SLOT_JOINS 1147 /*! log: consolidated slot unbuffered writes */ -#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1129 +#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1148 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1130 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1149 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1131 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1150 /*! log: log files manually zero-filled */ -#define WT_STAT_CONN_LOG_ZERO_FILLS 1132 +#define WT_STAT_CONN_LOG_ZERO_FILLS 1151 /*! log: log flush operations */ -#define WT_STAT_CONN_LOG_FLUSH 1133 +#define WT_STAT_CONN_LOG_FLUSH 1152 /*! log: log force write operations */ -#define WT_STAT_CONN_LOG_FORCE_WRITE 1134 +#define WT_STAT_CONN_LOG_FORCE_WRITE 1153 /*! log: log force write operations skipped */ -#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1135 +#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1154 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1136 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1155 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1137 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1156 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1138 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1157 /*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1139 +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1158 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1140 +#define WT_STAT_CONN_LOG_SCANS 1159 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1141 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1160 /*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1142 +#define WT_STAT_CONN_LOG_WRITE_LSN 1161 /*! log: log server thread write LSN walk skipped */ -#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1143 +#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1162 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1144 +#define WT_STAT_CONN_LOG_SYNC 1163 /*! log: log sync time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DURATION 1145 +#define WT_STAT_CONN_LOG_SYNC_DURATION 1164 /*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1146 +#define WT_STAT_CONN_LOG_SYNC_DIR 1165 /*! log: log sync_dir time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1147 +#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1166 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1148 +#define WT_STAT_CONN_LOG_WRITES 1167 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1149 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1168 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1150 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1169 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1151 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1170 /*! log: pre-allocated log files not ready and missed */ -#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1152 +#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1171 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1153 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1172 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1154 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1173 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1155 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1174 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1156 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1175 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1157 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1176 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1158 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1177 /*! log: written slots coalesced */ -#define WT_STAT_CONN_LOG_SLOT_COALESCED 1159 +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1178 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1160 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1179 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1161 +#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1180 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1162 +#define WT_STAT_CONN_REC_PAGES 1181 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1163 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1182 /*! reconciliation: pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE 1164 +#define WT_STAT_CONN_REC_PAGE_DELETE 1183 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1165 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1184 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1166 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1185 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1167 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1186 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1168 +#define WT_STAT_CONN_SESSION_OPEN 1187 /*! session: table compact failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1169 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1188 /*! session: table compact successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1170 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1189 /*! session: table create failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1171 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1190 /*! session: table create successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1172 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1191 /*! session: table drop failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1173 +#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1192 /*! session: table drop successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1174 +#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1193 /*! session: table rebalance failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1175 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1194 /*! session: table rebalance successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1176 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1195 /*! session: table rename failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1177 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1196 /*! session: table rename successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1178 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1197 /*! session: table salvage failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1179 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1198 /*! session: table salvage successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1180 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1199 /*! session: table truncate failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1181 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1200 /*! session: table truncate successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1182 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1201 /*! session: table verify failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1183 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1202 /*! session: table verify successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1184 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1203 /*! thread-state: active filesystem fsync calls */ -#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1185 +#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1204 /*! thread-state: active filesystem read calls */ -#define WT_STAT_CONN_THREAD_READ_ACTIVE 1186 +#define WT_STAT_CONN_THREAD_READ_ACTIVE 1205 /*! thread-state: active filesystem write calls */ -#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1187 +#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1206 +/*! thread-yield: application thread time evicting (usecs) */ +#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1207 +/*! thread-yield: application thread time waiting for cache (usecs) */ +#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1208 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1188 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1209 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1189 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1210 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1190 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1211 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1191 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1212 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1192 +#define WT_STAT_CONN_PAGE_SLEEP 1213 /*! transaction: number of named snapshots created */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1193 +#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1214 /*! transaction: number of named snapshots dropped */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1194 +#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1215 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1195 +#define WT_STAT_CONN_TXN_BEGIN 1216 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1196 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1217 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1197 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1218 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1198 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1219 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1199 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1220 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1200 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1221 /*! transaction: transaction checkpoint scrub dirty target */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1201 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1222 /*! transaction: transaction checkpoint scrub time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1202 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1223 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1203 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1224 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1204 +#define WT_STAT_CONN_TXN_CHECKPOINT 1225 +/*! + * transaction: transaction checkpoints skipped because database was + * clean + */ +#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1226 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1205 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1227 /*! * transaction: transaction fsync calls for checkpoint after allocating * the transaction ID */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1206 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1228 /*! * transaction: transaction fsync duration for checkpoint after * allocating the transaction ID (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1207 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1229 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1208 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1230 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1209 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1231 /*! * transaction: transaction range of IDs currently pinned by named * snapshots */ -#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1210 +#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1232 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1211 +#define WT_STAT_CONN_TXN_SYNC 1233 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1212 +#define WT_STAT_CONN_TXN_COMMIT 1234 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1213 +#define WT_STAT_CONN_TXN_ROLLBACK 1235 /*! * @} @@ -4709,28 +4768,28 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); /*! btree: btree checkpoint generation */ #define WT_STAT_DSRC_BTREE_CHECKPOINT_GENERATION 2022 /*! - * btree: column-store fixed-size leaf pages, only reported if - * statistics=all is set + * btree: column-store fixed-size leaf pages, only reported if tree_walk + * or all statistics are enabled */ #define WT_STAT_DSRC_BTREE_COLUMN_FIX 2023 /*! - * btree: column-store internal pages, only reported if statistics=all is - * set + * btree: column-store internal pages, only reported if tree_walk or all + * statistics are enabled */ #define WT_STAT_DSRC_BTREE_COLUMN_INTERNAL 2024 /*! * btree: column-store variable-size RLE encoded values, only reported if - * statistics=all is set + * tree_walk or all statistics are enabled */ #define WT_STAT_DSRC_BTREE_COLUMN_RLE 2025 /*! * btree: column-store variable-size deleted values, only reported if - * statistics=all is set + * tree_walk or all statistics are enabled */ #define WT_STAT_DSRC_BTREE_COLUMN_DELETED 2026 /*! * btree: column-store variable-size leaf pages, only reported if - * statistics=all is set + * tree_walk or all statistics are enabled */ #define WT_STAT_DSRC_BTREE_COLUMN_VARIABLE 2027 /*! btree: fixed-record size */ @@ -4748,20 +4807,26 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); /*! btree: maximum tree depth */ #define WT_STAT_DSRC_BTREE_MAXIMUM_DEPTH 2034 /*! - * btree: number of key/value pairs, only reported if statistics=all is - * set + * btree: number of key/value pairs, only reported if tree_walk or all + * statistics are enabled */ #define WT_STAT_DSRC_BTREE_ENTRIES 2035 -/*! btree: overflow pages, only reported if statistics=all is set */ +/*! + * btree: overflow pages, only reported if tree_walk or all statistics + * are enabled + */ #define WT_STAT_DSRC_BTREE_OVERFLOW 2036 /*! btree: pages rewritten by compaction */ #define WT_STAT_DSRC_BTREE_COMPACT_REWRITE 2037 /*! - * btree: row-store internal pages, only reported if statistics=all is - * set + * btree: row-store internal pages, only reported if tree_walk or all + * statistics are enabled */ #define WT_STAT_DSRC_BTREE_ROW_INTERNAL 2038 -/*! btree: row-store leaf pages, only reported if statistics=all is set */ +/*! + * btree: row-store leaf pages, only reported if tree_walk or all + * statistics are enabled + */ #define WT_STAT_DSRC_BTREE_ROW_LEAF 2039 /*! cache: bytes currently in the cache */ #define WT_STAT_DSRC_CACHE_BYTES_INUSE 2040 @@ -4807,87 +4872,179 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2060 /*! cache: unmodified pages evicted */ #define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2061 +/*! + * cache_walk: Average difference between current eviction generation + * when the page was last considered, only reported if cache_walk or all + * statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_GEN_AVG_GAP 2062 +/*! + * cache_walk: Average on-disk page image size seen, only reported if + * cache_walk or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_AVG_WRITTEN_SIZE 2063 +/*! + * cache_walk: Clean pages currently in cache, only reported if + * cache_walk or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_PAGES_CLEAN 2064 +/*! + * cache_walk: Current eviction generation, only reported if cache_walk + * or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_GEN_CURRENT 2065 +/*! + * cache_walk: Dirty pages currently in cache, only reported if + * cache_walk or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_PAGES_DIRTY 2066 +/*! + * cache_walk: Entries in the root page, only reported if cache_walk or + * all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_ROOT_ENTRIES 2067 +/*! + * cache_walk: Internal pages currently in cache, only reported if + * cache_walk or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_PAGES_INTERNAL 2068 +/*! + * cache_walk: Leaf pages currently in cache, only reported if cache_walk + * or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_PAGES_LEAF 2069 +/*! + * cache_walk: Maximum difference between current eviction generation + * when the page was last considered, only reported if cache_walk or all + * statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_GEN_MAX_GAP 2070 +/*! + * cache_walk: Maximum page size seen, only reported if cache_walk or all + * statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_MAX_PAGESIZE 2071 +/*! + * cache_walk: Minimum on-disk page image size seen, only reported if + * cache_walk or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_MIN_WRITTEN_SIZE 2072 +/*! + * cache_walk: On-disk page image sizes smaller than a single allocation + * unit, only reported if cache_walk or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_SMALLER_ALLOC_SIZE 2073 +/*! + * cache_walk: Pages created in memory and never written, only reported + * if cache_walk or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_MEMORY 2074 +/*! + * cache_walk: Pages currently queued for eviction, only reported if + * cache_walk or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_QUEUED 2075 +/*! + * cache_walk: Pages that could not be queued for eviction, only reported + * if cache_walk or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_NOT_QUEUEABLE 2076 +/*! + * cache_walk: Refs skipped during cache traversal, only reported if + * cache_walk or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_REFS_SKIPPED 2077 +/*! + * cache_walk: Size of the root page, only reported if cache_walk or all + * statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_ROOT_SIZE 2078 +/*! + * cache_walk: Total number of pages currently in cache, only reported if + * cache_walk or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_PAGES 2079 /*! compression: compressed pages read */ -#define WT_STAT_DSRC_COMPRESS_READ 2062 +#define WT_STAT_DSRC_COMPRESS_READ 2080 /*! compression: compressed pages written */ -#define WT_STAT_DSRC_COMPRESS_WRITE 2063 +#define WT_STAT_DSRC_COMPRESS_WRITE 2081 /*! compression: page written failed to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2064 +#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2082 /*! compression: page written was too small to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2065 +#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2083 /*! compression: raw compression call failed, additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2066 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2084 /*! compression: raw compression call failed, no additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2067 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2085 /*! compression: raw compression call succeeded */ -#define WT_STAT_DSRC_COMPRESS_RAW_OK 2068 +#define WT_STAT_DSRC_COMPRESS_RAW_OK 2086 /*! cursor: bulk-loaded cursor-insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2069 +#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2087 /*! cursor: create calls */ -#define WT_STAT_DSRC_CURSOR_CREATE 2070 +#define WT_STAT_DSRC_CURSOR_CREATE 2088 /*! cursor: cursor-insert key and value bytes inserted */ -#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2071 +#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2089 /*! cursor: cursor-remove key bytes removed */ -#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2072 +#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2090 /*! cursor: cursor-update value bytes updated */ -#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2073 +#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2091 /*! cursor: insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT 2074 +#define WT_STAT_DSRC_CURSOR_INSERT 2092 /*! cursor: next calls */ -#define WT_STAT_DSRC_CURSOR_NEXT 2075 +#define WT_STAT_DSRC_CURSOR_NEXT 2093 /*! cursor: prev calls */ -#define WT_STAT_DSRC_CURSOR_PREV 2076 +#define WT_STAT_DSRC_CURSOR_PREV 2094 /*! cursor: remove calls */ -#define WT_STAT_DSRC_CURSOR_REMOVE 2077 +#define WT_STAT_DSRC_CURSOR_REMOVE 2095 /*! cursor: reset calls */ -#define WT_STAT_DSRC_CURSOR_RESET 2078 +#define WT_STAT_DSRC_CURSOR_RESET 2096 /*! cursor: restarted searches */ -#define WT_STAT_DSRC_CURSOR_RESTART 2079 +#define WT_STAT_DSRC_CURSOR_RESTART 2097 /*! cursor: search calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH 2080 +#define WT_STAT_DSRC_CURSOR_SEARCH 2098 /*! cursor: search near calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2081 +#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2099 /*! cursor: truncate calls */ -#define WT_STAT_DSRC_CURSOR_TRUNCATE 2082 +#define WT_STAT_DSRC_CURSOR_TRUNCATE 2100 /*! cursor: update calls */ -#define WT_STAT_DSRC_CURSOR_UPDATE 2083 +#define WT_STAT_DSRC_CURSOR_UPDATE 2101 /*! reconciliation: dictionary matches */ -#define WT_STAT_DSRC_REC_DICTIONARY 2084 +#define WT_STAT_DSRC_REC_DICTIONARY 2102 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2085 +#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2103 /*! * reconciliation: internal page key bytes discarded using suffix * compression */ -#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2086 +#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2104 /*! reconciliation: internal page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2087 +#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2105 /*! reconciliation: internal-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2088 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2106 /*! reconciliation: leaf page key bytes discarded using prefix compression */ -#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2089 +#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2107 /*! reconciliation: leaf page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2090 +#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2108 /*! reconciliation: leaf-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2091 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2109 /*! reconciliation: maximum blocks required for a page */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2092 +#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2110 /*! reconciliation: overflow values written */ -#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2093 +#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2111 /*! reconciliation: page checksum matches */ -#define WT_STAT_DSRC_REC_PAGE_MATCH 2094 +#define WT_STAT_DSRC_REC_PAGE_MATCH 2112 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_DSRC_REC_PAGES 2095 +#define WT_STAT_DSRC_REC_PAGES 2113 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_DSRC_REC_PAGES_EVICTION 2096 +#define WT_STAT_DSRC_REC_PAGES_EVICTION 2114 /*! reconciliation: pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE 2097 +#define WT_STAT_DSRC_REC_PAGE_DELETE 2115 /*! session: object compaction */ -#define WT_STAT_DSRC_SESSION_COMPACT 2098 +#define WT_STAT_DSRC_SESSION_COMPACT 2116 /*! session: open cursor count */ -#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2099 +#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2117 /*! transaction: update conflicts */ -#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2100 +#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2118 /*! * @} diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h index 4e6699ab9d1..d354757c592 100644 --- a/src/include/wt_internal.h +++ b/src/include/wt_internal.h @@ -216,6 +216,8 @@ struct __wt_logslot; typedef struct __wt_logslot WT_LOGSLOT; struct __wt_lsm_chunk; typedef struct __wt_lsm_chunk WT_LSM_CHUNK; +struct __wt_lsm_cursor_chunk; + typedef struct __wt_lsm_cursor_chunk WT_LSM_CURSOR_CHUNK; struct __wt_lsm_data_source; typedef struct __wt_lsm_data_source WT_LSM_DATA_SOURCE; struct __wt_lsm_manager; diff --git a/src/log/log.c b/src/log/log.c index b0c789f0f9e..00e4ea5f441 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -128,9 +128,9 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) "log_force_sync: sync directory %s to LSN %" PRIu32 "/%" PRIu32, log->log_dir_fh->name, min_lsn->l.file, min_lsn->l.offset); - WT_ERR(__wt_epoch(session, &fsync_start)); + __wt_epoch(session, &fsync_start); WT_ERR(__wt_fsync(session, log->log_dir_fh, true)); - WT_ERR(__wt_epoch(session, &fsync_stop)); + __wt_epoch(session, &fsync_stop); fsync_duration_usecs = WT_TIMEDIFF_US(fsync_stop, fsync_start); log->sync_dir_lsn = *min_lsn; WT_STAT_CONN_INCR(session, log_sync_dir); @@ -152,9 +152,9 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) __wt_verbose(session, WT_VERB_LOG, "log_force_sync: sync %s to LSN %" PRIu32 "/%" PRIu32, log_fh->name, min_lsn->l.file, min_lsn->l.offset); - WT_ERR(__wt_epoch(session, &fsync_start)); + __wt_epoch(session, &fsync_start); WT_ERR(__wt_fsync(session, log_fh, true)); - WT_ERR(__wt_epoch(session, &fsync_stop)); + __wt_epoch(session, &fsync_stop); fsync_duration_usecs = WT_TIMEDIFF_US(fsync_stop, fsync_start); log->sync_lsn = *min_lsn; WT_STAT_CONN_INCR(session, log_sync); @@ -1478,9 +1478,9 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) "/%" PRIu32, log->log_dir_fh->name, sync_lsn.l.file, sync_lsn.l.offset); - WT_ERR(__wt_epoch(session, &fsync_start)); + __wt_epoch(session, &fsync_start); WT_ERR(__wt_fsync(session, log->log_dir_fh, true)); - WT_ERR(__wt_epoch(session, &fsync_stop)); + __wt_epoch(session, &fsync_stop); fsync_duration_usecs = WT_TIMEDIFF_US(fsync_stop, fsync_start); log->sync_dir_lsn = sync_lsn; @@ -1500,9 +1500,9 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) log->log_fh->name, sync_lsn.l.file, sync_lsn.l.offset); WT_STAT_CONN_INCR(session, log_sync); - WT_ERR(__wt_epoch(session, &fsync_start)); + __wt_epoch(session, &fsync_start); WT_ERR(__wt_fsync(session, log->log_fh, true)); - WT_ERR(__wt_epoch(session, &fsync_stop)); + __wt_epoch(session, &fsync_stop); fsync_duration_usecs = WT_TIMEDIFF_US(fsync_stop, fsync_start); WT_STAT_CONN_INCRV(session, diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index e98f59e7b05..067c527a21a 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -10,7 +10,7 @@ #define WT_FORALL_CURSORS(clsm, c, i) \ for ((i) = (clsm)->nchunks; (i) > 0;) \ - if (((c) = (clsm)->cursors[--i]) != NULL) + if (((c) = (clsm)->chunks[--i]->cursor) != NULL) #define WT_LSM_CURCMP(s, lsm_tree, c1, c2, cmp) \ __wt_compare(s, (lsm_tree)->collator, &(c1)->key, &(c2)->key, &cmp) @@ -18,6 +18,7 @@ static int __clsm_lookup(WT_CURSOR_LSM *, WT_ITEM *); static int __clsm_open_cursors(WT_CURSOR_LSM *, bool, u_int, uint32_t); static int __clsm_reset_cursors(WT_CURSOR_LSM *, WT_CURSOR *); +static int __clsm_search_near(WT_CURSOR *cursor, int *exactp); /* * __wt_clsm_request_switch -- @@ -109,7 +110,7 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm) primary = NULL; have_primary = false; } else { - primary = clsm->cursors[clsm->nchunks - 1]; + primary = clsm->chunks[clsm->nchunks - 1]->cursor; primary_chunk = clsm->primary_chunk; WT_ASSERT(session, F_ISSET(&session->txn, WT_TXN_HAS_ID)); have_primary = (primary != NULL && primary_chunk != NULL && @@ -165,8 +166,7 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update) WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; WT_TXN *txn; - uint64_t *switch_txnp; - uint64_t snap_min; + uint64_t i, pinned_id , switch_txn; lsm_tree = clsm->lsm_tree; session = (WT_SESSION_IMPL *)clsm->iface.session; @@ -226,8 +226,8 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update) * that overlaps with our snapshot is a potential * conflict. * - * Note that the global snap_min is correct here: it - * tracks concurrent transactions excluding special + * Note that the pinned ID is correct here: it tracks + * concurrent transactions excluding special * transactions such as checkpoint (which we can't * conflict with because checkpoint only writes the * metadata, which is not an LSM tree). @@ -237,17 +237,18 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update) F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { WT_ASSERT(session, F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)); - snap_min = - WT_SESSION_TXN_STATE(session)->snap_min; - for (switch_txnp = - &clsm->switch_txn[clsm->nchunks - 2]; + pinned_id = + WT_SESSION_TXN_STATE(session)->pinned_id; + for (i = clsm->nchunks - 2; clsm->nupdates < clsm->nchunks; - clsm->nupdates++, switch_txnp--) { - if (WT_TXNID_LT(*switch_txnp, snap_min)) + clsm->nupdates++, i--) { + switch_txn = + clsm->chunks[i]->switch_txn; + if (WT_TXNID_LT(switch_txn, pinned_id)) break; WT_ASSERT(session, !__wt_txn_visible_all( - session, *switch_txnp)); + session, switch_txn)); } } } @@ -378,7 +379,7 @@ __clsm_close_cursors(WT_CURSOR_LSM *clsm, u_int start, u_int end) WT_CURSOR *c; u_int i; - if (clsm->cursors == NULL || clsm->nchunks == 0) + if (clsm->chunks == NULL || clsm->nchunks == 0) return (0); /* @@ -387,12 +388,12 @@ __clsm_close_cursors(WT_CURSOR_LSM *clsm, u_int start, u_int end) * careful with unsigned integer wrapping. */ for (i = start; i < end; i++) { - if ((c = (clsm)->cursors[i]) != NULL) { - clsm->cursors[i] = NULL; + if ((c = (clsm)->chunks[i]->cursor) != NULL) { + clsm->chunks[i]->cursor = NULL; WT_RET(c->close(c)); } - if ((bloom = clsm->blooms[i]) != NULL) { - clsm->blooms[i] = NULL; + if ((bloom = clsm->chunks[i]->bloom) != NULL) { + clsm->chunks[i]->bloom = NULL; WT_RET(__wt_bloom_close(bloom)); } } @@ -401,6 +402,45 @@ __clsm_close_cursors(WT_CURSOR_LSM *clsm, u_int start, u_int end) } /* + * __clsm_resize_chunks -- + * Allocates an array of unit objects for each chunk. + */ +static int +__clsm_resize_chunks( + WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, u_int nchunks) +{ + WT_DECL_RET; + WT_LSM_CURSOR_CHUNK *chunk; + + /* Don't allocate more iterators if we don't need them. */ + if (clsm->chunks_count >= nchunks) { + return (ret); + } + + WT_RET(__wt_realloc_def(session, &clsm->chunks_alloc, nchunks, + &clsm->chunks)); + for (; clsm->chunks_count < nchunks; clsm->chunks_count++) { + WT_RET(__wt_calloc_one(session, &chunk)); + clsm->chunks[clsm->chunks_count] = chunk; + } + return (ret); +} + +/* + * __clsm_free_chunks -- + * Allocates an array of unit objects for each chunk. + */ +static void +__clsm_free_chunks(WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm) +{ + size_t i; + for (i = 0; i < clsm->chunks_count; i++) { + __wt_free(session, clsm->chunks[i]); + } + __wt_free(session, clsm->chunks); +} + +/* * __clsm_open_cursors -- * Open cursors for the current set of files. */ @@ -409,7 +449,7 @@ __clsm_open_cursors( WT_CURSOR_LSM *clsm, bool update, u_int start_chunk, uint32_t start_id) { WT_BTREE *btree; - WT_CURSOR *c, **cp, *primary; + WT_CURSOR *c, *cursor, *primary; WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; @@ -422,6 +462,7 @@ __clsm_open_cursors( bool locked; c = &clsm->iface; + cursor = NULL; session = (WT_SESSION_IMPL *)c->session; txn = &session->txn; chunk = NULL; @@ -465,7 +506,7 @@ __clsm_open_cursors( retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { nchunks = clsm->nchunks; ngood = 0; - + WT_ERR(__clsm_resize_chunks(session, clsm, nchunks)); /* * We may have raced with another merge completing. Check that * we're starting at the right offset in the chunk array. @@ -486,16 +527,13 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks); } else { nchunks = lsm_tree->nchunks; + WT_ERR(__clsm_resize_chunks(session, clsm, nchunks)); /* * If we are only opening the cursor for updates, only open the * primary chunk, plus any other chunks that might be required * to detect snapshot isolation conflicts. */ - if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) - WT_ERR(__wt_realloc_def(session, - &clsm->txnid_alloc, nchunks, - &clsm->switch_txn)); if (F_ISSET(clsm, WT_CLSM_OPEN_READ)) ngood = nupdates = 0; else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { @@ -504,11 +542,11 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { * chunk are globally visible. Copy the maximum * transaction IDs into the cursor as we go. */ - for (ngood = nchunks - 1, nupdates = 1; - ngood > 0; + for (ngood = nchunks - 1, nupdates = 1; ngood > 0; ngood--, nupdates++) { chunk = lsm_tree->chunk[ngood - 1]; - clsm->switch_txn[ngood - 1] = chunk->switch_txn; + clsm->chunks[ngood - 1]->switch_txn = + chunk->switch_txn; if (__wt_txn_visible_all( session, chunk->switch_txn)) break; @@ -519,21 +557,20 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { } /* Check how many cursors are already open. */ - for (cp = clsm->cursors + ngood; - ngood < clsm->nchunks && ngood < nchunks; - cp++, ngood++) { + for (; ngood < clsm->nchunks && ngood < nchunks; ngood++) { chunk = lsm_tree->chunk[ngood]; + cursor = clsm->chunks[ngood]->cursor; /* If the cursor isn't open yet, we're done. */ - if (*cp == NULL) + if (cursor == NULL) break; /* Easy case: the URIs don't match. */ - if (strcmp((*cp)->uri, chunk->uri) != 0) + if (strcmp(cursor->uri, chunk->uri) != 0) break; /* Make sure the checkpoint config matches. */ - checkpoint = ((WT_CURSOR_BTREE *)*cp)-> + checkpoint = ((WT_CURSOR_BTREE *)cursor)-> btree->dhandle->checkpoint; if (checkpoint == NULL && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && @@ -541,7 +578,7 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { break; /* Make sure the Bloom config matches. */ - if (clsm->blooms[ngood] == NULL && + if (clsm->chunks[ngood]->bloom == NULL && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) break; } @@ -559,7 +596,7 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { * full, we may block while closing a cursor. Save the * generation number and retry if it has changed under us. */ - if (clsm->cursors != NULL && ngood < clsm->nchunks) { + if (clsm->chunks != NULL && ngood < clsm->nchunks) { close_range_start = ngood; close_range_end = clsm->nchunks; } else if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0 ) { @@ -591,28 +628,23 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { clsm->current = NULL; } - WT_ERR(__wt_realloc_def(session, - &clsm->bloom_alloc, nchunks, &clsm->blooms)); - WT_ERR(__wt_realloc_def(session, - &clsm->cursor_alloc, nchunks, &clsm->cursors)); - clsm->nchunks = nchunks; /* Open the cursors for chunks that have changed. */ - for (i = ngood, cp = clsm->cursors + i; i != nchunks; i++, cp++) { + for (i = ngood; i != nchunks; i++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Copy the maximum transaction ID. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) - clsm->switch_txn[i] = chunk->switch_txn; + clsm->chunks[i]->switch_txn = chunk->switch_txn; /* * Read from the checkpoint if the file has been written. * Once all cursors switch, the in-memory tree can be evicted. */ - WT_ASSERT(session, *cp == NULL); + WT_ASSERT(session, clsm->chunks[i]->cursor == NULL); ret = __wt_open_cursor(session, chunk->uri, c, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? - ckpt_cfg : NULL, cp); + ckpt_cfg : NULL, &clsm->chunks[i]->cursor); /* * XXX kludge: we may have an empty chunk where no checkpoint @@ -620,8 +652,8 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { * chunk instead. */ if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { - ret = __wt_open_cursor( - session, chunk->uri, c, NULL, cp); + ret = __wt_open_cursor(session, + chunk->uri, c, NULL, &clsm->chunks[i]->cursor); if (ret == 0) chunk->empty = 1; } @@ -634,25 +666,31 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { * write conflicts with concurrent updates. */ if (i != nchunks - 1) - (*cp)->insert = __wt_curfile_update_check; + clsm->chunks[i]->cursor->insert = + __wt_curfile_update_check; if (!F_ISSET(clsm, WT_CLSM_MERGE) && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) WT_ERR(__wt_bloom_open(session, chunk->bloom_uri, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, - c, &clsm->blooms[i])); + c, &clsm->chunks[i]->bloom)); /* Child cursors always use overwrite and raw mode. */ - F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW); + F_SET(clsm->chunks[i]->cursor, + WT_CURSTD_OVERWRITE | WT_CURSTD_RAW); } + /* Setup the count values for each chunk in the chunks*/ + for (i = 0; i != clsm->nchunks; i++) + clsm->chunks[i]->count = lsm_tree->chunk[i]->count; + /* The last chunk is our new primary. */ if (chunk != NULL && !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && chunk->switch_txn == WT_TXN_NONE) { clsm->primary_chunk = chunk; - primary = clsm->cursors[clsm->nchunks - 1]; + primary = clsm->chunks[clsm->nchunks - 1]->cursor; /* * Disable eviction for the in-memory chunk. Also clear the * bulk load flag here, otherwise eviction will be enabled by @@ -672,17 +710,19 @@ err: #ifdef HAVE_DIAGNOSTIC /* Check that all cursors are open as expected. */ if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) { - for (i = 0, cp = clsm->cursors; i != clsm->nchunks; cp++, i++) { + for (i = 0; i != clsm->nchunks; i++) { + cursor = clsm->chunks[i]->cursor; chunk = lsm_tree->chunk[i + start_chunk]; - /* Make sure the cursor is open. */ - WT_ASSERT(session, *cp != NULL); + /* Make sure the first cursor is open. */ + WT_ASSERT(session, cursor != NULL); /* Easy case: the URIs should match. */ - WT_ASSERT(session, strcmp((*cp)->uri, chunk->uri) == 0); + WT_ASSERT( + session, strcmp(cursor->uri, chunk->uri) == 0); /* Make sure the checkpoint config matches. */ - checkpoint = ((WT_CURSOR_BTREE *)*cp)-> + checkpoint = ((WT_CURSOR_BTREE *)cursor)-> btree->dhandle->checkpoint; WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && @@ -693,7 +733,8 @@ err: WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) && !F_ISSET(clsm, WT_CLSM_MERGE)) ? - clsm->blooms[i] != NULL : clsm->blooms[i] == NULL); + clsm->chunks[i]->bloom != NULL : + clsm->chunks[i]->bloom == NULL); } } #endif @@ -902,6 +943,96 @@ err: __clsm_leave(clsm); } /* + * __clsm_random_chunk -- + * Pick a chunk at random, weighted by the size of all chunks. Weighting + * proportional to documents avoids biasing towards small chunks. Then return + * the cursor on the chunk we have picked. + */ +static int +__clsm_random_chunk(WT_SESSION_IMPL *session, + WT_CURSOR_LSM *clsm, WT_CURSOR **cursor) +{ + uint64_t checked_docs, i, rand_doc, total_docs; + + /* + * If the tree is empty we cannot do a random lookup, so return a + * WT_NOTFOUND. + */ + if (clsm->nchunks == 0) + return (WT_NOTFOUND); + for (total_docs = i = 0; i < clsm->nchunks; i++) { + total_docs += clsm->chunks[i]->count; + } + if (total_docs == 0) + return (WT_NOTFOUND); + + rand_doc = __wt_random(&session->rnd) % total_docs; + + for (checked_docs = i = 0; i < clsm->nchunks; i++) { + checked_docs += clsm->chunks[i]->count; + if (rand_doc <= checked_docs) { + *cursor = clsm->chunks[i]->cursor; + break; + } + } + return (0); +} + +/* + * __clsm_next_random -- + * WT_CURSOR->next method for the LSM cursor type when configured with + * next_random. + */ +static int +__clsm_next_random(WT_CURSOR *cursor) +{ + WT_CURSOR_LSM *clsm; + WT_CURSOR *c; + WT_DECL_RET; + WT_SESSION_IMPL *session; + int exact; + + c = NULL; + clsm = (WT_CURSOR_LSM *)cursor; + + CURSOR_API_CALL(cursor, session, next, NULL); + WT_CURSOR_NOVALUE(cursor); + WT_ERR(__clsm_enter(clsm, false, false)); + + for (;;) { + WT_ERR(__clsm_random_chunk(session, clsm, &c)); + /* + * This call to next_random on the chunk can potentially end in + * WT_NOTFOUND if the chunk we picked is empty. We want to retry + * in that case. + */ + ret = __wt_curfile_next_random(c); + if (ret == WT_NOTFOUND) + continue; + + WT_ERR(ret); + F_SET(cursor, WT_CURSTD_KEY_INT); + WT_ERR(c->get_key(c, &cursor->key)); + /* + * Search near the current key to resolve any tombstones + * and position to a valid document. If we see a + * WT_NOTFOUND here that is valid, as the tree has no + * documents visible to us. + */ + WT_ERR(__clsm_search_near(cursor, &exact)); + break; + } + + /* We have found a valid doc. Set that we are now positioned */ + if (0) { +err: F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); + } + __clsm_leave(clsm); + API_END(session, ret); + return (ret); +} + +/* * __clsm_prev -- * WT_CURSOR->prev method for the LSM cursor type. */ @@ -1072,7 +1203,7 @@ __clsm_lookup(WT_CURSOR_LSM *clsm, WT_ITEM *value) WT_FORALL_CURSORS(clsm, c, i) { /* If there is a Bloom filter, see if we can skip the read. */ bloom = NULL; - if ((bloom = clsm->blooms[i]) != NULL) { + if ((bloom = clsm->chunks[i]->bloom) != NULL) { if (!have_hash) { __wt_bloom_hash(bloom, &cursor->key, &bhash); have_hash = true; @@ -1259,7 +1390,12 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp) */ F_CLR(cursor, WT_CURSTD_KEY_SET); F_SET(cursor, WT_CURSTD_KEY_INT); - if ((ret = cursor->next(cursor)) == 0) { + /* + * We call __clsm_next here as we want to advance + * forward. If we are a random LSM cursor calling next + * on the cursor will not advance as we intend. + */ + if ((ret = __clsm_next(cursor)) == 0) { cmp = 1; deleted = false; } @@ -1268,7 +1404,11 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp) } if (deleted) { clsm->current = NULL; - WT_ERR(cursor->prev(cursor)); + /* + * We call prev directly here as cursor->prev may be "invalid" + * if this is a random cursor. + */ + WT_ERR(__clsm_prev(cursor)); cmp = -1; } *exactp = cmp; @@ -1312,7 +1452,7 @@ __clsm_put(WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, * Clear the existing cursor position. Don't clear the primary cursor: * we're about to use it anyway. */ - primary = clsm->cursors[clsm->nchunks - 1]; + primary = clsm->chunks[clsm->nchunks - 1]->cursor; WT_RET(__clsm_reset_cursors(clsm, primary)); /* If necessary, set the position for future scans. */ @@ -1322,12 +1462,12 @@ __clsm_put(WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, for (i = 0, slot = clsm->nchunks - 1; i < clsm->nupdates; i++, slot--) { /* Check if we need to keep updating old chunks. */ if (i > 0 && - __wt_txn_visible(session, clsm->switch_txn[slot])) { + __wt_txn_visible(session, clsm->chunks[slot]->switch_txn)) { clsm->nupdates = i; break; } - c = clsm->cursors[slot]; + c = clsm->chunks[slot]->cursor; c->set_key(c, key); c->set_value(c, value); WT_RET((position && i == 0) ? c->update(c) : c->insert(c)); @@ -1485,9 +1625,7 @@ __wt_clsm_close(WT_CURSOR *cursor) clsm = (WT_CURSOR_LSM *)cursor; CURSOR_API_CALL(cursor, session, close, NULL); WT_TRET(__clsm_close_cursors(clsm, 0, clsm->nchunks)); - __wt_free(session, clsm->blooms); - __wt_free(session, clsm->cursors); - __wt_free(session, clsm->switch_txn); + __clsm_free_chunks(session, clsm); /* In case we were somehow left positioned, clear that. */ __clsm_leave(clsm); @@ -1588,6 +1726,13 @@ __wt_clsm_open(WT_SESSION_IMPL *session, */ clsm->dsk_gen = 0; + /* If the next_random option is set, configure a random cursor */ + WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval)); + if (cval.val != 0) { + __wt_cursor_set_notsup(cursor); + cursor->next = __clsm_next_random; + } + WT_ERR(__wt_cursor_init(cursor, cursor->uri, owner, cfg, cursorp)); if (bulk) diff --git a/src/lsm/lsm_cursor_bulk.c b/src/lsm/lsm_cursor_bulk.c index 607ca0c9705..319426de3f0 100644 --- a/src/lsm/lsm_cursor_bulk.c +++ b/src/lsm/lsm_cursor_bulk.c @@ -28,9 +28,8 @@ __clsm_close_bulk(WT_CURSOR *cursor) session = (WT_SESSION_IMPL *)clsm->iface.session; /* Close the bulk cursor to ensure the chunk is written to disk. */ - bulk_cursor = clsm->cursors[0]; + bulk_cursor = clsm->chunks[0]->cursor; WT_RET(bulk_cursor->close(bulk_cursor)); - clsm->cursors[0] = NULL; clsm->nchunks = 0; /* Set ondisk, and flush the metadata */ @@ -75,7 +74,7 @@ __clsm_insert_bulk(WT_CURSOR *cursor) WT_ASSERT(session, lsm_tree->nchunks == 1 && clsm->nchunks == 1); ++chunk->count; chunk->size += cursor->key.size + cursor->value.size; - bulk_cursor = *clsm->cursors; + bulk_cursor = clsm->chunks[0]->cursor; bulk_cursor->set_key(bulk_cursor, &cursor->key); bulk_cursor->set_value(bulk_cursor, &cursor->value); WT_RET(bulk_cursor->insert(bulk_cursor)); @@ -124,11 +123,10 @@ __wt_clsm_open_bulk(WT_CURSOR_LSM *clsm, const char *cfg[]) * for a bloom filter - it makes cleanup simpler. Cleaned up by * cursor close on error. */ - WT_RET(__wt_calloc_one(session, &clsm->blooms)); - clsm->bloom_alloc = 1; - WT_RET(__wt_calloc_one(session, &clsm->cursors)); - clsm->cursor_alloc = 1; - clsm->nchunks = 1; + WT_RET( + __wt_realloc_def(session, &clsm->chunks_alloc, 1, &clsm->chunks)); + WT_RET(__wt_calloc_one(session, &clsm->chunks[0])); + clsm->chunks_count = clsm->nchunks = 1; /* * Open a bulk cursor on the first chunk in the tree - take a read @@ -139,7 +137,7 @@ __wt_clsm_open_bulk(WT_CURSOR_LSM *clsm, const char *cfg[]) */ WT_RET(__wt_open_cursor(session, lsm_tree->chunk[0]->uri, &clsm->iface, cfg, &bulk_cursor)); - clsm->cursors[0] = bulk_cursor; + clsm->chunks[0]->cursor = bulk_cursor; /* LSM cursors are always raw */ F_SET(bulk_cursor, WT_CURSTD_RAW); diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c index 5a5140b9c3a..0a5f4fdd8b5 100644 --- a/src/lsm/lsm_manager.c +++ b/src/lsm/lsm_manager.c @@ -392,7 +392,7 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session) TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) { if (!lsm_tree->active) continue; - WT_ERR(__wt_epoch(session, &now)); + __wt_epoch(session, &now); pushms = lsm_tree->work_push_ts.tv_sec == 0 ? 0 : WT_TIMEDIFF_MS(now, lsm_tree->work_push_ts); fillms = 3 * lsm_tree->chunk_fill_ms; @@ -651,7 +651,7 @@ __wt_lsm_manager_push_entry(WT_SESSION_IMPL *session, return (0); } - WT_RET(__wt_epoch(session, &lsm_tree->work_push_ts)); + __wt_epoch(session, &lsm_tree->work_push_ts); WT_RET(__wt_calloc_one(session, &entry)); entry->type = type; entry->flags = flags; diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c index 4bbfcfd4411..493855d489a 100644 --- a/src/lsm/lsm_merge.c +++ b/src/lsm/lsm_merge.c @@ -54,7 +54,7 @@ __lsm_merge_aggressive_clear(WT_LSM_TREE *lsm_tree) * __lsm_merge_aggressive_update -- * Update the merge aggressiveness for an LSM tree. */ -static int +static void __lsm_merge_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { struct timespec now; @@ -72,7 +72,7 @@ __lsm_merge_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) if (!lsm_tree->modified || F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) { lsm_tree->merge_aggressiveness = 10; - return (0); + return; } /* @@ -81,7 +81,7 @@ __lsm_merge_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) */ if (lsm_tree->chunks_flushed <= lsm_tree->merge_min) { __lsm_merge_aggressive_clear(lsm_tree); - return (0); + return; } /* @@ -91,10 +91,10 @@ __lsm_merge_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) */ if (!F_ISSET(lsm_tree, WT_LSM_TREE_AGGRESSIVE_TIMER)) { F_SET(lsm_tree, WT_LSM_TREE_AGGRESSIVE_TIMER); - return (__wt_epoch(session, &lsm_tree->merge_aggressive_ts)); + __wt_epoch(session, &lsm_tree->merge_aggressive_ts); } - WT_RET(__wt_epoch(session, &now)); + __wt_epoch(session, &now); msec_since_last_merge = WT_TIMEDIFF_MS(now, lsm_tree->merge_aggressive_ts); @@ -113,7 +113,7 @@ __lsm_merge_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) * generates a variable load. */ if (msec_since_last_merge < msec_to_create_merge) - return (0); + return; /* * Bump how aggressively we look for merges based on how long since @@ -134,7 +134,6 @@ __lsm_merge_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) msec_since_last_merge, lsm_tree->chunk_fill_ms); lsm_tree->merge_aggressiveness = new_aggressive; } - return (0); } /* @@ -326,7 +325,7 @@ retry_find: goto retry_find; } /* Consider getting aggressive if no merge was found */ - WT_RET(__lsm_merge_aggressive_update(session, lsm_tree)); + __lsm_merge_aggressive_update(session, lsm_tree); return (WT_NOTFOUND); } diff --git a/src/lsm/lsm_stat.c b/src/lsm/lsm_stat.c index f4f5a0acce8..3fe3ca1ba81 100644 --- a/src/lsm/lsm_stat.c +++ b/src/lsm/lsm_stat.c @@ -42,11 +42,11 @@ __curstat_lsm_init( if (cst->flags != 0) { (void)snprintf(config, sizeof(config), "statistics=(%s%s%s%s)", - F_ISSET(cst, WT_CONN_STAT_ALL) ? "all," : "", - F_ISSET(cst, WT_CONN_STAT_CLEAR) ? "clear," : "", - !F_ISSET(cst, WT_CONN_STAT_ALL) && - F_ISSET(cst, WT_CONN_STAT_FAST) ? "fast," : "", - F_ISSET(cst, WT_CONN_STAT_SIZE) ? "size," : ""); + F_ISSET(cst, WT_STAT_TYPE_ALL) ? "all," : "", + F_ISSET(cst, WT_STAT_CLEAR) ? "clear," : "", + !F_ISSET(cst, WT_STAT_TYPE_ALL) && + F_ISSET(cst, WT_STAT_TYPE_FAST) ? "fast," : "", + F_ISSET(cst, WT_STAT_TYPE_SIZE) ? "size," : ""); cfg[1] = disk_cfg[1] = config; } @@ -132,26 +132,26 @@ __curstat_lsm_init( /* Include, and optionally clear, LSM-level specific information. */ WT_STAT_WRITE(session, stats, bloom_miss, lsm_tree->bloom_miss); - if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) + if (F_ISSET(cst, WT_STAT_CLEAR)) lsm_tree->bloom_miss = 0; WT_STAT_WRITE(session, stats, bloom_hit, lsm_tree->bloom_hit); - if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) + if (F_ISSET(cst, WT_STAT_CLEAR)) lsm_tree->bloom_hit = 0; WT_STAT_WRITE(session, stats, bloom_false_positive, lsm_tree->bloom_false_positive); - if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) + if (F_ISSET(cst, WT_STAT_CLEAR)) lsm_tree->bloom_false_positive = 0; WT_STAT_WRITE(session, stats, lsm_lookup_no_bloom, lsm_tree->lsm_lookup_no_bloom); - if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) + if (F_ISSET(cst, WT_STAT_CLEAR)) lsm_tree->lsm_lookup_no_bloom = 0; WT_STAT_WRITE(session, stats, lsm_checkpoint_throttle, lsm_tree->lsm_checkpoint_throttle); - if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) + if (F_ISSET(cst, WT_STAT_CLEAR)) lsm_tree->lsm_checkpoint_throttle = 0; WT_STAT_WRITE(session, stats, lsm_merge_throttle, lsm_tree->lsm_merge_throttle); - if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) + if (F_ISSET(cst, WT_STAT_CLEAR)) lsm_tree->lsm_merge_throttle = 0; __wt_curstat_dsrc_final(cst); diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c index db9fd581110..0054dcd1583 100644 --- a/src/lsm/lsm_tree.c +++ b/src/lsm/lsm_tree.c @@ -265,7 +265,7 @@ __wt_lsm_tree_setup_chunk( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) { WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); - WT_RET(__wt_epoch(session, &chunk->create_ts)); + __wt_epoch(session, &chunk->create_ts); WT_RET(__wt_lsm_tree_chunk_name( session, lsm_tree, chunk->id, &chunk->uri)); @@ -496,7 +496,7 @@ __lsm_tree_open(WT_SESSION_IMPL *session, lsm_tree->queue_ref = 0; /* Set a flush timestamp as a baseline. */ - WT_ERR(__wt_epoch(session, &lsm_tree->last_flush_ts)); + __wt_epoch(session, &lsm_tree->last_flush_ts); /* Now the tree is setup, make it visible to others. */ TAILQ_INSERT_HEAD(&S2C(session)->lsmqh, lsm_tree, q); @@ -1139,7 +1139,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp) return (0); } - WT_ERR(__wt_seconds(session, &begin)); + __wt_seconds(session, &begin); /* * Compacting has two distinct phases. @@ -1267,7 +1267,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp) break; } __wt_sleep(1, 0); - WT_ERR(__wt_seconds(session, &end)); + __wt_seconds(session, &end); if (session->compact->max_time > 0 && session->compact->max_time < (uint64_t)(end - begin)) { WT_ERR(ETIMEDOUT); diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c index 72bcf56b3c4..917104031fc 100644 --- a/src/lsm/lsm_work_unit.c +++ b/src/lsm/lsm_work_unit.c @@ -358,7 +358,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); /* Update the flush timestamp to help track ongoing progress. */ - WT_ERR(__wt_epoch(session, &lsm_tree->last_flush_ts)); + __wt_epoch(session, &lsm_tree->last_flush_ts); ++lsm_tree->chunks_flushed; /* Lock the tree, mark the chunk as on disk and update the metadata. */ diff --git a/src/meta/meta_ckpt.c b/src/meta/meta_ckpt.c index 2b7719c3241..b985104c2eb 100644 --- a/src/meta/meta_ckpt.c +++ b/src/meta/meta_ckpt.c @@ -424,7 +424,7 @@ __wt_meta_ckptlist_set(WT_SESSION_IMPL *session, * guaranteed, a time_t has to be an arithmetic type, * but not an integral type. */ - WT_ERR(__wt_seconds(session, &secs)); + __wt_seconds(session, &secs); ckpt->sec = (uintmax_t)secs; } if (strcmp(ckpt->name, WT_CHECKPOINT) == 0) diff --git a/src/os_posix/os_mtx_cond.c b/src/os_posix/os_mtx_cond.c index b25bb8c25d1..842bb6eeec9 100644 --- a/src/os_posix/os_mtx_cond.c +++ b/src/os_posix/os_mtx_cond.c @@ -63,7 +63,7 @@ __wt_cond_wait_signal( locked = true; if (usecs > 0) { - WT_ERR(__wt_epoch(session, &ts)); + __wt_epoch(session, &ts); ts.tv_sec += (time_t) (((uint64_t)ts.tv_nsec + WT_THOUSAND * usecs) / WT_BILLION); ts.tv_nsec = (long) diff --git a/src/os_posix/os_time.c b/src/os_posix/os_time.c index b1b22a8e684..719e214696b 100644 --- a/src/os_posix/os_time.c +++ b/src/os_posix/os_time.c @@ -12,26 +12,35 @@ * __wt_epoch -- * Return the time since the Epoch. */ -int +void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) { WT_DECL_RET; + /* + * This function doesn't return an error, but panics on failure (which + * should never happen, it's done this way to simplify error handling + * in the caller). However, some compilers complain about using garbage + * values. Initializing the values avoids the complaint. + */ + tsp->tv_sec = 0; + tsp->tv_nsec = 0; + #if defined(HAVE_CLOCK_GETTIME) - WT_SYSCALL(clock_gettime(CLOCK_REALTIME, tsp), ret); + WT_SYSCALL_RETRY(clock_gettime(CLOCK_REALTIME, tsp), ret); if (ret == 0) - return (0); - WT_RET_MSG(session, ret, "clock_gettime"); + return; + WT_PANIC_MSG(session, ret, "clock_gettime"); #elif defined(HAVE_GETTIMEOFDAY) struct timeval v; - WT_SYSCALL(gettimeofday(&v, NULL), ret); + WT_SYSCALL_RETRY(gettimeofday(&v, NULL), ret); if (ret == 0) { tsp->tv_sec = v.tv_sec; tsp->tv_nsec = v.tv_usec * WT_THOUSAND; - return (0); + return; } - WT_RET_MSG(session, ret, "gettimeofday"); + WT_PANIC_MSG(session, ret, "gettimeofday"); #else NO TIME-OF-DAY IMPLEMENTATION: see src/os_posix/os_time.c #endif diff --git a/src/os_win/os_time.c b/src/os_win/os_time.c index e784b5d8a36..6aa5b3719f6 100644 --- a/src/os_win/os_time.c +++ b/src/os_win/os_time.c @@ -12,11 +12,11 @@ * __wt_epoch -- * Return the time since the Epoch. */ -int +void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) { - uint64_t ns100; FILETIME time; + uint64_t ns100; WT_UNUSED(session); @@ -26,8 +26,6 @@ __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) - 116444736000000000LL; tsp->tv_sec = ns100 / 10000000; tsp->tv_nsec = (long)((ns100 % 10000000) * 100); - - return (0); } /* diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index 9c38c535301..810f3fd976b 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -451,19 +451,18 @@ __wt_reconcile(WT_SESSION_IMPL *session, } /* - * When application threads perform eviction, don't cache block manager - * or reconciliation structures (even across calls), we can have a - * significant number of application threads doing eviction at the same - * time with large items. We ignore checkpoints, once the checkpoint - * completes, all unnecessary session resources will be discarded. + * When threads perform eviction, don't cache block manager or + * reconciliation structures (even across calls), we can have a + * significant number of threads doing eviction at the same time with + * large items. We ignore checkpoints, once the checkpoint completes, + * all unnecessary session resources will be discarded. * - * Even in application threads doing checkpoints or in internal threads - * doing any reconciliation, clean up reconciliation resources. Some - * workloads have millions of boundary structures in a reconciliation - * and we don't want to tie that memory down, even across calls. + * Even in application threads doing checkpoints, clean up + * reconciliation resources. Some workloads have millions of boundary + * structures in a reconciliation and we don't want to tie that memory + * down, even across calls. */ - if (WT_SESSION_IS_CHECKPOINT(session) || - F_ISSET(session, WT_SESSION_INTERNAL)) + if (WT_SESSION_IS_CHECKPOINT(session)) __rec_bnd_cleanup(session, r, false); else { /* @@ -564,10 +563,12 @@ __rec_write_status(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) * barrier after the change for clarity (the requirement is the * flag be set before a subsequent checkpoint reads it, and * as the current checkpoint is waiting on this reconciliation - * to complete, there's no risk of that happening) + * to complete, there's no risk of that happening). */ - btree->modified = 1; + btree->modified = true; WT_FULL_BARRIER(); + if (!S2C(session)->modified) + S2C(session)->modified = true; /* * Eviction should only be here if following the save/restore @@ -3335,7 +3336,7 @@ supd_check_complete: __wt_verbose(session, WT_VERB_SPLIT, "Reconciliation creating a page with %" PRIu32 " entries, memory footprint %" WT_SIZET_FMT - ", page count %" PRIu32 ", %s, split state: %d\n", + ", page count %" PRIu32 ", %s, split state: %d", r->entries, r->page->memory_footprint, r->bnd_next, F_ISSET(r, WT_EVICTING) ? "evict" : "checkpoint", r->bnd_state); diff --git a/src/schema/schema_stat.c b/src/schema/schema_stat.c index 1cd39d97364..345f9164e9b 100644 --- a/src/schema/schema_stat.c +++ b/src/schema/schema_stat.c @@ -137,7 +137,7 @@ __wt_curstat_table_init(WT_SESSION_IMPL *session, * If only gathering table size statistics, try a fast path that * avoids the schema and table list locks. */ - if (F_ISSET(cst, WT_CONN_STAT_SIZE)) { + if (F_ISSET(cst, WT_STAT_TYPE_SIZE)) { WT_RET(__curstat_size_only(session, uri, &was_fast, cst)); if (was_fast) return (0); diff --git a/src/session/session_api.c b/src/session/session_api.c index 0d3fcad3184..f594450db74 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -68,9 +68,10 @@ __wt_session_copy_values(WT_SESSION_IMPL *session) * unless the cursor is reading from a checkpoint. */ WT_TXN_STATE *txn_state = WT_SESSION_TXN_STATE(session); - WT_ASSERT(session, txn_state->snap_min != WT_TXN_NONE || - (WT_PREFIX_MATCH(cursor->uri, "file:") && - F_ISSET((WT_CURSOR_BTREE *)cursor, WT_CBT_NO_TXN))); + WT_ASSERT(session, + txn_state->pinned_id != WT_TXN_NONE || + (WT_PREFIX_MATCH(cursor->uri, "file:") && + F_ISSET((WT_CURSOR_BTREE *)cursor, WT_CBT_NO_TXN))); #endif F_CLR(cursor, WT_CURSTD_VALUE_INT); @@ -1417,10 +1418,10 @@ __session_transaction_pinned_range(WT_SESSION *wt_session, uint64_t *prange) /* Assign pinned to the lesser of id or snap_min */ if (txn_state->id != WT_TXN_NONE && - WT_TXNID_LT(txn_state->id, txn_state->snap_min)) + WT_TXNID_LT(txn_state->id, txn_state->pinned_id)) pinned = txn_state->id; else - pinned = txn_state->snap_min; + pinned = txn_state->pinned_id; if (pinned == WT_TXN_NONE) *prange = 0; @@ -1494,14 +1495,14 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config) if (timeout_ms == 0) WT_ERR(ETIMEDOUT); - WT_ERR(__wt_epoch(session, &start)); + __wt_epoch(session, &start); /* * Keep checking the LSNs until we find it is stable or we reach * our timeout. */ while (__wt_log_cmp(&session->bg_sync_lsn, &log->sync_lsn) > 0) { __wt_cond_signal(session, conn->log_file_cond); - WT_ERR(__wt_epoch(session, &now)); + __wt_epoch(session, &now); waited_ms = WT_TIMEDIFF_MS(now, start); if (forever || waited_ms < timeout_ms) /* @@ -1756,11 +1757,13 @@ __open_session(WT_CONNECTION_IMPL *conn, if (i >= conn->session_cnt) /* Defend against off-by-one errors. */ conn->session_cnt = i + 1; - session_ret->id = i; session_ret->iface = F_ISSET(conn, WT_CONN_READONLY) ? stds_readonly : stds; session_ret->iface.connection = &conn->iface; + session_ret->name = NULL; + session_ret->id = i; + WT_ERR(__wt_cond_alloc(session, "session", false, &session_ret->cond)); if (WT_SESSION_FIRST_USE(session_ret)) @@ -1776,10 +1779,10 @@ __open_session(WT_CONNECTION_IMPL *conn, * Allocate the table hash array as well. */ if (session_ret->dhhash == NULL) - WT_ERR(__wt_calloc(session_ret, WT_HASH_ARRAY_SIZE, + WT_ERR(__wt_calloc(session, WT_HASH_ARRAY_SIZE, sizeof(struct __dhandles_hash), &session_ret->dhhash)); if (session_ret->tablehash == NULL) - WT_ERR(__wt_calloc(session_ret, WT_HASH_ARRAY_SIZE, + WT_ERR(__wt_calloc(session, WT_HASH_ARRAY_SIZE, sizeof(struct __tables_hash), &session_ret->tablehash)); for (i = 0; i < WT_HASH_ARRAY_SIZE; i++) { TAILQ_INIT(&session_ret->dhhash[i]); @@ -1788,7 +1791,7 @@ __open_session(WT_CONNECTION_IMPL *conn, /* Initialize transaction support: default to read-committed. */ session_ret->isolation = WT_ISO_READ_COMMITTED; - WT_ERR(__wt_txn_init(session_ret)); + WT_ERR(__wt_txn_init(session, session_ret)); /* * The session's hazard pointer memory isn't discarded during normal @@ -1807,6 +1810,9 @@ __open_session(WT_CONNECTION_IMPL *conn, */ session_ret->hazard_size = 0; + /* Cache the offset of this session's statistics bucket. */ + session_ret->stat_bucket = WT_STATS_SLOT_ID(session); + /* * Configuration: currently, the configuration for open_session is the * same as session.reconfigure, so use that function. @@ -1815,8 +1821,6 @@ __open_session(WT_CONNECTION_IMPL *conn, WT_ERR( __session_reconfigure((WT_SESSION *)session_ret, config)); - session_ret->name = NULL; - /* * Publish: make the entry visible to server threads. There must be a * barrier for two reasons, to ensure structure fields are set before diff --git a/src/session/session_compact.c b/src/session/session_compact.c index f03d5d34bac..66635007723 100644 --- a/src/session/session_compact.c +++ b/src/session/session_compact.c @@ -179,17 +179,16 @@ __compact_handle_append(WT_SESSION_IMPL *session, const char *cfg[]) * Check if the timeout has been exceeded. */ static int -__session_compact_check_timeout( - WT_SESSION_IMPL *session, struct timespec begin) +__session_compact_check_timeout(WT_SESSION_IMPL *session, struct timespec begin) { struct timespec end; if (session->compact->max_time == 0) return (0); - WT_RET(__wt_epoch(session, &end)); + __wt_epoch(session, &end); if (session->compact->max_time < WT_TIMEDIFF_SEC(end, begin)) - WT_RET(ETIMEDOUT); + return (ETIMEDOUT); return (0); } @@ -219,7 +218,7 @@ __compact_file(WT_SESSION_IMPL *session, const char *cfg[]) session, t, "target=(\"%s\"),force=1", dhandle->name)); checkpoint_cfg[1] = t->data; - WT_ERR(__wt_epoch(session, &start_time)); + __wt_epoch(session, &start_time); /* * We compact 10% of the file on each pass (but the overall size of the diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c index e76407567bc..725854c6001 100644 --- a/src/session/session_dhandle.c +++ b/src/session/session_dhandle.c @@ -8,8 +8,6 @@ #include "wt_internal.h" -static int __session_dhandle_sweep(WT_SESSION_IMPL *); - /* * __session_add_dhandle -- * Add a handle to the session's cache. @@ -371,7 +369,7 @@ __wt_session_close_cache(WT_SESSION_IMPL *session) * __session_dhandle_sweep -- * Discard any session dhandles that are not open. */ -static int +static void __session_dhandle_sweep(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; @@ -385,9 +383,9 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session) * Periodically sweep for dead handles; if we've swept recently, don't * do it again. */ - WT_RET(__wt_seconds(session, &now)); + __wt_seconds(session, &now); if (difftime(now, session->last_sweep) < conn->sweep_interval) - return (0); + return; session->last_sweep = now; WT_STAT_CONN_INCR(session, dh_session_sweeps); @@ -408,7 +406,6 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session) } dhandle_cache = dhandle_cache_next; } - return (0); } /* @@ -446,7 +443,7 @@ __session_get_dhandle( } /* Sweep the handle list to remove any dead handles. */ - WT_RET(__session_dhandle_sweep(session)); + __session_dhandle_sweep(session); /* * We didn't find a match in the session cache, search the shared diff --git a/src/support/err.c b/src/support/err.c index 8bfac250b3a..3ecbab1cbe9 100644 --- a/src/support/err.c +++ b/src/support/err.c @@ -162,7 +162,6 @@ __wt_eventv(WT_SESSION_IMPL *session, bool msg_event, int error, WT_SESSION *wt_session; struct timespec ts; size_t len, remain, wlen; - int prefix_cnt; const char *err, *prefix; char *end, *p, tid[128]; @@ -211,44 +210,32 @@ __wt_eventv(WT_SESSION_IMPL *session, bool msg_event, int error, * name, and the session's name. Write them as a comma-separate list, * followed by a colon. */ - prefix_cnt = 0; - if (__wt_epoch(session, &ts) == 0) { - __wt_thread_id(tid, sizeof(tid)); - remain = WT_PTRDIFF(end, p); - wlen = (size_t)snprintf(p, remain, - "[%" PRIuMAX ":%" PRIuMAX "][%s]", - (uintmax_t)ts.tv_sec, - (uintmax_t)ts.tv_nsec / WT_THOUSAND, tid); - p = wlen >= remain ? end : p + wlen; - prefix_cnt = 1; - } + __wt_epoch(session, &ts); + __wt_thread_id(tid, sizeof(tid)); + remain = WT_PTRDIFF(end, p); + wlen = (size_t)snprintf(p, remain, "[%" PRIuMAX ":%" PRIuMAX "][%s]", + (uintmax_t)ts.tv_sec, (uintmax_t)ts.tv_nsec / WT_THOUSAND, tid); + p = wlen >= remain ? end : p + wlen; + if ((prefix = S2C(session)->error_prefix) != NULL) { remain = WT_PTRDIFF(end, p); - wlen = (size_t)snprintf(p, remain, - "%s%s", prefix_cnt == 0 ? "" : ", ", prefix); + wlen = (size_t)snprintf(p, remain, ", %s", prefix); p = wlen >= remain ? end : p + wlen; - prefix_cnt = 1; } prefix = session->dhandle == NULL ? NULL : session->dhandle->name; if (prefix != NULL) { remain = WT_PTRDIFF(end, p); - wlen = (size_t)snprintf(p, remain, - "%s%s", prefix_cnt == 0 ? "" : ", ", prefix); + wlen = (size_t)snprintf(p, remain, ", %s", prefix); p = wlen >= remain ? end : p + wlen; - prefix_cnt = 1; } if ((prefix = session->name) != NULL) { remain = WT_PTRDIFF(end, p); - wlen = (size_t)snprintf(p, remain, - "%s%s", prefix_cnt == 0 ? "" : ", ", prefix); - p = wlen >= remain ? end : p + wlen; - prefix_cnt = 1; - } - if (prefix_cnt != 0) { - remain = WT_PTRDIFF(end, p); - wlen = (size_t)snprintf(p, remain, ": "); + wlen = (size_t)snprintf(p, remain, ", %s", prefix); p = wlen >= remain ? end : p + wlen; } + remain = WT_PTRDIFF(end, p); + wlen = (size_t)snprintf(p, remain, ": "); + p = wlen >= remain ? end : p + wlen; if (file_name != NULL) { remain = WT_PTRDIFF(end, p); diff --git a/src/support/rand.c b/src/support/rand.c index d2e4cd27aab..025b18e4ed3 100644 --- a/src/support/rand.c +++ b/src/support/rand.c @@ -66,20 +66,18 @@ __wt_random_init(WT_RAND_STATE volatile * rnd_state) * threads and we want each thread to initialize its own random state based * on a different random seed. */ -int +void __wt_random_init_seed( WT_SESSION_IMPL *session, WT_RAND_STATE volatile * rnd_state) { struct timespec ts; WT_RAND_STATE rnd; - WT_RET(__wt_epoch(session, &ts)); + __wt_epoch(session, &ts); M_W(rnd) = (uint32_t)(ts.tv_nsec + 521288629); M_Z(rnd) = (uint32_t)(ts.tv_nsec + 362436069); *rnd_state = rnd; - - return (0); } /* diff --git a/src/support/stat.c b/src/support/stat.c index 7150223e6cb..6e8e218a0db 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -65,6 +65,24 @@ static const char * const __stats_dsrc_desc[] = { "cache: pages written from cache", "cache: pages written requiring in-memory restoration", "cache: unmodified pages evicted", + "cache_walk: Average difference between current eviction generation when the page was last considered", + "cache_walk: Average on-disk page image size seen", + "cache_walk: Clean pages currently in cache", + "cache_walk: Current eviction generation", + "cache_walk: Dirty pages currently in cache", + "cache_walk: Entries in the root page", + "cache_walk: Internal pages currently in cache", + "cache_walk: Leaf pages currently in cache", + "cache_walk: Maximum difference between current eviction generation when the page was last considered", + "cache_walk: Maximum page size seen", + "cache_walk: Minimum on-disk page image size seen", + "cache_walk: On-disk page image sizes smaller than a single allocation unit", + "cache_walk: Pages created in memory and never written", + "cache_walk: Pages currently queued for eviction", + "cache_walk: Pages that could not be queued for eviction", + "cache_walk: Refs skipped during cache traversal", + "cache_walk: Size of the root page", + "cache_walk: Total number of pages currently in cache", "compression: compressed pages read", "compression: compressed pages written", "compression: page written failed to compress", @@ -196,6 +214,24 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) stats->cache_write = 0; stats->cache_write_restore = 0; stats->cache_eviction_clean = 0; + /* not clearing cache_state_gen_avg_gap */ + /* not clearing cache_state_avg_written_size */ + /* not clearing cache_state_pages_clean */ + /* not clearing cache_state_gen_current */ + /* not clearing cache_state_pages_dirty */ + /* not clearing cache_state_root_entries */ + /* not clearing cache_state_pages_internal */ + /* not clearing cache_state_pages_leaf */ + /* not clearing cache_state_gen_max_gap */ + /* not clearing cache_state_max_pagesize */ + /* not clearing cache_state_min_written_size */ + /* not clearing cache_state_smaller_alloc_size */ + /* not clearing cache_state_memory */ + /* not clearing cache_state_queued */ + /* not clearing cache_state_not_queueable */ + /* not clearing cache_state_refs_skipped */ + /* not clearing cache_state_root_size */ + /* not clearing cache_state_pages */ stats->compress_read = 0; stats->compress_write = 0; stats->compress_write_fail = 0; @@ -325,6 +361,27 @@ __wt_stat_dsrc_aggregate_single( to->cache_write += from->cache_write; to->cache_write_restore += from->cache_write_restore; to->cache_eviction_clean += from->cache_eviction_clean; + to->cache_state_gen_avg_gap += from->cache_state_gen_avg_gap; + to->cache_state_avg_written_size += + from->cache_state_avg_written_size; + to->cache_state_pages_clean += from->cache_state_pages_clean; + to->cache_state_gen_current += from->cache_state_gen_current; + to->cache_state_pages_dirty += from->cache_state_pages_dirty; + to->cache_state_root_entries += from->cache_state_root_entries; + to->cache_state_pages_internal += from->cache_state_pages_internal; + to->cache_state_pages_leaf += from->cache_state_pages_leaf; + to->cache_state_gen_max_gap += from->cache_state_gen_max_gap; + to->cache_state_max_pagesize += from->cache_state_max_pagesize; + to->cache_state_min_written_size += + from->cache_state_min_written_size; + to->cache_state_smaller_alloc_size += + from->cache_state_smaller_alloc_size; + to->cache_state_memory += from->cache_state_memory; + to->cache_state_queued += from->cache_state_queued; + to->cache_state_not_queueable += from->cache_state_not_queueable; + to->cache_state_refs_skipped += from->cache_state_refs_skipped; + to->cache_state_root_size += from->cache_state_root_size; + to->cache_state_pages += from->cache_state_pages; to->compress_read += from->compress_read; to->compress_write += from->compress_write; to->compress_write_fail += from->compress_write_fail; @@ -467,6 +524,39 @@ __wt_stat_dsrc_aggregate( to->cache_write += WT_STAT_READ(from, cache_write); to->cache_write_restore += WT_STAT_READ(from, cache_write_restore); to->cache_eviction_clean += WT_STAT_READ(from, cache_eviction_clean); + to->cache_state_gen_avg_gap += + WT_STAT_READ(from, cache_state_gen_avg_gap); + to->cache_state_avg_written_size += + WT_STAT_READ(from, cache_state_avg_written_size); + to->cache_state_pages_clean += + WT_STAT_READ(from, cache_state_pages_clean); + to->cache_state_gen_current += + WT_STAT_READ(from, cache_state_gen_current); + to->cache_state_pages_dirty += + WT_STAT_READ(from, cache_state_pages_dirty); + to->cache_state_root_entries += + WT_STAT_READ(from, cache_state_root_entries); + to->cache_state_pages_internal += + WT_STAT_READ(from, cache_state_pages_internal); + to->cache_state_pages_leaf += + WT_STAT_READ(from, cache_state_pages_leaf); + to->cache_state_gen_max_gap += + WT_STAT_READ(from, cache_state_gen_max_gap); + to->cache_state_max_pagesize += + WT_STAT_READ(from, cache_state_max_pagesize); + to->cache_state_min_written_size += + WT_STAT_READ(from, cache_state_min_written_size); + to->cache_state_smaller_alloc_size += + WT_STAT_READ(from, cache_state_smaller_alloc_size); + to->cache_state_memory += WT_STAT_READ(from, cache_state_memory); + to->cache_state_queued += WT_STAT_READ(from, cache_state_queued); + to->cache_state_not_queueable += + WT_STAT_READ(from, cache_state_not_queueable); + to->cache_state_refs_skipped += + WT_STAT_READ(from, cache_state_refs_skipped); + to->cache_state_root_size += + WT_STAT_READ(from, cache_state_root_size); + to->cache_state_pages += WT_STAT_READ(from, cache_state_pages); to->compress_read += WT_STAT_READ(from, compress_read); to->compress_write += WT_STAT_READ(from, compress_write); to->compress_write_fail += WT_STAT_READ(from, compress_write_fail); @@ -549,6 +639,10 @@ static const char * const __stats_connection_desc[] = { "block-manager: bytes written for checkpoint", "block-manager: mapped blocks read", "block-manager: mapped bytes read", + "cache: application threads page read from disk to cache count", + "cache: application threads page read from disk to cache time (usecs)", + "cache: application threads page write from cache to disk count", + "cache: application threads page write from cache to disk time (usecs)", "cache: bytes belonging to page images in the cache", "cache: bytes currently in the cache", "cache: bytes not belonging to page images in the cache", @@ -642,6 +736,21 @@ static const char * const __stats_connection_desc[] = { "data-handle: connection sweeps", "data-handle: session dhandles swept", "data-handle: session sweep attempts", + "lock: checkpoint lock acquisitions", + "lock: checkpoint lock application thread wait time (usecs)", + "lock: checkpoint lock internal thread wait time (usecs)", + "lock: handle-list lock acquisitions", + "lock: handle-list lock application thread wait time (usecs)", + "lock: handle-list lock internal thread wait time (usecs)", + "lock: metadata lock acquisitions", + "lock: metadata lock application thread wait time (usecs)", + "lock: metadata lock internal thread wait time (usecs)", + "lock: schema lock acquisitions", + "lock: schema lock application thread wait time (usecs)", + "lock: schema lock internal thread wait time (usecs)", + "lock: table lock acquisitions", + "lock: table lock application thread time waiting for the table lock (usecs)", + "lock: table lock internal thread time waiting for the table lock (usecs)", "log: busy returns attempting to switch slots", "log: consolidated slot closures", "log: consolidated slot join races", @@ -706,6 +815,8 @@ static const char * const __stats_connection_desc[] = { "thread-state: active filesystem fsync calls", "thread-state: active filesystem read calls", "thread-state: active filesystem write calls", + "thread-yield: application thread time evicting (usecs)", + "thread-yield: application thread time waiting for cache (usecs)", "thread-yield: page acquire busy blocked", "thread-yield: page acquire eviction blocked", "thread-yield: page acquire locked blocked", @@ -723,6 +834,7 @@ static const char * const __stats_connection_desc[] = { "transaction: transaction checkpoint scrub time (msecs)", "transaction: transaction checkpoint total time (msecs)", "transaction: transaction checkpoints", + "transaction: transaction checkpoints skipped because database was clean", "transaction: transaction failures due to cache overflow", "transaction: transaction fsync calls for checkpoint after allocating the transaction ID", "transaction: transaction fsync duration for checkpoint after allocating the transaction ID (usecs)", @@ -793,6 +905,10 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->block_byte_write_checkpoint = 0; stats->block_map_read = 0; stats->block_byte_map_read = 0; + stats->cache_read_app_count = 0; + stats->cache_read_app_time = 0; + stats->cache_write_app_count = 0; + stats->cache_write_app_time = 0; /* not clearing cache_bytes_image */ /* not clearing cache_bytes_inuse */ /* not clearing cache_bytes_other */ @@ -886,6 +1002,21 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->dh_sweeps = 0; stats->dh_session_handles = 0; stats->dh_session_sweeps = 0; + stats->lock_checkpoint_count = 0; + stats->lock_checkpoint_wait_application = 0; + stats->lock_checkpoint_wait_internal = 0; + stats->lock_handle_list_count = 0; + stats->lock_handle_list_wait_application = 0; + stats->lock_handle_list_wait_internal = 0; + stats->lock_metadata_count = 0; + stats->lock_metadata_wait_application = 0; + stats->lock_metadata_wait_internal = 0; + stats->lock_schema_count = 0; + stats->lock_schema_wait_application = 0; + stats->lock_schema_wait_internal = 0; + stats->lock_table_count = 0; + stats->lock_table_wait_application = 0; + stats->lock_table_wait_internal = 0; stats->log_slot_switch_busy = 0; stats->log_slot_closes = 0; stats->log_slot_races = 0; @@ -950,6 +1081,8 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) /* not clearing thread_fsync_active */ /* not clearing thread_read_active */ /* not clearing thread_write_active */ + stats->application_evict_time = 0; + stats->application_cache_time = 0; stats->page_busy_blocked = 0; stats->page_forcible_evict_blocked = 0; stats->page_locked_blocked = 0; @@ -967,6 +1100,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) /* not clearing txn_checkpoint_scrub_time */ /* not clearing txn_checkpoint_time_total */ stats->txn_checkpoint = 0; + stats->txn_checkpoint_skipped = 0; stats->txn_fail_cache = 0; stats->txn_checkpoint_fsync_post = 0; /* not clearing txn_checkpoint_fsync_post_duration */ @@ -1030,6 +1164,11 @@ __wt_stat_connection_aggregate( WT_STAT_READ(from, block_byte_write_checkpoint); to->block_map_read += WT_STAT_READ(from, block_map_read); to->block_byte_map_read += WT_STAT_READ(from, block_byte_map_read); + to->cache_read_app_count += WT_STAT_READ(from, cache_read_app_count); + to->cache_read_app_time += WT_STAT_READ(from, cache_read_app_time); + to->cache_write_app_count += + WT_STAT_READ(from, cache_write_app_count); + to->cache_write_app_time += WT_STAT_READ(from, cache_write_app_time); to->cache_bytes_image += WT_STAT_READ(from, cache_bytes_image); to->cache_bytes_inuse += WT_STAT_READ(from, cache_bytes_inuse); to->cache_bytes_other += WT_STAT_READ(from, cache_bytes_other); @@ -1156,6 +1295,33 @@ __wt_stat_connection_aggregate( to->dh_sweeps += WT_STAT_READ(from, dh_sweeps); to->dh_session_handles += WT_STAT_READ(from, dh_session_handles); to->dh_session_sweeps += WT_STAT_READ(from, dh_session_sweeps); + to->lock_checkpoint_count += + WT_STAT_READ(from, lock_checkpoint_count); + to->lock_checkpoint_wait_application += + WT_STAT_READ(from, lock_checkpoint_wait_application); + to->lock_checkpoint_wait_internal += + WT_STAT_READ(from, lock_checkpoint_wait_internal); + to->lock_handle_list_count += + WT_STAT_READ(from, lock_handle_list_count); + to->lock_handle_list_wait_application += + WT_STAT_READ(from, lock_handle_list_wait_application); + to->lock_handle_list_wait_internal += + WT_STAT_READ(from, lock_handle_list_wait_internal); + to->lock_metadata_count += WT_STAT_READ(from, lock_metadata_count); + to->lock_metadata_wait_application += + WT_STAT_READ(from, lock_metadata_wait_application); + to->lock_metadata_wait_internal += + WT_STAT_READ(from, lock_metadata_wait_internal); + to->lock_schema_count += WT_STAT_READ(from, lock_schema_count); + to->lock_schema_wait_application += + WT_STAT_READ(from, lock_schema_wait_application); + to->lock_schema_wait_internal += + WT_STAT_READ(from, lock_schema_wait_internal); + to->lock_table_count += WT_STAT_READ(from, lock_table_count); + to->lock_table_wait_application += + WT_STAT_READ(from, lock_table_wait_application); + to->lock_table_wait_internal += + WT_STAT_READ(from, lock_table_wait_internal); to->log_slot_switch_busy += WT_STAT_READ(from, log_slot_switch_busy); to->log_slot_closes += WT_STAT_READ(from, log_slot_closes); to->log_slot_races += WT_STAT_READ(from, log_slot_races); @@ -1242,6 +1408,10 @@ __wt_stat_connection_aggregate( to->thread_fsync_active += WT_STAT_READ(from, thread_fsync_active); to->thread_read_active += WT_STAT_READ(from, thread_read_active); to->thread_write_active += WT_STAT_READ(from, thread_write_active); + to->application_evict_time += + WT_STAT_READ(from, application_evict_time); + to->application_cache_time += + WT_STAT_READ(from, application_cache_time); to->page_busy_blocked += WT_STAT_READ(from, page_busy_blocked); to->page_forcible_evict_blocked += WT_STAT_READ(from, page_forcible_evict_blocked); @@ -1270,6 +1440,8 @@ __wt_stat_connection_aggregate( to->txn_checkpoint_time_total += WT_STAT_READ(from, txn_checkpoint_time_total); to->txn_checkpoint += WT_STAT_READ(from, txn_checkpoint); + to->txn_checkpoint_skipped += + WT_STAT_READ(from, txn_checkpoint_skipped); to->txn_fail_cache += WT_STAT_READ(from, txn_fail_cache); to->txn_checkpoint_fsync_post += WT_STAT_READ(from, txn_checkpoint_fsync_post); diff --git a/src/support/thread_group.c b/src/support/thread_group.c index f5ddabad7d4..a866d2d01c5 100644 --- a/src/support/thread_group.c +++ b/src/support/thread_group.c @@ -60,7 +60,7 @@ __thread_group_grow( while (group->current_threads < new_count) { thread = group->threads[group->current_threads++]; __wt_verbose(session, WT_VERB_THREAD_GROUP, - "Starting utility thread: %p:%"PRIu32"\n", + "Starting utility thread: %p:%" PRIu32, (void *)group, thread->id); F_SET(thread, WT_THREAD_RUN); WT_ASSERT(session, thread->session != NULL); @@ -100,7 +100,7 @@ __thread_group_shrink(WT_SESSION_IMPL *session, /* Wake threads to ensure they notice the state change */ if (thread->tid != 0) { __wt_verbose(session, WT_VERB_THREAD_GROUP, - "Stopping utility thread: %p:%"PRIu32"\n", + "Stopping utility thread: %p:%" PRIu32, (void *)group, thread->id); F_CLR(thread, WT_THREAD_RUN); __wt_cond_signal(session, group->wait_cond); @@ -224,7 +224,7 @@ __wt_thread_group_resize( __wt_verbose(session, WT_VERB_THREAD_GROUP, "Resize thread group: %p, from min: %" PRIu32 " -> %" PRIu32 - " from max: %" PRIu32 " -> %" PRIu32 "\n", + " from max: %" PRIu32 " -> %" PRIu32, (void *)group, group->min, new_min, group->max, new_max); __wt_writelock(session, group->lock); @@ -253,7 +253,7 @@ __wt_thread_group_create( cond_alloced = false; __wt_verbose(session, WT_VERB_THREAD_GROUP, - "Creating thread group: %p\n", (void *)group); + "Creating thread group: %p", (void *)group); WT_RET(__wt_rwlock_alloc(session, &group->lock, "Thread group")); WT_ERR(__wt_cond_alloc( @@ -286,7 +286,7 @@ __wt_thread_group_destroy(WT_SESSION_IMPL *session, WT_THREAD_GROUP *group) WT_DECL_RET; __wt_verbose(session, WT_VERB_THREAD_GROUP, - "Destroying thread group: %p\n", (void *)group); + "Destroying thread group: %p", (void *)group); WT_ASSERT(session, __wt_rwlock_islocked(session, group->lock)); diff --git a/src/txn/txn.c b/src/txn/txn.c index 01e0fbbb634..d60ea73c660 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -96,11 +96,11 @@ __wt_txn_release_snapshot(WT_SESSION_IMPL *session) txn_state = WT_SESSION_TXN_STATE(session); WT_ASSERT(session, - txn_state->snap_min == WT_TXN_NONE || + txn_state->pinned_id == WT_TXN_NONE || session->txn.isolation == WT_ISO_READ_UNCOMMITTED || - !__wt_txn_visible_all(session, txn_state->snap_min)); + !__wt_txn_visible_all(session, txn_state->pinned_id)); - txn_state->snap_min = WT_TXN_NONE; + txn_state->pinned_id = WT_TXN_NONE; F_CLR(txn, WT_TXN_HAS_SNAPSHOT); } @@ -117,7 +117,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s, *txn_state; uint64_t current_id, id; - uint64_t prev_oldest_id, snap_min; + uint64_t prev_oldest_id, pinned_id; uint32_t i, n, session_cnt; conn = S2C(session); @@ -135,21 +135,21 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) WT_PAUSE(); WT_RET(ret); - current_id = snap_min = txn_global->current; + current_id = pinned_id = txn_global->current; prev_oldest_id = txn_global->oldest_id; /* * Include the checkpoint transaction, if one is running: we should * ignore any uncommitted changes the checkpoint has written to the * metadata. We don't have to keep the checkpoint's changes pinned so - * don't including it in the published snap_min. + * don't including it in the published pinned ID. */ if ((id = txn_global->checkpoint_txnid) != WT_TXN_NONE) txn->snapshot[n++] = id; /* For pure read-only workloads, avoid scanning. */ if (prev_oldest_id == current_id) { - txn_state->snap_min = current_id; + txn_state->pinned_id = current_id; /* Check that the oldest ID has not moved in the meantime. */ WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); goto done; @@ -172,18 +172,18 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) (id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id)) { txn->snapshot[n++] = id; - if (WT_TXNID_LT(id, snap_min)) - snap_min = id; + if (WT_TXNID_LT(id, pinned_id)) + pinned_id = id; } } /* - * If we got a new snapshot, update the published snap_min for this + * If we got a new snapshot, update the published pinned ID for this * session. */ - WT_ASSERT(session, WT_TXNID_LE(prev_oldest_id, snap_min)); + WT_ASSERT(session, WT_TXNID_LE(prev_oldest_id, pinned_id)); WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); - txn_state->snap_min = snap_min; + txn_state->pinned_id = pinned_id; done: __wt_readunlock(session, txn_global->scan_rwlock); __txn_sort_snapshot(session, n, current_id); @@ -232,13 +232,13 @@ __txn_oldest_scan(WT_SESSION_IMPL *session, /* * !!! - * Note: Don't ignore snap_min values older than the previous - * oldest ID. Read-uncommitted operations publish snap_min + * Note: Don't ignore pinned ID values older than the previous + * oldest ID. Read-uncommitted operations publish pinned ID * values without acquiring the scan lock to protect the global - * table. See the comment in __wt_txn_cursor_op for - * more details. + * table. See the comment in __wt_txn_cursor_op for more + * details. */ - if ((id = s->snap_min) != WT_TXN_NONE && + if ((id = s->pinned_id) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) { oldest_id = id; oldest_session = &conn->sessions[i]; @@ -360,7 +360,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) __wt_verbose(session, WT_VERB_TRANSACTION, "old snapshot %" PRIu64 " pinned in session %" PRIu32 " [%s]" - " with snap_min %" PRIu64 "\n", + " with snap_min %" PRIu64, oldest_id, oldest_session->id, oldest_session->lastop, oldest_session->txn.snap_min); @@ -659,21 +659,21 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]) * Initialize a session's transaction data. */ int -__wt_txn_init(WT_SESSION_IMPL *session) +__wt_txn_init(WT_SESSION_IMPL *session, WT_SESSION_IMPL *session_ret) { WT_TXN *txn; - txn = &session->txn; + txn = &session_ret->txn; txn->id = WT_TXN_NONE; WT_RET(__wt_calloc_def(session, - S2C(session)->session_size, &txn->snapshot)); + S2C(session_ret)->session_size, &txn->snapshot)); #ifdef HAVE_DIAGNOSTIC - if (S2C(session)->txn_global.states != NULL) { + if (S2C(session_ret)->txn_global.states != NULL) { WT_TXN_STATE *txn_state; - txn_state = WT_SESSION_TXN_STATE(session); - WT_ASSERT(session, txn_state->snap_min == WT_TXN_NONE); + txn_state = WT_SESSION_TXN_STATE(session_ret); + WT_ASSERT(session, txn_state->pinned_id == WT_TXN_NONE); } #endif @@ -683,7 +683,7 @@ __wt_txn_init(WT_SESSION_IMPL *session) */ txn->mod = NULL; - txn->isolation = session->isolation; + txn->isolation = session_ret->isolation; return (0); } @@ -773,7 +773,7 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) WT_CACHE_LINE_ALIGNMENT_VERIFY(session, txn_global->states); for (i = 0, s = txn_global->states; i < conn->session_size; i++, s++) - s->id = s->snap_min = WT_TXN_NONE; + s->id = s->pinned_id = WT_TXN_NONE; return (0); } diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 3aad95f5a9f..0557e6ce60c 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -314,7 +314,7 @@ __checkpoint_update_generation(WT_SESSION_IMPL *session) * __checkpoint_reduce_dirty_cache -- * Release clean trees from the list cached for checkpoints. */ -static int +static void __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) { WT_CACHE *cache; @@ -332,9 +332,9 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) /* Give up if scrubbing is disabled. */ if (cache->eviction_checkpoint_target == 0 || cache->eviction_checkpoint_target >= cache->eviction_dirty_trigger) - return (0); + return; - WT_RET(__wt_epoch(session, &start)); + __wt_epoch(session, &start); last = start; bytes_written_last = 0; bytes_written_start = cache->bytes_written; @@ -345,7 +345,7 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) * cache via reconfigure. This avoids potential divide by zero. */ if (cache_size < 10 * WT_MEGABYTE) - return (0); + return; stepdown_us = 10000; work_us = 0; progress = false; @@ -371,7 +371,7 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) break; __wt_sleep(0, stepdown_us / 10); - WT_RET(__wt_epoch(session, &stop)); + __wt_epoch(session, &stop); current_us = WT_TIMEDIFF_US(stop, last); total_ms = WT_TIMEDIFF_MS(stop, start); bytes_written_total = @@ -427,14 +427,12 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) WT_MAX(cache->eviction_dirty_target, current_dirty - delta); WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, cache->eviction_scrub_limit); - WT_RET(__wt_epoch(session, &last)); + __wt_epoch(session, &last); } - WT_RET(__wt_epoch(session, &stop)); + __wt_epoch(session, &stop); total_ms = WT_TIMEDIFF_MS(stop, start); WT_STAT_CONN_SET(session, txn_checkpoint_scrub_time, total_ms); - - return (0); } /* @@ -497,7 +495,7 @@ __checkpoint_stats( * __checkpoint_verbose_track -- * Output a verbose message with timing information */ -static int +static void __checkpoint_verbose_track(WT_SESSION_IMPL *session, const char *msg, struct timespec *start) { @@ -506,9 +504,9 @@ __checkpoint_verbose_track(WT_SESSION_IMPL *session, uint64_t msec; if (!WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) - return (0); + return; - WT_RET(__wt_epoch(session, &stop)); + __wt_epoch(session, &stop); /* * Get time diff in microseconds. @@ -526,7 +524,6 @@ __checkpoint_verbose_track(WT_SESSION_IMPL *session, WT_UNUSED(msg); WT_UNUSED(start); #endif - return (0); } /* @@ -576,7 +573,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) conn->cache->evict_max_page_size = 0; /* Initialize the verbose tracking timer */ - WT_ERR(__wt_epoch(session, &verb_timer)); + __wt_epoch(session, &verb_timer); /* * Update the global oldest ID so we do all possible cleanup. @@ -594,18 +591,18 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * Try to reduce the amount of dirty data in cache so there is less * work do during the critical section of the checkpoint. */ - WT_ERR(__checkpoint_reduce_dirty_cache(session)); + __checkpoint_reduce_dirty_cache(session); /* Tell logging that we are about to start a database checkpoint. */ if (full && logging) WT_ERR(__wt_txn_checkpoint_log( session, full, WT_TXN_LOG_CKPT_PREPARE, NULL)); - WT_ERR(__checkpoint_verbose_track(session, - "starting transaction", &verb_timer)); + __checkpoint_verbose_track(session, + "starting transaction", &verb_timer); if (full) - WT_ERR(__wt_epoch(session, &start)); + __wt_epoch(session, &start); /* * Start the checkpoint for real. @@ -666,6 +663,14 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_txn_id_check(session)); /* + * Mark the connection as clean. If some data gets modified after + * generating checkpoint transaction id, connection will be reset to + * dirty when reconciliation marks the btree dirty on encountering the + * dirty page. + */ + conn->modified = false; + + /* * Save the checkpoint session ID. * * We never do checkpoints in the default session (with id zero). @@ -689,7 +694,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) */ WT_ASSERT(session, WT_TXNID_LE(txn_global->oldest_id, txn_state->id) && - WT_TXNID_LE(txn_global->oldest_id, txn_state->snap_min)); + WT_TXNID_LE(txn_global->oldest_id, txn_state->pinned_id)); /* * Clear our entry from the global transaction session table. Any @@ -698,7 +703,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * can safely ignore the checkpoint ID (see the visible all check for * details). */ - txn_state->id = txn_state->snap_min = WT_TXN_NONE; + txn_state->id = txn_state->pinned_id = WT_TXN_NONE; __wt_writeunlock(session, txn_global->scan_rwlock); /* @@ -739,23 +744,22 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_presync)); __wt_evict_server_wake(session); - WT_ERR(__checkpoint_verbose_track(session, - "committing transaction", &verb_timer)); + __checkpoint_verbose_track(session, + "committing transaction", &verb_timer); /* * Checkpoints have to hit disk (it would be reasonable to configure for * lazy checkpoints, but we don't support them yet). */ - WT_ERR(__wt_epoch(session, &fsync_start)); + __wt_epoch(session, &fsync_start); WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync)); - WT_ERR(__wt_epoch(session, &fsync_stop)); + __wt_epoch(session, &fsync_stop); fsync_duration_usecs = WT_TIMEDIFF_US(fsync_stop, fsync_start); WT_STAT_CONN_INCR(session, txn_checkpoint_fsync_post); WT_STAT_CONN_SET(session, txn_checkpoint_fsync_post_duration, fsync_duration_usecs); - WT_ERR(__checkpoint_verbose_track(session, - "sync completed", &verb_timer)); + __checkpoint_verbose_track(session, "sync completed", &verb_timer); /* * Commit the transaction now that we are sure that all files in the @@ -793,8 +797,8 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) ret = __wt_checkpoint_sync(session, NULL)); WT_ERR(ret); - WT_ERR(__checkpoint_verbose_track(session, - "metadata sync completed", &verb_timer)); + __checkpoint_verbose_track(session, + "metadata sync completed", &verb_timer); } else WT_WITH_DHANDLE(session, WT_SESSION_META_DHANDLE(session), @@ -808,7 +812,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) txn_global->checkpoint_pinned = WT_TXN_NONE; if (full) { - WT_ERR(__wt_epoch(session, &stop)); + __wt_epoch(session, &stop); __checkpoint_stats(session, &start, &stop); } @@ -825,6 +829,9 @@ err: /* * overwritten the checkpoint, so what ends up on disk is not * consistent. */ + if (ret != 0 && !conn->modified) + conn->modified = true; + session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED; if (tracking) WT_TRET(__wt_meta_track_off(session, false, ret != 0)); @@ -1352,9 +1359,13 @@ __checkpoint_tree( * out of sync with the set of dirty pages (modify is set, but there * are no dirty pages), we perform a checkpoint without any writes, no * checkpoint is created, and then things get bad. + * While marking the root page as dirty, we do not want to dirty the + * btree because we are marking the btree as clean just after this call. + * Also, marking the btree dirty at this stage will unnecessarily mark + * the connection as dirty causing checkpoint-skip code to fail. */ WT_ERR(__wt_page_modify_init(session, btree->root.page)); - __wt_page_modify_set(session, btree->root.page); + __wt_page_only_modify_set(session, btree->root.page); /* * Clear the tree's modified flag; any changes before we clear the flag @@ -1366,7 +1377,7 @@ __checkpoint_tree( * it sets the modified flag itself. Use a full barrier so we get the * store done quickly, this isn't a performance path. */ - btree->modified = 0; + btree->modified = false; WT_FULL_BARRIER(); /* Tell logging that a file checkpoint is starting. */ @@ -1440,8 +1451,11 @@ err: /* * If the checkpoint didn't complete successfully, make sure the * tree is marked dirty. */ - if (ret != 0 && !btree->modified && was_modified) - btree->modified = 1; + if (ret != 0 && !btree->modified && was_modified) { + btree->modified = true; + if (!S2C(session)->modified) + S2C(session)->modified = true; + } __wt_meta_ckptlist_free(session, ckptbase); btree->ckpt = NULL; diff --git a/src/txn/txn_nsnap.c b/src/txn/txn_nsnap.c index 8f7e93238de..7ba0cc8700e 100644 --- a/src/txn/txn_nsnap.c +++ b/src/txn/txn_nsnap.c @@ -42,9 +42,16 @@ __nsnap_drop_one(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name) return (WT_NOTFOUND); /* Bump the global ID if we are removing the first entry */ - if (found == TAILQ_FIRST(&txn_global->nsnaph)) + if (found == TAILQ_FIRST(&txn_global->nsnaph)) { + WT_ASSERT(session, !__wt_txn_visible_all( + session, txn_global->nsnap_oldest_id)); txn_global->nsnap_oldest_id = (TAILQ_NEXT(found, q) != NULL) ? - TAILQ_NEXT(found, q)->snap_min : WT_TXN_NONE; + TAILQ_NEXT(found, q)->pinned_id : WT_TXN_NONE; + WT_DIAGNOSTIC_YIELD; + WT_ASSERT(session, txn_global->nsnap_oldest_id == WT_TXN_NONE || + !__wt_txn_visible_all( + session, txn_global->nsnap_oldest_id)); + } TAILQ_REMOVE(&txn_global->nsnaph, found, q); __nsnap_destroy(session, found); WT_STAT_CONN_INCR(session, txn_snapshots_dropped); @@ -104,7 +111,7 @@ __nsnap_drop_to(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name, bool inclusive) } if (TAILQ_NEXT(last, q) != NULL) - new_nsnap_oldest = TAILQ_NEXT(last, q)->snap_min; + new_nsnap_oldest = TAILQ_NEXT(last, q)->pinned_id; } do { @@ -117,7 +124,15 @@ __nsnap_drop_to(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name, bool inclusive) } while (nsnap != last && !TAILQ_EMPTY(&txn_global->nsnaph)); /* Now that the queue of named snapshots is updated, update the ID */ + WT_ASSERT(session, !__wt_txn_visible_all( + session, txn_global->nsnap_oldest_id) && + (new_nsnap_oldest == WT_TXN_NONE || + WT_TXNID_LE(txn_global->nsnap_oldest_id, new_nsnap_oldest))); txn_global->nsnap_oldest_id = new_nsnap_oldest; + WT_DIAGNOSTIC_YIELD; + WT_ASSERT(session, + new_nsnap_oldest == WT_TXN_NONE || + !__wt_txn_visible_all(session, new_nsnap_oldest)); return (ret); } @@ -157,6 +172,7 @@ __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_calloc_one(session, &nsnap_new)); nsnap = nsnap_new; WT_ERR(__wt_strndup(session, cval.str, cval.len, &nsnap->name)); + nsnap->pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; nsnap->snap_min = txn->snap_min; nsnap->snap_max = txn->snap_max; if (txn->snapshot_count > 0) { @@ -175,15 +191,25 @@ __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[]) */ WT_ERR_NOTFOUND_OK(__nsnap_drop_one(session, &cval)); - if (TAILQ_EMPTY(&txn_global->nsnaph)) - txn_global->nsnap_oldest_id = nsnap_new->snap_min; + if (TAILQ_EMPTY(&txn_global->nsnaph)) { + WT_ASSERT(session, txn_global->nsnap_oldest_id == WT_TXN_NONE && + !__wt_txn_visible_all(session, nsnap_new->pinned_id)); + __wt_readlock(session, txn_global->scan_rwlock); + txn_global->nsnap_oldest_id = nsnap_new->pinned_id; + __wt_readunlock(session, txn_global->scan_rwlock); + } TAILQ_INSERT_TAIL(&txn_global->nsnaph, nsnap_new, q); WT_STAT_CONN_INCR(session, txn_snapshots_created); nsnap_new = NULL; -err: if (started_txn) +err: if (started_txn) { +#ifdef HAVE_DIAGNOSTIC + uint64_t pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; +#endif WT_TRET(__wt_txn_rollback(session, NULL)); - else if (ret == 0) + WT_DIAGNOSTIC_YIELD; + WT_ASSERT(session, !__wt_txn_visible_all(session, pinned_id)); + } else if (ret == 0) F_SET(txn, WT_TXN_NAMED_SNAPSHOT); if (nsnap_new != NULL) @@ -258,7 +284,20 @@ __wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *nameval) __wt_readlock(session, txn_global->nsnap_rwlock); TAILQ_FOREACH(nsnap, &txn_global->nsnaph, q) if (WT_STRING_MATCH(nsnap->name, nameval->str, nameval->len)) { - txn->snap_min = txn_state->snap_min = nsnap->snap_min; + /* + * Acquire the scan lock so the oldest ID can't move + * forward without seeing our pinned ID. + */ + __wt_readlock(session, txn_global->scan_rwlock); + txn_state->pinned_id = nsnap->pinned_id; + __wt_readunlock(session, txn_global->scan_rwlock); + + WT_ASSERT(session, !__wt_txn_visible_all( + session, txn_state->pinned_id) && + txn_global->nsnap_oldest_id != WT_TXN_NONE && + WT_TXNID_LE(txn_global->nsnap_oldest_id, + txn_state->pinned_id)); + txn->snap_min = nsnap->snap_min; txn->snap_max = nsnap->snap_max; if ((txn->snapshot_count = nsnap->snapshot_count) != 0) memcpy(txn->snapshot, nsnap->snapshot, diff --git a/test/bloom/test_bloom.c b/test/bloom/test_bloom.c index 7a298f000aa..67249ff887e 100644 --- a/test/bloom/test_bloom.c +++ b/test/bloom/test_bloom.c @@ -56,8 +56,6 @@ void usage(void) extern char *__wt_optarg; extern int __wt_optind; -void (*custom_die)(void) = NULL; - int main(int argc, char *argv[]) { diff --git a/test/checkpoint/smoke.sh b/test/checkpoint/smoke.sh index 2f1d4345ad7..123d4e00df5 100755 --- a/test/checkpoint/smoke.sh +++ b/test/checkpoint/smoke.sh @@ -6,6 +6,9 @@ set -e echo "checkpoint: 3 mixed tables" $TEST_WRAPPER ./t -T 3 -t m +# We are done if short tests are requested +test -z "$TESTUTIL_DISABLE_LONG_TESTS" || exit 0 + echo "checkpoint: 6 column-store tables" $TEST_WRAPPER ./t -T 6 -t c diff --git a/test/checkpoint/test_checkpoint.c b/test/checkpoint/test_checkpoint.c index 6b2f0d4466c..4998019ad8e 100644 --- a/test/checkpoint/test_checkpoint.c +++ b/test/checkpoint/test_checkpoint.c @@ -42,8 +42,6 @@ static int wt_shutdown(void); extern int __wt_optind; extern char *__wt_optarg; -void (*custom_die)(void) = NULL; - int main(int argc, char *argv[]) { diff --git a/test/csuite/Makefile.am b/test/csuite/Makefile.am index 15db2fbcf46..8f1714237b9 100644 --- a/test/csuite/Makefile.am +++ b/test/csuite/Makefile.am @@ -7,6 +7,9 @@ AM_LDFLAGS = -static test_wt1965_col_efficiency_SOURCES = wt1965_col_efficiency/main.c noinst_PROGRAMS = test_wt1965_col_efficiency +test_wt2403_lsm_workload_SOURCES = wt2403_lsm_workload/main.c +noinst_PROGRAMS += test_wt2403_lsm_workload + test_wt2246_col_append_SOURCES = wt2246_col_append/main.c noinst_PROGRAMS += test_wt2246_col_append diff --git a/test/csuite/wt1965_col_efficiency/main.c b/test/csuite/wt1965_col_efficiency/main.c index 0dc367c0611..a7235d81b31 100644 --- a/test/csuite/wt1965_col_efficiency/main.c +++ b/test/csuite/wt1965_col_efficiency/main.c @@ -35,8 +35,6 @@ * it is demonstrating an inefficiency rather than a correctness bug. */ -void (*custom_die)(void) = NULL; - /* If changing field count also need to change set_value and get_value calls */ #define NR_FIELDS 8 #define NR_OBJECTS 100 diff --git a/test/csuite/wt2246_col_append/main.c b/test/csuite/wt2246_col_append/main.c index b795816c76f..4b352b26051 100644 --- a/test/csuite/wt2246_col_append/main.c +++ b/test/csuite/wt2246_col_append/main.c @@ -42,8 +42,6 @@ #define MILLION 1000000 -void (*custom_die)(void) = NULL; - /* Needs to be global for signal handling. */ static TEST_OPTS *opts, _opts; @@ -104,6 +102,8 @@ main(int argc, char *argv[]) char buf[100]; opts = &_opts; + if (testutil_disable_long_tests()) + return (0); memset(opts, 0, sizeof(*opts)); opts->table_type = TABLE_ROW; opts->n_append_threads = N_APPEND_THREADS; diff --git a/test/csuite/wt2323_join_visibility/main.c b/test/csuite/wt2323_join_visibility/main.c index bbf1626fe82..239a3f300d0 100644 --- a/test/csuite/wt2323_join_visibility/main.c +++ b/test/csuite/wt2323_join_visibility/main.c @@ -52,8 +52,6 @@ * of inserts set low as a default. */ -void (*custom_die)(void) = NULL; - #define N_RECORDS 10000 #define N_INSERT 500000 #define N_INSERT_THREAD 2 @@ -96,6 +94,8 @@ main(int argc, char *argv[]) opts = &_opts; sharedopts = &_sharedopts; + if (testutil_disable_long_tests()) + return (0); memset(opts, 0, sizeof(*opts)); memset(sharedopts, 0, sizeof(*sharedopts)); @@ -225,7 +225,8 @@ test_join(TEST_OPTS *opts, SHARED_OPTS *sharedopts, bool bloom, testutil_check(session->close(session, NULL)); } -static void *thread_insert(void *arg) +static void * +thread_insert(void *arg) { SHARED_OPTS *sharedopts; TEST_OPTS *opts; @@ -239,7 +240,7 @@ static void *thread_insert(void *arg) threadargs = (THREAD_ARGS *)arg; opts = threadargs->testopts; sharedopts = threadargs->sharedopts; - testutil_check(__wt_random_init_seed(NULL, &rnd)); + __wt_random_init_seed(NULL, &rnd); testutil_check(opts->conn->open_session( opts->conn, NULL, NULL, &session)); diff --git a/test/csuite/wt2403_lsm_workload/main.c b/test/csuite/wt2403_lsm_workload/main.c new file mode 100644 index 00000000000..0c287484b9e --- /dev/null +++ b/test/csuite/wt2403_lsm_workload/main.c @@ -0,0 +1,241 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "test_util.h" + +static const char name[] = "lsm:test"; +#define NUM_DOCS 100000 +#define NUM_QUERIES (NUM_DOCS/100) + +static void +rand_str(uint64_t i, char *str) +{ + uint64_t x, y; + + y = strlen(str); + for (x = y; x > y - 8; x--) { + str[x - 1] = (char)(i % 10) + 48; + i = i / 10; + } +} + +static void +check_str(uint64_t i, char *str, bool mod) +{ + char str2[] = "0000000000000000"; + + rand_str(i, str2); + if (mod) + str2[0] = 'A'; + testutil_checkfmt(strcmp(str, str2), + "strcmp failed, got %s, expected %s", str, str2); +} + +static void +query_docs(WT_CURSOR *cursor, bool mod) +{ + WT_ITEM key, value; + int i; + + for (i = 0; i < NUM_QUERIES; i++) { + testutil_check(cursor->next(cursor)); + testutil_check(cursor->get_key(cursor, &key)); + testutil_check(cursor->get_value(cursor, &value)); + check_str((uint64_t)key.data, (char *)value.data, mod); + } + printf("%d documents read\n", NUM_QUERIES); +} + +static void * +compact_thread(void *args) +{ + WT_SESSION *session; + + session = (WT_SESSION *)args; + testutil_check(session->compact(session, name, NULL)); + return (NULL); +} + +int +main(int argc, char *argv[]) +{ + TEST_OPTS *opts, _opts; + WT_CURSOR *rcursor, *wcursor; + WT_ITEM key, value; + WT_SESSION *session, *session2; + pthread_t thread; + uint64_t i; + + char str[] = "0000000000000000"; + + /* + * Create a clean test directory for this run of the test program if the + * environment variable isn't already set (as is done by make check). + */ + opts = &_opts; + memset(opts, 0, sizeof(*opts)); + testutil_check(testutil_parse_opts(argc, argv, opts)); + testutil_make_work_dir(opts->home); + testutil_check(wiredtiger_open(opts->home, + NULL, "create,cache_size=200M", &opts->conn)); + + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session2)); + + testutil_check(session->create(session, name, + "key_format=Q,value_format=S")); + + /* Populate the table with some data. */ + testutil_check(session->open_cursor( + session, name, NULL, "overwrite", &wcursor)); + for (i = 0; i < NUM_DOCS; i++) { + wcursor->set_key(wcursor, i); + rand_str(i, str); + wcursor->set_value(wcursor, str); + testutil_check(wcursor->insert(wcursor)); + } + testutil_check(wcursor->close(wcursor)); + printf("%d documents inserted\n", NUM_DOCS); + + /* Perform some random reads */ + testutil_check(session->open_cursor( + session, name, NULL, "next_random=true", &rcursor)); + query_docs(rcursor, false); + testutil_check(rcursor->close(rcursor)); + + /* Setup Transaction to pin the current values */ + testutil_check( + session2->begin_transaction(session2, "isolation=snapshot")); + testutil_check(session2->open_cursor( + session2, name, NULL, "next_random=true", &rcursor)); + + /* Perform updates in a txn to confirm that we see only the original. */ + testutil_check(session->open_cursor( + session, name, NULL, "overwrite", &wcursor)); + for (i = 0; i < NUM_DOCS; i++) { + rand_str(i, str); + str[0] = 'A'; + wcursor->set_key(wcursor, i); + wcursor->set_value(wcursor, str); + testutil_check(wcursor->update(wcursor)); + } + testutil_check(wcursor->close(wcursor)); + printf("%d documents set to update\n", NUM_DOCS); + + /* Random reads, which should see the original values */ + query_docs(rcursor, false); + testutil_check(rcursor->close(rcursor)); + + /* Finish the txn */ + testutil_check(session2->rollback_transaction(session2, NULL)); + + /* Random reads, which should see the updated values */ + testutil_check(session2->open_cursor( + session2, name, NULL, "next_random=true", &rcursor)); + query_docs(rcursor, true); + testutil_check(rcursor->close(rcursor)); + + /* Setup a pre-delete txn */ + testutil_check( + session2->begin_transaction(session2, "isolation=snapshot")); + testutil_check(session2->open_cursor( + session2, name, NULL, "next_random=true", &rcursor)); + + /* Delete all but one document */ + testutil_check(session->open_cursor( + session, name, NULL, "overwrite", &wcursor)); + for (i = 0; i < NUM_DOCS - 1; i++) { + wcursor->set_key(wcursor, i); + testutil_check(wcursor->remove(wcursor)); + } + testutil_check(wcursor->close(wcursor)); + printf("%d documents deleted\n", NUM_DOCS - 1); + + /* Random reads, which should not see the deletes */ + query_docs(rcursor, true); + testutil_check(rcursor->close(rcursor)); + + /* Rollback the txn so we can see the deletes */ + testutil_check(session2->rollback_transaction(session2, NULL)); + + /* Find the one remaining document 3 times */ + testutil_check(session2->open_cursor( + session2, name, NULL, "next_random=true", &rcursor)); + for (i = 0; i < 3; i++) { + testutil_check(rcursor->next(rcursor)); + testutil_check(rcursor->get_key(rcursor, &key)); + testutil_check(rcursor->get_value(rcursor, &value)); + /* There should only be one value available to us */ + testutil_assertfmt((uint64_t)key.data == NUM_DOCS - 1, + "expected %d and got %" PRIu64, + NUM_DOCS - 1, (uint64_t)key.data); + check_str((uint64_t)key.data, (char *)value.data, true); + } + printf("Found the deleted doc 3 times\n"); + testutil_check(rcursor->close(rcursor)); + + /* Repopulate the table for compact. */ + testutil_check(session->open_cursor( + session, name, NULL, "overwrite", &wcursor)); + for (i = 0; i < NUM_DOCS - 1; i++) { + wcursor->set_key(wcursor, i); + rand_str(i, str); + str[0] = 'A'; + wcursor->set_value(wcursor, str); + testutil_check(wcursor->insert(wcursor)); + } + testutil_check(wcursor->close(wcursor)); + + /* Run random cursor queries while compact is running */ + testutil_check(session2->open_cursor( + session2, name, NULL, "next_random=true", &rcursor)); + testutil_check(pthread_create(&thread, NULL, compact_thread, session)); + query_docs(rcursor, true); + testutil_check(rcursor->close(rcursor)); + testutil_check(pthread_join(thread, NULL)); + + /* Delete everything. Check for infinite loops */ + testutil_check(session->open_cursor( + session, name, NULL, "overwrite", &wcursor)); + for (i = 0; i < NUM_DOCS; i++) { + wcursor->set_key(wcursor, i); + testutil_check(wcursor->remove(wcursor)); + } + testutil_check(wcursor->close(wcursor)); + + testutil_check(session2->open_cursor( + session2, name, NULL, "next_random=true", &rcursor)); + for (i = 0; i < 3; i++) + testutil_assert(rcursor->next(rcursor) == WT_NOTFOUND); + printf("Successfully got WT_NOTFOUND\n"); + + testutil_cleanup(opts); + return (EXIT_SUCCESS); +} diff --git a/test/csuite/wt2447_join_main_table/main.c b/test/csuite/wt2447_join_main_table/main.c index bbae61e7ed5..1368e7c8c09 100644 --- a/test/csuite/wt2447_join_main_table/main.c +++ b/test/csuite/wt2447_join_main_table/main.c @@ -49,8 +49,6 @@ * table. */ -void (*custom_die)(void) = NULL; - #define N_RECORDS 10000 static void diff --git a/test/csuite/wt2535_insert_race/main.c b/test/csuite/wt2535_insert_race/main.c index 5eaca3279b6..ae18760a829 100644 --- a/test/csuite/wt2535_insert_race/main.c +++ b/test/csuite/wt2535_insert_race/main.c @@ -36,8 +36,6 @@ * Failure mode: Check that the data is correct at the end of the run. */ -void (*custom_die)(void) = NULL; - void *thread_insert_race(void *); int @@ -52,6 +50,8 @@ main(int argc, char *argv[]) int i; opts = &_opts; + if (testutil_disable_long_tests()) + return (0); memset(opts, 0, sizeof(*opts)); opts->nthreads = 10; opts->nrecords = 1000; diff --git a/test/csuite/wt2592_join_schema/main.c b/test/csuite/wt2592_join_schema/main.c index 4ffc9194646..0ec1c765d99 100644 --- a/test/csuite/wt2592_join_schema/main.c +++ b/test/csuite/wt2592_join_schema/main.c @@ -36,12 +36,6 @@ * Failure mode: The failure seen in WT-2592 was that no items were returned * by a join. */ -#include <inttypes.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -#include <wiredtiger.h> /* The C struct for the data we are storing in a WiredTiger table. */ typedef struct { @@ -66,8 +60,6 @@ static POP_RECORD pop_data[] = { { "", 0, 0 } }; -void (*custom_die)(void) = NULL; - int main(int argc, char *argv[]) { diff --git a/test/csuite/wt2695_checksum/main.c b/test/csuite/wt2695_checksum/main.c index afb9d0788bd..db4fed5dc53 100644 --- a/test/csuite/wt2695_checksum/main.c +++ b/test/csuite/wt2695_checksum/main.c @@ -32,8 +32,6 @@ * Test case description: Smoke-test the CRC. */ -void (*custom_die)(void) = NULL; - static inline void check(uint32_t hw, uint32_t sw, size_t len, const char *msg) { @@ -61,7 +59,7 @@ main(int argc, char *argv[]) wiredtiger_open(opts->home, NULL, "create", &opts->conn)); /* Initialize the RNG. */ - testutil_check(__wt_random_init_seed(NULL, &rnd)); + __wt_random_init_seed(NULL, &rnd); /* Allocate aligned memory for the data. */ data = dcalloc(DATASIZE, sizeof(uint8_t)); diff --git a/test/csuite/wt2719_reconfig/main.c b/test/csuite/wt2719_reconfig/main.c index b67dae6d647..0942cfc73b2 100644 --- a/test/csuite/wt2719_reconfig/main.c +++ b/test/csuite/wt2719_reconfig/main.c @@ -34,9 +34,7 @@ * Test case description: Fuzz testing for WiredTiger reconfiguration. */ -void (*custom_die)(void) = NULL; - -static const char *list[] = { +static const char * const list[] = { ",async=(enabled=0)", ",async=(enabled=1)", ",async=(ops_max=2048)", @@ -256,7 +254,7 @@ main(int argc, char *argv[]) session, opts->uri, "type=lsm,key_format=S,value_format=S")); /* Initialize the RNG. */ - testutil_check(__wt_random_init_seed(NULL, &rnd)); + __wt_random_init_seed(NULL, &rnd); /* Allocate memory for the config. */ len = WT_ELEMENTS(list) * 64; diff --git a/test/csuite/wt2834_join_bloom_fix/main.c b/test/csuite/wt2834_join_bloom_fix/main.c index 1e2d919d3c7..7c80496f1b6 100644 --- a/test/csuite/wt2834_join_bloom_fix/main.c +++ b/test/csuite/wt2834_join_bloom_fix/main.c @@ -39,8 +39,6 @@ * * Failure mode: We get results back from our join. */ -void (*custom_die)(void) = NULL; - #define N_RECORDS 100000 #define N_INSERT 1000000 @@ -62,6 +60,8 @@ main(int argc, char *argv[]) char joinuri[256]; opts = &_opts; + if (testutil_disable_long_tests()) + return (0); memset(opts, 0, sizeof(*opts)); testutil_check(testutil_parse_opts(argc, argv, opts)); @@ -101,8 +101,8 @@ main(int argc, char *argv[]) &maincur)); maincur->set_key(maincur, N_RECORDS); maincur->set_value(maincur, 54321, 0, "", 0, N_RECORDS); - maincur->insert(maincur); - maincur->close(maincur); + testutil_check(maincur->insert(maincur)); + testutil_check(maincur->close(maincur)); testutil_check(session->close(session, NULL)); populate(opts); @@ -151,6 +151,7 @@ main(int argc, char *argv[]) key, key2, post, balance, flag); count++; } + testutil_assert(ret == WT_NOTFOUND); testutil_assert(count == 0); testutil_cleanup(opts); @@ -159,7 +160,8 @@ main(int argc, char *argv[]) return (0); } -void populate(TEST_OPTS *opts) +void +populate(TEST_OPTS *opts) { WT_CURSOR *maincur; WT_SESSION *session; @@ -167,7 +169,7 @@ void populate(TEST_OPTS *opts) int balance, i, flag, post; WT_RAND_STATE rnd; - testutil_check(__wt_random_init_seed(NULL, &rnd)); + __wt_random_init_seed(NULL, &rnd); testutil_check(opts->conn->open_session( opts->conn, NULL, NULL, &session)); @@ -194,6 +196,6 @@ void populate(TEST_OPTS *opts) testutil_check(maincur->insert(maincur)); testutil_check(session->commit_transaction(session, NULL)); } - maincur->close(maincur); - session->close(session, NULL); + testutil_check(maincur->close(maincur)); + testutil_check(session->close(session, NULL)); } diff --git a/test/csuite/wt2853_perf/main.c b/test/csuite/wt2853_perf/main.c index 67ba4a20ada..6cec9634cd1 100644 --- a/test/csuite/wt2853_perf/main.c +++ b/test/csuite/wt2853_perf/main.c @@ -42,8 +42,6 @@ * continues until the test ends (~30 seconds). */ -void (*custom_die)(void) = NULL; - static void *thread_insert(void *); static void *thread_get(void *); @@ -201,7 +199,7 @@ thread_insert(void *arg) threadargs = (THREAD_ARGS *)arg; opts = threadargs->testopts; - testutil_check(__wt_random_init_seed(NULL, &rnd)); + __wt_random_init_seed(NULL, &rnd); (void)time(&prevtime); testutil_check(opts->conn->open_session( diff --git a/test/cursor_order/cursor_order.c b/test/cursor_order/cursor_order.c index aa351e6fea8..85b8c68e545 100644 --- a/test/cursor_order/cursor_order.c +++ b/test/cursor_order/cursor_order.c @@ -44,8 +44,6 @@ static void wt_shutdown(SHARED_CONFIG *); extern int __wt_optind; extern char *__wt_optarg; -void (*custom_die)(void) = NULL; - int main(int argc, char *argv[]) { diff --git a/test/fops/t.c b/test/fops/t.c index bf0588d5a53..7b4a7cf8fca 100644 --- a/test/fops/t.c +++ b/test/fops/t.c @@ -51,8 +51,6 @@ static void wt_shutdown(void); extern int __wt_optind; extern char *__wt_optarg; -void (*custom_die)(void) = NULL; - int main(int argc, char *argv[]) { diff --git a/test/format/config.c b/test/format/config.c index 542adf33da2..839ff5058de 100644 --- a/test/format/config.c +++ b/test/format/config.c @@ -187,8 +187,17 @@ config_setup(void) /* Give in-memory configuration a final review. */ config_in_memory_check(); - /* Make the default maximum-run length 20 minutes. */ - if (!config_is_perm("timer")) + /* + * Run-length configured by a number of operations and a timer. If the + * operation count and the timer are both set by a configuration, there + * isn't anything to do. If only the operation count was configured, + * set a default maximum-run of 20 minutes. If only the timer is set, + * clear the operations count (which was set randomly). + */ + if (config_is_perm("timer")) { + if (!config_is_perm("ops")) + config_single("ops=0", 0); + } else config_single("timer=20", 0); /* @@ -270,28 +279,33 @@ config_compression(const char *conf_name) */ switch (mmrand(NULL, 1, 20)) { #ifdef HAVE_BUILTIN_EXTENSION_LZ4 - case 1: case 2: case 3: case 4: /* 20% lz4 */ + case 1: case 2: /* 10% lz4 */ cstr = "lz4"; break; - case 5: /* 5% lz4-no-raw */ + case 3: /* 5% lz4-no-raw */ cstr = "lz4-noraw"; break; #endif #ifdef HAVE_BUILTIN_EXTENSION_SNAPPY - case 6: case 7: case 8: case 9: /* 30% snappy */ - case 10: case 11: + case 4: case 5: case 6: case 7: /* 30% snappy */ + case 8: case 9: cstr = "snappy"; break; #endif #ifdef HAVE_BUILTIN_EXTENSION_ZLIB - case 12: case 13: case 14: case 15: /* 20% zlib */ + case 10: case 11: case 12: case 13: /* 20% zlib */ cstr = "zlib"; break; - case 16: /* 5% zlib-no-raw */ + case 14: /* 5% zlib-no-raw */ cstr = "zlib-noraw"; break; #endif - case 17: case 18: case 19: case 20: /* 20% no compression */ +#ifdef HAVE_BUILTIN_EXTENSION_ZSTD + case 15: case 16 case 17: /* 15% zstd */ + cstr = "zstd"; + break; +#endif + case 18: case 19: case 20: /* 15% no compression */ default: break; } @@ -748,6 +762,8 @@ config_map_compression(const char *s, u_int *vp) *vp = COMPRESS_ZLIB; else if (strcmp(s, "zlib-noraw") == 0) *vp = COMPRESS_ZLIB_NO_RAW; + else if (strcmp(s, "zstd") == 0) + *vp = COMPRESS_ZSTD; else testutil_die(EINVAL, "illegal compression configuration: %s", s); diff --git a/test/format/config.h b/test/format/config.h index 725bc7c5d97..9bfba3cd0df 100644 --- a/test/format/config.h +++ b/test/format/config.h @@ -58,7 +58,7 @@ typedef struct { } CONFIG; #define COMPRESSION_LIST \ - "(none | lz4 | lz4-noraw | snappy | zlib | zlib-noraw)" + "(none | lz4 | lz4-noraw | snappy | zlib | zlib-noraw | zstd)" static CONFIG c[] = { { "abort", diff --git a/test/format/format.h b/test/format/format.h index 363dcf9eea8..820bc020c9b 100644 --- a/test/format/format.h +++ b/test/format/format.h @@ -48,6 +48,8 @@ EXTPATH "compressors/snappy/.libs/libwiredtiger_snappy.so" #define ZLIB_PATH \ EXTPATH "compressors/zlib/.libs/libwiredtiger_zlib.so" +#define ZSTD_PATH \ + EXTPATH "compressors/zstd/.libs/libwiredtiger_zstd.so" #define REVERSE_PATH \ EXTPATH "collators/reverse/.libs/libwiredtiger_reverse_collator.so" @@ -219,6 +221,7 @@ typedef struct { #define COMPRESS_SNAPPY 5 #define COMPRESS_ZLIB 6 #define COMPRESS_ZLIB_NO_RAW 7 +#define COMPRESS_ZSTD 8 u_int c_compression_flag; /* Compression flag value */ u_int c_logging_compression_flag; /* Log compression flag value */ diff --git a/test/format/lrt.c b/test/format/lrt.c index 937525522fa..69d6b22d71f 100644 --- a/test/format/lrt.c +++ b/test/format/lrt.c @@ -96,6 +96,22 @@ lrt(void *arg) pinned = 0; } else { /* + * Test named snapshots: create a snapshot, wait to + * give the transaction state time to move forward, + * then start a transaction with the named snapshot, + * drop it, then commit the transaction. This exercises + * most of the named snapshot logic under load. + */ + testutil_check(session->snapshot(session, "name=test")); + sleep(1); + testutil_check(session->begin_transaction( + session, "snapshot=test")); + testutil_check(session->snapshot( + session, "drop=(all)")); + testutil_check(session->commit_transaction( + session, NULL)); + + /* * Begin transaction: without an explicit transaction, * the snapshot is only kept around while a cursor is * positioned. As soon as the cursor loses its position diff --git a/test/format/t.c b/test/format/t.c index 12258af8e51..7701595776c 100644 --- a/test/format/t.c +++ b/test/format/t.c @@ -38,8 +38,6 @@ static void usage(void) extern int __wt_optind; extern char *__wt_optarg; -void (*custom_die)(void) = format_die; /* Local death handler. */ - int main(int argc, char *argv[]) { @@ -47,6 +45,8 @@ main(int argc, char *argv[]) int ch, onerun, reps; const char *config, *home; + custom_die = format_die; /* Local death handler. */ + config = NULL; #ifdef _WIN32 @@ -115,7 +115,7 @@ main(int argc, char *argv[]) argv += __wt_optind; /* Initialize the global RNG. */ - testutil_check(__wt_random_init_seed(NULL, &g.rnd)); + __wt_random_init_seed(NULL, &g.rnd); /* Set up paths. */ path_setup(home); diff --git a/test/format/util.c b/test/format/util.c index 667f6d6bcb1..a709aa93a2e 100644 --- a/test/format/util.c +++ b/test/format/util.c @@ -78,7 +78,7 @@ key_gen_setup(WT_ITEM *key) } static void -key_gen_common(WT_ITEM *key, uint64_t keyno, int suffix) +key_gen_common(WT_ITEM *key, uint64_t keyno, const char * const suffix) { int len; char *p; @@ -86,11 +86,15 @@ key_gen_common(WT_ITEM *key, uint64_t keyno, int suffix) p = key->mem; /* - * The key always starts with a 10-digit string (the specified cnt) + * The key always starts with a 10-digit string (the specified row) * followed by two digits, a random number between 1 and 15 if it's * an insert, otherwise 00. */ - len = sprintf(p, "%010" PRIu64 ".%02d", keyno, suffix); + u64_to_string_zf(keyno, key->mem, 11); + p[10] = '.'; + p[11] = suffix[0]; + p[12] = suffix[1]; + len = 13; /* * In a column-store, the key is only used for Berkeley DB inserts, @@ -118,13 +122,19 @@ key_gen_common(WT_ITEM *key, uint64_t keyno, int suffix) void key_gen(WT_ITEM *key, uint64_t keyno) { - key_gen_common(key, keyno, 0); + key_gen_common(key, keyno, "00"); } void key_gen_insert(WT_RAND_STATE *rnd, WT_ITEM *key, uint64_t keyno) { - key_gen_common(key, keyno, (int)mmrand(rnd, 1, 15)); + static const char * const suffix[15] = { + "01", "02", "03", "04", "05", + "06", "07", "08", "09", "10", + "11", "12", "13", "14", "15" + }; + + key_gen_common(key, keyno, suffix[mmrand(rnd, 1, 15) - 1]); } static uint32_t val_dup_data_len; /* Length of duplicate data items */ @@ -221,7 +231,7 @@ val_gen(WT_RAND_STATE *rnd, WT_ITEM *value, uint64_t keyno) p[10] = '/'; value->size = val_dup_data_len; } else { - (void)sprintf(p, "%010" PRIu64, keyno); + u64_to_string_zf(keyno, p, 11); p[10] = '/'; value->size = value_len(rnd, keyno, g.c_value_min, g.c_value_max); diff --git a/test/format/wts.c b/test/format/wts.c index 74c4bb902b3..23fdbce156c 100644 --- a/test/format/wts.c +++ b/test/format/wts.c @@ -50,6 +50,8 @@ compressor(uint32_t compress_flag) return ("zlib"); case COMPRESS_ZLIB_NO_RAW: return ("zlib-noraw"); + case COMPRESS_ZSTD: + return ("zstd"); default: break; } @@ -210,13 +212,14 @@ wts_open(const char *home, bool set_api, WT_CONNECTION **connp) /* Extensions. */ p += snprintf(p, REMAIN(p, end), ",extensions=[" - "\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"],", + "\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"],", g.c_reverse ? REVERSE_PATH : "", access(LZ4_PATH, R_OK) == 0 ? LZ4_PATH : "", access(LZO_PATH, R_OK) == 0 ? LZO_PATH : "", access(ROTN_PATH, R_OK) == 0 ? ROTN_PATH : "", access(SNAPPY_PATH, R_OK) == 0 ? SNAPPY_PATH : "", access(ZLIB_PATH, R_OK) == 0 ? ZLIB_PATH : "", + access(ZSTD_PATH, R_OK) == 0 ? ZSTD_PATH : "", DATASOURCE("kvsbdb") ? KVS_BDB_PATH : ""); /* @@ -546,6 +549,7 @@ wts_stats(void) WT_DECL_RET; WT_SESSION *session; FILE *fp; + size_t len; char *stat_name; const char *pval, *desc; uint64_t v; @@ -582,8 +586,9 @@ wts_stats(void) /* Data source statistics. */ fprintf(fp, "\n\n====== Data source statistics:\n"); - stat_name = dmalloc(strlen("statistics:") + strlen(g.uri) + 1); - sprintf(stat_name, "statistics:%s", g.uri); + len = strlen("statistics:") + strlen(g.uri) + 1; + stat_name = dmalloc(len); + snprintf(stat_name, len, "statistics:%s", g.uri); testutil_check(session->open_cursor( session, stat_name, NULL, NULL, &cursor)); free(stat_name); diff --git a/test/huge/huge.c b/test/huge/huge.c index 3aa61a9048e..17e2db353d5 100644 --- a/test/huge/huge.c +++ b/test/huge/huge.c @@ -159,8 +159,6 @@ run(CONFIG *cp, int bigkey, size_t bytes) extern int __wt_optind; extern char *__wt_optarg; -void (*custom_die)(void) = NULL; - int main(int argc, char *argv[]) { diff --git a/test/java/com/wiredtiger/test/ConcurrentCloseTest.java b/test/java/com/wiredtiger/test/ConcurrentCloseTest.java index fece0353bf0..fead0b0bf38 100644 --- a/test/java/com/wiredtiger/test/ConcurrentCloseTest.java +++ b/test/java/com/wiredtiger/test/ConcurrentCloseTest.java @@ -34,6 +34,7 @@ import com.wiredtiger.db.WiredTigerException; import com.wiredtiger.db.wiredtiger; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; import java.io.BufferedReader; import java.io.File; @@ -69,7 +70,7 @@ class InsertThread extends Thread { Session session = conn.open_session(null); Cursor cursor = session.open_cursor("table:cclose", null, "overwrite"); - cursor.putKeyString("key"+threadId + "-" + i); + cursor.putKeyString("key" + threadId + "-" + i); cursor.putValueString("value1"); ret = cursor.insert(); cursor.close(); @@ -127,36 +128,36 @@ public class ConcurrentCloseTest { setup(); try { List<Thread> threads = new ArrayList<Thread>(); - int i, ret; + int i; - ret = session.create("table:cclose", "key_format=S,value_format=S"); + assertEquals(0, session.create("table:cclose", + "key_format=S,value_format=S")); Cursor cursor = session.open_cursor("table:cclose", null, "overwrite"); cursor.putKeyString("key1"); cursor.putValueString("value1"); - ret = cursor.insert(); + assertEquals(0, cursor.insert()); cursor.close(); - ret = session.close(null); + assertEquals(0, session.close(null)); for (i = 0; i < NUM_THREADS; i++) { Thread insertThread = new InsertThread(conn, i); - Thread scanThread = new InsertThread(conn, i); + Thread scanThread = new ScanThread(conn); insertThread.start(); scanThread.start(); threads.add(insertThread); threads.add(scanThread); } - for (Thread thread : threads) try { thread.join(); - ret = -1; } catch (InterruptedException ie) { + fail(); } - ret = conn.close(null); - System.exit(ret); + assertEquals(0, conn.close(null)); + System.exit(0); } catch (WiredTigerException wte) { System.err.println("Exception: " + wte); diff --git a/test/manydbs/manydbs.c b/test/manydbs/manydbs.c index c5c9a9a7ccd..7e986d47af3 100644 --- a/test/manydbs/manydbs.c +++ b/test/manydbs/manydbs.c @@ -68,8 +68,6 @@ usage(void) extern int __wt_optind; extern char *__wt_optarg; -void (*custom_die)(void) = NULL; - static WT_CONNECTION **connections = NULL; static WT_CURSOR **cursors = NULL; static WT_RAND_STATE rnd; diff --git a/test/packing/intpack-test.c b/test/packing/intpack-test.c index 76851b38e35..c84823b741b 100644 --- a/test/packing/intpack-test.c +++ b/test/packing/intpack-test.c @@ -28,8 +28,6 @@ #include "test_util.h" -void (*custom_die)(void) = NULL; - int main(void) { diff --git a/test/packing/intpack-test2.c b/test/packing/intpack-test2.c index a7d31329069..4e612808a35 100644 --- a/test/packing/intpack-test2.c +++ b/test/packing/intpack-test2.c @@ -28,8 +28,6 @@ #include "test_util.h" -void (*custom_die)(void) = NULL; - int main(void) { diff --git a/test/packing/intpack-test3.c b/test/packing/intpack-test3.c index aac0178578f..763b0255ecf 100644 --- a/test/packing/intpack-test3.c +++ b/test/packing/intpack-test3.c @@ -28,8 +28,6 @@ #include "test_util.h" -void (*custom_die)(void) = NULL; - void test_value(int64_t); void test_spread(int64_t, int64_t, int64_t); diff --git a/test/packing/packing-test.c b/test/packing/packing-test.c index f251c17eb67..919b0622806 100644 --- a/test/packing/packing-test.c +++ b/test/packing/packing-test.c @@ -28,8 +28,6 @@ #include "test_util.h" -void (*custom_die)(void) = NULL; - static void check(const char *fmt, ...) { diff --git a/test/readonly/readonly.c b/test/readonly/readonly.c index 7a131912c31..a4b79f5859f 100644 --- a/test/readonly/readonly.c +++ b/test/readonly/readonly.c @@ -158,8 +158,6 @@ open_dbs(int op, const char *dir, extern int __wt_optind; extern char *__wt_optarg; -void (*custom_die)(void) = NULL; - int main(int argc, char *argv[]) { diff --git a/test/recovery/random-abort.c b/test/recovery/random-abort.c index 16065cec29e..03e67e2f723 100644 --- a/test/recovery/random-abort.c +++ b/test/recovery/random-abort.c @@ -179,8 +179,6 @@ fill_db(uint32_t nth) extern int __wt_optind; extern char *__wt_optarg; -void (*custom_die)(void) = NULL; - int main(int argc, char *argv[]) { @@ -245,7 +243,7 @@ main(int argc, char *argv[]) if (!verify_only) { testutil_make_work_dir(home); - testutil_assert(__wt_random_init_seed(NULL, &rnd) == 0); + __wt_random_init_seed(NULL, &rnd); if (rand_time) { timeout = __wt_random(&rnd) % MAX_TIME; if (timeout < MIN_TIME) diff --git a/test/recovery/truncated-log.c b/test/recovery/truncated-log.c index c0effa85e95..c265263d44c 100644 --- a/test/recovery/truncated-log.c +++ b/test/recovery/truncated-log.c @@ -258,8 +258,6 @@ fill_db(void) extern int __wt_optind; extern char *__wt_optarg; -void (*custom_die)(void) = NULL; - int main(int argc, char *argv[]) { diff --git a/test/salvage/salvage.c b/test/salvage/salvage.c index bad0167ca8e..b8553bbd72d 100644 --- a/test/salvage/salvage.c +++ b/test/salvage/salvage.c @@ -64,8 +64,6 @@ static int verbose; /* -v flag */ extern int __wt_optind; extern char *__wt_optarg; -void (*custom_die)(void) = NULL; - int main(int argc, char *argv[]) { diff --git a/test/suite/test_bug017.py b/test/suite/test_bug017.py new file mode 100644 index 00000000000..03e7b2ba714 --- /dev/null +++ b/test/suite/test_bug017.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2016 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest + +# test_bug017.py +# WT-2987: opening a cursor on an incomplete table drops core +class test_bug017(wttest.WiredTigerTestCase): + + def test_bug017_run(self): + self.session.create("table:bug17", + 'key_format=r,value_format=5sHQ,' + + 'columns=(id,country,year,population),colgroups=(main,population)') + + msg = '/column groups/' + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.open_cursor("table:bug17(country)", None), + msg) + +if __name__ == '__main__': + wttest.run() diff --git a/test/suite/test_cursor_random02.py b/test/suite/test_cursor_random02.py index 93aa97f2282..195480d703b 100644 --- a/test/suite/test_cursor_random02.py +++ b/test/suite/test_cursor_random02.py @@ -34,7 +34,10 @@ from wtscenario import make_scenarios # test_cursor_random02.py # Cursor next_random operations class test_cursor_random02(wttest.WiredTigerTestCase): - type = 'table:random' + types = [ + ('lsm', dict(type='lsm:random')), + ('table', dict(type='table:random')) + ] config = [ ('not-sample', dict(config='next_random=true')) ] @@ -46,26 +49,35 @@ class test_cursor_random02(wttest.WiredTigerTestCase): ('10000', dict(records=10000)), ('50000', dict(records=50000)), ] - scenarios = make_scenarios(config, records) + scenarios = make_scenarios(config, records, types) # Check that next_random works in the presence of a larger set of values, # where the values are in an insert list. def test_cursor_random_reasonable_distribution(self): uri = self.type num_entries = self.records + config = 'key_format=S' + if uri == 'table:random': + config = 'leaf_page_max=100MB,' + config # Set the leaf-page-max value, otherwise the page might split. - simple_populate(self, uri, - 'leaf_page_max=100MB,key_format=S', num_entries) + simple_populate(self, uri, config, num_entries) # Setup an array to track which keys are seen visitedKeys = [0] * (num_entries + 1) + # Setup a counter to see when we find a sequential key + sequentialKeys = 0 cursor = self.session.open_cursor(uri, None, 'next_random=true') + lastKey = None for i in range(0, num_entries): self.assertEqual(cursor.next(), 0) current = cursor.get_key() current = int(current) visitedKeys[current] = visitedKeys[current] + 1 + if lastKey != None: + if current == (lastKey + 1): + sequentialKeys += 1 + lastKey = current differentKeys = sum(x > 0 for x in visitedKeys) @@ -76,7 +88,10 @@ class test_cursor_random02(wttest.WiredTigerTestCase): str(num_entries) + ', ' + \ str((int)((differentKeys * 100) / num_entries)) + '%') ''' - + # Can't test for non-sequential data when there is 1 item in the table + if num_entries > 1: + self.assertGreater(num_entries - 1, sequentialKeys, + 'cursor is returning sequential data') self.assertGreater(differentKeys, num_entries / 4, 'next_random random distribution not adequate') diff --git a/test/suite/test_encrypt01.py b/test/suite/test_encrypt01.py index d314cbeadfd..746c9d13e96 100644 --- a/test/suite/test_encrypt01.py +++ b/test/suite/test_encrypt01.py @@ -57,6 +57,7 @@ class test_encrypt01(wttest.WiredTigerTestCase): ('lz4', dict(log_compress='lz4', block_compress='lz4')), ('snappy', dict(log_compress='snappy', block_compress='snappy')), ('zlib', dict(log_compress='zlib', block_compress='zlib')), + ('zstd', dict(log_compress='zstd', block_compress='zstd')), ('none-snappy', dict(log_compress=None, block_compress='snappy')), ('snappy-lz4', dict(log_compress='snappy', block_compress='lz4')), ] diff --git a/test/suite/test_inmem02.py b/test/suite/test_inmem02.py new file mode 100644 index 00000000000..9eb8330b2a3 --- /dev/null +++ b/test/suite/test_inmem02.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2016 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest +from time import sleep +from helper import simple_populate, simple_populate_check +from helper import key_populate, value_populate +from wtscenario import make_scenarios + +# test_inmem02.py +# Test in-memory with ignore-cache-size setting. +class test_inmem02(wttest.WiredTigerTestCase): + uri = 'table:inmem02' + conn_config = \ + 'cache_size=3MB,file_manager=(close_idle_time=0),in_memory=true' + table_config = 'key_format=S,value_format=S,memory_page_max=32k,leaf_page_max=4k' + + # Add more data than fits into the configured cache and verify it fails. + def test_insert_over_allowed(self): + + # Create a new table that is allowed to exceed the cache size, do this + # before filling the cache so that the create succeeds + self.session.create( + self.uri + '_over', 'ignore_in_memory_cache_size=true') + + # Populate a table with enough data to fill the cache. + msg = '/WT_CACHE_FULL.*/' + self.assertRaisesHavingMessage(wiredtiger.WiredTigerError, + lambda:simple_populate( + self, self.uri, self.table_config, 10000000), msg) + + # Add some content to the new table + cursor = self.session.open_cursor(self.uri + '_over', None) + for i in range(1, 1000): + cursor[str('%015d' % i)] = str(i) + ': abcdefghijklmnopqrstuvwxyz' + cursor.close() + +if __name__ == '__main__': + wttest.run() diff --git a/test/suite/test_stat02.py b/test/suite/test_stat02.py index 3d2a83d1c3c..047d2c74499 100644 --- a/test/suite/test_stat02.py +++ b/test/suite/test_stat02.py @@ -165,7 +165,7 @@ class test_stat_cursor_conn_error(wttest.WiredTigerTestCase): args = ['none', 'all', 'fast'] for i in list(itertools.permutations(args, 2)): config = 'create,statistics=(' + i[0] + ',' + i[1] + ')' - msg = '/only one statistics configuration value/' + msg = '/Only one of/' self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: self.wiredtiger_open('.', config), msg) @@ -188,10 +188,76 @@ class test_stat_cursor_dsrc_error(wttest.WiredTigerTestCase): args = ['all', 'fast'] for i in list(itertools.permutations(args, 2)): config = 'statistics=(' + i[0] + ',' + i[1] + ')' - msg = '/only one statistics configuration value/' + msg = '/Only one of/' self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: self.session.open_cursor( 'statistics:' + self.uri, None, config), msg) +# Test data-source cache walk statistics +class test_stat_cursor_dsrc_cache_walk(wttest.WiredTigerTestCase): + uri = 'file:test_stat_cursor_dsrc_cache_walk' + + conn_config = 'statistics=(none)' + + def test_stat_cursor_dsrc_cache_walk(self): + simple_populate(self, self.uri, 'key_format=S', 100) + # Ensure that it's an error to get cache_walk stats if none is set + msg = '/doesn\'t match the database statistics/' + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.open_cursor( + 'statistics:' + self.uri, None, None), msg) + + # Test configurations that are valid but should not collect + # cache walk information. Do these first since the cache walk + # statistics are mostly marked as not cleared - so once they are + # populated the values will always be returned + self.conn.reconfigure('statistics=(cache_walk,fast,clear)') + c = self.session.open_cursor( + 'statistics:' + self.uri, None, 'statistics=(fast)') + self.assertEqual(c[stat.dsrc.cache_state_root_size][2], 0) + c.close() + + self.conn.reconfigure('statistics=(all,clear)') + c = self.session.open_cursor( + 'statistics:' + self.uri, None, 'statistics=(fast)') + self.assertEqual(c[stat.dsrc.cache_state_root_size][2], 0) + c.close() + + self.conn.reconfigure('statistics=(cache_walk,fast,clear)') + c = self.session.open_cursor('statistics:' + self.uri, None, None) + self.assertGreater(c[stat.dsrc.cache_state_root_size][2], 0) + # Verify that cache_walk didn't imply tree_walk + self.assertEqual(c[stat.dsrc.btree_entries][2], 0) + c.close() + + self.conn.reconfigure('statistics=(cache_walk,tree_walk,fast,clear)') + c = self.session.open_cursor('statistics:' + self.uri, None, None) + self.assertGreater(c[stat.dsrc.cache_state_root_size][2], 0) + # Verify that cache_walk didn't exclude tree_walk + self.assertGreater(c[stat.dsrc.btree_entries][2], 0) + c.close() + + self.conn.reconfigure('statistics=(all,clear)') + c = self.session.open_cursor( + 'statistics:' + self.uri, None, 'statistics=(all)') + self.assertGreater(c[stat.dsrc.cache_state_root_size][2], 0) + self.assertGreater(c[stat.dsrc.btree_entries][2], 0) + c.close() + + # Verify that cache and tree walk can operate independantly + self.conn.reconfigure('statistics=(all,clear)') + c = self.session.open_cursor( + 'statistics:' + self.uri, None, 'statistics=(cache_walk,fast)') + self.assertGreater(c[stat.dsrc.cache_state_root_size][2], 0) + self.assertEqual(c[stat.dsrc.btree_entries][2], 0) + c.close() + + self.conn.reconfigure('statistics=(all,clear)') + c = self.session.open_cursor( + 'statistics:' + self.uri, None, 'statistics=(tree_walk,fast)') + # Don't check the cache walk stats for empty - they won't be cleared + self.assertGreater(c[stat.dsrc.btree_entries][2], 0) + c.close() + if __name__ == '__main__': wttest.run() diff --git a/test/thread/t.c b/test/thread/t.c index 5b53532e8a6..baadbf2adb9 100644 --- a/test/thread/t.c +++ b/test/thread/t.c @@ -52,8 +52,6 @@ static void wt_shutdown(void); extern int __wt_optind; extern char *__wt_optarg; -void (*custom_die)(void) = NULL; - int main(int argc, char *argv[]) { diff --git a/test/utility/misc.c b/test/utility/misc.c index 096bc752726..1491c9a6938 100644 --- a/test/utility/misc.c +++ b/test/utility/misc.c @@ -27,6 +27,8 @@ */ #include "test_util.h" +void (*custom_die)(void) = NULL; + /* * die -- * Report an error and quit. @@ -142,8 +144,6 @@ testutil_cleanup(TEST_OPTS *opts) if (!opts->preserve) testutil_clean_work_dir(opts->home); - free(opts->conn_config); - free(opts->table_config); free(opts->uri); free(opts->home); } diff --git a/test/utility/parse_opts.c b/test/utility/parse_opts.c index 08aeafa9617..74a1c021d5d 100644 --- a/test/utility/parse_opts.c +++ b/test/utility/parse_opts.c @@ -27,10 +27,6 @@ */ #include "test_util.h" -extern int __wt_opterr; /* if error message should be printed */ -extern int __wt_optind; /* index into parent argv vector */ -extern int __wt_optopt; /* character checked for validity */ -extern int __wt_optreset; /* reset getopt */ extern char *__wt_optarg; /* argument associated with option */ /* @@ -59,7 +55,7 @@ testutil_parse_opts(int argc, char * const *argv, TEST_OPTS *opts) opts->n_append_threads = (uint64_t)atoll(__wt_optarg); break; case 'h': /* Home directory */ - opts->home = __wt_optarg; + opts->home = dstrdup(__wt_optarg); break; case 'n': /* Number of records */ opts->nrecords = (uint64_t)atoll(__wt_optarg); @@ -116,12 +112,14 @@ testutil_parse_opts(int argc, char * const *argv, TEST_OPTS *opts) } /* - * Setup the home directory. It needs to be unique for every test - * or the auto make parallel tester gets upset. + * Setup the home directory if not explicitly specified. It needs to be + * unique for every test or the auto make parallel tester gets upset. */ - len = strlen("WT_TEST.") + strlen(opts->progname) + 10; - opts->home = dmalloc(len); - snprintf(opts->home, len, "WT_TEST.%s", opts->progname); + if (opts->home == NULL) { + len = strlen("WT_TEST.") + strlen(opts->progname) + 10; + opts->home = dmalloc(len); + snprintf(opts->home, len, "WT_TEST.%s", opts->progname); + } /* Setup the default URI string */ len = strlen("table:") + strlen(opts->progname) + 10; diff --git a/test/utility/test_util.h b/test/utility/test_util.h index 3c1d0e2630a..f6a9cd68e02 100644 --- a/test/utility/test_util.h +++ b/test/utility/test_util.h @@ -68,10 +68,8 @@ typedef struct { * resources. */ WT_CONNECTION *conn; - char *conn_config; WT_SESSION *session; bool running; - char *table_config; char *uri; volatile uint64_t next_threadid; uint64_t max_inserted_id; @@ -87,6 +85,16 @@ typedef struct { } while (0) /* + * testutil_assertfmt -- + * Complain and quit if something isn't true. + */ +#define testutil_assertfmt(a, fmt, ...) do { \ + if (!(a)) \ + testutil_die(0, "%s/%d: %s: " fmt, \ + __func__, __LINE__, #a, __VA_ARGS__); \ +} while (0) + +/* * testutil_check -- * Complain and quit if a function call fails. */ @@ -108,6 +116,62 @@ typedef struct { __func__, __LINE__, #call, __VA_ARGS__); \ } while (0) +/* + * u64_to_string -- + * Convert a uint64_t to a text string. + * + * Algorithm from Andrei Alexandrescu's talk: "Three Optimization Tips for C++" + */ +static inline void +u64_to_string(uint64_t n, char **pp) +{ + static const char hundred_lookup[201] = + "0001020304050607080910111213141516171819" + "2021222324252627282930313233343536373839" + "4041424344454647484950515253545556575859" + "6061626364656667686970717273747576777879" + "8081828384858687888990919293949596979899"; + u_int i; + char *p; + + /* + * The argument pointer references the last element of a buffer (which + * must be large enough to hold any possible value). + * + * Nul-terminate the buffer. + */ + for (p = *pp, *p-- = '\0'; n >= 100; n /= 100) { + i = (n % 100) * 2; + *p-- = hundred_lookup[i + 1]; + *p-- = hundred_lookup[i]; + } + + /* Handle the last two digits. */ + i = (u_int)n * 2; + *p = hundred_lookup[i + 1]; + if (n >= 10) + *--p = hundred_lookup[i]; + + /* Return a pointer to the first byte of the text string. */ + *pp = p; +} + +/* + * u64_to_string_zf -- + * Convert a uint64_t to a text string, zero-filling the buffer. + */ +static inline void +u64_to_string_zf(uint64_t n, char *buf, size_t len) +{ + char *p; + + p = buf + (len - 1); + u64_to_string(n, &p); + + while (p > buf) + *--p = '0'; +} + /* Allow tests to add their own death handling. */ extern void (*custom_die)(void); diff --git a/tools/wtstats/stat_data.py b/tools/wtstats/stat_data.py index 8f47b86a23e..635e710c469 100644 --- a/tools/wtstats/stat_data.py +++ b/tools/wtstats/stat_data.py @@ -91,6 +91,24 @@ no_scale_per_second_list = [ 'btree: row-store leaf pages', 'cache: bytes currently in the cache', 'cache: overflow values cached in memory', + 'cache_walk: Average difference between current eviction generation when the page was last considered', + 'cache_walk: Average on-disk page image size seen', + 'cache_walk: Clean pages currently in cache', + 'cache_walk: Current eviction generation', + 'cache_walk: Dirty pages currently in cache', + 'cache_walk: Entries in the root page', + 'cache_walk: Internal pages currently in cache', + 'cache_walk: Leaf pages currently in cache', + 'cache_walk: Maximum difference between current eviction generation when the page was last considered', + 'cache_walk: Maximum page size seen', + 'cache_walk: Minimum on-disk page image size seen', + 'cache_walk: On-disk page image sizes smaller than a single allocation unit', + 'cache_walk: Pages created in memory and never written', + 'cache_walk: Pages currently queued for eviction', + 'cache_walk: Pages that could not be queued for eviction', + 'cache_walk: Refs skipped during cache traversal', + 'cache_walk: Size of the root page', + 'cache_walk: Total number of pages currently in cache', 'LSM: bloom filters in the LSM tree', 'LSM: chunks in the LSM tree', 'LSM: highest merge generation in the LSM tree', @@ -162,6 +180,24 @@ no_clear_list = [ 'transaction: transaction range of IDs currently pinned by named snapshots', 'btree: btree checkpoint generation', 'cache: bytes currently in the cache', + 'cache_walk: Average difference between current eviction generation when the page was last considered', + 'cache_walk: Average on-disk page image size seen', + 'cache_walk: Clean pages currently in cache', + 'cache_walk: Current eviction generation', + 'cache_walk: Dirty pages currently in cache', + 'cache_walk: Entries in the root page', + 'cache_walk: Internal pages currently in cache', + 'cache_walk: Leaf pages currently in cache', + 'cache_walk: Maximum difference between current eviction generation when the page was last considered', + 'cache_walk: Maximum page size seen', + 'cache_walk: Minimum on-disk page image size seen', + 'cache_walk: On-disk page image sizes smaller than a single allocation unit', + 'cache_walk: Pages created in memory and never written', + 'cache_walk: Pages currently queued for eviction', + 'cache_walk: Pages that could not be queued for eviction', + 'cache_walk: Refs skipped during cache traversal', + 'cache_walk: Size of the root page', + 'cache_walk: Total number of pages currently in cache', 'session: open cursor count', ] prefix_list = [ @@ -169,6 +205,7 @@ prefix_list = [ 'reconciliation', 'LSM', 'log', + 'lock', 'cache', 'transaction', 'cursor', @@ -176,9 +213,10 @@ prefix_list = [ 'session', 'block-manager', 'thread-yield', + 'cache_walk', 'async', 'btree', 'thread-state', 'compression', ] -groups = {'cursor': ['cursor', 'session'], 'lsm': ['LSM', 'transaction'], 'system': ['connection', 'data-handle', 'session', 'thread-state'], 'evict': ['block-manager', 'cache', 'connection', 'thread-state'], 'memory': ['cache', 'connection', 'reconciliation']}
\ No newline at end of file +groups = {'cursor': ['cursor', 'session'], 'lsm': ['LSM', 'transaction'], 'system': ['connection', 'data-handle', 'session', 'thread-state'], 'evict': ['block-manager', 'cache', 'cache_walk', 'connection', 'thread-state'], 'memory': ['cache', 'cache_walk', 'connection', 'reconciliation']}
\ No newline at end of file diff --git a/tools/wtstats/wtstats.py b/tools/wtstats/wtstats.py index 3549031c30f..bf5557d12f4 100755 --- a/tools/wtstats/wtstats.py +++ b/tools/wtstats/wtstats.py @@ -115,6 +115,9 @@ def parse_wtstats_file(file, result): # Parse file for line in open(file, 'rU'): month, day, time, v, title = line.strip('\n').split(" ", 4) + # The colon in the URI confuses parsing, strip it out. + if "cache_walk" in title: + title = title.replace("file:", "", 1) result[title].append((month + " " + day + " " + time, v)) |