diff options
95 files changed, 2548 insertions, 957 deletions
diff --git a/bench/wtperf/config.c b/bench/wtperf/config.c index 1238c25502c..808e85eedae 100644 --- a/bench/wtperf/config.c +++ b/bench/wtperf/config.c @@ -54,6 +54,7 @@ static void config_opt_usage(void); int config_assign(CONFIG *dest, const CONFIG *src) { + CONFIG_QUEUE_ENTRY *conf_line, *tmp_line; size_t i, len; char *newstr, **pstr; @@ -96,6 +97,18 @@ config_assign(CONFIG *dest, const CONFIG *src) } TAILQ_INIT(&dest->stone_head); + TAILQ_INIT(&dest->config_head); + + /* Clone the config string information into the new cfg object */ + TAILQ_FOREACH(conf_line, &src->config_head, c) { + len = strlen(conf_line->string); + if ((tmp_line = calloc(sizeof(CONFIG_QUEUE_ENTRY), 1)) == NULL) + return (enomem(src)); + if ((tmp_line->string = calloc(len + 1, 1)) == NULL) + return (enomem(src)); + strncpy(tmp_line->string, conf_line->string, len); + TAILQ_INSERT_TAIL(&dest->config_head, tmp_line, c); + } return (0); } @@ -106,9 +119,17 @@ config_assign(CONFIG *dest, const CONFIG *src) void config_free(CONFIG *cfg) { + CONFIG_QUEUE_ENTRY *config_line; size_t i; char **pstr; + while (!TAILQ_EMPTY(&cfg->config_head)) { + config_line = TAILQ_FIRST(&cfg->config_head); + TAILQ_REMOVE(&cfg->config_head, config_line, c); + free(config_line->string); + free(config_line); + } + for (i = 0; i < sizeof(config_opts) / sizeof(config_opts[0]); i++) if (config_opts[i].type == STRING_TYPE || config_opts[i].type == CONFIG_STRING_TYPE) { @@ -569,16 +590,34 @@ err: if (fd != -1) int config_opt_line(CONFIG *cfg, const char *optstr) { + CONFIG_QUEUE_ENTRY *config_line; WT_CONFIG_ITEM k, v; WT_CONFIG_PARSER *scan; + size_t len; int ret, t_ret; + char *string_copy; + len = strlen(optstr); if ((ret = wiredtiger_config_parser_open( - NULL, optstr, strlen(optstr), &scan)) != 0) { + NULL, optstr, len, &scan)) != 0) { lprintf(cfg, ret, 0, "Error in config_scan_begin"); return (ret); } + /* + * Append the current line to our copy of the config. The config is + * stored in the order it is processed, so added options will be after + * any parsed from the original config. We allocate len + 1 to allow for + * a null byte to be added. + */ + if ((string_copy = calloc(len + 1, 1)) == NULL) + return (enomem(cfg)); + + strncpy(string_copy, optstr, len); + config_line = calloc(sizeof(CONFIG_QUEUE_ENTRY), 1); + config_line->string = string_copy; + TAILQ_INSERT_TAIL(&cfg->config_head, config_line, c); + while (ret == 0) { if ((ret = scan->next(scan, &k, &v)) != 0) { /* Any parse error has already been reported. */ @@ -653,6 +692,90 @@ config_sanity(CONFIG *cfg) } /* + * config_consolidate -- + * Consolidate repeated configuration settings so that it only appears + * once in the configuration output file. + */ +void +config_consolidate(CONFIG *cfg) +{ + CONFIG_QUEUE_ENTRY *conf_line, *test_line, *tmp; + char *string_key; + + /* + * This loop iterates over the config queue and for entry checks if an + * entry later in the queue has the same key. If a match is found then + * the current queue entry is removed and we continue. + */ + conf_line = TAILQ_FIRST(&cfg->config_head); + while (conf_line != NULL) { + string_key = strchr(conf_line->string, '='); + tmp = test_line = TAILQ_NEXT(conf_line, c); + while (test_line != NULL) { + /* + * The + 1 here forces the '=' sign to be matched + * ensuring we don't match keys that have a common + * prefix such as "table_count" and "table_count_idle" + * as being the same key. + */ + if (strncmp(conf_line->string, test_line->string, + (size_t)(string_key - conf_line->string + 1)) + == 0) { + TAILQ_REMOVE(&cfg->config_head, conf_line, c); + free(conf_line->string); + free(conf_line); + break; + } + test_line = TAILQ_NEXT(test_line, c); + } + conf_line = tmp; + } +} + +/* + * config_to_file -- + * Write the final config used in this execution to a file. + */ +void +config_to_file(CONFIG *cfg) +{ + CONFIG_QUEUE_ENTRY *config_line; + FILE *fp; + size_t req_len; + char *path; + + fp = NULL; + + /* Backup the config */ + req_len = strlen(cfg->home) + 100; + if ((path = calloc(req_len, 1)) == NULL) { + (void)enomem(cfg); + goto err; + } + + snprintf(path, req_len + 14, "%s/CONFIG.wtperf", cfg->home); + if ((fp = fopen(path, "w")) == NULL) { + lprintf(cfg, errno, 0, "%s", path); + goto err; + } + + /* Print the config dump */ + fprintf(fp,"# Warning. This config includes " + "unwritten, implicit configuration defaults.\n" + "# Changes to those values may cause differences in behavior.\n"); + config_consolidate(cfg); + config_line = TAILQ_FIRST(&cfg->config_head); + while (config_line != NULL) { + fprintf(fp, "%s\n", config_line->string); + config_line = TAILQ_NEXT(config_line, c); + } + +err: free(path); + if (fp != NULL) + (void)fclose(fp); +} + +/* * config_print -- * Print out the configuration in verbose mode. */ diff --git a/bench/wtperf/runners/btree-split-stress.wtperf b/bench/wtperf/runners/btree-split-stress.wtperf new file mode 100644 index 00000000000..deb8c70d12f --- /dev/null +++ b/bench/wtperf/runners/btree-split-stress.wtperf @@ -0,0 +1,10 @@ +conn_config="cache_size=2GB,statistics=[fast,clear],statistics_log=(wait=10),eviction=(threads_max=4,threads_min=4)" +table_config="type=file,leaf_page_max=8k,internal_page_max=8k,memory_page_max=2MB,split_deepen_min_child=250" +icount=200000 +report_interval=5 +run_time=300 +reopen_connection=false +populate_threads=2 +value_sz=256 +read_range=100 +threads=((count=4,inserts=1,throttle=100000),(count=8,reads=1)) diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index 955f605c0b3..5386096d9b7 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -60,6 +60,7 @@ static const CONFIG default_cfg = { 0, /* total seconds running */ 0, /* has truncate */ {NULL, NULL}, /* the truncate queue */ + {NULL, NULL}, /* the config queue */ #define OPT_DEFINE_DEFAULT #include "wtperf_opt.i" @@ -371,6 +372,53 @@ err: cfg->error = cfg->stop = 1; return (NULL); } +/* + * do_range_reads -- + * If configured to execute a sequence of next operations after each + * search do them. Ensuring the keys we see are always in order. + */ +static int +do_range_reads(CONFIG *cfg, WT_CURSOR *cursor) +{ + size_t range; + uint64_t next_val, prev_val; + char *range_key_buf; + char buf[512]; + int ret; + + ret = 0; + + if (cfg->read_range == 0) + return (0); + + memset(&buf[0], 0, 512 * sizeof(char)); + range_key_buf = &buf[0]; + + /* Save where the first key is for comparisons. */ + cursor->get_key(cursor, &range_key_buf); + extract_key(range_key_buf, &next_val); + + for (range = 0; range < cfg->read_range; ++range) { + prev_val = next_val; + ret = cursor->next(cursor); + /* We are done if we reach the end. */ + if (ret != 0) + break; + + /* Retrieve and decode the key */ + cursor->get_key(cursor, &range_key_buf); + extract_key(range_key_buf, &next_val); + if (next_val < prev_val) { + lprintf(cfg, EINVAL, 0, + "Out of order keys %" PRIu64 + " came before %" PRIu64, + prev_val, next_val); + return (EINVAL); + } + } + return (0); +} + static void * worker(void *arg) { @@ -381,8 +429,8 @@ worker(void *arg) WT_CONNECTION *conn; WT_CURSOR **cursors, *cursor, *tmp_cursor; WT_SESSION *session; - int64_t ops, ops_per_txn, throttle_ops; size_t i; + int64_t ops, ops_per_txn, throttle_ops; uint64_t next_val, usecs; uint8_t *op, *op_end; int measure_latency, ret, truncated; @@ -533,7 +581,14 @@ worker(void *arg) "get_value in read."); goto err; } + /* + * If we want to read a range, then call next + * for several operations, confirming that the + * next key is in the correct order. + */ + ret = do_range_reads(cfg, cursor); } + if (ret == 0 || ret == WT_NOTFOUND) break; goto op_err; @@ -2103,6 +2158,8 @@ main(int argc, char *argv[]) if (config_assign(cfg, &default_cfg)) goto err; + TAILQ_INIT(&cfg->config_head); + /* Do a basic validation of options, and home is needed before open. */ while ((ch = __wt_getopt("wtperf", argc, argv, opts)) != EOF) switch (ch) { @@ -2308,6 +2365,9 @@ main(int argc, char *argv[]) if ((ret = config_sanity(cfg)) != 0) goto err; + /* Write a copy of the config. */ + config_to_file(cfg); + /* Display the configuration. */ if (cfg->verbose > 1) config_print(cfg); @@ -2333,7 +2393,7 @@ start_threads(CONFIG *cfg, WORKLOAD *workp, CONFIG_THREAD *base, u_int num, void *(*func)(void *)) { CONFIG_THREAD *thread; - u_int i, j; + u_int i; int ret; /* Initialize the threads. */ @@ -2342,15 +2402,13 @@ start_threads(CONFIG *cfg, thread->workload = workp; /* - * We don't want the threads executing in lock-step, move each - * new RNG state further along in the sequence. + * We don't want the threads executing in lock-step, seed each + * one differently. */ - if (i == 0) - __wt_random_init(&thread->rnd); - else - thread->rnd = (thread - 1)->rnd; - for (j = 0; j < 1000; ++j) - (void)__wt_random(&thread->rnd); + if ((ret = __wt_random_init_seed(NULL, &thread->rnd)) != 0) { + lprintf(cfg, ret, 0, "Error initializing RNG"); + return (ret); + } /* * Every thread gets a key/data buffer because we don't bother diff --git a/bench/wtperf/wtperf.h b/bench/wtperf/wtperf.h index b26e978c13b..361b135ced7 100644 --- a/bench/wtperf/wtperf.h +++ b/bench/wtperf/wtperf.h @@ -127,6 +127,12 @@ struct __truncate_queue_entry { }; typedef struct __truncate_queue_entry TRUNCATE_QUEUE_ENTRY; +struct __config_queue_entry { + char *string; + TAILQ_ENTRY(__config_queue_entry) c; +}; +typedef struct __config_queue_entry CONFIG_QUEUE_ENTRY; + #define LOG_PARTIAL_CONFIG ",log=(enabled=false)" /* * NOTE: If you add any fields to this structure here, you must also add @@ -181,6 +187,9 @@ struct __config { /* Configuration structure */ /* Queue head for use with the Truncate Logic */ TAILQ_HEAD(__truncate_qh, __truncate_queue_entry) stone_head; + /* Queue head to save a copy of the config to be output */ + TAILQ_HEAD(__config_qh, __config_queue_entry) config_head; + /* Fields changeable on command line are listed in wtperf_opt.i */ #define OPT_DECLARE_STRUCT #include "wtperf_opt.i" @@ -189,6 +198,7 @@ struct __config { /* Configuration structure */ #define ELEMENTS(a) (sizeof(a) / sizeof(a[0])) +#define READ_RANGE_OPS 10 #define THROTTLE_OPS 100 #define THOUSAND (1000ULL) @@ -271,6 +281,8 @@ void config_free(CONFIG *); int config_opt_file(CONFIG *, const char *); int config_opt_line(CONFIG *, const char *); int config_opt_str(CONFIG *, const char *, const char *); +void config_to_file(CONFIG *); +void config_consolidate(CONFIG *); void config_print(CONFIG *); int config_sanity(CONFIG *); void latency_insert(CONFIG *, uint32_t *, uint32_t *, uint32_t *); @@ -305,4 +317,10 @@ generate_key(CONFIG *cfg, char *key_buf, uint64_t keyno) sprintf(key_buf, "%0*" PRIu64, cfg->key_sz - 1, keyno); } +static inline void +extract_key(char *key_buf, uint64_t *keynop) +{ + sscanf(key_buf, "%" SCNu64, keynop); +} + #endif diff --git a/bench/wtperf/wtperf_opt.i b/bench/wtperf/wtperf_opt.i index a9d3c033b74..3c122e4d186 100644 --- a/bench/wtperf/wtperf_opt.i +++ b/bench/wtperf/wtperf_opt.i @@ -140,6 +140,7 @@ DEF_OPT_AS_UINT32(random_range, 0, "if non zero choose a value from within this range as the key for " "insert operations") DEF_OPT_AS_BOOL(random_value, 0, "generate random content for the value") +DEF_OPT_AS_UINT32(read_range, 0, "scan a range of keys after each search") DEF_OPT_AS_BOOL(reopen_connection, 1, "close and reopen the connection between populate and workload phases") DEF_OPT_AS_UINT32(report_interval, 2, diff --git a/build_posix/configure.ac.in b/build_posix/configure.ac.in index de2f8963629..5949fb0509c 100644 --- a/build_posix/configure.ac.in +++ b/build_posix/configure.ac.in @@ -103,7 +103,7 @@ esac # Linux requires buffers aligned to 4KB boundaries for O_DIRECT to work. BUFFER_ALIGNMENT=0 -if test "$ac_cv_func_posix_memalign" = "yes" ; then +if test "$ax_cv_func_posix_memalign_works" = "yes" ; then case "$host_os" in linux*) BUFFER_ALIGNMENT=4096 ;; esac diff --git a/build_win/filelist.win b/build_win/filelist.win index af6ddf98da9..b845c45823e 100644 --- a/build_win/filelist.win +++ b/build_win/filelist.win @@ -121,6 +121,7 @@ src/os_win/os_map.c src/os_win/os_mtx_cond.c src/os_win/os_once.c src/os_win/os_open.c +src/os_win/os_pagesize.c src/os_win/os_path.c src/os_win/os_priv.c src/os_win/os_remove.c diff --git a/dist/api_data.py b/dist/api_data.py index f58a48b4a0b..ff6d3f3ccb5 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -814,21 +814,19 @@ methods = { 'WT_SESSION.open_cursor' : Method(cursor_runtime_config + [ Config('bulk', 'false', r''' - configure the cursor for bulk-loading, a fast, initial load - path (see @ref tune_bulk_load for more information). Bulk-load - may only be used for newly created objects and cursors - configured for bulk-load only support the WT_CURSOR::insert - and WT_CURSOR::close methods. When bulk-loading row-store - objects, keys must be loaded in sorted order. The value is - usually a true/false flag; when bulk-loading fixed-length - column store objects, the special value \c bitmap allows - chunks of a memory resident bitmap to be loaded directly into - a file by passing a \c WT_ITEM to WT_CURSOR::set_value where - the \c size field indicates the number of records in the - bitmap (as specified by the object's \c value_format - configuration). Bulk-loaded bitmap values must end on a byte - boundary relative to the bit count (except for the last set - of values loaded)'''), + configure the cursor for bulk-loading, a fast, initial load path + (see @ref tune_bulk_load for more information). Bulk-load may + only be used for newly created objects and applications should + use the WT_CURSOR::insert method to insert rows. When + bulk-loading, rows must be loaded in sorted order. The value + is usually a true/false flag; when bulk-loading fixed-length + column store objects, the special value \c bitmap allows chunks + of a memory resident bitmap to be loaded directly into a file + by passing a \c WT_ITEM to WT_CURSOR::set_value where the \c + size field indicates the number of records in the bitmap (as + specified by the object's \c value_format configuration). + Bulk-loaded bitmap values must end on a byte boundary relative + to the bit count (except for the last set of values loaded)'''), Config('checkpoint', '', r''' the name of a checkpoint to open (the reserved name "WiredTigerCheckpoint" opens the most recent internal @@ -843,12 +841,20 @@ methods = { with the @ref util_dump and @ref util_load commands''', choices=['hex', 'json', 'print']), Config('next_random', 'false', r''' - configure the cursor to return a pseudo-random record from - the object; valid only for row-store cursors. Cursors - configured with \c next_random=true only support the - WT_CURSOR::next and WT_CURSOR::close methods. See @ref - cursor_random for details''', + configure the cursor to return a pseudo-random record from the + object when the WT_CURSOR::next method is called; valid only for + row-store cursors. See @ref cursor_random for details''', type='boolean'), + Config('next_random_sample_size', '0', r''' + cursors configured by \c next_random to return pseudo-random + records from the object randomly select from the entire object, + by default. Setting \c next_random_sample_size to a non-zero + value sets the number of samples the application expects to take + using the \c next_random cursor. A cursor configured with both + \c next_random and \c next_random_sample_size attempts to divide + the object into \c next_random_sample_size equal-sized pieces, + and each retrieval returns a record from one of those pieces. See + @ref cursor_random for details'''), Config('raw', 'false', r''' ignore the encodings for the key and value, manage data as if the formats were \c "u". See @ref cursor_raw for details''', diff --git a/dist/filelist b/dist/filelist index 52af87c2a68..dde090e5a85 100644 --- a/dist/filelist +++ b/dist/filelist @@ -119,6 +119,7 @@ src/os_posix/os_mtx_cond.c src/os_posix/os_mtx_rw.c src/os_posix/os_once.c src/os_posix/os_open.c +src/os_posix/os_pagesize.c src/os_posix/os_path.c src/os_posix/os_priv.c src/os_posix/os_remove.c diff --git a/dist/flags.py b/dist/flags.py index 1965dfb7dbe..7d237dd39a4 100644 --- a/dist/flags.py +++ b/dist/flags.py @@ -37,6 +37,7 @@ flags = { 'READ_NO_WAIT', 'READ_PREV', 'READ_SKIP_INTL', + 'READ_SKIP_LEAF', 'READ_TRUNCATE', 'READ_WONT_NEED', ], diff --git a/dist/log.py b/dist/log.py index feeb053db3e..6d35bf2e718 100644 --- a/dist/log.py +++ b/dist/log.py @@ -8,14 +8,15 @@ import log_data tmp_file = '__tmp' # Map log record types to: -# (C type, pack type, printf format, printf arg(s), printf setup) +# (C type, pack type, printf format, printf arg(s), list of setup functions) field_types = { - 'string' : ('const char *', 'S', '%s', 'arg', ''), + 'string' : ('const char *', 'S', '%s', 'arg', [ '' ]), 'item' : ('WT_ITEM *', 'u', '%s', 'escaped', - 'WT_ERR(__logrec_jsonify_str(session, &escaped, &arg));'), - 'recno' : ('uint64_t', 'r', '%" PRIu64 "', 'arg', ''), - 'uint32' : ('uint32_t', 'I', '%" PRIu32 "', 'arg', ''), - 'uint64' : ('uint64_t', 'Q', '%" PRIu64 "', 'arg', ''), + [ 'WT_ERR(__logrec_make_json_str(session, &escaped, &arg));', + 'WT_ERR(__logrec_make_hex_str(session, &escaped, &arg));']), + 'recno' : ('uint64_t', 'r', '%" PRIu64 "', 'arg', [ '' ]), + 'uint32' : ('uint32_t', 'I', '%" PRIu32 "', 'arg', [ '' ]), + 'uint64' : ('uint64_t', 'Q', '%" PRIu64 "', 'arg', [ '' ]), } def cintype(f): @@ -38,15 +39,13 @@ def clocaltype(f): return type def escape_decl(fields): - for f in fields: - if 'escaped' in field_types[f[0]][4]: - return '\n\tchar *escaped;' - return '' + return '\n\tchar *escaped;' if has_escape(fields) else '' def has_escape(fields): for f in fields: - if 'escaped' in field_types[f[0]][4]: - return True + for setup in field_types[f[0]][4]: + if 'escaped' in setup: + return True return False def pack_fmt(fields): @@ -65,10 +64,38 @@ def printf_arg(f): arg = field_types[f[0]][3].replace('arg', f[1]) return ' ' + arg -def printf_setup(f): - stmt = field_types[f[0]][4].replace('arg', f[1]) - return '' if stmt == '' else stmt + '\n\t' - +def printf_setup(f, i, nl_indent): + stmt = field_types[f[0]][4][i].replace('arg', f[1]) + return '' if stmt == '' else stmt + nl_indent + +def n_setup(f): + return len(field_types[f[0]][4]) + +# Create a printf line, with an optional setup function. +# ishex indicates that the the field name in the output is modified +# (to add "-hex"), and that the setup and printf are conditional +# in the generated code. +def printf_line(f, optype, i, ishex): + ifbegin = '' + ifend = '' + nl_indent = '\n\t' + name = f[1] + postcomma = '' if i + 1 == len(optype.fields) else ',\\n' + precomma = '' + if ishex > 0: + name += '-hex' + ifend = nl_indent + '}' + nl_indent += '\t' + ifbegin = 'if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) {' + nl_indent + if postcomma == '': + precomma = ',\\n' + body = '%s%s(__wt_fprintf(out,' % ( + printf_setup(f, ishex, nl_indent), + 'WT_ERR' if has_escape(optype.fields) else 'WT_RET') + \ + '%s "%s \\"%s\\": \\"%s\\"%s",%s));' % ( + nl_indent, precomma, name, printf_fmt(f), postcomma, + printf_arg(f)) + return ifbegin + body + ifend ##################################################################### # Update log.h with #defines for types @@ -176,7 +203,7 @@ __logrec_json_unpack_str(char *dest, size_t destlen, const char *src, } static int -__logrec_jsonify_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item) +__logrec_make_json_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item) { \tsize_t needed; @@ -185,6 +212,17 @@ __logrec_jsonify_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item) \t(void)__logrec_json_unpack_str(*destp, needed, item->data, item->size); \treturn (0); } + +static int +__logrec_make_hex_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item) +{ +\tsize_t needed; + +\tneeded = item->size * 2 + 1; +\tWT_RET(__wt_realloc(session, NULL, needed, destp)); +\t__wt_fill_hex(item->data, item->size, (uint8_t *)*destp, needed, NULL); +\treturn (0); +} ''') # Emit code to read, write and print log operations (within a log record) @@ -255,11 +293,12 @@ __wt_logop_%(name)s_unpack( tfile.write(''' int __wt_logop_%(name)s_print( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + FILE *out, uint32_t flags) { %(arg_ret)s\t%(arg_decls)s -\t%(arg_init)sWT_RET(__wt_logop_%(name)s_unpack( +\t%(arg_unused)s%(arg_init)sWT_RET(__wt_logop_%(name)s_unpack( \t session, pp, end%(arg_addrs)s)); \tWT_RET(__wt_fprintf(out, " \\"optype\\": \\"%(name)s\\",\\n")); @@ -272,27 +311,22 @@ __wt_logop_%(name)s_print( 'arg_decls' : ('\n\t'.join('%s%s%s;' % (clocaltype(f), '' if clocaltype(f)[-1] == '*' else ' ', f[1]) for f in optype.fields)) + escape_decl(optype.fields), + 'arg_unused' : ('' if has_escape(optype.fields) + else 'WT_UNUSED(flags);\n\t'), 'arg_init' : ('escaped = NULL;\n\t' if has_escape(optype.fields) else ''), 'arg_fini' : ('\nerr:\t__wt_free(session, escaped);\n\treturn (ret);' if has_escape(optype.fields) else '\treturn (0);'), 'arg_addrs' : ''.join(', &%s' % f[1] for f in optype.fields), - 'print_args' : '\n\t'.join( - '%s%s(__wt_fprintf(out,\n\t " \\"%s\\": \\"%s\\",\\n",%s));' % - (printf_setup(f), - 'WT_ERR' if has_escape(optype.fields) else 'WT_RET', - f[1], printf_fmt(f), printf_arg(f)) - for f in optype.fields[:-1]) + str( - '\n\t%s%s(__wt_fprintf(out,\n\t " \\"%s\\": \\"%s\\"",%s));' % - (printf_setup(last_field), - 'WT_ERR' if has_escape(optype.fields) else 'WT_RET', - last_field[1], printf_fmt(last_field), printf_arg(last_field))), + 'print_args' : '\n\t'.join(printf_line(f, optype, i, s) + for i,f in enumerate(optype.fields) for s in range(0, n_setup(f))) }) # Emit the printlog entry point tfile.write(''' int __wt_txn_op_printlog( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + FILE *out, uint32_t flags) { \tuint32_t optype, opsize; @@ -308,7 +342,8 @@ for optype in log_data.optypes: tfile.write(''' \tcase %(macro)s: -\t\tWT_RET(%(print_func)s(session, pp, end, out)); +\t\tWT_RET(%(print_func)s(session, pp, end, out, +\t\t flags)); \t\tbreak; ''' % { 'macro' : optype.macro_name(), diff --git a/dist/s_copyright b/dist/s_copyright index 020be6ae33d..0816274a367 100755 --- a/dist/s_copyright +++ b/dist/s_copyright @@ -6,6 +6,7 @@ c1=__wt.copyright.1 c2=__wt.copyright.2 c3=__wt.copyright.3 c4=__wt.copyright.4 +c5=__wt.copyright.5 check() { @@ -34,6 +35,9 @@ check() if `sed -e 1,3p -e 4q -e d $1 | diff - dist/$c4 > /dev/null` ; then return; fi + if `sed -e 2,7p -e 8q -e d $1 | diff - dist/$c5 > /dev/null` ; then + return; + fi echo "$1: copyright information is incorrect" exit 1 @@ -81,6 +85,16 @@ cat > $c4 <<ENDOFTEXT # This is free and unencumbered software released into the public domain. ENDOFTEXT +cat > $c5 <<ENDOFTEXT + * Copyright (c) 2014-$year MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: +ENDOFTEXT + # Search for files, skipping some well-known 3rd party directories. (cd .. && find [a-z]* -name '*.[chi]' \ -o -name '*.cxx' \ diff --git a/dist/s_funcs b/dist/s_funcs index 3769ccc4aa7..5fee03b5615 100755 --- a/dist/s_funcs +++ b/dist/s_funcs @@ -6,7 +6,7 @@ trap 'rm -f $t; exit 0' 0 1 2 3 13 15 # List of files to search. l=`sed -e 's,#.*,,' -e '/^$/d' -e 's,^,../,' filelist` -l="$l `echo ../src/*/*.i ../src/utilities/*.c`" +l="$l `echo ../src/*/*.i ../src/utilities/*.c ../bench/wtperf/*.c`" ( # Copy out the functions we don't use, but it's OK. diff --git a/dist/s_longlines b/dist/s_longlines index 15ca5603385..decedb58f44 100755 --- a/dist/s_longlines +++ b/dist/s_longlines @@ -8,10 +8,11 @@ l=`(cd .. && find bench/wtperf examples ext src test -name '*.[chisy]' && find dist -name '*.py' && find src -name '*.in') | - sed -e '/include\/extern\.h/d'\ - -e '/support\/stat\.c/d'` + sed -e '/dist\/stat_data\.py/d' \ + -e '/support\/stat\.c/d' \ + -e '/include\/extern\.h/d'` for f in $l ; do expand -t8 < ../$f | awk -- \ - "{if(length(\$0) > 80) printf(\"%s:%d\\n\", \"$f\", NR)}" + "{if(length(\$0) > 80) printf(\"%s:%d\\n\", \"$f\", NR)}" done diff --git a/dist/s_string.ok b/dist/s_string.ok index b408888970b..27583402259 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -144,6 +144,7 @@ INIT INITIALIZER INMEM INTL +ISA ITEMs Inline Intra @@ -180,6 +181,7 @@ LevelDB Levyx Llqr Llqrt +LoadLoad LockFile Lookaside Lookup @@ -187,6 +189,7 @@ MALLOC MEM MEMALIGN MERCHANTABILITY +MONGODB MSVC MULTIBLOCK MUTEX @@ -283,10 +286,13 @@ Spinlock Spinlocks Split's Stoica +StoreLoad +StoreStore TAILQ TCMalloc TODO TORTIOUS +TSO TXN TXNC Timespec @@ -301,6 +307,7 @@ ULINE URI URIs UTF +UltraSparc Unbuffered UnixLib Unmap @@ -371,6 +378,7 @@ automake bInheritHandle basecfg basho +bcr bdb beginthreadex bigram @@ -412,6 +420,7 @@ bzip calloc cas catfmt +ccr cd centric cfg @@ -423,6 +432,7 @@ checkpointer checkpointing checksum checksums +children's chk chongo cip @@ -711,6 +721,7 @@ lookaside lookup lookups lossy +lr lrt lru lseek @@ -719,6 +730,7 @@ lsn lsnappy lt lu +lwsync lz lzo madvise @@ -726,6 +738,8 @@ majorp malloc marshall marshalled +mbll +mbss mem memalign membar @@ -802,6 +816,7 @@ os ovfl ownp packv +pagesize parens pareto parserp @@ -1022,6 +1037,7 @@ variable's vectorized versa vfprintf +vm vpack vprintf vrfy diff --git a/dist/s_win b/dist/s_win index cdfc71a8a1e..1eb4702d517 100755 --- a/dist/s_win +++ b/dist/s_win @@ -62,6 +62,7 @@ win_filelist() -e 's;os_posix/os_mtx_cond.c;os_win/os_mtx_cond.c;' \ -e 's;os_posix/os_once.c;os_win/os_once.c;' \ -e 's;os_posix/os_open.c;os_win/os_open.c;' \ + -e 's;os_posix/os_pagesize.c;os_win/os_pagesize.c;' \ -e 's;os_posix/os_path.c;os_win/os_path.c;' \ -e 's;os_posix/os_priv.c;os_win/os_priv.c;' \ -e 's;os_posix/os_remove.c;os_win/os_remove.c;' \ diff --git a/dist/stat.py b/dist/stat.py index d62fda3fcb9..6dcfccfeab5 100644 --- a/dist/stat.py +++ b/dist/stat.py @@ -171,9 +171,7 @@ __wt_stat_''' + name + '''_aggregate_single( { ''') for l in sorted(list): - if 'no_aggregate' in l.flags: - o = '\tto->' + l.name + ' = from->' + l.name + ';\n' - elif 'max_aggregate' in l.flags: + if 'max_aggregate' in l.flags: o = '\tif (from->' + l.name + ' > to->' + l.name + ')\n' +\ '\t\tto->' + l.name + ' = from->' + l.name + ';\n' else: @@ -197,12 +195,12 @@ __wt_stat_''' + name + '''_aggregate( f.write('\tint64_t v;\n\n') break; for l in sorted(list): - if 'no_aggregate' in l.flags: - o = '\tto->' + l.name + ' = from[0]->' + l.name + ';\n' - elif 'max_aggregate' in l.flags: - o = '\tif ((v = WT_STAT_READ(from, ' + l.name + ')) >\n' +\ - '\t to->' + l.name + ')\n' +\ - '\t\tto->' + l.name + ' = v;\n' + if 'max_aggregate' in l.flags: + o = '\tif ((v = WT_STAT_READ(from, ' + l.name + ')) > ' +\ + 'to->' + l.name + ')\n' + if len(o) > 72: # Account for the leading tab. + o = o.replace(' > ', ' >\n\t ') + o +='\t\tto->' + l.name + ' = v;\n' else: o = '\tto->' + l.name + ' += WT_STAT_READ(from, ' + l.name + ');\n' if len(o) > 72: # Account for the leading tab. diff --git a/dist/stat_data.py b/dist/stat_data.py index 3a23071a3f2..41a93961079 100644 --- a/dist/stat_data.py +++ b/dist/stat_data.py @@ -8,20 +8,13 @@ # NOTE: All statistics descriptions must have a prefix string followed by ':'. # # Data-source statistics are normally aggregated across the set of underlying -# objects. Additional optionaly configuration flags are available: -# no_aggregate Ignore the value when aggregating statistics +# objects. Additional optional configuration flags are available: # max_aggregate Take the maximum value when aggregating statistics -# -# Optional configuration flags: # no_clear Value not cleared when statistics cleared # no_scale Don't scale value per second in the logging tool script # -# The no_clear flag is a little complicated: it means we don't clear the values -# when resetting statistics after each run (necessary when the WiredTiger engine -# is updating values that persist over multiple runs, for example the count of -# cursors), but it also causes the underlying display routines to not treat the -# change between displays as relative to the number of seconds, that is, it's an -# absolute value. The no_clear flag should be set in either case. +# The no_clear and no_scale flags are normally always set together (values that +# are maintained over time are normally not scaled per second). from operator import attrgetter import sys @@ -129,13 +122,11 @@ connection_stats = [ # Async API statistics ########################################## AsyncStat('async_alloc_race', 'number of allocation state races'), - AsyncStat('async_alloc_view', - 'number of operation slots viewed for allocation'), + AsyncStat('async_alloc_view', 'number of operation slots viewed for allocation'), AsyncStat('async_cur_queue', 'current work queue length'), AsyncStat('async_flush', 'number of flush calls'), AsyncStat('async_full', 'number of times operation allocation failed'), - AsyncStat('async_max_queue', - 'maximum work queue length', 'no_clear,no_scale'), + AsyncStat('async_max_queue', 'maximum work queue length', 'no_clear,no_scale'), AsyncStat('async_nowork', 'number of times worker found no work'), AsyncStat('async_op_alloc', 'total allocations'), AsyncStat('async_op_compact', 'total compact calls'), @@ -158,89 +149,59 @@ connection_stats = [ ########################################## # Cache and eviction statistics ########################################## - CacheStat('cache_bytes_dirty', - 'tracked dirty bytes in the cache', 'no_clear,no_scale'), - CacheStat('cache_bytes_internal', - 'tracked bytes belonging to internal pages in the cache', - 'no_clear,no_scale'), - CacheStat('cache_bytes_inuse', - 'bytes currently in the cache', 'no_clear,no_scale'), - CacheStat('cache_bytes_leaf', - 'tracked bytes belonging to leaf pages in the cache', - 'no_clear,no_scale'), - CacheStat('cache_bytes_max', - 'maximum bytes configured', 'no_clear,no_scale'), - CacheStat('cache_bytes_overflow', - 'tracked bytes belonging to overflow pages in the cache', - 'no_clear,no_scale'), + CacheStat('cache_bytes_dirty', 'tracked dirty bytes in the cache', 'no_clear,no_scale'), + CacheStat('cache_bytes_internal', 'tracked bytes belonging to internal pages in the cache', 'no_clear,no_scale'), + CacheStat('cache_bytes_inuse', 'bytes currently in the cache', 'no_clear,no_scale'), + CacheStat('cache_bytes_leaf', 'tracked bytes belonging to leaf pages in the cache', 'no_clear,no_scale'), + CacheStat('cache_bytes_max', 'maximum bytes configured', 'no_clear,no_scale'), + CacheStat('cache_bytes_overflow', 'tracked bytes belonging to overflow pages in the cache', 'no_clear,no_scale'), CacheStat('cache_bytes_read', 'bytes read into cache'), CacheStat('cache_bytes_write', 'bytes written from cache'), CacheStat('cache_eviction_app', 'pages evicted by application threads'), CacheStat('cache_eviction_checkpoint', 'checkpoint blocked page eviction'), CacheStat('cache_eviction_clean', 'unmodified pages evicted'), - CacheStat('cache_eviction_deepen', - 'page split during eviction deepened the tree'), + CacheStat('cache_eviction_deepen', 'page split during eviction deepened the tree'), CacheStat('cache_eviction_dirty', 'modified pages evicted'), - CacheStat('cache_eviction_fail', - 'pages selected for eviction unable to be evicted'), - CacheStat('cache_eviction_force', - 'pages evicted because they exceeded the in-memory maximum'), - CacheStat('cache_eviction_force_delete', - 'pages evicted because they had chains of deleted items'), - CacheStat('cache_eviction_force_fail', - 'failed eviction of pages that exceeded the in-memory maximum'), + CacheStat('cache_eviction_fail', 'pages selected for eviction unable to be evicted'), + CacheStat('cache_eviction_force', 'pages evicted because they exceeded the in-memory maximum'), + CacheStat('cache_eviction_force_delete', 'pages evicted because they had chains of deleted items'), + CacheStat('cache_eviction_force_fail', 'failed eviction of pages that exceeded the in-memory maximum'), CacheStat('cache_eviction_hazard', 'hazard pointer blocked page eviction'), CacheStat('cache_eviction_internal', 'internal pages evicted'), - CacheStat('cache_eviction_maximum_page_size', - 'maximum page size at eviction', 'no_clear,no_scale'), - CacheStat('cache_eviction_queue_empty', - 'eviction server candidate queue empty when topping up'), - CacheStat('cache_eviction_queue_not_empty', - 'eviction server candidate queue not empty when topping up'), - CacheStat('cache_eviction_server_evicting', - 'eviction server evicting pages'), - CacheStat('cache_eviction_server_not_evicting', - 'eviction server populating queue, but not evicting pages'), - CacheStat('cache_eviction_slow', - 'eviction server unable to reach eviction goal'), - CacheStat('cache_eviction_split_internal', - 'internal pages split during eviction'), + CacheStat('cache_eviction_maximum_page_size', 'maximum page size at eviction', 'no_clear,no_scale'), + CacheStat('cache_eviction_queue_empty', 'eviction server candidate queue empty when topping up'), + CacheStat('cache_eviction_queue_not_empty', 'eviction server candidate queue not empty when topping up'), + CacheStat('cache_eviction_server_evicting', 'eviction server evicting pages'), + CacheStat('cache_eviction_server_not_evicting', 'eviction server populating queue, but not evicting pages'), + CacheStat('cache_eviction_slow', 'eviction server unable to reach eviction goal'), + CacheStat('cache_eviction_split_internal', 'internal pages split during eviction'), CacheStat('cache_eviction_split_leaf', 'leaf pages split during eviction'), CacheStat('cache_eviction_walk', 'pages walked for eviction'), - CacheStat('cache_eviction_worker_evicting', - 'eviction worker thread evicting pages'), + CacheStat('cache_eviction_worker_evicting', 'eviction worker thread evicting pages'), CacheStat('cache_inmem_split', 'in-memory page splits'), - CacheStat('cache_inmem_splittable', - 'in-memory page passed criteria to be split'), + CacheStat('cache_inmem_splittable', 'in-memory page passed criteria to be split'), CacheStat('cache_lookaside_insert', 'lookaside table insert calls'), CacheStat('cache_lookaside_remove', 'lookaside table remove calls'), CacheStat('cache_overhead', 'percentage overhead', 'no_clear,no_scale'), - CacheStat('cache_pages_dirty', - 'tracked dirty pages in the cache', 'no_clear,no_scale'), - CacheStat('cache_pages_inuse', - 'pages currently held in the cache', 'no_clear,no_scale'), + CacheStat('cache_pages_dirty', 'tracked dirty pages in the cache', 'no_clear,no_scale'), + CacheStat('cache_pages_inuse', 'pages currently held in the cache', 'no_clear,no_scale'), CacheStat('cache_read', 'pages read into cache'), - CacheStat('cache_read_lookaside', - 'pages read into cache requiring lookaside entries'), + CacheStat('cache_read_lookaside', 'pages read into cache requiring lookaside entries'), CacheStat('cache_write', 'pages written from cache'), - CacheStat('cache_write_lookaside', - 'page written requiring lookaside records'), - CacheStat('cache_write_restore', - 'pages written requiring in-memory restoration'), + CacheStat('cache_write_lookaside', 'page written requiring lookaside records'), + CacheStat('cache_write_restore', 'pages written requiring in-memory restoration'), ########################################## # Dhandle statistics ########################################## - DhandleStat('dh_conn_handle_count', - 'connection data handles currently active', 'no_clear,no_scale'), + DhandleStat('dh_conn_handle_count', 'connection data handles currently active', 'no_clear,no_scale'), + DhandleStat('dh_session_handles', 'session dhandles swept'), + DhandleStat('dh_session_sweeps', 'session sweep attempts'), DhandleStat('dh_sweep_close', 'connection sweep dhandles closed'), - DhandleStat('dh_sweep_remove', - 'connection sweep dhandles removed from hash list'), DhandleStat('dh_sweep_ref', 'connection sweep candidate became referenced'), + DhandleStat('dh_sweep_remove', 'connection sweep dhandles removed from hash list'), DhandleStat('dh_sweep_tod', 'connection sweep time-of-death sets'), DhandleStat('dh_sweeps', 'connection sweeps'), - DhandleStat('dh_session_handles', 'session dhandles swept'), - DhandleStat('dh_session_sweeps', 'session sweep attempts'), ########################################## # Logging statistics @@ -257,10 +218,8 @@ connection_stats = [ LogStat('log_flush', 'log flush operations'), LogStat('log_max_filesize', 'maximum log file size', 'no_clear,no_scale'), LogStat('log_prealloc_files', 'pre-allocated log files prepared'), - LogStat('log_prealloc_max', - 'number of pre-allocated log files to create', 'no_clear,no_scale'), - LogStat('log_prealloc_missed', - 'pre-allocated log files not ready and missed'), + LogStat('log_prealloc_max', 'number of pre-allocated log files to create', 'no_clear,no_scale'), + LogStat('log_prealloc_missed', 'pre-allocated log files not ready and missed'), LogStat('log_prealloc_used', 'pre-allocated log files used'), LogStat('log_release_write_lsn', 'log release advances write LSN'), LogStat('log_scan_records', 'records processed by log scan'), @@ -283,46 +242,32 @@ connection_stats = [ ########################################## # Reconciliation statistics ########################################## - RecStat('rec_pages', 'page reconciliation calls'), RecStat('rec_page_delete', 'pages deleted'), RecStat('rec_page_delete_fast', 'fast-path pages deleted'), + RecStat('rec_pages', 'page reconciliation calls'), RecStat('rec_pages_eviction', 'page reconciliation calls for eviction'), - RecStat('rec_split_stashed_bytes', - 'split bytes currently awaiting free', 'no_clear,no_scale'), - RecStat('rec_split_stashed_objects', - 'split objects currently awaiting free', 'no_clear,no_scale'), + RecStat('rec_split_stashed_bytes', 'split bytes currently awaiting free', 'no_clear,no_scale'), + RecStat('rec_split_stashed_objects', 'split objects currently awaiting free', 'no_clear,no_scale'), ########################################## # Transaction statistics ########################################## TxnStat('txn_begin', 'transaction begins'), TxnStat('txn_checkpoint', 'transaction checkpoints'), - TxnStat('txn_checkpoint_generation', - 'transaction checkpoint generation', 'no_clear,no_scale'), - TxnStat('txn_checkpoint_running', - 'transaction checkpoint currently running', 'no_clear,no_scale'), - TxnStat('txn_checkpoint_time_max', - 'transaction checkpoint max time (msecs)', 'no_clear,no_scale'), - TxnStat('txn_checkpoint_time_min', - 'transaction checkpoint min time (msecs)', 'no_clear,no_scale'), - TxnStat('txn_checkpoint_time_recent', - 'transaction checkpoint most recent time (msecs)', 'no_clear,no_scale'), - TxnStat('txn_checkpoint_time_total', - 'transaction checkpoint total time (msecs)', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_generation', 'transaction checkpoint generation', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_running', 'transaction checkpoint currently running', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_time_max', 'transaction checkpoint max time (msecs)', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_time_min', 'transaction checkpoint min time (msecs)', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_time_recent', 'transaction checkpoint most recent time (msecs)', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_time_total', 'transaction checkpoint total time (msecs)', 'no_clear,no_scale'), TxnStat('txn_commit', 'transactions committed'), - TxnStat('txn_fail_cache', - 'transaction failures due to cache overflow'), - TxnStat('txn_pinned_checkpoint_range', - 'transaction range of IDs currently pinned by a checkpoint', - 'no_clear,no_scale'), - TxnStat('txn_pinned_range', - 'transaction range of IDs currently pinned', 'no_clear,no_scale'), - TxnStat('txn_pinned_snapshot_range', - 'transaction range of IDs currently pinned by named snapshots', - 'no_clear,no_scale'), + TxnStat('txn_fail_cache', 'transaction failures due to cache overflow'), + TxnStat('txn_pinned_checkpoint_range', 'transaction range of IDs currently pinned by a checkpoint', 'no_clear,no_scale'), + TxnStat('txn_pinned_range', 'transaction range of IDs currently pinned', 'no_clear,no_scale'), + TxnStat('txn_pinned_snapshot_range', 'transaction range of IDs currently pinned by named snapshots', 'no_clear,no_scale'), + TxnStat('txn_rollback', 'transactions rolled back'), TxnStat('txn_snapshots_created', 'number of named snapshots created'), TxnStat('txn_snapshots_dropped', 'number of named snapshots dropped'), - TxnStat('txn_rollback', 'transactions rolled back'), TxnStat('txn_sync', 'transaction sync calls'), ########################################## @@ -331,23 +276,18 @@ connection_stats = [ LSMStat('lsm_checkpoint_throttle', 'sleep for LSM checkpoint throttle'), LSMStat('lsm_merge_throttle', 'sleep for LSM merge throttle'), LSMStat('lsm_rows_merged', 'rows merged in an LSM tree'), - LSMStat('lsm_work_queue_app', - 'application work units currently queued', 'no_clear,no_scale'), - LSMStat('lsm_work_queue_manager', - 'merge work units currently queued', 'no_clear,no_scale'), + LSMStat('lsm_work_queue_app', 'application work units currently queued', 'no_clear,no_scale'), + LSMStat('lsm_work_queue_manager', 'merge work units currently queued', 'no_clear,no_scale'), LSMStat('lsm_work_queue_max', 'tree queue hit maximum'), - LSMStat('lsm_work_queue_switch', - 'switch work units currently queued', 'no_clear,no_scale'), + LSMStat('lsm_work_queue_switch', 'switch work units currently queued', 'no_clear,no_scale'), LSMStat('lsm_work_units_created', 'tree maintenance operations scheduled'), - LSMStat('lsm_work_units_discarded', - 'tree maintenance operations discarded'), + LSMStat('lsm_work_units_discarded', 'tree maintenance operations discarded'), LSMStat('lsm_work_units_done', 'tree maintenance operations executed'), ########################################## # Session operations ########################################## - SessionStat('session_cursor_open', - 'open cursor count', 'no_clear,no_scale'), + SessionStat('session_cursor_open', 'open cursor count', 'no_clear,no_scale'), SessionStat('session_open', 'open session count', 'no_clear,no_scale'), ########################################## @@ -385,8 +325,7 @@ dsrc_stats = [ # Session operations ########################################## SessionStat('session_compact', 'object compaction'), - SessionStat('session_cursor_open', - 'open cursor count', 'no_clear,no_scale'), + SessionStat('session_cursor_open', 'open cursor count', 'no_clear,no_scale'), ########################################## # Cursor operations @@ -394,8 +333,7 @@ dsrc_stats = [ CursorStat('cursor_create', 'create calls'), CursorStat('cursor_insert', 'insert calls'), CursorStat('cursor_insert_bulk', 'bulk-loaded cursor-insert calls'), - CursorStat('cursor_insert_bytes', - 'cursor-insert key and value bytes inserted'), + CursorStat('cursor_insert_bytes', 'cursor-insert key and value bytes inserted'), CursorStat('cursor_next', 'next calls'), CursorStat('cursor_prev', 'prev calls'), CursorStat('cursor_remove', 'remove calls'), @@ -411,33 +349,21 @@ dsrc_stats = [ ########################################## # Btree statistics ########################################## - BtreeStat('btree_checkpoint_generation', - 'btree checkpoint generation', 'no_clear,no_scale'), - BtreeStat('btree_column_deleted', - 'column-store variable-size deleted values', 'no_scale'), - BtreeStat('btree_column_fix', - 'column-store fixed-size leaf pages', 'no_scale'), - BtreeStat('btree_column_internal', - 'column-store internal pages', 'no_scale'), - BtreeStat('btree_column_rle', - 'column-store variable-size RLE encoded values', 'no_scale'), - BtreeStat('btree_column_variable', - 'column-store variable-size leaf pages', 'no_scale'), + BtreeStat('btree_checkpoint_generation', 'btree checkpoint generation', 'no_clear,no_scale'), + BtreeStat('btree_column_deleted', 'column-store variable-size deleted values', 'no_scale'), + BtreeStat('btree_column_fix', 'column-store fixed-size leaf pages', 'no_scale'), + BtreeStat('btree_column_internal', 'column-store internal pages', 'no_scale'), + BtreeStat('btree_column_rle', 'column-store variable-size RLE encoded values', 'no_scale'), + BtreeStat('btree_column_variable', 'column-store variable-size leaf pages', 'no_scale'), BtreeStat('btree_compact_rewrite', 'pages rewritten by compaction'), BtreeStat('btree_entries', 'number of key/value pairs', 'no_scale'), - BtreeStat('btree_fixed_len', 'fixed-record size', 'no_aggregate,no_scale'), - BtreeStat('btree_maximum_depth', - 'maximum tree depth', 'max_aggregate,no_scale'), - BtreeStat('btree_maxintlkey', - 'maximum internal page key size', 'max_aggregate,no_scale'), - BtreeStat('btree_maxintlpage', - 'maximum internal page size', 'max_aggregate,no_scale'), - BtreeStat('btree_maxleafkey', - 'maximum leaf page key size', 'max_aggregate,no_scale'), - BtreeStat('btree_maxleafpage', - 'maximum leaf page size', 'max_aggregate,no_scale'), - BtreeStat('btree_maxleafvalue', - 'maximum leaf page value size', 'max_aggregate,no_scale'), + BtreeStat('btree_fixed_len', 'fixed-record size', 'max_aggregate,no_scale'), + BtreeStat('btree_maximum_depth', 'maximum tree depth', 'max_aggregate,no_scale'), + BtreeStat('btree_maxintlkey', 'maximum internal page key size', 'max_aggregate,no_scale'), + BtreeStat('btree_maxintlpage', 'maximum internal page size', 'max_aggregate,no_scale'), + BtreeStat('btree_maxleafkey', 'maximum leaf page key size', 'max_aggregate,no_scale'), + BtreeStat('btree_maxleafpage', 'maximum leaf page size', 'max_aggregate,no_scale'), + BtreeStat('btree_maxleafvalue', 'maximum leaf page value size', 'max_aggregate,no_scale'), BtreeStat('btree_overflow', 'overflow pages', 'no_scale'), BtreeStat('btree_row_internal', 'row-store internal pages', 'no_scale'), BtreeStat('btree_row_leaf', 'row-store leaf pages', 'no_scale'), @@ -454,26 +380,21 @@ dsrc_stats = [ LSMStat('bloom_size', 'total size of bloom filters', 'no_scale'), LSMStat('lsm_checkpoint_throttle', 'sleep for LSM checkpoint throttle'), LSMStat('lsm_chunk_count', 'chunks in the LSM tree', 'no_scale'), - LSMStat('lsm_generation_max', - 'highest merge generation in the LSM tree', 'max_aggregate,no_scale'), - LSMStat('lsm_lookup_no_bloom', - 'queries that could have benefited ' + - 'from a Bloom filter that did not exist'), + LSMStat('lsm_generation_max', 'highest merge generation in the LSM tree', 'max_aggregate,no_scale'), + LSMStat('lsm_lookup_no_bloom', 'queries that could have benefited from a Bloom filter that did not exist'), LSMStat('lsm_merge_throttle', 'sleep for LSM merge throttle'), ########################################## # Block manager statistics ########################################## - BlockStat('allocation_size', - 'file allocation unit size', 'no_aggregate,no_scale'), + BlockStat('allocation_size', 'file allocation unit size', 'max_aggregate,no_scale'), BlockStat('block_alloc', 'blocks allocated'), BlockStat('block_checkpoint_size', 'checkpoint size', 'no_scale'), BlockStat('block_extension', 'allocations requiring file extension'), BlockStat('block_free', 'blocks freed'), - BlockStat('block_magic', 'file magic number', 'no_aggregate,no_scale'), - BlockStat('block_major', - 'file major version number', 'no_aggregate,no_scale'), - BlockStat('block_minor', 'minor version number', 'no_aggregate,no_scale'), + BlockStat('block_magic', 'file magic number', 'max_aggregate,no_scale'), + BlockStat('block_major', 'file major version number', 'max_aggregate,no_scale'), + BlockStat('block_minor', 'minor version number', 'max_aggregate,no_scale'), BlockStat('block_reuse_bytes', 'file bytes available for reuse'), BlockStat('block_size', 'file size in bytes', 'no_scale'), @@ -484,44 +405,33 @@ dsrc_stats = [ CacheStat('cache_bytes_write', 'bytes written from cache'), CacheStat('cache_eviction_checkpoint', 'checkpoint blocked page eviction'), CacheStat('cache_eviction_clean', 'unmodified pages evicted'), - CacheStat('cache_eviction_deepen', - 'page split during eviction deepened the tree'), + CacheStat('cache_eviction_deepen', 'page split during eviction deepened the tree'), CacheStat('cache_eviction_dirty', 'modified pages evicted'), - CacheStat('cache_eviction_fail', - 'data source pages selected for eviction unable to be evicted'), + CacheStat('cache_eviction_fail', 'data source pages selected for eviction unable to be evicted'), CacheStat('cache_eviction_hazard', 'hazard pointer blocked page eviction'), CacheStat('cache_eviction_internal', 'internal pages evicted'), - CacheStat('cache_eviction_split_internal', - 'internal pages split during eviction'), + CacheStat('cache_eviction_split_internal', 'internal pages split during eviction'), CacheStat('cache_eviction_split_leaf', 'leaf pages split during eviction'), CacheStat('cache_inmem_split', 'in-memory page splits'), - CacheStat('cache_inmem_splittable', - 'in-memory page passed criteria to be split'), - CacheStat('cache_overflow_value', - 'overflow values cached in memory', 'no_scale'), + CacheStat('cache_inmem_splittable', 'in-memory page passed criteria to be split'), + CacheStat('cache_overflow_value', 'overflow values cached in memory', 'no_scale'), CacheStat('cache_read', 'pages read into cache'), - CacheStat('cache_read_lookaside', - 'pages read into cache requiring lookaside entries'), + CacheStat('cache_read_lookaside', 'pages read into cache requiring lookaside entries'), CacheStat('cache_read_overflow', 'overflow pages read into cache'), CacheStat('cache_write', 'pages written from cache'), - CacheStat('cache_write_lookaside', - 'page written requiring lookaside records'), - CacheStat('cache_write_restore', - 'pages written requiring in-memory restoration'), + CacheStat('cache_write_lookaside', 'page written requiring lookaside records'), + CacheStat('cache_write_restore', 'pages written requiring in-memory restoration'), ########################################## # Compression statistics ########################################## - CompressStat('compress_raw_fail', - 'raw compression call failed, no additional data available'), - CompressStat('compress_raw_fail_temporary', - 'raw compression call failed, additional data available'), + CompressStat('compress_raw_fail', 'raw compression call failed, no additional data available'), + CompressStat('compress_raw_fail_temporary', 'raw compression call failed, additional data available'), CompressStat('compress_raw_ok', 'raw compression call succeeded'), CompressStat('compress_read', 'compressed pages read'), CompressStat('compress_write', 'compressed pages written'), CompressStat('compress_write_fail', 'page written failed to compress'), - CompressStat('compress_write_too_small', - 'page written was too small to compress'), + CompressStat('compress_write_too_small', 'page written was too small to compress'), ########################################## # Reconciliation statistics @@ -529,8 +439,7 @@ dsrc_stats = [ RecStat('rec_dictionary', 'dictionary matches'), RecStat('rec_multiblock_internal', 'internal page multi-block writes'), RecStat('rec_multiblock_leaf', 'leaf page multi-block writes'), - RecStat('rec_multiblock_max', - 'maximum blocks required for a page', 'max_aggregate,no_scale'), + RecStat('rec_multiblock_max', 'maximum blocks required for a page', 'max_aggregate,no_scale'), RecStat('rec_overflow_key_internal', 'internal-page overflow keys'), RecStat('rec_overflow_key_leaf', 'leaf-page overflow keys'), RecStat('rec_overflow_value', 'overflow values written'), @@ -539,10 +448,8 @@ dsrc_stats = [ RecStat('rec_page_match', 'page checksum matches'), RecStat('rec_pages', 'page reconciliation calls'), RecStat('rec_pages_eviction', 'page reconciliation calls for eviction'), - RecStat('rec_prefix_compression', - 'leaf page key bytes discarded using prefix compression'), - RecStat('rec_suffix_compression', - 'internal page key bytes discarded using suffix compression'), + RecStat('rec_prefix_compression', 'leaf page key bytes discarded using prefix compression'), + RecStat('rec_suffix_compression', 'internal page key bytes discarded using suffix compression'), ########################################## # Transaction statistics diff --git a/src/block/block_compact.c b/src/block/block_compact.c index d45d0a96da7..cd304b848d4 100644 --- a/src/block/block_compact.c +++ b/src/block/block_compact.c @@ -8,7 +8,7 @@ #include "wt_internal.h" -static int __block_dump_avail(WT_SESSION_IMPL *, WT_BLOCK *); +static int __block_dump_avail(WT_SESSION_IMPL *, WT_BLOCK *, bool); /* * __wt_block_compact_start -- @@ -22,8 +22,6 @@ __wt_block_compact_start(WT_SESSION_IMPL *session, WT_BLOCK *block) /* Switch to first-fit allocation. */ __wt_block_configure_first_fit(block, true); - block->compact_pct_tenths = 0; - return (0); } @@ -34,14 +32,21 @@ __wt_block_compact_start(WT_SESSION_IMPL *session, WT_BLOCK *block) int __wt_block_compact_end(WT_SESSION_IMPL *session, WT_BLOCK *block) { + WT_DECL_RET; + WT_UNUSED(session); /* Restore the original allocation plan. */ __wt_block_configure_first_fit(block, false); - block->compact_pct_tenths = 0; + /* Dump the results of the compaction pass. */ + if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT)) { + __wt_spin_lock(session, &block->live_lock); + ret = __block_dump_avail(session, block, false); + __wt_spin_unlock(session, &block->live_lock); + } - return (0); + return (ret); } /* @@ -70,12 +75,23 @@ __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, bool *skipp) if (fh->size <= WT_MEGABYTE) return (0); + /* + * Reset the compaction state information. This is done here, not in the + * compaction "start" routine, because this function is called first to + * determine if compaction is useful. + */ + block->compact_pct_tenths = 0; + block->compact_pages_reviewed = 0; + block->compact_pages_skipped = 0; + block->compact_pages_written = 0; + __wt_spin_lock(session, &block->live_lock); + /* Dump the current state of the file. */ if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT)) - WT_ERR(__block_dump_avail(session, block)); + WT_ERR(__block_dump_avail(session, block, true)); - /* Sum the available bytes in the first 80% and 90% of the file. */ + /* Sum the available bytes in the initial 80% and 90% of the file. */ avail_eighty = avail_ninety = 0; ninety = fh->size - fh->size / 10; eighty = fh->size - ((fh->size / 10) * 2); @@ -88,23 +104,6 @@ __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, bool *skipp) avail_eighty += ext->size; } - WT_ERR(__wt_verbose(session, WT_VERB_COMPACT, - "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first " - "80%% of the file", - block->name, - (uintmax_t)avail_eighty / WT_MEGABYTE, (uintmax_t)avail_eighty)); - WT_ERR(__wt_verbose(session, WT_VERB_COMPACT, - "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first " - "90%% of the file", - block->name, - (uintmax_t)avail_ninety / WT_MEGABYTE, (uintmax_t)avail_ninety)); - WT_ERR(__wt_verbose(session, WT_VERB_COMPACT, - "%s: require 10%% or %" PRIuMAX "MB (%" PRIuMAX ") in the first " - "90%% of the file to perform compaction, compaction %s", - block->name, - (uintmax_t)(fh->size / 10) / WT_MEGABYTE, (uintmax_t)fh->size / 10, - *skipp ? "skipped" : "proceeding")); - /* * Skip files where we can't recover at least 1MB. * @@ -127,6 +126,23 @@ __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, bool *skipp) block->compact_pct_tenths = 1; } + WT_ERR(__wt_verbose(session, WT_VERB_COMPACT, + "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first " + "80%% of the file", + block->name, + (uintmax_t)avail_eighty / WT_MEGABYTE, (uintmax_t)avail_eighty)); + WT_ERR(__wt_verbose(session, WT_VERB_COMPACT, + "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first " + "90%% of the file", + block->name, + (uintmax_t)avail_ninety / WT_MEGABYTE, (uintmax_t)avail_ninety)); + WT_ERR(__wt_verbose(session, WT_VERB_COMPACT, + "%s: require 10%% or %" PRIuMAX "MB (%" PRIuMAX ") in the first " + "90%% of the file to perform compaction, compaction %s", + block->name, + (uintmax_t)(fh->size / 10) / WT_MEGABYTE, (uintmax_t)fh->size / 10, + *skipp ? "skipped" : "proceeding")); + err: __wt_spin_unlock(session, &block->live_lock); return (ret); @@ -177,6 +193,14 @@ __wt_block_compact_page_skip(WT_SESSION_IMPL *session, } __wt_spin_unlock(session, &block->live_lock); + if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT)) { + ++block->compact_pages_reviewed; + if (*skipp) + ++block->compact_pages_skipped; + else + ++block->compact_pages_written; + } + return (ret); } @@ -185,7 +209,7 @@ __wt_block_compact_page_skip(WT_SESSION_IMPL *session, * Dump out the avail list so we can see what compaction will look like. */ static int -__block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block) +__block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block, bool start) { WT_EXTLIST *el; WT_EXT *ext; @@ -196,6 +220,20 @@ __block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block) size = block->fh->size; WT_RET(__wt_verbose(session, WT_VERB_COMPACT, + "============ %s", + start ? "testing for compaction" : "ending compaction pass")); + + if (!start) { + WT_RET(__wt_verbose(session, WT_VERB_COMPACT, + "pages reviewed: %" PRIuMAX, + block->compact_pages_reviewed)); + WT_RET(__wt_verbose(session, WT_VERB_COMPACT, + "pages skipped: %" PRIuMAX, block->compact_pages_skipped)); + WT_RET(__wt_verbose(session, WT_VERB_COMPACT, + "pages written: %" PRIuMAX, block->compact_pages_written)); + } + + WT_RET(__wt_verbose(session, WT_VERB_COMPACT, "file size %" PRIuMAX "MB (%" PRIuMAX ") with %" PRIuMAX "%% space available %" PRIuMAX "MB (%" PRIuMAX ")", (uintmax_t)size / WT_MEGABYTE, (uintmax_t)size, @@ -219,6 +257,10 @@ __block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block) } #ifdef __VERBOSE_OUTPUT_PERCENTILE + /* + * The verbose output always displays 10% buckets, running this code + * as well also displays 1% buckets. + */ for (i = 0; i < WT_ELEMENTS(percentile); ++i) { v = percentile[i] * 512; WT_RET(__wt_verbose(session, WT_VERB_COMPACT, diff --git a/src/block/block_mgr.c b/src/block/block_mgr.c index 7260cab75d9..f9f66e05d7f 100644 --- a/src/block/block_mgr.c +++ b/src/block/block_mgr.c @@ -221,6 +221,18 @@ __bm_free(WT_BM *bm, } /* + * __bm_is_mapped -- + * Return if the file is mapped into memory. + */ +static bool +__bm_is_mapped(WT_BM *bm, WT_SESSION_IMPL *session) +{ + WT_UNUSED(session); + + return (bm->map == NULL ? false : true); +} + +/* * __bm_stat -- * Block-manager statistics. */ @@ -357,6 +369,7 @@ __bm_method_set(WT_BM *bm, bool readonly) (int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly; bm->free = (int (*)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t))__bm_readonly; + bm->is_mapped = __bm_is_mapped; bm->preload = __wt_bm_preload; bm->read = __wt_bm_read; bm->salvage_end = (int (*) @@ -367,6 +380,7 @@ __bm_method_set(WT_BM *bm, bool readonly) (WT_BM *, WT_SESSION_IMPL *))__bm_readonly; bm->salvage_valid = (int (*)(WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t, bool))__bm_readonly; + bm->size = __wt_block_manager_size; bm->stat = __bm_stat; bm->sync = (int (*)(WT_BM *, WT_SESSION_IMPL *, bool))__bm_readonly; @@ -391,12 +405,14 @@ __bm_method_set(WT_BM *bm, bool readonly) bm->compact_skip = __bm_compact_skip; bm->compact_start = __bm_compact_start; bm->free = __bm_free; + bm->is_mapped = __bm_is_mapped; bm->preload = __wt_bm_preload; bm->read = __wt_bm_read; bm->salvage_end = __bm_salvage_end; bm->salvage_next = __bm_salvage_next; bm->salvage_start = __bm_salvage_start; bm->salvage_valid = __bm_salvage_valid; + bm->size = __wt_block_manager_size; bm->stat = __bm_stat; bm->sync = __bm_sync; bm->verify_addr = __bm_verify_addr; diff --git a/src/block/block_open.c b/src/block/block_open.c index 7cf12d36066..ff70b765d1f 100644 --- a/src/block/block_open.c +++ b/src/block/block_open.c @@ -405,27 +405,37 @@ __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats) * Reading from the live system's structure normally requires locking, * but it's an 8B statistics read, there's no need. */ - stats->allocation_size = block->allocsize; - stats->block_checkpoint_size = (int64_t)block->live.ckpt_size; - stats->block_magic = WT_BLOCK_MAGIC; - stats->block_major = WT_BLOCK_MAJOR_VERSION; - stats->block_minor = WT_BLOCK_MINOR_VERSION; - stats->block_reuse_bytes = (int64_t)block->live.avail.bytes; - stats->block_size = block->fh->size; + WT_STAT_WRITE(stats, allocation_size, block->allocsize); + WT_STAT_WRITE( + stats, block_checkpoint_size, (int64_t)block->live.ckpt_size); + WT_STAT_WRITE(stats, block_magic, WT_BLOCK_MAGIC); + WT_STAT_WRITE(stats, block_major, WT_BLOCK_MAJOR_VERSION); + WT_STAT_WRITE(stats, block_minor, WT_BLOCK_MINOR_VERSION); + WT_STAT_WRITE( + stats, block_reuse_bytes, (int64_t)block->live.avail.bytes); + WT_STAT_WRITE(stats, block_size, block->fh->size); } /* * __wt_block_manager_size -- - * Set the size statistic for a file. + * Return the size of a live block handle. */ int -__wt_block_manager_size( - WT_SESSION_IMPL *session, const char *filename, WT_DSRC_STATS *stats) +__wt_block_manager_size(WT_BM *bm, WT_SESSION_IMPL *session, wt_off_t *sizep) { - wt_off_t filesize; - - WT_RET(__wt_filesize_name(session, filename, false, &filesize)); - stats->block_size = filesize; + WT_UNUSED(session); + *sizep = bm->block->fh == NULL ? 0 : bm->block->fh->size; return (0); } + +/* + * __wt_block_manager_named_size -- + * Return the size of a named file. + */ +int +__wt_block_manager_named_size( + WT_SESSION_IMPL *session, const char *name, wt_off_t *sizep) +{ + return (__wt_filesize_name(session, name, false, sizep)); +} diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c index b2c9e4b67f8..8935d39b696 100644 --- a/src/btree/bt_compact.c +++ b/src/btree/bt_compact.c @@ -17,9 +17,11 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_BM *bm; WT_DECL_RET; + WT_MULTI *multi; WT_PAGE *page; WT_PAGE_MODIFY *mod; size_t addr_size; + uint32_t i; const uint8_t *addr; *skipp = true; /* Default skip. */ @@ -41,29 +43,46 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) /* * If the page is clean, test the original addresses. - * If the page is a 1-to-1 replacement, test the replacement addresses. + * If the page is a replacement, test the replacement addresses. * Ignore empty pages, they get merged into the parent. */ if (mod == NULL || mod->rec_result == 0) { - WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); + __wt_ref_info(ref, &addr, &addr_size, NULL); if (addr == NULL) return (0); - WT_RET( + return ( bm->compact_page_skip(bm, session, addr, addr_size, skipp)); - } else if (mod->rec_result == WT_PM_REC_REPLACE) { - /* - * The page's modification information can change underfoot if - * the page is being reconciled, serialize with reconciliation. - */ + } + + /* + * The page's modification information can change underfoot if the page + * is being reconciled, serialize with reconciliation. + */ + if (mod->rec_result == WT_PM_REC_REPLACE || + mod->rec_result == WT_PM_REC_MULTIBLOCK) WT_RET(__wt_fair_lock(session, &page->page_lock)); + if (mod->rec_result == WT_PM_REC_REPLACE) ret = bm->compact_page_skip(bm, session, mod->mod_replace.addr, mod->mod_replace.size, skipp); + if (mod->rec_result == WT_PM_REC_MULTIBLOCK) + for (multi = mod->mod_multi, + i = 0; i < mod->mod_multi_entries; ++multi, ++i) { + if (multi->disk_image != NULL) + continue; + if ((ret = bm->compact_page_skip(bm, session, + multi->addr.addr, multi->addr.size, skipp)) != 0) + break; + if (!*skipp) + break; + } + + if (mod->rec_result == WT_PM_REC_REPLACE || + mod->rec_result == WT_PM_REC_MULTIBLOCK) WT_TRET(__wt_fair_unlock(session, &page->page_lock)); - WT_RET(ret); - } - return (0); + + return (ret); } /* @@ -130,7 +149,7 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) * read, set its generation to a low value so it is evicted * quickly. */ - WT_ERR(__wt_tree_walk(session, &ref, NULL, + WT_ERR(__wt_tree_walk(session, &ref, WT_READ_COMPACT | WT_READ_NO_GEN | WT_READ_WONT_NEED)); if (ref == NULL) break; @@ -139,7 +158,8 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) if (skip) continue; - session->compaction = true; + session->compact_state = WT_COMPACT_SUCCESS; + /* Rewrite the page: mark the page and tree dirty. */ WT_ERR(__wt_page_modify_init(session, ref->page)); __wt_page_modify_set(session, ref->page); @@ -182,7 +202,7 @@ __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) * address, the page isn't on disk, but we have to read internal pages * to walk the tree regardless; throw up our hands and read it. */ - WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, &type)); + __wt_ref_info(ref, &addr, &addr_size, &type); if (addr == NULL) return (0); diff --git a/src/btree/bt_curnext.c b/src/btree/bt_curnext.c index 3c96bad39d7..6573bc60165 100644 --- a/src/btree/bt_curnext.c +++ b/src/btree/bt_curnext.c @@ -389,6 +389,14 @@ __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt) */ cbt->page_deleted_count = 0; +#ifdef HAVE_DIAGNOSTIC + /* + * If starting a new iteration, clear the last-key returned, it doesn't + * apply. + */ + cbt->lastkey->size = 0; + cbt->lastrecno = WT_RECNO_OOB; +#endif /* * If we don't have a search page, then we're done, we're starting at * the beginning or end of the tree, not as a result of a search. @@ -430,6 +438,104 @@ __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt) } } +#ifdef HAVE_DIAGNOSTIC +/* + * __cursor_key_order_check_col -- + * Check key ordering for column-store cursor movements. + */ +static int +__cursor_key_order_check_col( + WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next) +{ + int cmp; + + cmp = 0; /* -Werror=maybe-uninitialized */ + + if (cbt->lastrecno != WT_RECNO_OOB) { + if (cbt->lastrecno < cbt->recno) + cmp = -1; + if (cbt->lastrecno > cbt->recno) + cmp = 1; + } + + if (cbt->lastrecno == WT_RECNO_OOB || + (next && cmp < 0) || (!next && cmp > 0)) { + cbt->lastrecno = cbt->recno; + return (0); + } + + WT_PANIC_RET(session, EINVAL, + "WT_CURSOR.%s out-of-order returns: returned key %" PRIu64 " then " + "key %" PRIu64, + next ? "next" : "prev", cbt->lastrecno, cbt->recno); +} + +/* + * __cursor_key_order_check_row -- + * Check key ordering for row-store cursor movements. + */ +static int +__cursor_key_order_check_row( + WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next) +{ + WT_BTREE *btree; + WT_ITEM *key; + WT_DECL_RET; + WT_DECL_ITEM(a); + WT_DECL_ITEM(b); + int cmp; + + btree = S2BT(session); + key = &cbt->iface.key; + cmp = 0; /* -Werror=maybe-uninitialized */ + + if (cbt->lastkey->size != 0) + WT_RET(__wt_compare( + session, btree->collator, cbt->lastkey, key, &cmp)); + + if (cbt->lastkey->size == 0 || (next && cmp < 0) || (!next && cmp > 0)) + return (__wt_buf_set(session, cbt->lastkey, + cbt->iface.key.data, cbt->iface.key.size)); + + WT_ERR(__wt_scr_alloc(session, 512, &a)); + WT_ERR(__wt_buf_set_printable( + session, a, cbt->lastkey->data, cbt->lastkey->size)); + + WT_ERR(__wt_scr_alloc(session, 512, &b)); + WT_ERR(__wt_buf_set_printable(session, b, key->data, key->size)); + + WT_PANIC_ERR(session, EINVAL, + "WT_CURSOR.%s out-of-order returns: returned key %.*s then " + "key %.*s", + next ? "next" : "prev", + (int)a->size, (const char *)a->data, + (int)b->size, (const char *)b->data); + +err: __wt_scr_free(session, &a); + __wt_scr_free(session, &b); + + return (ret); +} + +/* + * __wt_cursor_key_order_check -- + * Check key ordering for cursor movements. + */ +int +__wt_cursor_key_order_check( + WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next) +{ + switch (cbt->ref->page->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + return (__cursor_key_order_check_col(session, cbt, next)); + case WT_PAGE_ROW_LEAF: + return (__cursor_key_order_check_row(session, cbt, next)); + WT_ILLEGAL_VALUE(session); + } +} +#endif + /* * __wt_btcur_next -- * Move to the next record in the tree. @@ -527,10 +633,15 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) __wt_page_evict_soon(page); cbt->page_deleted_count = 0; - WT_ERR(__wt_tree_walk(session, &cbt->ref, NULL, flags)); + WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } +#ifdef HAVE_DIAGNOSTIC + if (ret == 0) + WT_ERR(__wt_cursor_key_order_check(session, cbt, true)); +#endif + err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); diff --git a/src/btree/bt_curprev.c b/src/btree/bt_curprev.c index b7cea561b48..1e4b1daa090 100644 --- a/src/btree/bt_curprev.c +++ b/src/btree/bt_curprev.c @@ -615,9 +615,13 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) __wt_page_evict_soon(page); cbt->page_deleted_count = 0; - WT_ERR(__wt_tree_walk(session, &cbt->ref, NULL, flags)); + WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } +#ifdef HAVE_DIAGNOSTIC + if (ret == 0) + WT_ERR(__wt_cursor_key_order_check(session, cbt, false)); +#endif err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index 69512f45933..28b51fd2865 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -62,8 +62,18 @@ __cursor_size_chk(WT_SESSION_IMPL *session, WT_ITEM *kv) static inline int __cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt) { - return (btree->type == BTREE_COL_FIX && - !F_ISSET(cbt, WT_CBT_MAX_RECORD)); + /* + * When there's no exact match, column-store search returns the key + * nearest the searched-for key (continuing past keys smaller than the + * searched-for key to return the next-largest key). Therefore, if the + * returned comparison is -1, the searched-for key was larger than any + * row on the page's standard information or column-store insert list. + * + * If the returned comparison is NOT -1, there was a row equal to or + * larger than the searched-for key, and we implicitly create missing + * rows. + */ + return (btree->type == BTREE_COL_FIX && cbt->compare != -1); } /* @@ -502,19 +512,14 @@ retry: WT_RET(__cursor_func_init(cbt, true)); case BTREE_COL_VAR: /* * If WT_CURSTD_APPEND is set, insert a new record (ignoring - * the application's record number). First we search for the - * maximum possible record number so the search ends on the - * last page. The real record number is assigned by the - * serialized append operation. + * the application's record number). The real record number + * is assigned by the serialized append operation. */ if (F_ISSET(cursor, WT_CURSTD_APPEND)) - cbt->iface.recno = UINT64_MAX; + cbt->iface.recno = WT_RECNO_OOB; WT_ERR(__cursor_col_search(session, cbt, NULL)); - if (F_ISSET(cursor, WT_CURSTD_APPEND)) - cbt->iface.recno = WT_RECNO_OOB; - /* * If not overwriting, fail if the key exists. Creating a * record past the end of the tree in a fixed-length @@ -816,7 +821,12 @@ err: if (ret == WT_RESTART) { /* * __wt_btcur_next_random -- - * Move to a random record in the tree. + * Move to a random record in the tree. There are two algorithms, one + * where we select a record at random from the whole tree on each + * retrieval and one where we first select a record at random from the + * whole tree, and then subsequently sample forward from that location. + * The sampling approach allows us to select reasonably uniform random + * points from unbalanced trees. */ int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) @@ -825,6 +835,8 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; + wt_off_t size; + uint64_t skip; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = cbt->btree; @@ -839,11 +851,65 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_STAT_FAST_CONN_INCR(session, cursor_next); WT_STAT_FAST_DATA_INCR(session, cursor_next); - WT_RET(__cursor_func_init(cbt, true)); + /* + * If retrieving random values without sampling, or we don't have a + * page reference, pick a roughly random leaf page in the tree. + */ + if (cbt->ref == NULL || cbt->next_random_sample_size == 0) { + /* + * Skip past the sample size of the leaf pages in the tree + * between each random key return to compensate for unbalanced + * trees. + * + * Use the underlying file size divided by its block allocation + * size as our guess of leaf pages in the file (this can be + * entirely wrong, as it depends on how many pages are in this + * particular checkpoint, how large the leaf and internal pages + * really are, and other factors). Then, divide that value by + * the configured sample size and increment the final result to + * make sure tiny files don't leave us with a skip value of 0. + * + * !!! + * Ideally, the number would be prime to avoid restart issues. + */ + if (cbt->next_random_sample_size != 0) { + WT_ERR(btree->bm->size(btree->bm, session, &size)); + cbt->next_random_leaf_skip = (uint64_t) + ((size / btree->allocsize) / + cbt->next_random_sample_size) + 1; + } + + /* + * Choose a leaf page from the tree. + */ + WT_ERR(__cursor_func_init(cbt, true)); + WT_WITH_PAGE_INDEX( + session, ret = __wt_row_random_descent(session, cbt)); + WT_ERR(ret); + } else { + /* + * Read through the tree, skipping leaf pages. Be cautious about + * the skip count: if the last leaf page skipped was also the + * last leaf page in the tree, it may be set to zero on return + * with the end-of-walk condition. + * + * Pages read for data sampling aren't "useful"; don't update + * the read generation of pages already in memory, and if a page + * is read, set its generation to a low value so it is evicted + * quickly. + */ + for (skip = + cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) + WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip, + WT_READ_NO_GEN | + WT_READ_SKIP_INTL | WT_READ_WONT_NEED)); + } - WT_WITH_PAGE_INDEX(session, - ret = __wt_row_random(session, cbt)); - WT_ERR(ret); + /* + * Select a random entry from the leaf page. If it's not valid, move to + * the next entry, if that doesn't work, move to the previous entry. + */ + WT_ERR(__wt_row_random_leaf(session, cbt)); if (__cursor_valid(cbt, &upd)) WT_ERR(__wt_kv_return(session, cbt, upd)); else { @@ -851,9 +917,9 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) ret = __wt_btcur_prev(cbt, false); WT_ERR(ret); } + return (0); -err: if (ret != 0) - WT_TRET(__cursor_reset(cbt)); +err: WT_TRET(__cursor_reset(cbt)); return (ret); } @@ -1167,6 +1233,11 @@ __wt_btcur_open(WT_CURSOR_BTREE *cbt) { cbt->row_key = &cbt->_row_key; cbt->tmp = &cbt->_tmp; + +#ifdef HAVE_DIAGNOSTIC + cbt->lastkey = &cbt->_lastkey; + cbt->lastrecno = WT_RECNO_OOB; +#endif } /* @@ -1192,6 +1263,9 @@ __wt_btcur_close(WT_CURSOR_BTREE *cbt, bool lowlevel) __wt_buf_free(session, &cbt->_row_key); __wt_buf_free(session, &cbt->_tmp); +#ifdef HAVE_DIAGNOSTIC + __wt_buf_free(session, &cbt->_lastkey); +#endif return (ret); } diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index 0f47c060daf..393f869ece9 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -43,7 +43,7 @@ static int __debug_page_col_var(WT_DBG *, WT_PAGE *); static int __debug_page_metadata(WT_DBG *, WT_PAGE *); static int __debug_page_row_int(WT_DBG *, WT_PAGE *, uint32_t); static int __debug_page_row_leaf(WT_DBG *, WT_PAGE *); -static int __debug_ref(WT_DBG *, WT_REF *); +static void __debug_ref(WT_DBG *, WT_REF *); static void __debug_row_skip(WT_DBG *, WT_INSERT_HEAD *); static int __debug_tree( WT_SESSION_IMPL *, WT_BTREE *, WT_PAGE *, const char *, uint32_t); @@ -74,9 +74,7 @@ __wt_debug_set_verbose(WT_SESSION_IMPL *session, const char *v) static inline void __debug_hex_byte(WT_DBG *ds, uint8_t v) { - static const char hex[] = "0123456789abcdef"; - - __dmsg(ds, "#%c%c", hex[(v & 0xf0) >> 4], hex[v & 0x0f]); + __dmsg(ds, "#%c%c", __wt_hex[(v & 0xf0) >> 4], __wt_hex[v & 0x0f]); } /* @@ -769,7 +767,7 @@ __debug_page_col_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags) WT_INTL_FOREACH_BEGIN(session, page, ref) { __dmsg(ds, "\trecno %" PRIu64 "\n", ref->key.recno); - WT_RET(__debug_ref(ds, ref)); + __debug_ref(ds, ref); } WT_INTL_FOREACH_END; if (LF_ISSET(WT_DEBUG_TREE_WALK)) @@ -843,7 +841,7 @@ __debug_page_row_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags) WT_INTL_FOREACH_BEGIN(session, page, ref) { __wt_ref_key(page, ref, &p, &len); __debug_item(ds, "K", p, len); - WT_RET(__debug_ref(ds, ref)); + __debug_ref(ds, ref); } WT_INTL_FOREACH_END; if (LF_ISSET(WT_DEBUG_TREE_WALK)) @@ -965,7 +963,7 @@ __debug_update(WT_DBG *ds, WT_UPDATE *upd, bool hexbyte) * __debug_ref -- * Dump a WT_REF structure. */ -static int +static void __debug_ref(WT_DBG *ds, WT_REF *ref) { WT_SESSION_IMPL *session; @@ -994,14 +992,14 @@ __debug_ref(WT_DBG *ds, WT_REF *ref) case WT_REF_SPLIT: __dmsg(ds, "split"); break; - WT_ILLEGAL_VALUE(session); + default: + __dmsg(ds, "INVALID"); + break; } - WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); + __wt_ref_info(ref, &addr, &addr_size, NULL); __dmsg(ds, " %s\n", __wt_addr_string(session, addr, addr_size, ds->tmp)); - - return (0); } /* diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index 294cc399d65..a6330326954 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -601,7 +601,7 @@ __btree_preload(WT_SESSION_IMPL *session) /* Pre-load the second-level internal pages. */ WT_INTL_FOREACH_BEGIN(session, btree->root.page, ref) { - WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); + __wt_ref_info(ref, &addr, &addr_size, NULL); if (addr != NULL) WT_RET(bm->preload(bm, session, addr, addr_size)); } WT_INTL_FOREACH_END; @@ -622,7 +622,7 @@ __btree_get_last_recno(WT_SESSION_IMPL *session) btree = S2BT(session); next_walk = NULL; - WT_RET(__wt_tree_walk(session, &next_walk, NULL, WT_READ_PREV)); + WT_RET(__wt_tree_walk(session, &next_walk, WT_READ_PREV)); if (next_walk == NULL) return (WT_NOTFOUND); diff --git a/src/btree/bt_huffman.c b/src/btree/bt_huffman.c index d9ff9616072..a34e57796a8 100644 --- a/src/btree/bt_huffman.c +++ b/src/btree/bt_huffman.c @@ -332,11 +332,17 @@ __wt_huffman_read(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *ip, for (tp = table, lineno = 1; (ret = fscanf(fp, "%" SCNi64 " %" SCNi64, &symbol, &frequency)) != EOF; ++tp, ++lineno) { - if (lineno > entries) + /* + * Entries is 0-based, that is, there are (entries +1) possible + * values that can be configured. The line number is 1-based, so + * adjust the test for too many entries, and report (entries +1) + * in the error as the maximum possible number of entries. + */ + if (lineno > entries + 1) WT_ERR_MSG(session, EINVAL, "Huffman table file %.*s is corrupted, " "more than %" PRIu32 " entries", - (int)ip->len, ip->str, entries); + (int)ip->len, ip->str, entries + 1); if (ret != 2) WT_ERR_MSG(session, EINVAL, "line %u of Huffman table file %.*s is corrupted: " diff --git a/src/btree/bt_misc.c b/src/btree/bt_misc.c index d2b16bb5d21..a60499ef8b7 100644 --- a/src/btree/bt_misc.c +++ b/src/btree/bt_misc.c @@ -101,7 +101,7 @@ __wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf) return (buf->data); } - (void)__wt_ref_info(session, ref, &addr, &addr_size, NULL); + __wt_ref_info(ref, &addr, &addr_size, NULL); return (__wt_addr_string(session, addr, addr_size, buf)); } diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c index 8808f0b1a85..fdccf033828 100644 --- a/src/btree/bt_page.c +++ b/src/btree/bt_page.c @@ -272,7 +272,7 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page) const WT_PAGE_HEADER *dsk; WT_PAGE_INDEX *pindex; WT_REF **refp, *ref; - uint32_t i; + uint32_t hint, i; btree = S2BT(session); dsk = page->dsk; @@ -284,9 +284,11 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page) */ pindex = WT_INTL_INDEX_GET_SAFE(page); refp = pindex->index; + hint = 0; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { ref = *refp++; ref->home = page; + ref->pindex_hint = hint++; __wt_cell_unpack(cell, unpack); ref->addr = cell; @@ -404,7 +406,7 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep) const WT_PAGE_HEADER *dsk; WT_PAGE_INDEX *pindex; WT_REF *ref, **refp; - uint32_t i; + uint32_t hint, i; bool overflow_keys; btree = S2BT(session); @@ -421,9 +423,11 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep) pindex = WT_INTL_INDEX_GET_SAFE(page); refp = pindex->index; overflow_keys = false; + hint = 0; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { ref = *refp; ref->home = page; + ref->pindex_hint = hint++; __wt_cell_unpack(cell, unpack); switch (unpack->type) { diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index 77215474359..c50f97bbe14 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -375,7 +375,7 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref) * Get the address: if there is no address, the page was deleted, but a * subsequent search or insert is forcing re-creation of the name space. */ - WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); + __wt_ref_info(ref, &addr, &addr_size, NULL); if (addr == NULL) { WT_ASSERT(session, previous_state == WT_REF_DELETED); diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c index 756ffd98f3a..b5c299b9ea9 100644 --- a/src/btree/bt_slvg.c +++ b/src/btree/bt_slvg.c @@ -1807,7 +1807,7 @@ err: if (page != NULL) */ static int __slvg_row_build_internal( - WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF *ss) + WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF *ss) { WT_ADDR *addr; WT_DECL_RET; diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 631aca0d5c0..69c787c9385 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -15,6 +15,22 @@ } while (0) /* + * A note on error handling: main split functions first allocate/initialize new + * structures; failures during that period are handled by discarding the memory + * and returning an error code, the caller knows the split didn't happen and + * proceeds accordingly. Second, split functions update the tree, and a failure + * in that period is catastrophic, any partial update to the tree requires a + * panic, we can't recover. Third, once the split is complete and the tree has + * been fully updated, we have to ignore most errors, the split is complete and + * correct, callers have to proceed accordingly. + */ +typedef enum { + WT_ERR_IGNORE, /* Ignore minor errors */ + WT_ERR_PANIC, /* Panic on all errors */ + WT_ERR_RETURN /* Clean up and return error */ +} WT_SPLIT_ERROR_PHASE; + +/* * __split_oldest_gen -- * Calculate the oldest active split generation. */ @@ -190,6 +206,8 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) case WT_PAGE_COL_INT: recno = 0; /* Less than any valid record number. */ WT_INTL_FOREACH_BEGIN(session, page, ref) { + WT_ASSERT(session, ref->home == page); + WT_ASSERT(session, ref->key.recno > recno); recno = ref->key.recno; } WT_INTL_FOREACH_END; @@ -202,6 +220,8 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) first = true; WT_INTL_FOREACH_BEGIN(session, page, ref) { + WT_ASSERT(session, ref->home == page); + __wt_ref_key(page, ref, &next->data, &next->size); if (last->size == 0) { if (first) @@ -328,7 +348,7 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, /* * If there's no address (the page has never been written), or the * address has been instantiated, there's no work to do. Otherwise, - * get the address from the on-page cell. + * instantiate the address in-memory, from the on-page cell. */ addr = ref->addr; if (addr != NULL && !__wt_off_page(from_home, addr)) { @@ -363,65 +383,101 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, } /* - * __split_child_block_evict_and_split -- - * Ensure the newly created child isn't evicted or split for now. + * __split_ref_step1 -- + * Prepare a set of WT_REFs for a move. */ static void -__split_child_block_evict_and_split(WT_PAGE *child) +__split_ref_step1( + WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first) { + WT_PAGE *child; + WT_REF *child_ref, *ref; + uint32_t i, j; + + /* The newly created subtree is complete. */ + WT_WRITE_BARRIER(); + /* - * Once the split is live, newly created internal pages might be evicted - * and their WT_REF structures freed. If that happens before all threads - * exit the index of the page which previously "owned" the WT_REF, a - * thread might see a freed WT_REF. To ensure that doesn't happen, the - * newly created page's modify structure has a field with a transaction - * ID that's checked before any internal page is evicted. Unfortunately, - * we don't know the correct value until we update the original page's - * index (we need a transaction ID from after that update), but the act - * of updating the original page's index is what allows the eviction to - * happen. - * - * Once the split is live, newly created internal pages might themselves - * split. The split itself is not the problem: if a page splits before - * we fix up its WT_REF (in other words, a WT_REF we move is then moved - * again, before we reset the underlying page's parent reference), it's - * OK because the test we use to find a WT_REF and WT_PAGE that require - * fixing up is only that the WT_REF points to the wrong parent, not it - * points to a specific wrong parent. The problem is our fix up of the - * WT_REFs in the created page could race with the subsequent fix of the - * same WT_REFs (in a different created page), we'd have to acquire some - * lock to prevent that race, and that's going to be difficult at best. - * - * For now, block eviction and splits in newly created pages until they - * have been fixed up. + * Update the moved WT_REFs so threads moving through them start looking + * at the created children's page index information. Because we've not + * yet updated the page index of the parent page into which we are going + * to split this subtree, a cursor moving through these WT_REFs will + * ascend into the created children, but eventually fail as that parent + * page won't yet know about the created children pages. That's OK, we + * spin there until the parent's page index is updated. */ - F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); + for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) { + ref = pindex->index[i]; + child = ref->page; + + /* + * Block eviction and splits in newly created pages. + * + * Once the split is live, newly created internal pages might be + * evicted and their WT_REF structures freed. If that happened + * before all threads exit the index of the page that previously + * "owned" the WT_REF, a thread might see a freed WT_REF. To + * ensure that doesn't happen, the newly created page's modify + * structure has a field with a transaction ID that's checked + * before any internal page is evicted. Unfortunately, we don't + * know the correct value until we update the original page's + * index (we need a transaction ID from after that update), but + * the act of updating the original page's index is what allows + * the eviction to happen. + * + * Split blocking was because historic versions of the split + * code didn't update the WT_REF.home field until after the + * split was live, so the WT_REF.home fields being updated could + * split again before the update, there's a race between splits + * as to which would update them first. The current code updates + * the WT_REF.home fields before going live (in this function), + * this shouldn't be an issue, but for now splits remain turned + * off. + */ + F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); + + /* + * We use a page flag to prevent the child from splitting from + * underneath us, but the split-generation error checks don't + * know about that flag; use the standard macros to ensure that + * reading the child's page index structure is safe. + */ + j = 0; + WT_ENTER_PAGE_INDEX(session); + WT_INTL_FOREACH_BEGIN(session, child, child_ref) { + child_ref->home = child; + child_ref->pindex_hint = j++; + } WT_INTL_FOREACH_END; + WT_LEAVE_PAGE_INDEX(session); + +#ifdef HAVE_DIAGNOSTIC + WT_WITH_PAGE_INDEX(session, + __split_verify_intl_key_order(session, child)); +#endif + } } /* - * __split_ref_move_final -- - * Finalize the moved WT_REF structures after the split succeeds. + * __split_ref_step2 -- + * Allow the newly created children to be evicted or split. */ static int -__split_ref_move_final( - WT_SESSION_IMPL *session, WT_REF **refp, uint32_t entries) +__split_ref_step2( + WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first) { WT_DECL_RET; WT_PAGE *child; - WT_REF *ref, *child_ref; + WT_REF *ref; uint32_t i; /* - * The WT_REF structures moved to newly allocated child pages reference - * the wrong parent page and we have to fix that up. The problem is - * revealed when a thread of control searches for the child page's - * reference structure slot, and fails to find it because the parent - * page being searched no longer references the child. When that failure - * happens the thread waits for the reference's home page to be updated, - * which we do here: walk the children and fix them up. + * The split has gone live, enable eviction and splits on the newly + * created internal pages. */ - for (i = 0; i < entries; ++i, ++refp) { - ref = *refp; + WT_WRITE_BARRIER(); + + for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) { + ref = pindex->index[i]; /* * We don't hold hazard pointers on created pages, they cannot @@ -441,42 +497,18 @@ __split_ref_move_final( WT_ERR(ret); child = ref->page; + + /* The child can now be evicted or split. */ + F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); + #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, __split_verify_intl_key_order(session, child)); #endif - /* - * We use a page flag to prevent the child from splitting from - * underneath us, but the split-generation error checks don't - * know about that flag; use the standard macros to ensure that - * reading the child's page index structure is safe. - */ - WT_ENTER_PAGE_INDEX(session); - WT_INTL_FOREACH_BEGIN(session, child, child_ref) { - /* - * The page's home reference may not be wrong, as we - * opened up access from the top of the tree already, - * disk pages may have been read in since then, and - * those pages would have correct parent references. - */ - if (child_ref->home != child) { - child_ref->home = child; - child_ref->pindex_hint = 0; - } - } WT_INTL_FOREACH_END; - WT_LEAVE_PAGE_INDEX(session); - - /* The child can now be evicted or split. */ - F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); WT_ERR(__wt_hazard_clear(session, child)); } - /* - * Push out the changes: not required for correctness, but don't let - * threads spin on incorrect page references longer than necessary. - */ - WT_FULL_BARRIER(); return (0); err: /* Something really bad just happened. */ @@ -496,11 +528,11 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex; WT_REF **alloc_refp; WT_REF **child_refp, *ref, **root_refp; + WT_SPLIT_ERROR_PHASE complete; size_t child_incr, root_decr, root_incr, size; uint64_t split_gen; uint32_t children, chunk, i, j, remain; uint32_t slots; - bool complete; void *p; WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen); @@ -511,7 +543,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) btree = S2BT(session); alloc_index = NULL; root_decr = root_incr = 0; - complete = false; + complete = WT_ERR_RETURN; /* The root page will be marked dirty, make sure that will succeed. */ WT_RET(__wt_page_modify_init(session, root)); @@ -589,16 +621,13 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_ERR(__wt_page_modify_init(session, child)); __wt_page_modify_set(session, child); - /* Ensure the page isn't evicted or split for now. */ - __split_child_block_evict_and_split(child); - /* * The newly allocated child's page index references the same * structures as the root. (We cannot move WT_REF structures, * threads may be underneath us right now changing the structure * state.) However, if the WT_REF structures reference on-page * information, we have to fix that, because the disk image for - * the page that has an page index entry for the WT_REF is about + * the page that has a page index entry for the WT_REF is about * to change. */ child_pindex = WT_INTL_INDEX_GET_SAFE(child); @@ -615,31 +644,28 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_ASSERT(session, root_refp - pindex->index == (ptrdiff_t)pindex->entries); + /* Start making real changes to the tree, errors are fatal. */ + complete = WT_ERR_PANIC; + + /* Prepare the WT_REFs for the move. */ + __split_ref_step1(session, alloc_index, false); + /* * Confirm the root page's index hasn't moved, then update it, which - * makes the split visible to threads descending the tree. From this - * point on, we're committed to the split. - * - * A note on error handling: until this point, there's no problem with - * unwinding on error. We allocated a new page index, a new set of - * WT_REFs and a new set of child pages -- if an error occurred, the - * root remained unchanged, although it may have an incorrect memory - * footprint. From now on we've modified the root page, attention - * needs to be paid. However, subsequent failures are relatively benign, - * the split is OK and complete. For that reason, we ignore errors past - * this point unless there's a panic. + * makes the split visible to threads descending the tree. */ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(root) == pindex); WT_INTL_INDEX_SET(root, alloc_index); - complete = true; #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, __split_verify_intl_key_order(session, root)); #endif - /* Fix up the moved WT_REF structures. */ - WT_ERR(__split_ref_move_final( - session, alloc_index->index, alloc_index->entries)); + /* Finalize the WT_REFs we moved. */ + WT_ERR(__split_ref_step2(session, alloc_index, false)); + + /* The split is complete and correct, ignore benign errors. */ + complete = WT_ERR_IGNORE; /* We've installed the allocated page-index, ensure error handling. */ alloc_index = NULL; @@ -664,24 +690,25 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) __wt_cache_page_inmem_decr(session, root, root_decr); __wt_page_modify_set(session, root); -err: /* - * If complete is true, we saw an error after opening up the tree to - * descent through the root page's new index. There is nothing we - * can do, there are threads potentially active in both versions of - * the tree. - * - * A note on error handling: if we completed the split, return success, - * nothing really bad can have happened, and our caller has to proceed - * with the split. - */ - if (!complete) +err: switch (complete) { + case WT_ERR_RETURN: __wt_free_ref_index(session, root, alloc_index, true); - - if (ret != 0 && ret != WT_PANIC) + break; + case WT_ERR_PANIC: __wt_err(session, ret, - "ignoring not-fatal error during root page split to " - "deepen the tree"); - return (ret == WT_PANIC || !complete ? ret : 0); + "fatal error during root page split to deepen the tree"); + ret = WT_PANIC; + break; + case WT_ERR_IGNORE: + if (ret != 0 && ret != WT_PANIC) { + __wt_err(session, ret, + "ignoring not-fatal error during root page split " + "to deepen the tree"); + ret = 0; + } + break; + } + return (ret); } /* @@ -698,19 +725,21 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, WT_PAGE *parent; WT_PAGE_INDEX *alloc_index, *pindex; WT_REF **alloc_refp, *next_ref; + WT_SPLIT_ERROR_PHASE complete; size_t parent_decr, size; uint64_t split_gen; - uint32_t i, j; + uint32_t hint, i, j; uint32_t deleted_entries, parent_entries, result_entries; uint32_t *deleted_refs; - bool complete, empty_parent; + bool empty_parent; parent = ref->home; alloc_index = pindex = NULL; parent_decr = 0; parent_entries = 0; - complete = empty_parent = false; + empty_parent = false; + complete = WT_ERR_RETURN; /* The parent page will be marked dirty, make sure that will succeed. */ WT_RET(__wt_page_modify_init(session, parent)); @@ -728,7 +757,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, * array anyway. Switch them to the special split state, so that any * reading thread will restart. */ - WT_RET(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr)); + WT_ERR(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr)); for (deleted_entries = 0, i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; WT_ASSERT(session, next_ref->state != WT_REF_SPLIT); @@ -768,28 +797,40 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, * Allocate and initialize a new page index array for the parent, then * copy references from the original index array, plus references from * the newly created split array, into place. + * + * Update the WT_REF's page-index hint as we go. This can race with a + * thread setting the hint based on an older page-index, and the change + * isn't backed out in the case of an error, so there ways for the hint + * to be wrong; OK because it's just a hint. */ size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *); WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); parent_incr += size; alloc_index->index = (WT_REF **)(alloc_index + 1); alloc_index->entries = result_entries; - for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) { + for (alloc_refp = alloc_index->index, + hint = i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; if (next_ref == ref) for (j = 0; j < new_entries; ++j) { ref_new[j]->home = parent; + ref_new[j]->pindex_hint = hint++; *alloc_refp++ = ref_new[j]; } - else if (next_ref->state != WT_REF_SPLIT) + else if (next_ref->state != WT_REF_SPLIT) { /* Skip refs we have marked for deletion. */ + next_ref->pindex_hint = hint++; *alloc_refp++ = next_ref; + } } /* Check that we filled in all the entries. */ WT_ASSERT(session, alloc_refp - alloc_index->index == (ptrdiff_t)result_entries); + /* Start making real changes to the tree, errors are fatal. */ + complete = WT_ERR_PANIC; + /* * Confirm the parent page's index hasn't moved then update it, which * makes the split visible to threads descending the tree. @@ -830,16 +871,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, */ WT_FULL_BARRIER(); - /* - * A note on error handling: failures before we swapped the new page - * index into the parent can be resolved by freeing allocated memory - * because the original page is unchanged, we can continue to use it - * and we have not yet modified the parent. Failures after we swap - * the new page index into the parent are also relatively benign, the - * split is OK and complete. For those reasons, we ignore errors past - * this point unless there's a panic. - */ - complete = true; + /* The split is complete and correct, ignore benign errors. */ + complete = WT_ERR_IGNORE; WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, "%p: %s %s" "split into parent %p, %" PRIu32 " -> %" PRIu32 @@ -923,7 +956,8 @@ err: __wt_scr_free(session, &scr); * nothing really bad can have happened, and our caller has to proceed * with the split. */ - if (!complete) { + switch (complete) { + case WT_ERR_RETURN: for (i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; if (next_ref->state == WT_REF_SPLIT) @@ -931,20 +965,28 @@ err: __wt_scr_free(session, &scr); } __wt_free_ref_index(session, NULL, alloc_index, false); - /* * The split couldn't proceed because the parent would be empty, * return EBUSY so our caller knows to unlock the WT_REF that's * being deleted, but don't be noisy, there's nothing wrong. */ if (empty_parent) - return (EBUSY); + ret = EBUSY; + break; + case WT_ERR_PANIC: + __wt_err(session, ret, "fatal error during parent page split"); + ret = WT_PANIC; + break; + case WT_ERR_IGNORE: + if (ret != 0 && ret != WT_PANIC) { + __wt_err(session, ret, + "ignoring not-fatal error during parent page " + "split"); + ret = 0; + } + break; } - - if (ret != 0 && ret != WT_PANIC) - __wt_err(session, ret, - "ignoring not-fatal error during parent page split"); - return (ret == WT_PANIC || !complete ? ret : 0); + return (ret); } /* @@ -960,11 +1002,11 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex, *replace_index; WT_REF **alloc_refp; WT_REF **child_refp, *page_ref, **page_refp, *ref; + WT_SPLIT_ERROR_PHASE complete; size_t child_incr, page_decr, page_incr, parent_incr, size; uint64_t split_gen; uint32_t children, chunk, i, j, remain; uint32_t slots; - bool complete; void *p; WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal); @@ -977,7 +1019,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) alloc_index = replace_index = NULL; page_ref = page->pg_intl_parent_ref; page_decr = page_incr = parent_incr = 0; - complete = false; + complete = WT_ERR_RETURN; /* * Our caller is holding the page locked to single-thread splits, which @@ -1074,9 +1116,6 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_ERR(__wt_page_modify_init(session, child)); __wt_page_modify_set(session, child); - /* Ensure the page isn't evicted or split for now. */ - __split_child_block_evict_and_split(child); - /* * The newly allocated child's page index references the same * structures as the parent. (We cannot move WT_REF structures, @@ -1100,22 +1139,16 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_ASSERT(session, page_refp - pindex->index == (ptrdiff_t)pindex->entries); + /* Start making real changes to the tree, errors are fatal. */ + complete = WT_ERR_PANIC; + + /* Prepare the WT_REFs for the move. */ + __split_ref_step1(session, alloc_index, true); + /* Split into the parent. */ WT_ERR(__split_parent(session, page_ref, alloc_index->index, alloc_index->entries, parent_incr, false, false)); - /* - * A note on error handling: until this point, there's no problem with - * unwinding on error. We allocated a new page index, a new set of - * WT_REFs and a new set of child pages -- if an error occurred, the - * page remained unchanged, although it may have an incorrect memory - * footprint. From now on we've modified the parent page, attention - * needs to be paid. However, subsequent failures are relatively benign, - * the split is OK and complete. For that reason, we ignore errors past - * this point unless there's a panic. - */ - complete = true; - /* Confirm the page's index hasn't moved, then update it. */ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex); WT_INTL_INDEX_SET(page, replace_index); @@ -1127,9 +1160,17 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) __split_verify_intl_key_order(session, page)); #endif - /* Fix up the moved WT_REF structures. */ - WT_ERR(__split_ref_move_final( - session, alloc_index->index + 1, alloc_index->entries - 1)); + /* Finalize the WT_REFs we moved. */ + WT_ERR(__split_ref_step2(session, alloc_index, true)); + + /* The split is complete and correct, ignore benign errors. */ + complete = WT_ERR_IGNORE; + + /* + * Push out the changes: not required for correctness, but no reason + * to wait. + */ + WT_FULL_BARRIER(); /* * We don't care about the page-index we allocated, all we needed was @@ -1158,24 +1199,26 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) __wt_cache_page_inmem_decr(session, page, page_decr); __wt_page_modify_set(session, page); -err: /* - * If complete is true, we saw an error after opening up the tree to - * descent through the page's new index. There is nothing we can do, - * there are threads potentially active in both versions of the tree. - * - * A note on error handling: if we completed the split, return success, - * nothing really bad can have happened, and our caller has to proceed - * with the split. - */ - if (!complete) { +err: switch (complete) { + case WT_ERR_RETURN: __wt_free_ref_index(session, page, alloc_index, true); __wt_free_ref_index(session, page, replace_index, false); - } - - if (ret != 0 && ret != WT_PANIC) + break; + case WT_ERR_PANIC: __wt_err(session, ret, - "ignoring not-fatal error during internal page split"); - return (ret == WT_PANIC || !complete ? ret : 0); + "fatal error during internal page split"); + ret = WT_PANIC; + break; + case WT_ERR_IGNORE: + if (ret != 0 && ret != WT_PANIC) { + __wt_err(session, ret, + "ignoring not-fatal error during internal page " + "split"); + ret = 0; + } + break; + } + return (ret); } /* diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c index 2f8759b9d82..ef70160aa72 100644 --- a/src/btree/bt_stat.c +++ b/src/btree/bt_stat.c @@ -35,10 +35,10 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) WT_STAT_SET(session, stats, btree_fixed_len, btree->bitcnt); WT_STAT_SET(session, stats, btree_maximum_depth, btree->maximum_depth); - WT_STAT_SET(session, stats, btree_maxintlpage, btree->maxintlpage); WT_STAT_SET(session, stats, btree_maxintlkey, btree->maxintlkey); - WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage); + WT_STAT_SET(session, stats, btree_maxintlpage, btree->maxintlpage); WT_STAT_SET(session, stats, btree_maxleafkey, btree->maxleafkey); + WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage); WT_STAT_SET(session, stats, btree_maxleafvalue, btree->maxleafvalue); /* Everything else is really, really expensive. */ @@ -59,8 +59,8 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) WT_STAT_SET(session, stats, btree_row_leaf, 0); next_walk = NULL; - while ((ret = __wt_tree_walk(session, &next_walk, NULL, 0)) == 0 && - next_walk != NULL) { + while ((ret = __wt_tree_walk( + session, &next_walk, 0)) == 0 && next_walk != NULL) { WT_WITH_PAGE_INDEX(session, ret = __stat_page(session, next_walk->page, stats)); WT_RET(ret); diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index 07bb2eb3a01..86607d8f187 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -58,7 +58,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL; for (walk = NULL;;) { - WT_ERR(__wt_tree_walk(session, &walk, NULL, flags)); + WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; @@ -124,7 +124,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) /* Write all dirty in-cache pages. */ flags |= WT_READ_NO_EVICT; for (walk = NULL;;) { - WT_ERR(__wt_tree_walk(session, &walk, NULL, flags)); + WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c index c7d83d8dfff..abb18529041 100644 --- a/src/btree/bt_walk.c +++ b/src/btree/bt_walk.c @@ -69,16 +69,78 @@ retry: WT_INTL_INDEX_GET(session, ref->home, pindex); } /* - * __wt_tree_walk -- + * __ref_is_leaf -- + * Check if a reference is for a leaf page. + */ +static inline bool +__ref_is_leaf(WT_REF *ref) +{ + size_t addr_size; + u_int type; + const uint8_t *addr; + + /* + * If the page has a disk address, we can crack it to figure out if + * this page is a leaf page or not. If there's no address, the page + * isn't on disk and we don't know the page type. + */ + __wt_ref_info(ref, &addr, &addr_size, &type); + return (addr == NULL ? + false : type == WT_CELL_ADDR_LEAF || type == WT_CELL_ADDR_LEAF_NO); +} + +/* + * __page_ascend -- + * Ascend the tree one level. + */ +static void +__page_ascend(WT_SESSION_IMPL *session, + WT_REF **refp, WT_PAGE_INDEX **pindexp, uint32_t *slotp) +{ + WT_REF *parent_ref, *ref; + + /* + * Ref points to the first/last slot on an internal page from which we + * are ascending the tree, moving to the parent page. This is tricky + * because the internal page we're on may be splitting into its parent. + * Find a stable configuration where the page we start from and the + * page we're moving to are connected. The tree eventually stabilizes + * into that configuration, keep trying until we succeed. + */ + for (ref = *refp;;) { + /* + * Find our parent slot on the next higher internal page, the + * slot from which we move to a next/prev slot, checking that + * we haven't reached the root. + */ + parent_ref = ref->home->pg_intl_parent_ref; + if (__wt_ref_is_root(parent_ref)) + break; + __page_refp(session, parent_ref, pindexp, slotp); + + /* + * When internal pages split, the WT_REF structures being moved + * are updated first. If the WT_REF we started with references + * the same page as we found on our search of the parent, there + * is a consistent view. + */ + if (ref->home == parent_ref->page) + break; + } + + *refp = parent_ref; +} + +/* + * __tree_walk_internal -- * Move to the next/previous page in the tree. */ -int -__wt_tree_walk(WT_SESSION_IMPL *session, - WT_REF **refp, uint64_t *walkcntp, uint32_t flags) +static inline int +__tree_walk_internal(WT_SESSION_IMPL *session, + WT_REF **refp, uint64_t *walkcntp, uint64_t *skipleafcntp, uint32_t flags) { WT_BTREE *btree; WT_DECL_RET; - WT_PAGE *page; WT_PAGE_INDEX *pindex; WT_REF *couple, *couple_orig, *ref; bool empty_internal, prev, skip; @@ -153,7 +215,7 @@ __wt_tree_walk(WT_SESSION_IMPL *session, goto descend; } -ascend: /* + /* * If the active page was the root, we've reached the walk's end. * Release any hazard-pointer we're holding. */ @@ -167,13 +229,14 @@ ascend: /* for (;;) { /* - * If we're at the last/first slot on the page, return this page - * in post-order traversal. Otherwise we move to the next/prev - * slot and left/right-most element in its subtree. + * If we're at the last/first slot on the internal page, return + * it in post-order traversal. Otherwise move to the next/prev + * slot and left/right-most element in that subtree. */ - if ((prev && slot == 0) || + while ((prev && slot == 0) || (!prev && slot == pindex->entries - 1)) { - ref = ref->home->pg_intl_parent_ref; + /* Ascend to the parent. */ + __page_ascend(session, &ref, &pindex, &slot); /* * If we got all the way through an internal page and @@ -185,40 +248,37 @@ ascend: /* empty_internal = false; } - /* Optionally skip internal pages. */ - if (LF_ISSET(WT_READ_SKIP_INTL)) - goto ascend; - /* - * We've ascended the tree and are returning an internal - * page. If it's the root, discard our hazard pointer, - * otherwise, swap our hazard pointer for the page we'll - * return. + * If at the root and returning internal pages, return + * the root page, otherwise we're done. Regardless, no + * hazard pointer is required, release the one we hold. */ - if (__wt_ref_is_root(ref)) + if (__wt_ref_is_root(ref)) { WT_ERR(__wt_page_release( session, couple, flags)); - else { - /* - * Locate the reference to our parent page then - * swap our child hazard pointer for the parent. - * We don't handle restart or not-found returns. - * It would require additional complexity and is - * not a possible return: we're moving to the - * parent of the current child page, our parent - * reference can't have split or been evicted. - */ - __page_refp(session, ref, &pindex, &slot); + if (!LF_ISSET(WT_READ_SKIP_INTL)) + *refp = ref; + goto done; + } + + /* + * Optionally return internal pages. Swap our previous + * hazard pointer for the page we'll return. We don't + * handle restart or not-found returns, it would require + * additional complexity and is not a possible return: + * we're moving to the parent of the current child page, + * the parent can't have been evicted. + */ + if (!LF_ISSET(WT_READ_SKIP_INTL)) { if ((ret = __wt_page_swap( session, couple, ref, flags)) != 0) { WT_TRET(__wt_page_release( session, couple, flags)); WT_ERR(ret); } + *refp = ref; + goto done; } - - *refp = ref; - goto done; } if (prev) @@ -304,6 +364,31 @@ ascend: /* break; } + /* + * Optionally skip leaf pages: skip all leaf pages if + * WT_READ_SKIP_LEAF is set, when the skip-leaf-count + * variable is non-zero, skip some count of leaf pages. + * If this page is disk-based, crack the cell to figure + * out it's a leaf page without reading it. + * + * If skipping some number of leaf pages, decrement the + * count of pages to zero, and then take the next leaf + * page we can. Be cautious around the page decrement, + * if for some reason don't take this particular page, + * we can take the next one, and, there are additional + * tests/decrements when we're about to return a leaf + * page. + */ + if (skipleafcntp != NULL || LF_ISSET(WT_READ_SKIP_LEAF)) + if (__ref_is_leaf(ref)) { + if (LF_ISSET(WT_READ_SKIP_LEAF)) + break; + if (*skipleafcntp > 0) { + --*skipleafcntp; + break; + } + } + ret = __wt_page_swap(session, couple, ref, flags); /* @@ -359,13 +444,29 @@ ascend: /* * A new page: configure for traversal of any internal * page's children, else return the leaf page. */ -descend: couple = ref; - page = ref->page; - if (WT_PAGE_IS_INTERNAL(page)) { - WT_INTL_INDEX_GET(session, page, pindex); + if (WT_PAGE_IS_INTERNAL(ref->page)) { +descend: couple = ref; + WT_INTL_INDEX_GET(session, ref->page, pindex); slot = prev ? pindex->entries - 1 : 0; empty_internal = true; } else { + /* + * Optionally skip leaf pages, the second half. + * We didn't have an on-page cell to figure out + * if it was a leaf page, we had to acquire the + * hazard pointer and look at the page. + */ + if (skipleafcntp != NULL || + LF_ISSET(WT_READ_SKIP_LEAF)) { + couple = ref; + if (LF_ISSET(WT_READ_SKIP_LEAF)) + break; + if (*skipleafcntp > 0) { + --*skipleafcntp; + break; + } + } + *refp = ref; goto done; } @@ -376,3 +477,37 @@ done: err: WT_LEAVE_PAGE_INDEX(session); return (ret); } + +/* + * __wt_tree_walk -- + * Move to the next/previous page in the tree. + */ +int +__wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags) +{ + return (__tree_walk_internal(session, refp, NULL, NULL, flags)); +} + +/* + * __wt_tree_walk_count -- + * Move to the next/previous page in the tree, tracking how many + * references were visited to get there. + */ +int +__wt_tree_walk_count(WT_SESSION_IMPL *session, + WT_REF **refp, uint64_t *walkcntp, uint32_t flags) +{ + return (__tree_walk_internal(session, refp, walkcntp, NULL, flags)); +} + +/* + * __wt_tree_walk_skip -- + * Move to the next/previous page in the tree, skipping a certain number + * of leaf pages before returning. + */ +int +__wt_tree_walk_skip(WT_SESSION_IMPL *session, + WT_REF **refp, uint64_t *skipleafcntp, uint32_t flags) +{ + return (__tree_walk_internal(session, refp, NULL, skipleafcntp, flags)); +} diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c index e9fa570f97b..c5e2abbe440 100644 --- a/src/btree/col_srch.c +++ b/src/btree/col_srch.c @@ -9,12 +9,60 @@ #include "wt_internal.h" /* + * __check_leaf_key_range -- + * Check the search key is in the leaf page's key range. + */ +static inline int +__check_leaf_key_range(WT_SESSION_IMPL *session, + uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt) +{ + WT_PAGE_INDEX *pindex; + uint32_t indx; + + /* + * There are reasons we can't do the fast checks, and we continue with + * the leaf page search in those cases, only skipping the complete leaf + * page search if we know it's not going to work. + */ + cbt->compare = 0; + + /* + * Check if the search key is smaller than the parent's starting key for + * this page. + */ + if (recno < leaf->key.recno) { + cbt->compare = 1; /* page keys > search key */ + return (0); + } + + /* + * Check if the search key is greater than or equal to the starting key + * for the parent's next page. + * + * !!! + * Check that "indx + 1" is a valid page-index entry first, because it + * also checks that "indx" is a valid page-index entry, and we have to + * do that latter check before looking at the indx slot of the array + * for a match to leaf (in other words, our page hint might be wrong). + */ + WT_INTL_INDEX_GET(session, leaf->home, pindex); + indx = leaf->pindex_hint; + if (indx + 1 < pindex->entries && pindex->index[indx] == leaf) + if (recno >= pindex->index[indx + 1]->key.recno) { + cbt->compare = -1; /* page keys < search key */ + return (0); + } + + return (0); +} + +/* * __wt_col_search -- * Search a column-store tree for a specific record-based key. */ int __wt_col_search(WT_SESSION_IMPL *session, - uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt) + uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_COL *cip; @@ -24,6 +72,7 @@ __wt_col_search(WT_SESSION_IMPL *session, WT_PAGE *page; WT_PAGE_INDEX *pindex, *parent_pindex; WT_REF *current, *descent; + uint64_t recno; uint32_t base, indx, limit; int depth; @@ -31,8 +80,38 @@ __wt_col_search(WT_SESSION_IMPL *session, __cursor_pos_clear(cbt); - /* We may only be searching a single leaf page, not the full tree. */ + /* + * When appending a new record, the search record number will be an + * out-of-band value, search for the largest key in the table instead. + */ + if ((recno = search_recno) == WT_RECNO_OOB) + recno = UINT64_MAX; + + /* + * We may be searching only a single leaf page, not the full tree. In + * the normal case where the page links to a parent, check the page's + * parent keys before doing the full search, it's faster when the + * cursor is being re-positioned. (One case where the page doesn't + * have a parent is if it is being re-instantiated in memory as part + * of a split). + */ if (leaf != NULL) { + WT_ASSERT(session, search_recno != WT_RECNO_OOB); + + if (leaf->home != NULL) { + WT_RET(__check_leaf_key_range( + session, recno, leaf, cbt)); + if (cbt->compare != 0) { + /* + * !!! + * WT_CURSOR.search_near uses the slot value to + * decide if there was an on-page match. + */ + cbt->slot = 0; + return (0); + } + } + current = leaf; goto leaf_only; } @@ -120,7 +199,17 @@ leaf_only: page = current->page; cbt->ref = current; cbt->recno = recno; - cbt->compare = 0; + + /* + * Don't bother searching if the caller is appending a new record where + * we'll allocate the record number; we're not going to find a match by + * definition, and we figure out the record number and position when we + * do the work. + */ + if (search_recno == WT_RECNO_OOB) { + cbt->compare = -1; + return (0); + } /* * Set the on-page slot to an impossible value larger than any possible @@ -142,6 +231,7 @@ leaf_only: * that's impossibly large for the page. We do have additional setup to * do in that case, the record may be appended to the page. */ + cbt->compare = 0; if (page->type == WT_PAGE_COL_FIX) { if (recno < page->pg_fix_recno) { cbt->compare = 1; @@ -190,18 +280,10 @@ past_end: * This is a rarely used path: we normally find exact matches, because * column-store files are dense, but in this case the caller searched * past the end of the table. - * - * Don't bother searching if the caller is appending a new record where - * we'll allocate the record number; we're not going to find a match by - * definition, and we figure out the position when we do the work. */ cbt->ins_head = WT_COL_APPEND(page); - if (recno == UINT64_MAX) - cbt->ins = NULL; - else - cbt->ins = __col_insert_search( - cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno); - if (cbt->ins == NULL) + if ((cbt->ins = __col_insert_search( + cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno)) == NULL) cbt->compare = -1; else { cbt->recno = WT_INSERT_RECNO(cbt->ins); @@ -212,14 +294,5 @@ past_end: else cbt->compare = -1; } - - /* - * Note if the record is past the maximum record in the tree, the cursor - * search functions need to know for fixed-length column-stores because - * appended records implicitly create any skipped records, and cursor - * search functions have to handle that case. - */ - if (cbt->compare == -1) - F_SET(cbt, WT_CBT_MAX_RECORD); return (0); } diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c index d2d8a4640ca..e98d30152ab 100644 --- a/src/btree/row_srch.c +++ b/src/btree/row_srch.c @@ -132,6 +132,76 @@ __wt_search_insert( } /* + * __check_leaf_key_range -- + * Check the search key is in the leaf page's key range. + */ +static inline int +__check_leaf_key_range(WT_SESSION_IMPL *session, + WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt) +{ + WT_BTREE *btree; + WT_COLLATOR *collator; + WT_ITEM *item; + WT_PAGE_INDEX *pindex; + uint32_t indx; + int cmp; + + btree = S2BT(session); + collator = btree->collator; + item = cbt->tmp; + + /* + * There are reasons we can't do the fast checks, and we continue with + * the leaf page search in those cases, only skipping the complete leaf + * page search if we know it's not going to work. + */ + cbt->compare = 0; + + /* + * First, confirm we have the right parent page-index slot, and quit if + * we don't. We don't search for the correct slot, that would make this + * cheap test expensive. + */ + WT_INTL_INDEX_GET(session, leaf->home, pindex); + indx = leaf->pindex_hint; + if (indx >= pindex->entries || pindex->index[indx] != leaf) + return (0); + + /* + * Check if the search key is smaller than the parent's starting key for + * this page. + * + * We can't compare against slot 0 on a row-store internal page because + * reconciliation doesn't build it, it may not be a valid key. + */ + if (indx != 0) { + __wt_ref_key(leaf->home, leaf, &item->data, &item->size); + WT_RET(__wt_compare(session, collator, srch_key, item, &cmp)); + if (cmp < 0) { + cbt->compare = 1; /* page keys > search key */ + return (0); + } + } + + /* + * Check if the search key is greater than or equal to the starting key + * for the parent's next page. + */ + ++indx; + if (indx < pindex->entries) { + __wt_ref_key( + leaf->home, pindex->index[indx], &item->data, &item->size); + WT_RET(__wt_compare(session, collator, srch_key, item, &cmp)); + if (cmp >= 0) { + cbt->compare = -1; /* page keys < search key */ + return (0); + } + } + + return (0); +} + +/* * __wt_row_search -- * Search a row-store tree for a specific key. */ @@ -179,8 +249,29 @@ __wt_row_search(WT_SESSION_IMPL *session, append_check = insert && cbt->append_tree; descend_right = true; - /* We may only be searching a single leaf page, not the full tree. */ + /* + * We may be searching only a single leaf page, not the full tree. In + * the normal case where the page links to a parent, check the page's + * parent keys before doing the full search, it's faster when the + * cursor is being re-positioned. (One case where the page doesn't + * have a parent is if it is being re-instantiated in memory as part + * of a split). + */ if (leaf != NULL) { + if (leaf->home != NULL) { + WT_RET(__check_leaf_key_range( + session, srch_key, leaf, cbt)); + if (cbt->compare != 0) { + /* + * !!! + * WT_CURSOR.search_near uses the slot value to + * decide if there was an on-page match. + */ + cbt->slot = 0; + return (0); + } + } + current = leaf; goto leaf_only; } @@ -196,15 +287,6 @@ restart_page: page = current->page; WT_INTL_INDEX_GET(session, page, pindex); - /* - * Fast-path internal pages with one child, a common case for - * the root page in new trees. - */ - if (pindex->entries == 1) { - descent = pindex->index[0]; - goto descend; - } - /* Fast-path appends. */ if (append_check) { descent = pindex->index[pindex->entries - 1]; @@ -536,19 +618,163 @@ err: /* } /* - * __wt_row_random -- - * Return a random key from a row-store tree. + * __wt_row_random_leaf -- + * Return a random key from a row-store leaf page. */ int -__wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) +__wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) +{ + WT_INSERT *ins, **start, **stop; + WT_INSERT_HEAD *ins_head; + WT_PAGE *page; + uint32_t choice, entries, i; + int level; + + page = cbt->ref->page; + + start = stop = NULL; /* [-Wconditional-uninitialized] */ + entries = 0; /* [-Wconditional-uninitialized] */ + + /* If the page has disk-based entries, select from them. */ + if (page->pg_row_entries != 0) { + cbt->compare = 0; + cbt->slot = __wt_random(&session->rnd) % page->pg_row_entries; + + /* + * The real row-store search function builds the key, so we + * have to as well. + */ + return (__wt_row_leaf_key(session, + page, page->pg_row_d + cbt->slot, cbt->tmp, false)); + } + + /* + * If the tree is new (and not empty), it might have a large insert + * list. + */ + F_SET(cbt, WT_CBT_SEARCH_SMALLEST); + if ((cbt->ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL) + return (WT_NOTFOUND); + + /* + * Walk down the list until we find a level with at least 50 entries, + * that's where we'll start rolling random numbers. The value 50 is + * used to ignore levels with only a few entries, that is, levels which + * are potentially badly skewed. + */ + for (ins_head = cbt->ins_head, + level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) { + start = &ins_head->head[level]; + for (entries = 0, stop = start; + *stop != NULL; stop = &(*stop)->next[level]) + ++entries; + + if (entries > 50) + break; + } + + /* + * If it's a tiny list and we went all the way to level 0, correct the + * level; entries is correctly set. + */ + if (level < 0) + level = 0; + + /* + * Step down the skip list levels, selecting a random chunk of the name + * space at each level. + */ + while (level > 0) { + /* + * There are (entries) or (entries + 1) chunks of the name space + * considered at each level. They are: between start and the 1st + * element, between the 1st and 2nd elements, and so on to the + * last chunk which is the name space after the stop element on + * the current level. This last chunk of name space may or may + * not be there: as we descend the levels of the skip list, this + * chunk may appear, depending if the next level down has + * entries logically after the stop point in the current level. + * We can't ignore those entries: because of the algorithm used + * to determine the depth of a skiplist, there may be a large + * number of entries "revealed" by descending a level. + * + * If the next level down has more items after the current stop + * point, there are (entries + 1) chunks to consider, else there + * are (entries) chunks. + */ + if (*(stop - 1) == NULL) + choice = __wt_random(&session->rnd) % entries; + else + choice = __wt_random(&session->rnd) % (entries + 1); + + if (choice == entries) { + /* + * We selected the name space after the stop element on + * this level. Set the start point to the current stop + * point, descend a level and move the stop element to + * the end of the list, that is, the end of the newly + * discovered name space, counting entries as we go. + */ + start = stop; + --start; + --level; + for (entries = 0, stop = start; + *stop != NULL; stop = &(*stop)->next[level]) + ++entries; + } else { + /* + * We selected another name space on the level. Move the + * start pointer the selected number of entries forward + * to the start of the selected chunk (if the selected + * number is 0, start won't move). Set the stop pointer + * to the next element in the list and drop both start + * and stop down a level. + */ + for (i = 0; i < choice; ++i) + start = &(*start)->next[level]; + stop = &(*start)->next[level]; + + --start; + --stop; + --level; + + /* Count the entries in the selected name space. */ + for (entries = 0, + ins = *start; ins != *stop; ins = ins->next[level]) + ++entries; + } + } + + /* + * When we reach the bottom level, entries will already be set. Select + * a random entry from the name space and return it. + * + * It should be impossible for the entries count to be 0 at this point, + * but check for it out of paranoia and to quiet static testing tools. + */ + if (entries > 0) + entries = __wt_random(&session->rnd) % entries; + for (ins = *start; entries > 0; --entries) + ins = ins->next[0]; + + cbt->ins = ins; + cbt->compare = 0; + + return (0); +} + +/* + * __wt_row_random_descent -- + * Find a random leaf page in a row-store tree. + */ +int +__wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_DECL_RET; - WT_INSERT *p, *t; WT_PAGE *page; WT_PAGE_INDEX *pindex; WT_REF *current, *descent; - uint32_t cnt; btree = S2BT(session); @@ -585,43 +811,6 @@ restart_root: return (ret); } - if (page->pg_row_entries != 0) { - cbt->ref = current; - cbt->compare = 0; - cbt->slot = __wt_random(&session->rnd) % page->pg_row_entries; - - /* - * The real row-store search function builds the key, so we - * have to as well. - */ - return (__wt_row_leaf_key(session, - page, page->pg_row_d + cbt->slot, cbt->tmp, false)); - } - - /* - * If the tree is new (and not empty), it might have a large insert - * list. Count how many records are in the list. - */ - F_SET(cbt, WT_CBT_SEARCH_SMALLEST); - if ((cbt->ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL) - WT_ERR(WT_NOTFOUND); - for (cnt = 1, p = WT_SKIP_FIRST(cbt->ins_head);; ++cnt) - if ((p = WT_SKIP_NEXT(p)) == NULL) - break; - - /* - * Select a random number from 0 to (N - 1), return that record. - */ - cnt = __wt_random(&session->rnd) % cnt; - for (p = t = WT_SKIP_FIRST(cbt->ins_head);; t = p) - if (cnt-- == 0 || (p = WT_SKIP_NEXT(p)) == NULL) - break; cbt->ref = current; - cbt->compare = 0; - cbt->ins = t; - return (0); - -err: WT_TRET(__wt_page_release(session, current, 0)); - return (ret); } diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c index d3a0265c13a..e943f01236e 100644 --- a/src/cache/cache_las.c +++ b/src/cache/cache_las.c @@ -18,6 +18,7 @@ __wt_las_stats_update(WT_SESSION_IMPL *session) WT_CONNECTION_IMPL *conn; WT_CONNECTION_STATS **cstats; WT_DSRC_STATS **dstats; + int64_t v; conn = S2C(session); @@ -37,10 +38,10 @@ __wt_las_stats_update(WT_SESSION_IMPL *session) dstats = ((WT_CURSOR_BTREE *) conn->las_session->las_cursor)->btree->dhandle->stats; - WT_STAT_SET(session, cstats, - cache_lookaside_insert, WT_STAT_READ(dstats, cursor_insert)); - WT_STAT_SET(session, cstats, - cache_lookaside_remove, WT_STAT_READ(dstats, cursor_remove)); + v = WT_STAT_READ(dstats, cursor_insert); + WT_STAT_SET(session, cstats, cache_lookaside_insert, v); + v = WT_STAT_READ(dstats, cursor_remove); + WT_STAT_SET(session, cstats, cache_lookaside_remove, v); } /* diff --git a/src/config/config_def.c b/src/config/config_def.c index d79ce6853e6..9d12e953498 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -323,6 +323,7 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_open_cursor[] = { NULL, "choices=[\"hex\",\"json\",\"print\"]", NULL, 0 }, { "next_random", "boolean", NULL, NULL, NULL, 0 }, + { "next_random_sample_size", "string", NULL, NULL, NULL, 0 }, { "overwrite", "boolean", NULL, NULL, NULL, 0 }, { "raw", "boolean", NULL, NULL, NULL, 0 }, { "readonly", "boolean", NULL, NULL, NULL, 0 }, @@ -920,9 +921,10 @@ static const WT_CONFIG_ENTRY config_entries[] = { NULL, 0 }, { "WT_SESSION.open_cursor", - "append=0,bulk=0,checkpoint=,dump=,next_random=0,overwrite=,raw=0" - ",readonly=0,skip_sort_check=0,statistics=,target=", - confchk_WT_SESSION_open_cursor, 11 + "append=0,bulk=0,checkpoint=,dump=,next_random=0," + "next_random_sample_size=0,overwrite=,raw=0,readonly=0," + "skip_sort_check=0,statistics=,target=", + confchk_WT_SESSION_open_cursor, 12 }, { "WT_SESSION.reconfigure", "isolation=read-committed", diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index bd14e1bf4fd..ee9935828e2 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -2003,6 +2003,9 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, WT_ERR(__wt_sweep_config(session, cfg)); WT_ERR(__wt_verbose_config(session, cfg)); + /* Initialize the OS page size for mmap */ + conn->page_size = __wt_get_vm_pagesize(); + /* Now that we know if verbose is configured, output the version. */ WT_ERR(__wt_verbose( session, WT_VERB_VERSION, "%s", WIREDTIGER_VERSION_STRING)); diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c index c6d5b535b86..0821238fbd7 100644 --- a/src/conn/conn_dhandle.c +++ b/src/conn/conn_dhandle.c @@ -147,12 +147,14 @@ __conn_dhandle_mark_dead(WT_SESSION_IMPL *session) int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) { + WT_BM *bm; WT_BTREE *btree; WT_DATA_HANDLE *dhandle; WT_DECL_RET; bool marked_dead, no_schema_lock; btree = S2BT(session); + bm = btree->bm; dhandle = session->dhandle; marked_dead = false; @@ -191,7 +193,7 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) */ if (!F_ISSET(btree, WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) { - if (force && (btree->bm == NULL || btree->bm->map == NULL)) { + if (force && (bm == NULL || !bm->is_mapped(bm, session))) { WT_ERR(__conn_dhandle_mark_dead(session)); marked_dead = true; } diff --git a/src/cursor/cur_file.c b/src/cursor/cur_file.c index 63f77248ca8..b955b292292 100644 --- a/src/cursor/cur_file.c +++ b/src/cursor/cur_file.c @@ -455,14 +455,24 @@ __wt_curfile_create(WT_SESSION_IMPL *session, } /* - * random_retrieval - * Random retrieval cursors only support next, reset and close. + * Random retrieval, row-store only. + * Random retrieval cursors support a limited set of methods. */ WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval)); if (cval.val != 0) { + if (WT_CURSOR_RECNO(cursor)) + WT_ERR_MSG(session, ENOTSUP, + "next_random configuration not supported for " + "column-store objects"); + __wt_cursor_set_notsup(cursor); cursor->next = __curfile_next_random; cursor->reset = __curfile_reset; + + WT_ERR(__wt_config_gets_def( + session, cfg, "next_random_sample_size", 0, &cval)); + if (cval.val != 0) + cbt->next_random_sample_size = (u_int)cval.val; } /* Underlying btree initialization. */ diff --git a/src/cursor/cur_json.c b/src/cursor/cur_json.c index 8f858a5012f..3270be07de4 100644 --- a/src/cursor/cur_json.c +++ b/src/cursor/cur_json.c @@ -313,7 +313,6 @@ size_t __wt_json_unpack_char(char ch, u_char *buf, size_t bufsz, bool force_unicode) { char abbrev; - u_char h; if (!force_unicode) { if (isprint(ch) && ch != '\\' && ch != '"') { @@ -354,16 +353,8 @@ __wt_json_unpack_char(char ch, u_char *buf, size_t bufsz, bool force_unicode) *buf++ = 'u'; *buf++ = '0'; *buf++ = '0'; - h = (((u_char)ch) >> 4) & 0xF; - if (h >= 10) - *buf++ = 'A' + (h - 10); - else - *buf++ = '0' + h; - h = ((u_char)ch) & 0xF; - if (h >= 10) - *buf++ = 'A' + (h - 10); - else - *buf++ = '0' + h; + *buf++ = __wt_hex[(ch & 0xf0) >> 4]; + *buf++ = __wt_hex[ch & 0x0f]; } return (6); } diff --git a/src/cursor/cur_stat.c b/src/cursor/cur_stat.c index e1d5b8eb91a..652dec364fb 100644 --- a/src/cursor/cur_stat.c +++ b/src/cursor/cur_stat.c @@ -384,6 +384,7 @@ __curstat_file_init(WT_SESSION_IMPL *session, { WT_DATA_HANDLE *dhandle; WT_DECL_RET; + wt_off_t size; const char *filename; /* @@ -395,8 +396,8 @@ __curstat_file_init(WT_SESSION_IMPL *session, if (!WT_PREFIX_SKIP(filename, "file:")) return (EINVAL); __wt_stat_dsrc_init_single(&cst->u.dsrc_stats); - WT_RET(__wt_block_manager_size( - session, filename, &cst->u.dsrc_stats)); + WT_RET(__wt_block_manager_named_size(session, filename, &size)); + cst->u.dsrc_stats.block_size = size; __wt_curstat_dsrc_final(cst); return (0); } @@ -662,7 +663,7 @@ __wt_curstat_open(WT_SESSION_IMPL *session, /* * We return the statistics field's offset as the key, and a string - * description, a string value, and a uint64_t value as the value + * description, a string value, and a uint64_t value as the value * columns. */ cursor->key_format = "i"; diff --git a/src/cursor/cur_std.c b/src/cursor/cur_std.c index f92426355ef..da38988b6c2 100644 --- a/src/cursor/cur_std.c +++ b/src/cursor/cur_std.c @@ -40,11 +40,11 @@ void __wt_cursor_set_notsup(WT_CURSOR *cursor) { /* - * Set all of the cursor methods (except for close and reset), to fail. - * Close is unchanged so the cursor can be discarded, reset defaults to + * Set cursor methods other than close, reconfigure and reset, to fail. + * Close is unchanged so the cursor can be discarded; reset is set to * a no-op because session transactional operations reset all of the - * cursors in a session, and random cursors shouldn't block transactions - * or checkpoints. + * cursors in a session. Reconfigure is left open in case it's possible + * in the future to change these configurations. */ cursor->compare = (int (*)(WT_CURSOR *, WT_CURSOR *, int *))__wt_cursor_notsup; diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c index dca72a16ee5..e746ccd5871 100644 --- a/src/cursor/cur_table.c +++ b/src/cursor/cur_table.c @@ -968,8 +968,11 @@ __wt_curtable_open(WT_SESSION_IMPL *session, WT_ERR(__wt_strdup(session, tmp->data, &ctable->cfg[1])); if (0) { -err: WT_TRET(__curtable_close(cursor)); - *cursorp = NULL; +err: if (*cursorp != NULL) { + WT_TRET(__wt_cursor_close(*cursorp)); + *cursorp = NULL; + } + WT_TRET(__curtable_close(cursor)); } __wt_scr_free(session, &tmp); diff --git a/src/docs/command-line.dox b/src/docs/command-line.dox index 745c5051be3..e2b376d5e3f 100644 --- a/src/docs/command-line.dox +++ b/src/docs/command-line.dox @@ -32,7 +32,7 @@ on success and non-zero on error. The \c wt tool supports several commands. If configured in the underlying database, some commands will run recovery when opening the database. If -the user wants to force recovery on any command, use the \c -r option. +the user wants to force recovery on any command, use the \c -R option. In general, commands that modify the database or tables will run recovery by default and commands that only read data will not run recovery. @@ -46,7 +46,7 @@ opened as a WiredTiger database. See @ref backup for more information, and @ref file_permissions for specifics on the copied file permissions. @subsection util_backup_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] backup [-t uri] directory</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] backup [-t uri] directory</code> @subsection util_backup_options Options The following are command-specific options for the \c backup command: @@ -64,7 +64,7 @@ The \c compact command attempts to rewrite the specified table or file to consume less disk space. @subsection util_compact_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] compact uri</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] compact uri</code> @subsection util_compact_options Options The \c compact command has no command-specific options. @@ -78,7 +78,7 @@ configuration. It is equivalent to a call to WT_SESSION::create with the specified string arguments. @subsection util_create_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] create [-c config] uri</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] create [-c config] uri</code> @subsection util_create_options Options The following are command-specific options for the \c create command: @@ -94,7 +94,7 @@ The \c drop command drops the specified \c uri. It is equivalent to a call to WT_SESSION::drop with the "force" configuration argument. @subsection util_drop_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] drop uri</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] drop uri</code> @subsection util_drop_options Options The \c drop command has no command-specific options. @@ -109,7 +109,7 @@ which can be re-loaded into a new table using the \c load command. See @subpage dump_formats for details of the dump file formats. @subsection util_dump_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] dump [-jrx] [-c checkpoint] [-f output] uri</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] dump [-jrx] [-c checkpoint] [-f output] uri</code> @subsection util_dump_options Options The following are command-specific options for the \c dump command: @@ -143,7 +143,7 @@ the database. If a URI is specified as an argument, only information about that data source is printed. @subsection util_list_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] list [-cv] [uri]</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] list [-cv] [uri]</code> @subsection util_list_options Options The following are command-specific options for the \c list command: @@ -170,7 +170,7 @@ table will be overwritten by the new data (use the \c -n option to make an attempt to overwrite existing data return an error). @subsection util_load_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] load [-ajn] [-f input] [-r name] [uri configuration ...]</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] load [-ajn] [-f input] [-r name] [uri configuration ...]</code> @subsection util_load_options Options The following are command-specific options for the \c load command: @@ -244,7 +244,7 @@ row-store table or file already exists, data in the table or file will be overwritten by the new data. @subsection util_loadtext_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] loadtext [-f input]</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] loadtext [-f input]</code> @subsection util_loadtext_options Options The following are command-specific options for the \c loadtext command: @@ -260,7 +260,7 @@ Display the database log. The \c printlog command outputs the database log. @subsection util_printlog_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] printlog [-p] [-f output]</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] printlog [-x] [-f output]</code> @subsection util_printlog_options Options The following are command-specific options for the \c printlog command: @@ -269,8 +269,9 @@ The following are command-specific options for the \c printlog command: By default, the \c printlog command output is written to the standard output; the \c -f option re-directs the output to the specified file. -@par <code>-p</code> -Display the log in a printable format. +@par <code>-x</code> +Keys and value items in the log are printed in hex format in addition +to the default string format. <hr> @section util_read wt read @@ -283,7 +284,7 @@ with string or record number keys and string values. The \c read command exits non-zero if a specified record is not found. @subsection util_read_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] read uri key ...</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] read uri key ...</code> @subsection util_read_options Options The \c read command has no command-specific options. @@ -295,7 +296,7 @@ Rename a table or file. The \c rename command renames the specified table or file. @subsection util_rename_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] rename uri name</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] rename uri name</code> @subsection util_rename_options Options The \c rename command has no command-specific options. @@ -309,7 +310,7 @@ data that cannot be recovered. Underlying files are re-written in place, overwriting the original file contents. @subsection util_salvage_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] salvage [-F force] uri</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] salvage [-F force] uri</code> @subsection util_salvage_options Options The following are command-specific options for the \c salvage command: @@ -327,7 +328,7 @@ The \c stat command outputs run-time statistics for the WiredTiger engine, or, if specified, for the URI on the command-line. @subsection util_stat_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] stat [-f] [uri]</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] stat [-f] [uri]</code> @subsection util_stat_options Options The following are command-specific options for the \c stat command: @@ -345,7 +346,7 @@ success if the data source is up-to-date, and failure if the data source cannot be upgraded. @subsection util_upgrade_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] upgrade uri</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] upgrade uri</code> @subsection util_upgrade_options Options The \c upgrade command has no command-specific options. @@ -359,7 +360,7 @@ success if the data source is correct, and failure if the data source is corrupted. @subsection util_verify_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] verify uri</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] verify uri</code> @subsection util_verify_options Options The \c verify command has no command-specific options. @@ -381,9 +382,9 @@ Attempting to overwrite an already existing record will fail. @subsection util_write_synopsis Synopsis <code> -wt [-rVv] [-C config] [-E secretkey ] [-h directory] write -a uri value ... +wt [-RVv] [-C config] [-E secretkey ] [-h directory] write -a uri value ... <br> -wt [-rVv] [-C config] [-E secretkey ] [-h directory] write [-o] uri key value ... +wt [-RVv] [-C config] [-E secretkey ] [-h directory] write [-o] uri key value ... </code> @subsection util_write_options Options diff --git a/src/docs/cursor-random.dox b/src/docs/cursor-random.dox index 446981e3192..a0a3212be6d 100644 --- a/src/docs/cursor-random.dox +++ b/src/docs/cursor-random.dox @@ -2,6 +2,27 @@ The \c next_random configuration to the WT_SESSION::open_cursor method configures the cursor to return a pseudo-random record from a row-store -object (the configuration is not supported on other types of objects). +object (the \c next_random configuration is not supported on other types +of objects). + +Applications should use the WT_CURSOR::next method to retrieve records +from the object, most other cursor methods are not supported. For +example, it's not possible to update using a cursor configured for +random retrieval. + +By default, each returned record is pseudo-randomly selected from the +underlying object as a whole. That can lead to skewed results when the +underlying tree structure is unbalanced or records are not uniformly +distributed. In such cases, the \c next_random_sample_size configuration +can also be specified. Setting \c next_random_sample_size configures the +number of samples the application expects to take using the cursor. A +cursor configured using \c next_random_sample_size divides the object +into \c next_random_sample_size pieces, and each subsequent retrieval +returns a record from the next one of those pieces. + +For example, setting \c next_random_sample_percent to \c 10 would cause +the cursor to sequentially return records from each tenth part of the +object. Setting \c next_random_sample_percent to \c 1000 would cause the +cursor to sequentially return records from each .1% of the object. */ diff --git a/src/docs/license.dox b/src/docs/license.dox index f34ebad19a7..febced2c6af 100644 --- a/src/docs/license.dox +++ b/src/docs/license.dox @@ -13,6 +13,19 @@ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the <b>GNU General Public License</b></a> for details. +Additionally, portions of the WiredTiger distribution are distributed +under the terms of the +<a href="http://www.opensource.org/licenses/BSD-3-Clause"> +BSD-3-Clause License</a>. These files have +<a href="http://www.opensource.org/licenses/BSD-3-Clause"> +BSD-3-Clause License</a> +copyright notices, and may be freely used and redistributed under the +terms of that notice. + +Additionally, portions of the WiredTiger distribution are public domain +software. Public domain files have notices releasing the software into +the public domain and may be freely used and redistributed. + For a license to use the WiredTiger software under conditions other than those described above, or for technical support for this software, please contact MongoDB, Inc. at @@ -28,7 +41,7 @@ of the WiredTiger library should comply with these copyrights. @hrow{Distribution Files, Copyright Holder, License} @row{\c src/include/bitstring.i, University of California\, Berkeley, <a href="http://www.opensource.org/licenses/BSD-3-Clause">BSD-3-Clause License</a>} @row{\c src/include/queue.h, University of California\, Berkeley, <a href="http://www.opensource.org/licenses/BSD-3-Clause">BSD-3-Clause License</a>} -@row{\c src/os_posix/getopt.c, University of California\, Berkeley, <a href="http://www.opensource.org/licenses/BSD-3-Clause">BSD-3-Clause License</a>} +@row{\c src/os_posix/os_getopt.c, University of California\, Berkeley, <a href="http://www.opensource.org/licenses/BSD-3-Clause">BSD-3-Clause License</a>} @row{\c src/support/hash_city.c, Google\, Inc., <a href="http://www.opensource.org/licenses/MIT">The MIT License</a>} @row{\c src/support/hash_fnv.c, Authors, Public Domain} </table> @@ -63,10 +76,4 @@ selected portions of the WiredTiger sources, please review the copyright notices and LICENSE files included in the WiredTiger distribution for the terms and conditions of such redistribution. -@section license_public_domain Public domain software - -Many portions of the WiredTiger distribution are public domain software. -Public domain files have notices releasing the software into the public -domain and may be freely used and redistributed. - */ diff --git a/src/docs/wtperf.dox b/src/docs/wtperf.dox index f3bdd64cfda..339bf740265 100644 --- a/src/docs/wtperf.dox +++ b/src/docs/wtperf.dox @@ -206,6 +206,8 @@ if non zero choose a value from within this range as the key for insert operations @par random_value (boolean, default=false) generate random content for the value +@par read_range (unsigned int, default=0) +scan a range of keys after each search @par reopen_connection (boolean, default=true) close and reopen the connection between populate and workload phases @par report_interval (unsigned int, default=2) diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c index 2b2117ad9fd..c5f6ae3d4d1 100644 --- a/src/evict/evict_file.c +++ b/src/evict/evict_file.c @@ -31,8 +31,8 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) /* Walk the tree, discarding pages. */ next_ref = NULL; - WT_ERR(__wt_tree_walk(session, &next_ref, NULL, - WT_READ_CACHE | WT_READ_NO_EVICT)); + WT_ERR(__wt_tree_walk( + session, &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT)); while ((ref = next_ref) != NULL) { page = ref->page; @@ -68,8 +68,8 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) * the reconciliation, the next walk call could miss a page in * the tree. */ - WT_ERR(__wt_tree_walk(session, &next_ref, NULL, - WT_READ_CACHE | WT_READ_NO_EVICT)); + WT_ERR(__wt_tree_walk(session, + &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT)); switch (syncop) { case WT_SYNC_CLOSE: diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index a8979fa6231..0e2b33c35ec 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -1229,7 +1229,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp) */ for (evict = start, pages_walked = 0; evict < end && !enough && (ret == 0 || ret == WT_NOTFOUND); - ret = __wt_tree_walk( + ret = __wt_tree_walk_count( session, &btree->evict_ref, &pages_walked, walk_flags)) { enough = pages_walked > cache->evict_max_refs_per_file; if ((ref = btree->evict_ref) == NULL) { @@ -1336,8 +1336,9 @@ fast: /* If the page can't be evicted, give up. */ if (__wt_ref_is_root(ref)) WT_RET(__evict_clear_walk(session)); else if (ref->page->read_gen == WT_READGEN_OLDEST) - WT_RET_NOTFOUND_OK(__wt_tree_walk(session, - &btree->evict_ref, &pages_walked, walk_flags)); + WT_RET_NOTFOUND_OK(__wt_tree_walk_count( + session, &btree->evict_ref, + &pages_walked, walk_flags)); } WT_STAT_FAST_CONN_INCRV(session, cache_eviction_walk, pages_walked); @@ -1617,7 +1618,7 @@ __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile) next_walk = NULL; session->dhandle = dhandle; - while (__wt_tree_walk(session, &next_walk, NULL, + while (__wt_tree_walk(session, &next_walk, WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 && next_walk != NULL) { page = next_walk->page; diff --git a/src/include/block.h b/src/include/block.h index 4bff6c82783..804eec24874 100644 --- a/src/include/block.h +++ b/src/include/block.h @@ -173,6 +173,7 @@ struct __wt_bm { int (*compact_skip)(WT_BM *, WT_SESSION_IMPL *, bool *); int (*compact_start)(WT_BM *, WT_SESSION_IMPL *); int (*free)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t); + bool (*is_mapped)(WT_BM *, WT_SESSION_IMPL *); int (*preload)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t); int (*read) (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t); @@ -182,6 +183,7 @@ struct __wt_bm { int (*salvage_start)(WT_BM *, WT_SESSION_IMPL *); int (*salvage_valid) (WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t, bool); + int (*size)(WT_BM *, WT_SESSION_IMPL *, wt_off_t *); int (*stat)(WT_BM *, WT_SESSION_IMPL *, WT_DSRC_STATS *stats); int (*sync)(WT_BM *, WT_SESSION_IMPL *, bool); int (*verify_addr)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t); @@ -244,7 +246,10 @@ struct __wt_block { bool ckpt_inprogress;/* Live checkpoint in progress */ /* Compaction support */ - int compact_pct_tenths; /* Percent to compact */ + int compact_pct_tenths; /* Percent to compact */ + uint64_t compact_pages_reviewed;/* Pages reviewed */ + uint64_t compact_pages_skipped; /* Pages skipped */ + uint64_t compact_pages_written; /* Pages rewritten */ /* Salvage support */ wt_off_t slvg_off; /* Salvage file offset */ diff --git a/src/include/btmem.h b/src/include/btmem.h index 6ee74c61a38..12a736c56a2 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -478,7 +478,7 @@ struct __wt_page { #define pg_row_ins u.row.ins #undef pg_row_upd #define pg_row_upd u.row.upd -#define pg_row_entries u.row.entries +#undef pg_row_entries #define pg_row_entries u.row.entries /* Fixed-length column-store leaf page. */ @@ -1049,7 +1049,7 @@ struct __wt_insert_head { uint64_t __prev_split_gen = (session)->split_gen; \ if (__prev_split_gen == 0) \ do { \ - WT_PUBLISH((session)->split_gen, \ + WT_PUBLISH((session)->split_gen, \ S2C(session)->split_gen); \ } while ((session)->split_gen != S2C(session)->split_gen) diff --git a/src/include/btree.i b/src/include/btree.i index 3e2e7158e04..23e0dfea2cd 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -948,9 +948,8 @@ __wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value) * __wt_ref_info -- * Return the addr/size and type triplet for a reference. */ -static inline int -__wt_ref_info(WT_SESSION_IMPL *session, - WT_REF *ref, const uint8_t **addrp, size_t *sizep, u_int *typep) +static inline void +__wt_ref_info(WT_REF *ref, const uint8_t **addrp, size_t *sizep, u_int *typep) { WT_ADDR *addr; WT_CELL_UNPACK *unpack, _unpack; @@ -984,7 +983,9 @@ __wt_ref_info(WT_SESSION_IMPL *session, case WT_ADDR_LEAF_NO: *typep = WT_CELL_ADDR_LEAF_NO; break; - WT_ILLEGAL_VALUE(session); + default: + *typep = 0; + break; } } else { __wt_cell_unpack((WT_CELL *)addr, unpack); @@ -993,7 +994,6 @@ __wt_ref_info(WT_SESSION_IMPL *session, if (typep != NULL) *typep = unpack->type; } - return (0); } /* @@ -1009,7 +1009,7 @@ __wt_ref_block_free(WT_SESSION_IMPL *session, WT_REF *ref) if (ref->addr == NULL) return (0); - WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); + __wt_ref_info(ref, &addr, &addr_size, NULL); WT_RET(__wt_btree_block_free(session, addr, addr_size)); /* Clear the address (so we don't free it twice). */ diff --git a/src/include/column.i b/src/include/column.i index fc1f372b2a9..9388e07d0d8 100644 --- a/src/include/column.i +++ b/src/include/column.i @@ -176,6 +176,16 @@ __col_insert_search(WT_INSERT_HEAD *inshead, continue; } + /* + * When no exact match is found, the search returns the smallest + * key larger than the searched-for key, or the largest key + * smaller than the searched-for key, if there is no larger key. + * Our callers depend on that: specifically, the fixed-length + * column store cursor code interprets returning a key smaller + * than the searched-for key to mean the searched-for key is + * larger than any key on the page. Don't change that behavior, + * things will break. + */ ins_recno = WT_INSERT_RECNO(ret_ins); cmp = (recno == ins_recno) ? 0 : (recno < ins_recno) ? -1 : 1; @@ -282,7 +292,17 @@ __col_var_search(WT_PAGE *page, uint64_t recno, uint64_t *start_recnop) start_recno = repeat->recno + repeat->rle; } - if (recno >= start_recno + (page->pg_var_entries - start_indx)) + /* + * !!! + * The test could be written more simply as: + * + * (recno >= start_recno + (page->pg_var_entries - start_indx)) + * + * It's split into two parts because the simpler test will overflow if + * searching for large record numbers. + */ + if (recno >= start_recno && + recno - start_recno >= page->pg_var_entries - start_indx) return (NULL); return (page->pg_var_d + start_indx + (uint32_t)(recno - start_recno)); diff --git a/src/include/connection.h b/src/include/connection.h index 2367f5a0035..1c1cb9b8987 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -415,6 +415,7 @@ struct __wt_connection_impl { uint32_t direct_io; uint32_t write_through; /* FILE_FLAG_WRITE_THROUGH type flags */ bool mmap; /* mmap configuration */ + int page_size; /* OS page size for mmap alignment */ uint32_t verbose; uint32_t flags; diff --git a/src/include/cursor.h b/src/include/cursor.h index 43bbfcf5b05..4f232ce4fd0 100644 --- a/src/include/cursor.h +++ b/src/include/cursor.h @@ -104,6 +104,14 @@ struct __wt_cursor_btree { uint64_t recno; /* Record number */ /* + * Next-random cursors can optionally be configured to step through a + * percentage of the total leaf pages to their next value. Note the + * configured value and the calculated number of leaf pages to skip. + */ + uint64_t next_random_leaf_skip; + u_int next_random_sample_size; + + /* * The search function sets compare to: * < 1 if the found key is less than the specified key * 0 if the found key matches the specified key @@ -192,18 +200,23 @@ struct __wt_cursor_btree { uint8_t append_tree; /* Cursor appended to the tree */ +#ifdef HAVE_DIAGNOSTIC + /* Check that cursor next/prev never returns keys out-of-order. */ + WT_ITEM *lastkey, _lastkey; + uint64_t lastrecno; +#endif + #define WT_CBT_ACTIVE 0x01 /* Active in the tree */ #define WT_CBT_ITERATE_APPEND 0x02 /* Col-store: iterating append list */ #define WT_CBT_ITERATE_NEXT 0x04 /* Next iteration configuration */ #define WT_CBT_ITERATE_PREV 0x08 /* Prev iteration configuration */ -#define WT_CBT_MAX_RECORD 0x10 /* Col-store: past end-of-table */ -#define WT_CBT_NO_TXN 0x20 /* Non-transactional cursor +#define WT_CBT_NO_TXN 0x10 /* Non-transactional cursor (e.g. on a checkpoint) */ -#define WT_CBT_SEARCH_SMALLEST 0x40 /* Row-store: small-key insert list */ +#define WT_CBT_SEARCH_SMALLEST 0x20 /* Row-store: small-key insert list */ #define WT_CBT_POSITION_MASK /* Flags associated with position */ \ (WT_CBT_ITERATE_APPEND | WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV | \ - WT_CBT_MAX_RECORD | WT_CBT_SEARCH_SMALLEST) + WT_CBT_SEARCH_SMALLEST) uint8_t flags; }; diff --git a/src/include/extern.h b/src/include/extern.h index bd32e067a58..7338f8dae3b 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -51,7 +51,8 @@ extern int __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const extern int __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block); extern int __wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh, uint32_t allocsize); extern void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats); -extern int __wt_block_manager_size( WT_SESSION_IMPL *session, const char *filename, WT_DSRC_STATS *stats); +extern int __wt_block_manager_size(WT_BM *bm, WT_SESSION_IMPL *session, wt_off_t *sizep); +extern int __wt_block_manager_named_size( WT_SESSION_IMPL *session, const char *name, wt_off_t *sizep); extern int __wt_bm_preload( WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size); extern int __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size); extern int __wt_block_read_off_blind( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset); @@ -91,6 +92,7 @@ extern int __wt_bloom_drop(WT_BLOOM *bloom, const char *config); extern int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp); extern void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt); +extern int __wt_cursor_key_order_check( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next); extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating); extern int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating); extern int __wt_btcur_reset(WT_CURSOR_BTREE *cbt); @@ -167,9 +169,11 @@ extern int __wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk, size_t size, bool empty_page_ok); extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *tag, WT_ITEM *buf); -extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags); +extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags); +extern int __wt_tree_walk_count(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags); +extern int __wt_tree_walk_skip(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp, uint32_t flags); extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove); -extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt); +extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt); extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page); extern int __wt_row_leaf_key_copy( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key); extern int __wt_row_leaf_key_work(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip_arg, WT_ITEM *keyb, bool instantiate); @@ -184,7 +188,8 @@ extern WT_UPDATE *__wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd); extern int __wt_search_insert( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key); extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert); -extern int __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt); +extern int __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt); +extern int __wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt); extern void __wt_las_stats_update(WT_SESSION_IMPL *session); extern int __wt_las_create(WT_SESSION_IMPL *session); extern int __wt_las_destroy(WT_SESSION_IMPL *session); @@ -360,23 +365,23 @@ extern int __wt_logrec_read(WT_SESSION_IMPL *session, const uint8_t **pp, const extern int __wt_logop_read(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *optypep, uint32_t *opsizep); extern int __wt_logop_col_put_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t recno, WT_ITEM *value); extern int __wt_logop_col_put_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *recnop, WT_ITEM *valuep); -extern int __wt_logop_col_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern int __wt_logop_col_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags); extern int __wt_logop_col_remove_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t recno); extern int __wt_logop_col_remove_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *recnop); -extern int __wt_logop_col_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern int __wt_logop_col_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags); extern int __wt_logop_col_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t start, uint64_t stop); extern int __wt_logop_col_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *startp, uint64_t *stopp); -extern int __wt_logop_col_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern int __wt_logop_col_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags); extern int __wt_logop_row_put_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *key, WT_ITEM *value); extern int __wt_logop_row_put_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *keyp, WT_ITEM *valuep); -extern int __wt_logop_row_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern int __wt_logop_row_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags); extern int __wt_logop_row_remove_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *key); extern int __wt_logop_row_remove_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *keyp); -extern int __wt_logop_row_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern int __wt_logop_row_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags); extern int __wt_logop_row_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *start, WT_ITEM *stop, uint32_t mode); extern int __wt_logop_row_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *startp, WT_ITEM *stopp, uint32_t *modep); -extern int __wt_logop_row_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); -extern int __wt_txn_op_printlog( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern int __wt_logop_row_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags); +extern int __wt_txn_op_printlog( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags); extern void __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); extern int __wt_log_slot_switch( WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool retry, bool forced); extern int __wt_log_slot_new(WT_SESSION_IMPL *session); @@ -466,7 +471,7 @@ extern int __wt_meta_track_init(WT_SESSION_IMPL *session); extern int __wt_meta_track_destroy(WT_SESSION_IMPL *session); extern int __wt_turtle_init(WT_SESSION_IMPL *session); extern int __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, char **valuep); -extern int __wt_turtle_update( WT_SESSION_IMPL *session, const char *key, const char *value); +extern int __wt_turtle_update(WT_SESSION_IMPL *session, const char *key, const char *value); extern void __wt_abort(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn)); extern int __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp); extern int __wt_realloc(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp); @@ -512,6 +517,7 @@ extern int __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp); extern int __wt_once(void (*init_routine)(void)); extern int __wt_open(WT_SESSION_IMPL *session, const char *name, bool ok_create, bool exclusive, int dio_type, WT_FH **fhp); extern int __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp); +extern int __wt_get_vm_pagesize(void); extern bool __wt_absolute_path(const char *path); extern const char *__wt_path_separator(void); extern bool __wt_has_priv(void); @@ -653,6 +659,7 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp ); extern int __wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page); extern void __wt_hazard_close(WT_SESSION_IMPL *session); +extern void __wt_fill_hex(const uint8_t *src, size_t src_max, uint8_t *dest, size_t dest_max, size_t *lenp); extern int __wt_raw_to_hex( WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to); extern int __wt_raw_to_esc_hex( WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to); extern int __wt_hex2byte(const u_char *from, u_char *to); @@ -670,6 +677,7 @@ extern uint32_t __wt_log2_int(uint32_t n); extern bool __wt_ispo2(uint32_t v); extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2); extern void __wt_random_init(WT_RAND_STATE volatile *rnd_state); +extern int __wt_random_init_seed( WT_SESSION_IMPL *session, WT_RAND_STATE volatile *rnd_state); extern uint32_t __wt_random(WT_RAND_STATE volatile *rnd_state); extern int __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size); extern int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))); @@ -731,7 +739,7 @@ extern int __wt_txn_checkpoint_logread( WT_SESSION_IMPL *session, const uint8_t extern int __wt_txn_checkpoint_log( WT_SESSION_IMPL *session, bool full, uint32_t flags, WT_LSN *lsnp); extern int __wt_txn_truncate_log( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop); extern int __wt_txn_truncate_end(WT_SESSION_IMPL *session); -extern int __wt_txn_printlog(WT_SESSION *wt_session, FILE *out); +extern int __wt_txn_printlog(WT_SESSION *wt_session, FILE *out, uint32_t flags); extern int __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_txn_named_snapshot_drop(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *nameval); diff --git a/src/include/flags.h b/src/include/flags.h index 064349125cc..bafff92fbc0 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -45,8 +45,9 @@ #define WT_READ_NO_WAIT 0x00000020 #define WT_READ_PREV 0x00000040 #define WT_READ_SKIP_INTL 0x00000080 -#define WT_READ_TRUNCATE 0x00000100 -#define WT_READ_WONT_NEED 0x00000200 +#define WT_READ_SKIP_LEAF 0x00000100 +#define WT_READ_TRUNCATE 0x00000200 +#define WT_READ_WONT_NEED 0x00000400 #define WT_SESSION_CAN_WAIT 0x00000001 #define WT_SESSION_CLEAR_EVICT_WALK 0x00000002 #define WT_SESSION_INTERNAL 0x00000004 diff --git a/src/include/gcc.h b/src/include/gcc.h index 01e33792d73..bb80f8b738b 100644 --- a/src/include/gcc.h +++ b/src/include/gcc.h @@ -156,8 +156,7 @@ __wt_atomic_cas_ptr(void *vp, void *old, void *new) #if defined(x86_64) || defined(__x86_64__) /* Pause instruction to prevent excess processor bus usage */ -#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory") - +#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory") #define WT_FULL_BARRIER() do { \ __asm__ volatile ("mfence" ::: "memory"); \ } while (0) @@ -169,7 +168,7 @@ __wt_atomic_cas_ptr(void *vp, void *old, void *new) } while (0) #elif defined(i386) || defined(__i386__) -#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory") +#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory") #define WT_FULL_BARRIER() do { \ __asm__ volatile ("lock; addl $0, 0(%%esp)" ::: "memory"); \ } while (0) @@ -177,23 +176,58 @@ __wt_atomic_cas_ptr(void *vp, void *old, void *new) #define WT_WRITE_BARRIER() WT_FULL_BARRIER() #elif defined(__PPC64__) || defined(PPC64) +/* ori 0,0,0 is the PPC64 noop instruction */ #define WT_PAUSE() __asm__ volatile("ori 0,0,0" ::: "memory") -#define WT_FULL_BARRIER() do { +#define WT_FULL_BARRIER() do { \ __asm__ volatile ("sync" ::: "memory"); \ } while (0) -#define WT_READ_BARRIER() WT_FULL_BARRIER() -#define WT_WRITE_BARRIER() WT_FULL_BARRIER() + +/* TODO: ISA 2.07 Elemental Memory Barriers would be better, + specifically mbll, and mbss, but they are not supported by POWER 8 */ +#define WT_READ_BARRIER() do { \ + __asm__ volatile ("lwsync" ::: "memory"); \ +} while (0) +#define WT_WRITE_BARRIER() do { \ + __asm__ volatile ("lwsync" ::: "memory"); \ +} while (0) #elif defined(__aarch64__) #define WT_PAUSE() __asm__ volatile("yield" ::: "memory") #define WT_FULL_BARRIER() do { \ - __asm__ volatile ("dsb sy" ::: "memory"); \ + __asm__ volatile ("dsb sy" ::: "memory"); \ +} while (0) +#define WT_READ_BARRIER() do { \ + __asm__ volatile ("dsb ld" ::: "memory"); \ +} while (0) +#define WT_WRITE_BARRIER() do { \ + __asm__ volatile ("dsb st" ::: "memory"); \ +} while (0) + +#elif defined(__s390x__) +#define WT_PAUSE() __asm__ volatile("lr 0,0" ::: "memory") +#define WT_FULL_BARRIER() do { \ + __asm__ volatile ("bcr 15,0\n" ::: "memory"); \ } while (0) +#define WT_READ_BARRIER() WT_FULL_BARRIER() +#define WT_WRITE_BARRIER() WT_FULL_BARRIER() + +#elif defined(__sparc__) +#define WT_PAUSE() __asm__ volatile("rd %%ccr, %%g0" ::: "memory") + +#define WT_FULL_BARRIER() do { \ + __asm__ volatile ("membar #StoreLoad" ::: "memory"); \ +} while (0) + +/* + * On UltraSparc machines, TSO is used, and so there is no need for membar. + * READ_BARRIER = #LoadLoad, and WRITE_BARRIER = #StoreStore are noop. + */ #define WT_READ_BARRIER() do { \ - __asm__ volatile ("dsb ld" ::: "memory"); \ + __asm__ volatile ("" ::: "memory"); \ } while (0) + #define WT_WRITE_BARRIER() do { \ - __asm__ volatile ("dsb st" ::: "memory"); \ + __asm__ volatile ("" ::: "memory"); \ } while (0) #else diff --git a/src/include/log.h b/src/include/log.h index 521de567fc0..e7737e12663 100644 --- a/src/include/log.h +++ b/src/include/log.h @@ -267,6 +267,11 @@ struct __wt_log_desc { }; /* + * Flags for __wt_txn_op_printlog. + */ +#define WT_TXN_PRINTLOG_HEX 0x0001 /* Add hex output */ + +/* * WT_LOG_REC_DESC -- * A descriptor for a log record type. */ diff --git a/src/include/misc.h b/src/include/misc.h index e542baec642..898e44eb8e0 100644 --- a/src/include/misc.h +++ b/src/include/misc.h @@ -268,3 +268,6 @@ union __wt_rand_state { uint32_t w, z; } x; }; + +/* Shared array for converting to hex */ +extern const u_char __wt_hex[]; diff --git a/src/include/session.h b/src/include/session.h index 5c3bcfb8ed0..1eca49f2c40 100644 --- a/src/include/session.h +++ b/src/include/session.h @@ -74,7 +74,10 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl { TAILQ_HEAD(__cursors, __wt_cursor) cursors; WT_CURSOR_BACKUP *bkp_cursor; /* Hot backup cursor */ - WT_COMPACT *compact; /* Compact state */ + + WT_COMPACT *compact; /* Compaction information */ + enum { WT_COMPACT_NONE=0, + WT_COMPACT_RUNNING, WT_COMPACT_SUCCESS } compact_state; /* * Lookaside table cursor, sweep and eviction worker threads only. @@ -134,8 +137,6 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl { void *reconcile; /* Reconciliation support */ int (*reconcile_cleanup)(WT_SESSION_IMPL *); - bool compaction; /* Compaction did some work */ - uint32_t flags; /* diff --git a/src/include/stat.h b/src/include/stat.h index dfe7ee5c6cd..a554607b7d5 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -139,8 +139,8 @@ __wt_stats_clear(void *stats_arg, int slot) */ #define WT_STAT_READ(stats, fld) \ __wt_stats_aggregate(stats, WT_STATS_FIELD_TO_SLOT(stats, fld)) -#define WT_STAT_WRITE(session, stats, fld) \ - ((stats)[WT_STATS_SLOT_ID(session)]->fld); +#define WT_STAT_WRITE(stats, fld, v) \ + (stats)->fld = (int64_t)(v) #define WT_STAT_DECRV(session, stats, fld, value) \ (stats)[WT_STATS_SLOT_ID(session)]->fld -= (int64_t)(value) diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 08f73386090..bdd8bb65910 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -896,18 +896,17 @@ struct __wt_session { * boolean flag; default \c false.} * @config{bulk, configure the cursor for bulk-loading\, a fast\, * initial load path (see @ref tune_bulk_load for more information). - * Bulk-load may only be used for newly created objects and cursors - * configured for bulk-load only support the WT_CURSOR::insert and - * WT_CURSOR::close methods. When bulk-loading row-store objects\, keys - * must be loaded in sorted order. The value is usually a true/false - * flag; when bulk-loading fixed-length column store objects\, the - * special value \c bitmap allows chunks of a memory resident bitmap to - * be loaded directly into a file by passing a \c WT_ITEM to - * WT_CURSOR::set_value where the \c size field indicates the number of - * records in the bitmap (as specified by the object's \c value_format - * configuration). Bulk-loaded bitmap values must end on a byte boundary - * relative to the bit count (except for the last set of values - * loaded)., a string; default \c false.} + * Bulk-load may only be used for newly created objects and applications + * should use the WT_CURSOR::insert method to insert rows. When + * bulk-loading\, rows must be loaded in sorted order. The value is + * usually a true/false flag; when bulk-loading fixed-length column + * store objects\, the special value \c bitmap allows chunks of a memory + * resident bitmap to be loaded directly into a file by passing a \c + * WT_ITEM to WT_CURSOR::set_value where the \c size field indicates the + * number of records in the bitmap (as specified by the object's \c + * value_format configuration). Bulk-loaded bitmap values must end on a + * byte boundary relative to the bit count (except for the last set of + * values loaded)., a string; default \c false.} * @config{checkpoint, the name of a checkpoint to open (the reserved * name "WiredTigerCheckpoint" opens the most recent internal checkpoint * taken for the object). The cursor does not support data @@ -921,10 +920,19 @@ struct __wt_session { * string\, chosen from the following options: \c "hex"\, \c "json"\, \c * "print"; default empty.} * @config{next_random, configure the cursor to return a pseudo-random - * record from the object; valid only for row-store cursors. Cursors - * configured with \c next_random=true only support the WT_CURSOR::next - * and WT_CURSOR::close methods. See @ref cursor_random for details., a - * boolean flag; default \c false.} + * record from the object when the WT_CURSOR::next method is called; + * valid only for row-store cursors. See @ref cursor_random for + * details., a boolean flag; default \c false.} + * @config{next_random_sample_size, cursors configured by \c next_random + * to return pseudo-random records from the object randomly select from + * the entire object\, by default. Setting \c next_random_sample_size + * to a non-zero value sets the number of samples the application + * expects to take using the \c next_random cursor. A cursor configured + * with both \c next_random and \c next_random_sample_size attempts to + * divide the object into \c next_random_sample_size equal-sized + * pieces\, and each retrieval returns a record from one of those + * pieces. See @ref cursor_random for details., a string; default \c + * 0.} * @config{overwrite, configures whether the cursor's insert\, update * and remove methods check the existing state of the record. If \c * overwrite is \c false\, WT_CURSOR::insert fails with diff --git a/src/log/log_auto.c b/src/log/log_auto.c index 5a1d03b1976..54df01d01ab 100644 --- a/src/log/log_auto.c +++ b/src/log/log_auto.c @@ -69,7 +69,7 @@ __logrec_json_unpack_str(char *dest, size_t destlen, const char *src, } static int -__logrec_jsonify_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item) +__logrec_make_json_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item) { size_t needed; @@ -79,6 +79,17 @@ __logrec_jsonify_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item) return (0); } +static int +__logrec_make_hex_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item) +{ + size_t needed; + + needed = item->size * 2 + 1; + WT_RET(__wt_realloc(session, NULL, needed, destp)); + __wt_fill_hex(item->data, item->size, (uint8_t *)*destp, needed, NULL); + return (0); +} + int __wt_logop_col_put_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, @@ -121,7 +132,8 @@ __wt_logop_col_put_unpack( int __wt_logop_col_put_print( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + FILE *out, uint32_t flags) { WT_DECL_RET; uint32_t fileid; @@ -138,9 +150,14 @@ __wt_logop_col_put_print( " \"fileid\": \"%" PRIu32 "\",\n", fileid)); WT_ERR(__wt_fprintf(out, " \"recno\": \"%" PRIu64 "\",\n", recno)); - WT_ERR(__logrec_jsonify_str(session, &escaped, &value)); + WT_ERR(__logrec_make_json_str(session, &escaped, &value)); WT_ERR(__wt_fprintf(out, " \"value\": \"%s\"", escaped)); + if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) { + WT_ERR(__logrec_make_hex_str(session, &escaped, &value)); + WT_ERR(__wt_fprintf(out, + ",\n \"value-hex\": \"%s\"", escaped)); + } err: __wt_free(session, escaped); return (ret); @@ -188,11 +205,13 @@ __wt_logop_col_remove_unpack( int __wt_logop_col_remove_print( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + FILE *out, uint32_t flags) { uint32_t fileid; uint64_t recno; + WT_UNUSED(flags); WT_RET(__wt_logop_col_remove_unpack( session, pp, end, &fileid, &recno)); @@ -246,12 +265,14 @@ __wt_logop_col_truncate_unpack( int __wt_logop_col_truncate_print( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + FILE *out, uint32_t flags) { uint32_t fileid; uint64_t start; uint64_t stop; + WT_UNUSED(flags); WT_RET(__wt_logop_col_truncate_unpack( session, pp, end, &fileid, &start, &stop)); @@ -307,7 +328,8 @@ __wt_logop_row_put_unpack( int __wt_logop_row_put_print( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + FILE *out, uint32_t flags) { WT_DECL_RET; uint32_t fileid; @@ -322,12 +344,22 @@ __wt_logop_row_put_print( WT_RET(__wt_fprintf(out, " \"optype\": \"row_put\",\n")); WT_ERR(__wt_fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid)); - WT_ERR(__logrec_jsonify_str(session, &escaped, &key)); + WT_ERR(__logrec_make_json_str(session, &escaped, &key)); WT_ERR(__wt_fprintf(out, " \"key\": \"%s\",\n", escaped)); - WT_ERR(__logrec_jsonify_str(session, &escaped, &value)); + if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) { + WT_ERR(__logrec_make_hex_str(session, &escaped, &key)); + WT_ERR(__wt_fprintf(out, + " \"key-hex\": \"%s\",\n", escaped)); + } + WT_ERR(__logrec_make_json_str(session, &escaped, &value)); WT_ERR(__wt_fprintf(out, " \"value\": \"%s\"", escaped)); + if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) { + WT_ERR(__logrec_make_hex_str(session, &escaped, &value)); + WT_ERR(__wt_fprintf(out, + ",\n \"value-hex\": \"%s\"", escaped)); + } err: __wt_free(session, escaped); return (ret); @@ -375,7 +407,8 @@ __wt_logop_row_remove_unpack( int __wt_logop_row_remove_print( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + FILE *out, uint32_t flags) { WT_DECL_RET; uint32_t fileid; @@ -389,9 +422,14 @@ __wt_logop_row_remove_print( WT_RET(__wt_fprintf(out, " \"optype\": \"row_remove\",\n")); WT_ERR(__wt_fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid)); - WT_ERR(__logrec_jsonify_str(session, &escaped, &key)); + WT_ERR(__logrec_make_json_str(session, &escaped, &key)); WT_ERR(__wt_fprintf(out, " \"key\": \"%s\"", escaped)); + if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) { + WT_ERR(__logrec_make_hex_str(session, &escaped, &key)); + WT_ERR(__wt_fprintf(out, + ",\n \"key-hex\": \"%s\"", escaped)); + } err: __wt_free(session, escaped); return (ret); @@ -439,7 +477,8 @@ __wt_logop_row_truncate_unpack( int __wt_logop_row_truncate_print( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + FILE *out, uint32_t flags) { WT_DECL_RET; uint32_t fileid; @@ -455,12 +494,22 @@ __wt_logop_row_truncate_print( WT_RET(__wt_fprintf(out, " \"optype\": \"row_truncate\",\n")); WT_ERR(__wt_fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid)); - WT_ERR(__logrec_jsonify_str(session, &escaped, &start)); + WT_ERR(__logrec_make_json_str(session, &escaped, &start)); WT_ERR(__wt_fprintf(out, " \"start\": \"%s\",\n", escaped)); - WT_ERR(__logrec_jsonify_str(session, &escaped, &stop)); + if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) { + WT_ERR(__logrec_make_hex_str(session, &escaped, &start)); + WT_ERR(__wt_fprintf(out, + " \"start-hex\": \"%s\",\n", escaped)); + } + WT_ERR(__logrec_make_json_str(session, &escaped, &stop)); WT_ERR(__wt_fprintf(out, " \"stop\": \"%s\",\n", escaped)); + if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) { + WT_ERR(__logrec_make_hex_str(session, &escaped, &stop)); + WT_ERR(__wt_fprintf(out, + " \"stop-hex\": \"%s\",\n", escaped)); + } WT_ERR(__wt_fprintf(out, " \"mode\": \"%" PRIu32 "\"", mode)); @@ -470,7 +519,8 @@ err: __wt_free(session, escaped); int __wt_txn_op_printlog( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + FILE *out, uint32_t flags) { uint32_t optype, opsize; @@ -480,27 +530,33 @@ __wt_txn_op_printlog( switch (optype) { case WT_LOGOP_COL_PUT: - WT_RET(__wt_logop_col_put_print(session, pp, end, out)); + WT_RET(__wt_logop_col_put_print(session, pp, end, out, + flags)); break; case WT_LOGOP_COL_REMOVE: - WT_RET(__wt_logop_col_remove_print(session, pp, end, out)); + WT_RET(__wt_logop_col_remove_print(session, pp, end, out, + flags)); break; case WT_LOGOP_COL_TRUNCATE: - WT_RET(__wt_logop_col_truncate_print(session, pp, end, out)); + WT_RET(__wt_logop_col_truncate_print(session, pp, end, out, + flags)); break; case WT_LOGOP_ROW_PUT: - WT_RET(__wt_logop_row_put_print(session, pp, end, out)); + WT_RET(__wt_logop_row_put_print(session, pp, end, out, + flags)); break; case WT_LOGOP_ROW_REMOVE: - WT_RET(__wt_logop_row_remove_print(session, pp, end, out)); + WT_RET(__wt_logop_row_remove_print(session, pp, end, out, + flags)); break; case WT_LOGOP_ROW_TRUNCATE: - WT_RET(__wt_logop_row_truncate_print(session, pp, end, out)); + WT_RET(__wt_logop_row_truncate_print(session, pp, end, out, + flags)); break; WT_ILLEGAL_VALUE(session); diff --git a/src/lsm/lsm_stat.c b/src/lsm/lsm_stat.c index c1eb7a2a389..7c53990a2a2 100644 --- a/src/lsm/lsm_stat.c +++ b/src/lsm/lsm_stat.c @@ -91,7 +91,7 @@ __curstat_lsm_init( * top-level. */ new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor); - new->lsm_generation_max = chunk->generation; + WT_STAT_WRITE(new, lsm_generation_max, chunk->generation); /* Aggregate statistics from each new chunk. */ __wt_stat_dsrc_aggregate_single(new, stats); @@ -115,37 +115,40 @@ __curstat_lsm_init( * into the top-level. */ new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor); - new->bloom_size = - (int64_t)((chunk->count * lsm_tree->bloom_bit_count) / 8); - new->bloom_page_evict = - new->cache_eviction_clean + new->cache_eviction_dirty; - new->bloom_page_read = new->cache_read; + WT_STAT_WRITE(new, bloom_size, + (int64_t)((chunk->count * lsm_tree->bloom_bit_count) / 8)); + WT_STAT_WRITE(new, bloom_page_evict, + new->cache_eviction_clean + new->cache_eviction_dirty); + WT_STAT_WRITE(new, bloom_page_read, new->cache_read); __wt_stat_dsrc_aggregate_single(new, stats); WT_ERR(stat_cursor->close(stat_cursor)); } /* Set statistics that aren't aggregated directly into the cursor */ - stats->bloom_count = bloom_count; - stats->lsm_chunk_count = lsm_tree->nchunks; + WT_STAT_WRITE(stats, bloom_count, bloom_count); + WT_STAT_WRITE(stats, lsm_chunk_count, lsm_tree->nchunks); /* Include, and optionally clear, LSM-level specific information. */ - stats->bloom_miss = lsm_tree->bloom_miss; + WT_STAT_WRITE(stats, bloom_miss, lsm_tree->bloom_miss); if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) lsm_tree->bloom_miss = 0; - stats->bloom_hit = lsm_tree->bloom_hit; + WT_STAT_WRITE(stats, bloom_hit, lsm_tree->bloom_hit); if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) lsm_tree->bloom_hit = 0; - stats->bloom_false_positive = lsm_tree->bloom_false_positive; + WT_STAT_WRITE( + stats, bloom_false_positive, lsm_tree->bloom_false_positive); if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) lsm_tree->bloom_false_positive = 0; - stats->lsm_lookup_no_bloom = lsm_tree->lsm_lookup_no_bloom; + WT_STAT_WRITE( + stats, lsm_lookup_no_bloom, lsm_tree->lsm_lookup_no_bloom); if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) lsm_tree->lsm_lookup_no_bloom = 0; - stats->lsm_checkpoint_throttle = lsm_tree->lsm_checkpoint_throttle; + WT_STAT_WRITE( + stats, lsm_checkpoint_throttle, lsm_tree->lsm_checkpoint_throttle); if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) lsm_tree->lsm_checkpoint_throttle = 0; - stats->lsm_merge_throttle = lsm_tree->lsm_merge_throttle; + WT_STAT_WRITE(stats, lsm_merge_throttle, lsm_tree->lsm_merge_throttle); if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) lsm_tree->lsm_merge_throttle = 0; diff --git a/src/meta/meta_turtle.c b/src/meta/meta_turtle.c index 13e8b31916f..3bd57846862 100644 --- a/src/meta/meta_turtle.c +++ b/src/meta/meta_turtle.c @@ -271,8 +271,7 @@ err: WT_TRET(__wt_fclose(&fp, WT_FHANDLE_READ)); * Update the turtle file. */ int -__wt_turtle_update( - WT_SESSION_IMPL *session, const char *key, const char *value) +__wt_turtle_update(WT_SESSION_IMPL *session, const char *key, const char *value) { WT_FH *fh; WT_DECL_ITEM(buf); diff --git a/src/os_posix/os_map.c b/src/os_posix/os_map.c index e95ccb0ade2..4276c89dbcf 100644 --- a/src/os_posix/os_map.c +++ b/src/os_posix/os_map.c @@ -48,8 +48,6 @@ __wt_mmap(WT_SESSION_IMPL *session, return (0); } -#define WT_VM_PAGESIZE 4096 - /* * __wt_mmap_preload -- * Cause a section of a memory map to be faulted in. @@ -59,9 +57,10 @@ __wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size) { #ifdef HAVE_POSIX_MADVISE /* Linux requires the address be aligned to a 4KB boundary. */ + WT_CONNECTION_IMPL *conn = S2C(session); WT_BM *bm = S2BT(session)->bm; WT_DECL_RET; - void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(WT_VM_PAGESIZE - 1)); + void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(conn->page_size - 1)); size += WT_PTRDIFF(p, blk); /* XXX proxy for "am I doing a scan?" -- manual read-ahead */ @@ -78,9 +77,9 @@ __wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size) * Manual pages aren't clear on whether alignment is required for the * size, so we will be conservative. */ - size &= ~(size_t)(WT_VM_PAGESIZE - 1); + size &= ~(size_t)(conn->page_size - 1); - if (size > WT_VM_PAGESIZE && + if (size > (size_t)conn->page_size && (ret = posix_madvise(blk, size, POSIX_MADV_WILLNEED)) != 0) WT_RET_MSG(session, ret, "posix_madvise will need"); #else @@ -101,8 +100,9 @@ __wt_mmap_discard(WT_SESSION_IMPL *session, void *p, size_t size) { #ifdef HAVE_POSIX_MADVISE /* Linux requires the address be aligned to a 4KB boundary. */ + WT_CONNECTION_IMPL *conn = S2C(session); WT_DECL_RET; - void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(WT_VM_PAGESIZE - 1)); + void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(conn->page_size - 1)); size += WT_PTRDIFF(p, blk); if ((ret = posix_madvise(blk, size, POSIX_MADV_DONTNEED)) != 0) diff --git a/src/os_posix/os_pagesize.c b/src/os_posix/os_pagesize.c new file mode 100644 index 00000000000..e7c7b4fdf15 --- /dev/null +++ b/src/os_posix/os_pagesize.c @@ -0,0 +1,19 @@ +/*- + * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_get_vm_pagesize -- + * Return the default page size of a virtual memory page. + */ +int +__wt_get_vm_pagesize(void) +{ + return (getpagesize()); +} diff --git a/src/os_win/os_pagesize.c b/src/os_win/os_pagesize.c new file mode 100644 index 00000000000..55cd6a694ec --- /dev/null +++ b/src/os_win/os_pagesize.c @@ -0,0 +1,23 @@ +/*- + * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_get_vm_pagesize -- + * Return the default page size of a virtual memory page. + */ +int +__wt_get_vm_pagesize(void) +{ + SYSTEM_INFO system_info; + + GetSystemInfo(&system_info); + + return (system_info.dwPageSize); +} diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index fd2aec45115..2b07117f9d5 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -1276,6 +1276,8 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, for (upd = upd_list; upd->next != NULL; upd = upd->next) ; upd->next = append; + __wt_cache_page_inmem_incr( + session, page, WT_UPDATE_MEMSIZE(append)); } /* @@ -1756,7 +1758,7 @@ __rec_key_state_update(WT_RECONCILE *r, bool ovfl_key) * Figure out the maximum leaf page size for the reconciliation. */ static inline uint32_t -__rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r) +__rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r) { WT_BTREE *btree; WT_PAGE *page; @@ -3263,7 +3265,14 @@ supd_check_complete: memset(WT_BLOCK_HEADER_REF(dsk), 0, btree->block_header); bnd->cksum = __wt_cksum(buf->data, buf->size); - if (mod->rec_result == WT_PM_REC_MULTIBLOCK && + /* + * One last check: don't reuse blocks if compacting, the reason + * for compaction is to move blocks to different locations. We + * do this check after calculating the checksums, hopefully the + * next write can be skipped. + */ + if (session->compact_state == WT_COMPACT_NONE && + mod->rec_result == WT_PM_REC_MULTIBLOCK && mod->mod_multi_entries > bnd_slot) { multi = &mod->mod_multi[bnd_slot]; if (multi->size == bnd->size && @@ -4465,7 +4474,7 @@ compare: /* WT_ERR(__rec_txn_read(session, r, ins, NULL, NULL, &upd)); if (upd == NULL) continue; - for (n = WT_INSERT_RECNO(ins); src_recno <= n; ++src_recno) { + for (n = WT_INSERT_RECNO(ins); src_recno <= n;) { /* * The application may have inserted records which left * gaps in the name space, and these gaps can be huge. @@ -4505,7 +4514,7 @@ compare: /* last->size == size && memcmp(last->data, data, size) == 0)) { ++rle; - continue; + goto next; } WT_ERR(__rec_col_var_helper(session, r, salvage, last, last_deleted, 0, rle)); @@ -4524,6 +4533,15 @@ compare: /* } last_deleted = deleted; rle = 1; + + /* + * Move to the next record. It's not a simple increment + * because if it's the maximum record, incrementing it + * wraps to 0 and this turns into an infinite loop. + */ +next: if (src_recno == UINT64_MAX) + break; + ++src_recno; } } diff --git a/src/session/session_api.c b/src/session/session_api.c index 053f69ee7f8..f0d0f26db54 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -148,7 +148,7 @@ __session_close(WT_SESSION *wt_session, const char *config) * via the registered close callback. */ if (session->event_handler->handle_close != NULL && - !WT_STREQ(cursor->uri, WT_LAS_URI)) + !WT_STREQ(cursor->internal_uri, WT_LAS_URI)) WT_TRET(session->event_handler->handle_close( session->event_handler, wt_session, cursor)); WT_TRET(cursor->close(cursor)); diff --git a/src/session/session_compact.c b/src/session/session_compact.c index 456fcd3ce03..8a5b741c0c5 100644 --- a/src/session/session_compact.c +++ b/src/session/session_compact.c @@ -172,12 +172,12 @@ __compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) for (i = 0; i < 100; ++i) { WT_ERR(__wt_txn_checkpoint(session, checkpoint_cfg)); - session->compaction = false; + session->compact_state = WT_COMPACT_RUNNING; WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker( session, uri, __wt_compact, NULL, cfg, 0)); WT_ERR(ret); - if (!session->compaction) + if (session->compact_state != WT_COMPACT_SUCCESS) break; WT_ERR(__wt_txn_checkpoint(session, checkpoint_cfg)); @@ -185,7 +185,9 @@ __compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) WT_ERR(__session_compact_check_timeout(session, start_time)); } -err: __wt_scr_free(session, &t); +err: session->compact_state = WT_COMPACT_NONE; + + __wt_scr_free(session, &t); return (ret); } diff --git a/src/support/global.c b/src/support/global.c index 1e32f5b4453..2330a65a707 100644 --- a/src/support/global.c +++ b/src/support/global.c @@ -12,28 +12,6 @@ WT_PROCESS __wt_process; /* Per-process structure */ static int __wt_pthread_once_failed; /* If initialization failed */ /* - * __system_is_little_endian -- - * Check if the system is little endian. - */ -static int -__system_is_little_endian(void) -{ - uint64_t v; - bool little; - - v = 1; - little = *((uint8_t *)&v) != 0; - - if (little) - return (0); - - fprintf(stderr, - "This release of the WiredTiger data engine does not support " - "big-endian systems; contact WiredTiger for more information.\n"); - return (EINVAL); -} - -/* * __wt_global_once -- * Global initialization, run once. */ @@ -42,11 +20,6 @@ __wt_global_once(void) { WT_DECL_RET; - if ((ret = __system_is_little_endian()) != 0) { - __wt_pthread_once_failed = ret; - return; - } - if ((ret = __wt_spin_init(NULL, &__wt_process.spinlock, "global")) != 0) { __wt_pthread_once_failed = ret; @@ -115,7 +88,7 @@ __wt_attach(WT_SESSION_IMPL *session) /* Sleep forever, the debugger will interrupt us when it attaches. */ for (;;) - __wt_sleep(100, 0); + __wt_sleep(10, 0); #else WT_UNUSED(session); #endif diff --git a/src/support/hash_city.c b/src/support/hash_city.c index 9a4a6464f40..33f4113c004 100644 --- a/src/support/hash_city.c +++ b/src/support/hash_city.c @@ -99,6 +99,12 @@ static uint32_t UNALIGNED_LOAD32(const char *p) { #define bswap_32(x) OSSwapInt32(x) #define bswap_64(x) OSSwapInt64(x) +#elif defined(__sun) + +#include <sys/byteorder.h> +#define bswap_32 BSWAP_32 +#define bswap_64 BSWAP_64 + #else #include <byteswap.h> #endif diff --git a/src/support/hex.c b/src/support/hex.c index eb9f420911a..5fb8d4bc190 100644 --- a/src/support/hex.c +++ b/src/support/hex.c @@ -8,7 +8,7 @@ #include "wt_internal.h" -static const u_char hex[] = "0123456789abcdef"; +const u_char __wt_hex[] = "0123456789abcdef"; /* * __fill_hex -- @@ -25,8 +25,8 @@ __fill_hex(const uint8_t *src, size_t src_max, --dest_max; for (; src_max > 0 && dest_max > 1; src_max -= 1, dest_max -= 2, ++src) { - *dest++ = hex[(*src & 0xf0) >> 4]; - *dest++ = hex[*src & 0x0f]; + *dest++ = __wt_hex[(*src & 0xf0) >> 4]; + *dest++ = __wt_hex[*src & 0x0f]; } *dest++ = '\0'; if (lenp != NULL) @@ -34,6 +34,17 @@ __fill_hex(const uint8_t *src, size_t src_max, } /* + * __wt_fill_hex -- + * In-memory conversion of raw bytes to a hexadecimal representation. + */ +void +__wt_fill_hex(const uint8_t *src, size_t src_max, + uint8_t *dest, size_t dest_max, size_t *lenp) +{ + __fill_hex(src, src_max, dest, dest_max, lenp); +} + +/* * __wt_raw_to_hex -- * Convert a chunk of data to a nul-terminated printable hex string. */ @@ -83,8 +94,8 @@ __wt_raw_to_esc_hex( *t++ = *p; } else { *t++ = '\\'; - *t++ = hex[(*p & 0xf0) >> 4]; - *t++ = hex[*p & 0x0f]; + *t++ = __wt_hex[(*p & 0xf0) >> 4]; + *t++ = __wt_hex[*p & 0x0f]; } *t++ = '\0'; to->size = WT_PTRDIFF(t, to->mem); diff --git a/src/support/huffman.c b/src/support/huffman.c index 4bda365cb10..9488dbf14fe 100644 --- a/src/support/huffman.c +++ b/src/support/huffman.c @@ -1,9 +1,31 @@ -/*- +/* * Copyright (c) 2014-2015 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * - * See the file LICENSE for redistribution information. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name MongoDB or the name WiredTiger + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY MONGODB INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ #include "wt_internal.h" diff --git a/src/support/rand.c b/src/support/rand.c index f5ecb12633e..3adcb801f03 100644 --- a/src/support/rand.c +++ b/src/support/rand.c @@ -60,6 +60,29 @@ __wt_random_init(WT_RAND_STATE volatile * rnd_state) } /* + * __wt_random_init_seed -- + * Initialize the state of a 32-bit pseudo-random number. + * Use this, instead of __wt_random_init if we are running with multiple + * threads and we want each thread to initialize its own random state based + * on a different random seed. + */ +int +__wt_random_init_seed( + WT_SESSION_IMPL *session, WT_RAND_STATE volatile * rnd_state) +{ + struct timespec ts; + WT_RAND_STATE rnd; + + WT_RET(__wt_epoch(session, &ts)); + M_W(rnd) = (uint32_t)(ts.tv_nsec + 521288629); + M_Z(rnd) = (uint32_t)(ts.tv_nsec + 362436069); + + *rnd_state = rnd; + + return (0); +} + +/* * __wt_random -- * Return a 32-bit pseudo-random number. */ diff --git a/src/support/stat.c b/src/support/stat.c index 4d7cd65fd18..7a615131628 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -250,19 +250,24 @@ __wt_stat_dsrc_aggregate_single( to->block_alloc += from->block_alloc; to->block_free += from->block_free; to->block_checkpoint_size += from->block_checkpoint_size; - to->allocation_size = from->allocation_size; + if (from->allocation_size > to->allocation_size) + to->allocation_size = from->allocation_size; to->block_reuse_bytes += from->block_reuse_bytes; - to->block_magic = from->block_magic; - to->block_major = from->block_major; + if (from->block_magic > to->block_magic) + to->block_magic = from->block_magic; + if (from->block_major > to->block_major) + to->block_major = from->block_major; to->block_size += from->block_size; - to->block_minor = from->block_minor; + if (from->block_minor > to->block_minor) + to->block_minor = from->block_minor; to->btree_checkpoint_generation += from->btree_checkpoint_generation; to->btree_column_fix += from->btree_column_fix; to->btree_column_internal += from->btree_column_internal; to->btree_column_deleted += from->btree_column_deleted; to->btree_column_variable += from->btree_column_variable; to->btree_column_rle += from->btree_column_rle; - to->btree_fixed_len = from->btree_fixed_len; + if (from->btree_fixed_len > to->btree_fixed_len) + to->btree_fixed_len = from->btree_fixed_len; if (from->btree_maxintlkey > to->btree_maxintlkey) to->btree_maxintlkey = from->btree_maxintlkey; if (from->btree_maxintlpage > to->btree_maxintlpage) @@ -367,12 +372,16 @@ __wt_stat_dsrc_aggregate( to->block_free += WT_STAT_READ(from, block_free); to->block_checkpoint_size += WT_STAT_READ(from, block_checkpoint_size); - to->allocation_size = from[0]->allocation_size; + if ((v = WT_STAT_READ(from, allocation_size)) > to->allocation_size) + to->allocation_size = v; to->block_reuse_bytes += WT_STAT_READ(from, block_reuse_bytes); - to->block_magic = from[0]->block_magic; - to->block_major = from[0]->block_major; + if ((v = WT_STAT_READ(from, block_magic)) > to->block_magic) + to->block_magic = v; + if ((v = WT_STAT_READ(from, block_major)) > to->block_major) + to->block_major = v; to->block_size += WT_STAT_READ(from, block_size); - to->block_minor = from[0]->block_minor; + if ((v = WT_STAT_READ(from, block_minor)) > to->block_minor) + to->block_minor = v; to->btree_checkpoint_generation += WT_STAT_READ(from, btree_checkpoint_generation); to->btree_column_fix += WT_STAT_READ(from, btree_column_fix); @@ -382,15 +391,14 @@ __wt_stat_dsrc_aggregate( to->btree_column_variable += WT_STAT_READ(from, btree_column_variable); to->btree_column_rle += WT_STAT_READ(from, btree_column_rle); - to->btree_fixed_len = from[0]->btree_fixed_len; - if ((v = WT_STAT_READ(from, btree_maxintlkey)) > - to->btree_maxintlkey) + if ((v = WT_STAT_READ(from, btree_fixed_len)) > to->btree_fixed_len) + to->btree_fixed_len = v; + if ((v = WT_STAT_READ(from, btree_maxintlkey)) > to->btree_maxintlkey) to->btree_maxintlkey = v; if ((v = WT_STAT_READ(from, btree_maxintlpage)) > to->btree_maxintlpage) to->btree_maxintlpage = v; - if ((v = WT_STAT_READ(from, btree_maxleafkey)) > - to->btree_maxleafkey) + if ((v = WT_STAT_READ(from, btree_maxleafkey)) > to->btree_maxleafkey) to->btree_maxleafkey = v; if ((v = WT_STAT_READ(from, btree_maxleafpage)) > to->btree_maxleafpage) diff --git a/src/txn/txn.c b/src/txn/txn.c index f835fea8f67..0a3e4a7a7db 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -216,6 +216,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, bool force) conn = S2C(session); txn_global = &conn->txn_global; +retry: current_id = last_running = txn_global->current; oldest_session = NULL; prev_oldest_id = txn_global->oldest_id; @@ -287,43 +288,60 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, bool force) WT_TXNID_LT(txn_global->last_running, last_running); /* Update the oldest ID. */ - if ((WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) && - __wt_atomic_casiv32(&txn_global->scan_count, 1, -1)) { - WT_ORDERED_READ(session_cnt, conn->session_cnt); - for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { - if ((id = s->id) != WT_TXN_NONE && - WT_TXNID_LT(id, last_running)) - last_running = id; - if ((id = s->snap_min) != WT_TXN_NONE && - WT_TXNID_LT(id, oldest_id)) - oldest_id = id; - } - - if (WT_TXNID_LT(last_running, oldest_id)) - oldest_id = last_running; - -#ifdef HAVE_DIAGNOSTIC + if (WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) { /* - * Make sure the ID doesn't move past any named snapshots. - * - * Don't include the read/assignment in the assert statement. - * Coverity complains if there are assignments only done in - * diagnostic builds, and when the read is from a volatile. + * We know we want to update. Check if we're racing. */ - id = txn_global->nsnap_oldest_id; - WT_ASSERT(session, - id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id)); + if (__wt_atomic_casiv32(&txn_global->scan_count, 1, -1)) { + WT_ORDERED_READ(session_cnt, conn->session_cnt); + for (i = 0, s = txn_global->states; + i < session_cnt; i++, s++) { + if ((id = s->id) != WT_TXN_NONE && + WT_TXNID_LT(id, last_running)) + last_running = id; + if ((id = s->snap_min) != WT_TXN_NONE && + WT_TXNID_LT(id, oldest_id)) + oldest_id = id; + } + + if (WT_TXNID_LT(last_running, oldest_id)) + oldest_id = last_running; + +#ifdef HAVE_DIAGNOSTIC + /* + * Make sure the ID doesn't move past any named + * snapshots. + * + * Don't include the read/assignment in the assert + * statement. Coverity complains if there are + * assignments only done in diagnostic builds, and + * when the read is from a volatile. + */ + id = txn_global->nsnap_oldest_id; + WT_ASSERT(session, + id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id)); #endif - if (WT_TXNID_LT(txn_global->last_running, last_running)) - txn_global->last_running = last_running; - if (WT_TXNID_LT(txn_global->oldest_id, oldest_id)) - txn_global->oldest_id = oldest_id; - WT_ASSERT(session, txn_global->scan_count == -1); - txn_global->scan_count = 0; + if (WT_TXNID_LT(txn_global->last_running, last_running)) + txn_global->last_running = last_running; + if (WT_TXNID_LT(txn_global->oldest_id, oldest_id)) + txn_global->oldest_id = oldest_id; + WT_ASSERT(session, txn_global->scan_count == -1); + txn_global->scan_count = 0; + } else { + /* + * We wanted to update the oldest ID but we're racing + * another thread. Retry if this is a forced update. + */ + WT_ASSERT(session, txn_global->scan_count > 0); + (void)__wt_atomic_subiv32(&txn_global->scan_count, 1); + if (force) { + __wt_yield(); + goto retry; + } + } } else { if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) && - current_id - oldest_id > 10000 && last_running_moved && - oldest_session != NULL) { + current_id - oldest_id > 10000 && oldest_session != NULL) { (void)__wt_verbose(session, WT_VERB_TRANSACTION, "old snapshot %" PRIu64 " pinned in session %d [%s]" diff --git a/src/txn/txn_log.c b/src/txn/txn_log.c index c5fa52dea6a..148ed868792 100644 --- a/src/txn/txn_log.c +++ b/src/txn/txn_log.c @@ -8,6 +8,12 @@ #include "wt_internal.h" +/* Cookie passed to __txn_printlog. */ +typedef struct { + FILE *out; + uint32_t flags; +} WT_TXN_PRINTLOG_ARGS; + /* * __txn_op_log -- * Log an operation for the current transaction. @@ -64,7 +70,8 @@ err: __wt_buf_free(session, &key); */ static int __txn_commit_printlog( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, + uint32_t flags) { bool firstrecord; @@ -79,7 +86,7 @@ __txn_commit_printlog( firstrecord = false; - WT_RET(__wt_txn_op_printlog(session, pp, end, out)); + WT_RET(__wt_txn_op_printlog(session, pp, end, out, flags)); WT_RET(__wt_fprintf(out, "\n }")); } @@ -459,6 +466,7 @@ __txn_printlog(WT_SESSION_IMPL *session, FILE *out; WT_LOG_RECORD *logrec; WT_LSN ckpt_lsn; + WT_TXN_PRINTLOG_ARGS *args; const uint8_t *end, *p; const char *msg; uint64_t txnid; @@ -467,7 +475,8 @@ __txn_printlog(WT_SESSION_IMPL *session, bool compressed; WT_UNUSED(next_lsnp); - out = cookie; + args = cookie; + out = args->out; p = WT_LOG_SKIP_HEADER(rawrec->data); end = (const uint8_t *)rawrec->data + rawrec->size; @@ -506,7 +515,8 @@ __txn_printlog(WT_SESSION_IMPL *session, WT_RET(__wt_fprintf(out, " \"type\" : \"commit\",\n")); WT_RET(__wt_fprintf(out, " \"txnid\" : %" PRIu64 ",\n", txnid)); - WT_RET(__txn_commit_printlog(session, &p, end, out)); + WT_RET(__txn_commit_printlog(session, &p, end, out, + args->flags)); break; case WT_LOGREC_FILE_SYNC: @@ -537,15 +547,18 @@ __txn_printlog(WT_SESSION_IMPL *session, * Print the log in a human-readable format. */ int -__wt_txn_printlog(WT_SESSION *wt_session, FILE *out) +__wt_txn_printlog(WT_SESSION *wt_session, FILE *out, uint32_t flags) { WT_SESSION_IMPL *session; + WT_TXN_PRINTLOG_ARGS args; session = (WT_SESSION_IMPL *)wt_session; + args.out = out; + args.flags = flags; WT_RET(__wt_fprintf(out, "[\n")); WT_RET(__wt_log_scan( - session, NULL, WT_LOGSCAN_FIRST, __txn_printlog, out)); + session, NULL, WT_LOGSCAN_FIRST, __txn_printlog, &args)); WT_RET(__wt_fprintf(out, "\n]\n")); return (0); diff --git a/src/utilities/util_main.c b/src/utilities/util_main.c index 9cbda08690e..3b7187bd0de 100644 --- a/src/utilities/util_main.c +++ b/src/utilities/util_main.c @@ -226,7 +226,6 @@ main(int argc, char *argv[]) ret = func(session, argc, argv); /* Close the database. */ - err: if (conn != NULL && (tret = conn->close(conn, NULL)) != 0 && ret == 0) ret = tret; diff --git a/src/utilities/util_printlog.c b/src/utilities/util_printlog.c index d202b09b228..3a665c1c657 100644 --- a/src/utilities/util_printlog.c +++ b/src/utilities/util_printlog.c @@ -15,10 +15,10 @@ util_printlog(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; int ch; - bool printable; + uint32_t flags; - printable = false; - while ((ch = __wt_getopt(progname, argc, argv, "f:p")) != EOF) + flags = 0; + while ((ch = __wt_getopt(progname, argc, argv, "f:x")) != EOF) switch (ch) { case 'f': /* output file */ if (freopen(__wt_optarg, "w", stdout) == NULL) { @@ -27,8 +27,8 @@ util_printlog(WT_SESSION *session, int argc, char *argv[]) return (1); } break; - case 'p': - printable = true; + case 'x': /* hex output */ + LF_SET(WT_TXN_PRINTLOG_HEX); break; case '?': default: @@ -41,8 +41,7 @@ util_printlog(WT_SESSION *session, int argc, char *argv[]) if (argc != 0) return (usage()); - WT_UNUSED(printable); - ret = __wt_txn_printlog(session, stdout); + ret = __wt_txn_printlog(session, stdout, flags); if (ret != 0) { fprintf(stderr, "%s: printlog failed: %s\n", @@ -61,7 +60,7 @@ usage(void) { (void)fprintf(stderr, "usage: %s %s " - "printlog [-p] [-f output-file]\n", + "printlog [-x] [-f output-file]\n", progname, usage_prefix); return (1); } diff --git a/test/format/ops.c b/test/format/ops.c index c705d362fe8..7e299b7d975 100644 --- a/test/format/ops.c +++ b/test/format/ops.c @@ -504,7 +504,7 @@ skip_insert: if (col_update(tinfo, */ if (!insert) { dir = (int)mmrand(&tinfo->rnd, 0, 1); - for (np = 0; np < mmrand(&tinfo->rnd, 1, 8); ++np) { + for (np = 0; np < mmrand(&tinfo->rnd, 1, 30); ++np) { if (notfound) break; if (nextprev(cursor, dir, ¬found)) diff --git a/test/suite/test_bulk01.py b/test/suite/test_bulk01.py index 80b420c9392..df027df0ddd 100644 --- a/test/suite/test_bulk01.py +++ b/test/suite/test_bulk01.py @@ -130,7 +130,7 @@ class test_bulk_load(wttest.WiredTigerTestCase): # Test that variable-length column-store bulk-load efficiently creates big # records. - def test_bulk_load_col_delete_big(self): + def test_bulk_load_col_big(self): if self.keyfmt != 'r' or self.valfmt == '8t': return diff --git a/test/suite/test_colgap.py b/test/suite/test_colgap.py index 4192f14c5e6..924d622a024 100644 --- a/test/suite/test_colgap.py +++ b/test/suite/test_colgap.py @@ -28,6 +28,7 @@ import wiredtiger, wttest from helper import simple_populate, key_populate, value_populate +from wtscenario import check_scenarios, multiply_scenarios, number_scenarios # test_colgap.py # Test variable-length column-store gap performance. @@ -119,5 +120,90 @@ class test_column_store_gap(wttest.WiredTigerTestCase): self.backward(cursor, list(reversed(v))) +# Basic testing of variable-length column-store with big records. +class test_colmax(wttest.WiredTigerTestCase): + name = 'test_colmax' + + types = [ + ('file', dict(type='file:')), + ('table', dict(type='table:')) + ] + valfmt = [ + ('integer', dict(valfmt='i')), + ('string', dict(valfmt='S')), + ] + record_number = [ + ('big', dict(recno=18446744073709551606)), + ('max', dict(recno=18446744073709551615)), + ] + bulk = [ + ('bulk', dict(bulk=1)), + ('not-bulk', dict(bulk=0)), + ] + reopen = [ + ('reopen', dict(reopen=1)), + ('not-reopen', dict(reopen=0)), + ] + single = [ + ('single', dict(single=1)), + ('not-single', dict(single=0)), + ] + + scenarios = number_scenarios(multiply_scenarios(\ + '.', types, valfmt, record_number, bulk, reopen, single)) + + # Test that variable-length column-store correctly/efficiently handles big + # records (if it's not efficient, we'll just hang). + def test_colmax_op(self): + recno = self.recno + + uri = self.type + self.name + self.session.create(uri, 'key_format=r' +',value_format=' + self.valfmt) + + # Insert a big record with/without a bulk cursor. + bulk_config = "" + if self.bulk: + bulk_config = "bulk" + cursor = self.session.open_cursor(uri, None, bulk_config) + + # Optionaly make the big record the only record in the table. + if not self.single: + for i in range(1, 723): + cursor[key_populate(cursor, i)] = value_populate(cursor, i) + + # Confirm searching past the end of the table works. + if not self.bulk: + cursor.set_key(recno) + self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND) + + # Insert the big record. + cursor[key_populate(cursor, recno)] = value_populate(cursor, recno) + + # Optionally flush to disk; re-open the cursor as necessary. + if self.bulk or self.reopen: + cursor.close() + if self.reopen == 1: + self.reopen_conn() + if self.bulk or self.reopen: + cursor = self.session.open_cursor(uri, None, None) + + # Search for the large record. + cursor.set_key(recno) + self.assertEqual(cursor.search(), 0) + self.assertEqual(cursor.get_value(), value_populate(cursor, recno)) + + # Update it. + cursor[key_populate(cursor, recno)] = value_populate(cursor, 37) + cursor.set_key(recno) + self.assertEqual(cursor.search(), 0) + self.assertEqual(cursor.get_value(), value_populate(cursor, 37)) + + # Remove it. + cursor.set_key(recno) + self.assertEqual(cursor.remove(), 0) + cursor.set_key(key_populate(cursor, recno)) + self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND) + + if __name__ == '__main__': wttest.run() diff --git a/test/suite/test_compact.py b/test/suite/test_compact01.py index c7269785115..c7269785115 100644 --- a/test/suite/test_compact.py +++ b/test/suite/test_compact01.py diff --git a/test/suite/test_compact02.py b/test/suite/test_compact02.py new file mode 100644 index 00000000000..f2d5c1fa283 --- /dev/null +++ b/test/suite/test_compact02.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2015 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_compact02.py +# Test that compact reduces the file size. +# + +import wiredtiger, wttest +from wiredtiger import stat +from wtscenario import multiply_scenarios, number_scenarios + +# Test basic compression +class test_compact02(wttest.WiredTigerTestCase): + + types = [ + ('file', dict(uri='file:test_compact02')), + ] + cacheSize = [ + ('default', dict(cacheSize='')), + ('1mb', dict(cacheSize='cache_size=1MB')), + ('10gb', dict(cacheSize='cache_size=10GB')), + ] + + # There's a balance between the pages we create and the size of the records + # being stored: compaction doesn't work on tables with many overflow items + # because we don't rewrite them. Experimentally, 8KB is as small as the test + # can go. Additionally, we can't set the maximum page size too large because + # there won't be enough pages to rewrite. Experimentally, 32KB (the default) + # is as large as the test can go. + fileConfig = [ + ('default', dict(fileConfig='')), + ('8KB', dict(fileConfig='leaf_page_max=8kb')), + ] + scenarios = \ + number_scenarios(multiply_scenarios('.', types, cacheSize, fileConfig)) + + # We want about 22K records that total about 130Mb. That is an average + # of 6196 bytes per record. Half the records should be smaller, about + # 2700 bytes (about 30Mb) and the other half should be larger, 9666 bytes + # per record (about 100Mb). + # + # Test flow is as follows. + # + # 1. Create a table with the data, alternating record size. + # 2. Checkpoint and get stats on the table to confirm the size. + # 3. Delete the half of the records with the larger record size. + # 4. Call compact. + # 5. Get stats on compacted table. + # + nrecords = 22000 + bigvalue = "abcdefghi" * 1074 # 9*1074 == 9666 + smallvalue = "ihgfedcba" * 303 # 9*303 == 2727 + + fullsize = nrecords / 2 * len(bigvalue) + nrecords / 2 * len(smallvalue) + + # Return the size of the file + def getSize(self): + cstat = self.session.open_cursor( + 'statistics:' + self.uri, None, 'statistics=(size)') + sz = cstat[stat.dsrc.block_size][2] + cstat.close() + return sz + + # This test varies the cache size and so needs to set up its own connection. + # Override the standard methods. + def setUpConnectionOpen(self, dir): + return None + def setUpSessionOpen(self, conn): + return None + def ConnectionOpen(self, cacheSize): + self.home = '.' + conn_params = 'create,' + \ + cacheSize + ',error_prefix="%s: ",' % self.shortid() + \ + 'statistics=(fast)' + try: + self.conn = wiredtiger.wiredtiger_open(self.home, conn_params) + except wiredtiger.WiredTigerError as e: + print "Failed conn at '%s' with config '%s'" % (dir, conn_params) + self.session = self.conn.open_session(None) + + # Create a table, add keys with both big and small values. + def test_compact02(self): + self.ConnectionOpen(self.cacheSize) + + mb = 1024 * 1024 + params = 'key_format=i,value_format=S,' + self.fileConfig + + # 1. Create a table with the data, alternating record size. + self.session.create(self.uri, params) + c = self.session.open_cursor(self.uri, None) + for i in range(self.nrecords): + if i % 2 == 0: + c[i] = str(i) + self.bigvalue + else: + c[i] = str(i) + self.smallvalue + c.close() + + # 2. Checkpoint and get stats on the table to confirm the size. + self.session.checkpoint() + sz = self.getSize() + self.pr('After populate ' + str(sz / mb) + 'MB') + self.assertGreater(sz, self.fullsize) + + # 3. Delete the half of the records with the larger record size. + c = self.session.open_cursor(self.uri, None) + count = 0 + for i in range(self.nrecords): + if i % 2 == 0: + count += 1 + c.set_key(i) + c.remove() + c.close() + self.pr('Removed total ' + str((count * 9666) / mb) + 'MB') + + # 4. Call compact. + self.session.compact(self.uri, None) + + # 5. Get stats on compacted table. + sz = self.getSize() + self.pr('After compact ' + str(sz / mb) + 'MB') + + # After compact, the file size should be less than half the full size. + self.assertLess(sz, self.fullsize / 2) + + +if __name__ == '__main__': + wttest.run() diff --git a/test/suite/test_cursor_random.py b/test/suite/test_cursor_random.py index 10a3140a2fd..b424dbbc7e3 100644 --- a/test/suite/test_cursor_random.py +++ b/test/suite/test_cursor_random.py @@ -29,90 +29,93 @@ import wiredtiger, wttest from helper import complex_populate, simple_populate from helper import key_populate, value_populate -from wtscenario import check_scenarios +from wtscenario import check_scenarios, multiply_scenarios, number_scenarios # test_cursor_random.py # Cursor next_random operations class test_cursor_random(wttest.WiredTigerTestCase): - scenarios = check_scenarios([ - ('file', dict(type='file:',fmt='S')), - ('table', dict(type='table:',fmt='S')) - ]) + types = [ + ('file', dict(type='file:random')), + ('table', dict(type='table:random')) + ] + config = [ + ('sample', dict(config='next_random=true,next_random_sample_size=35')), + ('not-sample', dict(config='next_random=true')) + ] + scenarios =number_scenarios(multiply_scenarios('.', types, config)) # Check that opening a random cursor on a row-store returns not-supported - # for every method except for next and reset, and next returns not-found. - def test_cursor_random_column(self): - uri = self.type + 'random' - self.session.create(uri, 'key_format=' + self.fmt + ',value_format=S') - cursor = self.session.open_cursor(uri, None, "next_random=true") + # for methods other than next, reconfigure and reset, and next returns + # not-found. + def test_cursor_random(self): + uri = self.type + self.session.create(uri, 'key_format=S,value_format=S') + cursor = self.session.open_cursor(uri, None, self.config) self.assertRaises( wiredtiger.WiredTigerError, lambda: cursor.compare(cursor)) + self.assertRaises(wiredtiger.WiredTigerError, lambda: cursor.insert()) self.assertRaises(wiredtiger.WiredTigerError, lambda: cursor.prev()) + self.assertRaises(wiredtiger.WiredTigerError, lambda: cursor.remove()) self.assertRaises(wiredtiger.WiredTigerError, lambda: cursor.search()) self.assertRaises( wiredtiger.WiredTigerError, lambda: cursor.search_near()) - self.assertRaises(wiredtiger.WiredTigerError, lambda: cursor.insert()) self.assertRaises(wiredtiger.WiredTigerError, lambda: cursor.update()) - self.assertRaises(wiredtiger.WiredTigerError, lambda: cursor.remove()) - cursor.reset() self.assertTrue(cursor.next(), wiredtiger.WT_NOTFOUND) + self.assertEquals(cursor.reconfigure(), 0) + self.assertEquals(cursor.reset(), 0) cursor.close() # Check that next_random works with a single value, repeatedly. def test_cursor_random_single_record(self): - uri = self.type + 'random' - self.session.create(uri, 'key_format=' + self.fmt + ',value_format=S') + uri = self.type + self.session.create(uri, 'key_format=S,value_format=S') cursor = self.session.open_cursor(uri, None) cursor['AAA'] = 'BBB' cursor.close() - cursor = self.session.open_cursor(uri, None, "next_random=true") + cursor = self.session.open_cursor(uri, None, self.config) for i in range(1,5): - cursor.next() + self.assertEquals(cursor.next(), 0) self.assertEquals(cursor.get_key(), 'AAA') cursor.close # Check that next_random works in the presence of a larger set of values, # where the values are in an insert list. def test_cursor_random_multiple_insert_records(self): - uri = self.type + 'random' - if self.type == 'file:': + uri = self.type + if uri.startswith('file:'): simple_populate(self, uri, - 'allocation_size=512,leaf_page_max=512,key_format=' +\ - self.fmt, 100) + 'allocation_size=512,leaf_page_max=512,key_format=S', 100) else: complex_populate(self, uri, - 'allocation_size=512,leaf_page_max=512,key_format=' +\ - self.fmt, 100) + 'allocation_size=512,leaf_page_max=512,key_format=S', 100) # In a insert list, next_random always selects the middle key/value # pair, all we can do is confirm cursor.next works. - cursor = self.session.open_cursor(uri, None, "next_random=true") + cursor = self.session.open_cursor(uri, None, self.config) self.assertEqual(cursor.next(), 0) # Check that next_random works in the presence of a larger set of values, # where the values are in a disk format page. def cursor_random_multiple_page_records(self, reopen): - uri = self.type + 'random' - if self.type == 'file:': + uri = self.type + if uri.startswith('file:'): simple_populate(self, uri, - 'allocation_size=512,leaf_page_max=512,key_format=' +\ - self.fmt, 10000) + 'allocation_size=512,leaf_page_max=512,key_format=S', 10000) else: complex_populate(self, uri, - 'allocation_size=512,leaf_page_max=512,key_format=' +\ - self.fmt, 10000) + 'allocation_size=512,leaf_page_max=512,key_format=S', 10000) # Optionally close the connection so everything is forced to disk, # insert lists are an entirely different path in the code. if reopen: self.reopen_conn() - cursor = self.session.open_cursor(uri, None, "next_random=true") + cursor = self.session.open_cursor(uri, None, self.config) last = '' match = 0 for i in range(1,10): - cursor.next() + self.assertEqual(cursor.next(), 0) current = cursor.get_key() if current == last: match += 1 @@ -128,23 +131,32 @@ class test_cursor_random(wttest.WiredTigerTestCase): # Check that opening a random cursor on column-store returns not-supported. class test_cursor_random_column(wttest.WiredTigerTestCase): scenarios = check_scenarios([ - ('file', dict(uri='file:random',fmt='r')), - ('table', dict(uri='table:random',fmt='r')), + ('file', dict(uri='file:random')), + ('table', dict(uri='table:random')) ]) def test_cursor_random_column(self): - self.session.create( - self.uri, 'key_format=' + self.fmt + ',value_format=S') - cursor = self.session.open_cursor(self.uri, None, "next_random=true") - self.assertRaises(wiredtiger.WiredTigerError, lambda: cursor.next()) - cursor.close() + self.session.create(self.uri, 'key_format=r,value_format=S') + msg = '/Operation not supported/' + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: + self.session.open_cursor(self.uri, None, "next_random=true"), msg) # Check next_random works in the presence a set of updates, some or all of # which are invisible to the cursor. class test_cursor_random_invisible(wttest.WiredTigerTestCase): + types = [ + ('file', dict(type='file:random')), + ('table', dict(type='table:random')) + ] + config = [ + ('sample', dict(config='next_random=true,next_random_sample_size=35')), + ('not-sample', dict(config='next_random=true')) + ] + scenarios =number_scenarios(multiply_scenarios('.', types, config)) + def test_cursor_random_invisible_all(self): - uri = 'file:random' + uri = self.type self.session.create(uri, 'key_format=S,value_format=S') cursor = self.session.open_cursor(uri, None) @@ -156,11 +168,11 @@ class test_cursor_random_invisible(wttest.WiredTigerTestCase): # Open another session, the updates won't yet be visible, we shouldn't # find anything at all. s = self.conn.open_session() - cursor = s.open_cursor(uri, None, "next_random=true") + cursor = s.open_cursor(uri, None, self.config) self.assertEqual(cursor.next(), wiredtiger.WT_NOTFOUND) def test_cursor_random_invisible_after(self): - uri = 'file:random' + uri = self.type self.session.create(uri, 'key_format=S,value_format=S') cursor = self.session.open_cursor(uri, None) @@ -175,12 +187,12 @@ class test_cursor_random_invisible(wttest.WiredTigerTestCase): # Open another session, the updates won't yet be visible, we should # return the only possible record. s = self.conn.open_session() - cursor = s.open_cursor(uri, None, "next_random=true") - cursor.next() + cursor = s.open_cursor(uri, None, self.config) + self.assertEquals(cursor.next(), 0) self.assertEqual(cursor.get_key(), key_populate(cursor, 1)) def test_cursor_random_invisible_before(self): - uri = 'file:random' + uri = self.type self.session.create(uri, 'key_format=S,value_format=S') cursor = self.session.open_cursor(uri, None) @@ -195,8 +207,8 @@ class test_cursor_random_invisible(wttest.WiredTigerTestCase): # Open another session, the updates won't yet be visible, we should # return the only possible record. s = self.conn.open_session() - cursor = s.open_cursor(uri, None, "next_random=true") - cursor.next() + cursor = s.open_cursor(uri, None, self.config) + self.assertEquals(cursor.next(), 0) self.assertEqual(cursor.get_key(), key_populate(cursor, 99)) diff --git a/test/suite/test_cursor_random02.py b/test/suite/test_cursor_random02.py new file mode 100644 index 00000000000..7c9e0e38cb9 --- /dev/null +++ b/test/suite/test_cursor_random02.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2015 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest +from helper import complex_populate, simple_populate +from helper import key_populate, value_populate +from wtscenario import check_scenarios, multiply_scenarios, number_scenarios + +# test_cursor_random02.py +# Cursor next_random operations +class test_cursor_random02(wttest.WiredTigerTestCase): + type = 'table:random' + config = [ + ('not-sample', dict(config='next_random=true')) + ] + records = [ + ('1', dict(records=1)), + ('250', dict(records=250)), + ('500', dict(records=500)), + ('5000', dict(records=5000)), + ('10000', dict(records=10000)), + ('50000', dict(records=50000)), + ] + scenarios = number_scenarios(multiply_scenarios('.', config, records)) + + # Check that next_random works in the presence of a larger set of values, + # where the values are in an insert list. + def test_cursor_random_reasonable_distribution(self): + uri = self.type + num_entries = self.records + + # Set the leaf-page-max value, otherwise the page might split. + simple_populate(self, uri, + 'leaf_page_max=100MB,key_format=S', num_entries) + # Setup an array to track which keys are seen + visitedKeys = [0] * (num_entries + 1) + + cursor = self.session.open_cursor(uri, None, 'next_random=true') + for i in range(0, num_entries): + self.assertEqual(cursor.next(), 0) + current = cursor.get_key() + current = int(current) + visitedKeys[current] = visitedKeys[current] + 1 + + differentKeys = sum(x > 0 for x in visitedKeys) + + #print visitedKeys + #print differentKeys + ''' + self.tty('differentKeys: ' + str(differentKeys) + ' of ' + \ + str(num_entries) + ', ' + \ + str((int)((differentKeys * 100) / num_entries)) + '%') + ''' + + self.assertGreater(differentKeys, num_entries / 4, + 'next_random random distribution not adequate') + +if __name__ == '__main__': + wttest.run() diff --git a/test/suite/test_jsondump02.py b/test/suite/test_jsondump02.py index 790f651fd2f..ac81e0729e5 100644 --- a/test/suite/test_jsondump02.py +++ b/test/suite/test_jsondump02.py @@ -209,7 +209,7 @@ class test_jsondump02(wttest.WiredTigerTestCase): self.check_json(self.table_uri3, ( ('"key0" : 1', '"value0" : "\\u0001\\u0002\\u0003"'), ('"key0" : 2', - '"value0" : "\\u0077\\u0088\\u0099\\u0000\\u00FF\\u00FE"'))) + '"value0" : "\\u0077\\u0088\\u0099\\u0000\\u00ff\\u00fe"'))) self.check_json(self.table_uri4, ( ('"ikey" : 1,\n"Skey" : "key1"', '"S1" : "val1",\n"i2" : 1,\n"S3" : "val1",\n"i4" : 1'), diff --git a/test/suite/test_txn08.py b/test/suite/test_txn08.py index d35a0c70b3b..8ee48104231 100644 --- a/test/suite/test_txn08.py +++ b/test/suite/test_txn08.py @@ -82,6 +82,11 @@ class test_txn08(wttest.WiredTigerTestCase, suite_subprocess): self.runWt(['printlog'], outfilename='printlog.out') self.check_file_contains('printlog.out', '\\u0001\\u0002abcd\\u0003\\u0004') + self.runWt(['printlog', '-x'], outfilename='printlog-hex.out') + self.check_file_contains('printlog-hex.out', + '\\u0001\\u0002abcd\\u0003\\u0004') + self.check_file_contains('printlog-hex.out', + '0102616263640304') if __name__ == '__main__': wttest.run() |