summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bench/wtperf/config.c125
-rw-r--r--bench/wtperf/runners/btree-split-stress.wtperf10
-rw-r--r--bench/wtperf/wtperf.c78
-rw-r--r--bench/wtperf/wtperf.h18
-rw-r--r--bench/wtperf/wtperf_opt.i1
-rw-r--r--build_posix/configure.ac.in2
-rw-r--r--build_win/filelist.win1
-rw-r--r--dist/api_data.py46
-rw-r--r--dist/filelist1
-rw-r--r--dist/flags.py1
-rw-r--r--dist/log.py97
-rwxr-xr-xdist/s_copyright14
-rwxr-xr-xdist/s_funcs2
-rwxr-xr-xdist/s_longlines7
-rw-r--r--dist/s_string.ok16
-rwxr-xr-xdist/s_win1
-rw-r--r--dist/stat.py16
-rw-r--r--dist/stat_data.py273
-rw-r--r--src/block/block_compact.c92
-rw-r--r--src/block/block_mgr.c16
-rw-r--r--src/block/block_open.c38
-rw-r--r--src/btree/bt_compact.c48
-rw-r--r--src/btree/bt_curnext.c113
-rw-r--r--src/btree/bt_curprev.c6
-rw-r--r--src/btree/bt_cursor.c108
-rw-r--r--src/btree/bt_debug.c20
-rw-r--r--src/btree/bt_handle.c4
-rw-r--r--src/btree/bt_huffman.c10
-rw-r--r--src/btree/bt_misc.c2
-rw-r--r--src/btree/bt_page.c8
-rw-r--r--src/btree/bt_read.c2
-rw-r--r--src/btree/bt_slvg.c2
-rw-r--r--src/btree/bt_split.c377
-rw-r--r--src/btree/bt_stat.c8
-rw-r--r--src/btree/bt_sync.c4
-rw-r--r--src/btree/bt_walk.c211
-rw-r--r--src/btree/col_srch.c117
-rw-r--r--src/btree/row_srch.c293
-rw-r--r--src/cache/cache_las.c9
-rw-r--r--src/config/config_def.c8
-rw-r--r--src/conn/conn_api.c3
-rw-r--r--src/conn/conn_dhandle.c4
-rw-r--r--src/cursor/cur_file.c14
-rw-r--r--src/cursor/cur_json.c13
-rw-r--r--src/cursor/cur_stat.c7
-rw-r--r--src/cursor/cur_std.c8
-rw-r--r--src/cursor/cur_table.c7
-rw-r--r--src/docs/command-line.dox41
-rw-r--r--src/docs/cursor-random.dox23
-rw-r--r--src/docs/license.dox21
-rw-r--r--src/docs/wtperf.dox2
-rw-r--r--src/evict/evict_file.c8
-rw-r--r--src/evict/evict_lru.c9
-rw-r--r--src/include/block.h7
-rw-r--r--src/include/btmem.h4
-rw-r--r--src/include/btree.i12
-rw-r--r--src/include/column.i22
-rw-r--r--src/include/connection.h1
-rw-r--r--src/include/cursor.h21
-rw-r--r--src/include/extern.h34
-rw-r--r--src/include/flags.h5
-rw-r--r--src/include/gcc.h52
-rw-r--r--src/include/log.h5
-rw-r--r--src/include/misc.h3
-rw-r--r--src/include/session.h7
-rw-r--r--src/include/stat.h4
-rw-r--r--src/include/wiredtiger.in40
-rw-r--r--src/log/log_auto.c96
-rw-r--r--src/lsm/lsm_stat.c31
-rw-r--r--src/meta/meta_turtle.c3
-rw-r--r--src/os_posix/os_map.c12
-rw-r--r--src/os_posix/os_pagesize.c19
-rw-r--r--src/os_win/os_pagesize.c23
-rw-r--r--src/reconcile/rec_write.c26
-rw-r--r--src/session/session_api.c2
-rw-r--r--src/session/session_compact.c8
-rw-r--r--src/support/global.c29
-rw-r--r--src/support/hash_city.c6
-rw-r--r--src/support/hex.c21
-rw-r--r--src/support/huffman.c26
-rw-r--r--src/support/rand.c23
-rw-r--r--src/support/stat.c36
-rw-r--r--src/txn/txn.c82
-rw-r--r--src/txn/txn_log.c25
-rw-r--r--src/utilities/util_main.c1
-rw-r--r--src/utilities/util_printlog.c15
-rw-r--r--test/format/ops.c2
-rw-r--r--test/suite/test_bulk01.py2
-rw-r--r--test/suite/test_colgap.py86
-rw-r--r--test/suite/test_compact01.py (renamed from test/suite/test_compact.py)0
-rw-r--r--test/suite/test_compact02.py152
-rw-r--r--test/suite/test_cursor_random.py106
-rw-r--r--test/suite/test_cursor_random02.py84
-rw-r--r--test/suite/test_jsondump02.py2
-rw-r--r--test/suite/test_txn08.py5
95 files changed, 2548 insertions, 957 deletions
diff --git a/bench/wtperf/config.c b/bench/wtperf/config.c
index 1238c25502c..808e85eedae 100644
--- a/bench/wtperf/config.c
+++ b/bench/wtperf/config.c
@@ -54,6 +54,7 @@ static void config_opt_usage(void);
int
config_assign(CONFIG *dest, const CONFIG *src)
{
+ CONFIG_QUEUE_ENTRY *conf_line, *tmp_line;
size_t i, len;
char *newstr, **pstr;
@@ -96,6 +97,18 @@ config_assign(CONFIG *dest, const CONFIG *src)
}
TAILQ_INIT(&dest->stone_head);
+ TAILQ_INIT(&dest->config_head);
+
+ /* Clone the config string information into the new cfg object */
+ TAILQ_FOREACH(conf_line, &src->config_head, c) {
+ len = strlen(conf_line->string);
+ if ((tmp_line = calloc(sizeof(CONFIG_QUEUE_ENTRY), 1)) == NULL)
+ return (enomem(src));
+ if ((tmp_line->string = calloc(len + 1, 1)) == NULL)
+ return (enomem(src));
+ strncpy(tmp_line->string, conf_line->string, len);
+ TAILQ_INSERT_TAIL(&dest->config_head, tmp_line, c);
+ }
return (0);
}
@@ -106,9 +119,17 @@ config_assign(CONFIG *dest, const CONFIG *src)
void
config_free(CONFIG *cfg)
{
+ CONFIG_QUEUE_ENTRY *config_line;
size_t i;
char **pstr;
+ while (!TAILQ_EMPTY(&cfg->config_head)) {
+ config_line = TAILQ_FIRST(&cfg->config_head);
+ TAILQ_REMOVE(&cfg->config_head, config_line, c);
+ free(config_line->string);
+ free(config_line);
+ }
+
for (i = 0; i < sizeof(config_opts) / sizeof(config_opts[0]); i++)
if (config_opts[i].type == STRING_TYPE ||
config_opts[i].type == CONFIG_STRING_TYPE) {
@@ -569,16 +590,34 @@ err: if (fd != -1)
int
config_opt_line(CONFIG *cfg, const char *optstr)
{
+ CONFIG_QUEUE_ENTRY *config_line;
WT_CONFIG_ITEM k, v;
WT_CONFIG_PARSER *scan;
+ size_t len;
int ret, t_ret;
+ char *string_copy;
+ len = strlen(optstr);
if ((ret = wiredtiger_config_parser_open(
- NULL, optstr, strlen(optstr), &scan)) != 0) {
+ NULL, optstr, len, &scan)) != 0) {
lprintf(cfg, ret, 0, "Error in config_scan_begin");
return (ret);
}
+ /*
+ * Append the current line to our copy of the config. The config is
+ * stored in the order it is processed, so added options will be after
+ * any parsed from the original config. We allocate len + 1 to allow for
+ * a null byte to be added.
+ */
+ if ((string_copy = calloc(len + 1, 1)) == NULL)
+ return (enomem(cfg));
+
+ strncpy(string_copy, optstr, len);
+ config_line = calloc(sizeof(CONFIG_QUEUE_ENTRY), 1);
+ config_line->string = string_copy;
+ TAILQ_INSERT_TAIL(&cfg->config_head, config_line, c);
+
while (ret == 0) {
if ((ret = scan->next(scan, &k, &v)) != 0) {
/* Any parse error has already been reported. */
@@ -653,6 +692,90 @@ config_sanity(CONFIG *cfg)
}
/*
+ * config_consolidate --
+ * Consolidate repeated configuration settings so that it only appears
+ * once in the configuration output file.
+ */
+void
+config_consolidate(CONFIG *cfg)
+{
+ CONFIG_QUEUE_ENTRY *conf_line, *test_line, *tmp;
+ char *string_key;
+
+ /*
+ * This loop iterates over the config queue and for entry checks if an
+ * entry later in the queue has the same key. If a match is found then
+ * the current queue entry is removed and we continue.
+ */
+ conf_line = TAILQ_FIRST(&cfg->config_head);
+ while (conf_line != NULL) {
+ string_key = strchr(conf_line->string, '=');
+ tmp = test_line = TAILQ_NEXT(conf_line, c);
+ while (test_line != NULL) {
+ /*
+ * The + 1 here forces the '=' sign to be matched
+ * ensuring we don't match keys that have a common
+ * prefix such as "table_count" and "table_count_idle"
+ * as being the same key.
+ */
+ if (strncmp(conf_line->string, test_line->string,
+ (size_t)(string_key - conf_line->string + 1))
+ == 0) {
+ TAILQ_REMOVE(&cfg->config_head, conf_line, c);
+ free(conf_line->string);
+ free(conf_line);
+ break;
+ }
+ test_line = TAILQ_NEXT(test_line, c);
+ }
+ conf_line = tmp;
+ }
+}
+
+/*
+ * config_to_file --
+ * Write the final config used in this execution to a file.
+ */
+void
+config_to_file(CONFIG *cfg)
+{
+ CONFIG_QUEUE_ENTRY *config_line;
+ FILE *fp;
+ size_t req_len;
+ char *path;
+
+ fp = NULL;
+
+ /* Backup the config */
+ req_len = strlen(cfg->home) + 100;
+ if ((path = calloc(req_len, 1)) == NULL) {
+ (void)enomem(cfg);
+ goto err;
+ }
+
+ snprintf(path, req_len + 14, "%s/CONFIG.wtperf", cfg->home);
+ if ((fp = fopen(path, "w")) == NULL) {
+ lprintf(cfg, errno, 0, "%s", path);
+ goto err;
+ }
+
+ /* Print the config dump */
+ fprintf(fp,"# Warning. This config includes "
+ "unwritten, implicit configuration defaults.\n"
+ "# Changes to those values may cause differences in behavior.\n");
+ config_consolidate(cfg);
+ config_line = TAILQ_FIRST(&cfg->config_head);
+ while (config_line != NULL) {
+ fprintf(fp, "%s\n", config_line->string);
+ config_line = TAILQ_NEXT(config_line, c);
+ }
+
+err: free(path);
+ if (fp != NULL)
+ (void)fclose(fp);
+}
+
+/*
* config_print --
* Print out the configuration in verbose mode.
*/
diff --git a/bench/wtperf/runners/btree-split-stress.wtperf b/bench/wtperf/runners/btree-split-stress.wtperf
new file mode 100644
index 00000000000..deb8c70d12f
--- /dev/null
+++ b/bench/wtperf/runners/btree-split-stress.wtperf
@@ -0,0 +1,10 @@
+conn_config="cache_size=2GB,statistics=[fast,clear],statistics_log=(wait=10),eviction=(threads_max=4,threads_min=4)"
+table_config="type=file,leaf_page_max=8k,internal_page_max=8k,memory_page_max=2MB,split_deepen_min_child=250"
+icount=200000
+report_interval=5
+run_time=300
+reopen_connection=false
+populate_threads=2
+value_sz=256
+read_range=100
+threads=((count=4,inserts=1,throttle=100000),(count=8,reads=1))
diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c
index 955f605c0b3..5386096d9b7 100644
--- a/bench/wtperf/wtperf.c
+++ b/bench/wtperf/wtperf.c
@@ -60,6 +60,7 @@ static const CONFIG default_cfg = {
0, /* total seconds running */
0, /* has truncate */
{NULL, NULL}, /* the truncate queue */
+ {NULL, NULL}, /* the config queue */
#define OPT_DEFINE_DEFAULT
#include "wtperf_opt.i"
@@ -371,6 +372,53 @@ err: cfg->error = cfg->stop = 1;
return (NULL);
}
+/*
+ * do_range_reads --
+ * If configured to execute a sequence of next operations after each
+ * search do them. Ensuring the keys we see are always in order.
+ */
+static int
+do_range_reads(CONFIG *cfg, WT_CURSOR *cursor)
+{
+ size_t range;
+ uint64_t next_val, prev_val;
+ char *range_key_buf;
+ char buf[512];
+ int ret;
+
+ ret = 0;
+
+ if (cfg->read_range == 0)
+ return (0);
+
+ memset(&buf[0], 0, 512 * sizeof(char));
+ range_key_buf = &buf[0];
+
+ /* Save where the first key is for comparisons. */
+ cursor->get_key(cursor, &range_key_buf);
+ extract_key(range_key_buf, &next_val);
+
+ for (range = 0; range < cfg->read_range; ++range) {
+ prev_val = next_val;
+ ret = cursor->next(cursor);
+ /* We are done if we reach the end. */
+ if (ret != 0)
+ break;
+
+ /* Retrieve and decode the key */
+ cursor->get_key(cursor, &range_key_buf);
+ extract_key(range_key_buf, &next_val);
+ if (next_val < prev_val) {
+ lprintf(cfg, EINVAL, 0,
+ "Out of order keys %" PRIu64
+ " came before %" PRIu64,
+ prev_val, next_val);
+ return (EINVAL);
+ }
+ }
+ return (0);
+}
+
static void *
worker(void *arg)
{
@@ -381,8 +429,8 @@ worker(void *arg)
WT_CONNECTION *conn;
WT_CURSOR **cursors, *cursor, *tmp_cursor;
WT_SESSION *session;
- int64_t ops, ops_per_txn, throttle_ops;
size_t i;
+ int64_t ops, ops_per_txn, throttle_ops;
uint64_t next_val, usecs;
uint8_t *op, *op_end;
int measure_latency, ret, truncated;
@@ -533,7 +581,14 @@ worker(void *arg)
"get_value in read.");
goto err;
}
+ /*
+ * If we want to read a range, then call next
+ * for several operations, confirming that the
+ * next key is in the correct order.
+ */
+ ret = do_range_reads(cfg, cursor);
}
+
if (ret == 0 || ret == WT_NOTFOUND)
break;
goto op_err;
@@ -2103,6 +2158,8 @@ main(int argc, char *argv[])
if (config_assign(cfg, &default_cfg))
goto err;
+ TAILQ_INIT(&cfg->config_head);
+
/* Do a basic validation of options, and home is needed before open. */
while ((ch = __wt_getopt("wtperf", argc, argv, opts)) != EOF)
switch (ch) {
@@ -2308,6 +2365,9 @@ main(int argc, char *argv[])
if ((ret = config_sanity(cfg)) != 0)
goto err;
+ /* Write a copy of the config. */
+ config_to_file(cfg);
+
/* Display the configuration. */
if (cfg->verbose > 1)
config_print(cfg);
@@ -2333,7 +2393,7 @@ start_threads(CONFIG *cfg,
WORKLOAD *workp, CONFIG_THREAD *base, u_int num, void *(*func)(void *))
{
CONFIG_THREAD *thread;
- u_int i, j;
+ u_int i;
int ret;
/* Initialize the threads. */
@@ -2342,15 +2402,13 @@ start_threads(CONFIG *cfg,
thread->workload = workp;
/*
- * We don't want the threads executing in lock-step, move each
- * new RNG state further along in the sequence.
+ * We don't want the threads executing in lock-step, seed each
+ * one differently.
*/
- if (i == 0)
- __wt_random_init(&thread->rnd);
- else
- thread->rnd = (thread - 1)->rnd;
- for (j = 0; j < 1000; ++j)
- (void)__wt_random(&thread->rnd);
+ if ((ret = __wt_random_init_seed(NULL, &thread->rnd)) != 0) {
+ lprintf(cfg, ret, 0, "Error initializing RNG");
+ return (ret);
+ }
/*
* Every thread gets a key/data buffer because we don't bother
diff --git a/bench/wtperf/wtperf.h b/bench/wtperf/wtperf.h
index b26e978c13b..361b135ced7 100644
--- a/bench/wtperf/wtperf.h
+++ b/bench/wtperf/wtperf.h
@@ -127,6 +127,12 @@ struct __truncate_queue_entry {
};
typedef struct __truncate_queue_entry TRUNCATE_QUEUE_ENTRY;
+struct __config_queue_entry {
+ char *string;
+ TAILQ_ENTRY(__config_queue_entry) c;
+};
+typedef struct __config_queue_entry CONFIG_QUEUE_ENTRY;
+
#define LOG_PARTIAL_CONFIG ",log=(enabled=false)"
/*
* NOTE: If you add any fields to this structure here, you must also add
@@ -181,6 +187,9 @@ struct __config { /* Configuration structure */
/* Queue head for use with the Truncate Logic */
TAILQ_HEAD(__truncate_qh, __truncate_queue_entry) stone_head;
+ /* Queue head to save a copy of the config to be output */
+ TAILQ_HEAD(__config_qh, __config_queue_entry) config_head;
+
/* Fields changeable on command line are listed in wtperf_opt.i */
#define OPT_DECLARE_STRUCT
#include "wtperf_opt.i"
@@ -189,6 +198,7 @@ struct __config { /* Configuration structure */
#define ELEMENTS(a) (sizeof(a) / sizeof(a[0]))
+#define READ_RANGE_OPS 10
#define THROTTLE_OPS 100
#define THOUSAND (1000ULL)
@@ -271,6 +281,8 @@ void config_free(CONFIG *);
int config_opt_file(CONFIG *, const char *);
int config_opt_line(CONFIG *, const char *);
int config_opt_str(CONFIG *, const char *, const char *);
+void config_to_file(CONFIG *);
+void config_consolidate(CONFIG *);
void config_print(CONFIG *);
int config_sanity(CONFIG *);
void latency_insert(CONFIG *, uint32_t *, uint32_t *, uint32_t *);
@@ -305,4 +317,10 @@ generate_key(CONFIG *cfg, char *key_buf, uint64_t keyno)
sprintf(key_buf, "%0*" PRIu64, cfg->key_sz - 1, keyno);
}
+static inline void
+extract_key(char *key_buf, uint64_t *keynop)
+{
+ sscanf(key_buf, "%" SCNu64, keynop);
+}
+
#endif
diff --git a/bench/wtperf/wtperf_opt.i b/bench/wtperf/wtperf_opt.i
index a9d3c033b74..3c122e4d186 100644
--- a/bench/wtperf/wtperf_opt.i
+++ b/bench/wtperf/wtperf_opt.i
@@ -140,6 +140,7 @@ DEF_OPT_AS_UINT32(random_range, 0,
"if non zero choose a value from within this range as the key for "
"insert operations")
DEF_OPT_AS_BOOL(random_value, 0, "generate random content for the value")
+DEF_OPT_AS_UINT32(read_range, 0, "scan a range of keys after each search")
DEF_OPT_AS_BOOL(reopen_connection, 1,
"close and reopen the connection between populate and workload phases")
DEF_OPT_AS_UINT32(report_interval, 2,
diff --git a/build_posix/configure.ac.in b/build_posix/configure.ac.in
index de2f8963629..5949fb0509c 100644
--- a/build_posix/configure.ac.in
+++ b/build_posix/configure.ac.in
@@ -103,7 +103,7 @@ esac
# Linux requires buffers aligned to 4KB boundaries for O_DIRECT to work.
BUFFER_ALIGNMENT=0
-if test "$ac_cv_func_posix_memalign" = "yes" ; then
+if test "$ax_cv_func_posix_memalign_works" = "yes" ; then
case "$host_os" in
linux*) BUFFER_ALIGNMENT=4096 ;;
esac
diff --git a/build_win/filelist.win b/build_win/filelist.win
index af6ddf98da9..b845c45823e 100644
--- a/build_win/filelist.win
+++ b/build_win/filelist.win
@@ -121,6 +121,7 @@ src/os_win/os_map.c
src/os_win/os_mtx_cond.c
src/os_win/os_once.c
src/os_win/os_open.c
+src/os_win/os_pagesize.c
src/os_win/os_path.c
src/os_win/os_priv.c
src/os_win/os_remove.c
diff --git a/dist/api_data.py b/dist/api_data.py
index f58a48b4a0b..ff6d3f3ccb5 100644
--- a/dist/api_data.py
+++ b/dist/api_data.py
@@ -814,21 +814,19 @@ methods = {
'WT_SESSION.open_cursor' : Method(cursor_runtime_config + [
Config('bulk', 'false', r'''
- configure the cursor for bulk-loading, a fast, initial load
- path (see @ref tune_bulk_load for more information). Bulk-load
- may only be used for newly created objects and cursors
- configured for bulk-load only support the WT_CURSOR::insert
- and WT_CURSOR::close methods. When bulk-loading row-store
- objects, keys must be loaded in sorted order. The value is
- usually a true/false flag; when bulk-loading fixed-length
- column store objects, the special value \c bitmap allows
- chunks of a memory resident bitmap to be loaded directly into
- a file by passing a \c WT_ITEM to WT_CURSOR::set_value where
- the \c size field indicates the number of records in the
- bitmap (as specified by the object's \c value_format
- configuration). Bulk-loaded bitmap values must end on a byte
- boundary relative to the bit count (except for the last set
- of values loaded)'''),
+ configure the cursor for bulk-loading, a fast, initial load path
+ (see @ref tune_bulk_load for more information). Bulk-load may
+ only be used for newly created objects and applications should
+ use the WT_CURSOR::insert method to insert rows. When
+ bulk-loading, rows must be loaded in sorted order. The value
+ is usually a true/false flag; when bulk-loading fixed-length
+ column store objects, the special value \c bitmap allows chunks
+ of a memory resident bitmap to be loaded directly into a file
+ by passing a \c WT_ITEM to WT_CURSOR::set_value where the \c
+ size field indicates the number of records in the bitmap (as
+ specified by the object's \c value_format configuration).
+ Bulk-loaded bitmap values must end on a byte boundary relative
+ to the bit count (except for the last set of values loaded)'''),
Config('checkpoint', '', r'''
the name of a checkpoint to open (the reserved name
"WiredTigerCheckpoint" opens the most recent internal
@@ -843,12 +841,20 @@ methods = {
with the @ref util_dump and @ref util_load commands''',
choices=['hex', 'json', 'print']),
Config('next_random', 'false', r'''
- configure the cursor to return a pseudo-random record from
- the object; valid only for row-store cursors. Cursors
- configured with \c next_random=true only support the
- WT_CURSOR::next and WT_CURSOR::close methods. See @ref
- cursor_random for details''',
+ configure the cursor to return a pseudo-random record from the
+ object when the WT_CURSOR::next method is called; valid only for
+ row-store cursors. See @ref cursor_random for details''',
type='boolean'),
+ Config('next_random_sample_size', '0', r'''
+ cursors configured by \c next_random to return pseudo-random
+ records from the object randomly select from the entire object,
+ by default. Setting \c next_random_sample_size to a non-zero
+ value sets the number of samples the application expects to take
+ using the \c next_random cursor. A cursor configured with both
+ \c next_random and \c next_random_sample_size attempts to divide
+ the object into \c next_random_sample_size equal-sized pieces,
+ and each retrieval returns a record from one of those pieces. See
+ @ref cursor_random for details'''),
Config('raw', 'false', r'''
ignore the encodings for the key and value, manage data as if
the formats were \c "u". See @ref cursor_raw for details''',
diff --git a/dist/filelist b/dist/filelist
index 52af87c2a68..dde090e5a85 100644
--- a/dist/filelist
+++ b/dist/filelist
@@ -119,6 +119,7 @@ src/os_posix/os_mtx_cond.c
src/os_posix/os_mtx_rw.c
src/os_posix/os_once.c
src/os_posix/os_open.c
+src/os_posix/os_pagesize.c
src/os_posix/os_path.c
src/os_posix/os_priv.c
src/os_posix/os_remove.c
diff --git a/dist/flags.py b/dist/flags.py
index 1965dfb7dbe..7d237dd39a4 100644
--- a/dist/flags.py
+++ b/dist/flags.py
@@ -37,6 +37,7 @@ flags = {
'READ_NO_WAIT',
'READ_PREV',
'READ_SKIP_INTL',
+ 'READ_SKIP_LEAF',
'READ_TRUNCATE',
'READ_WONT_NEED',
],
diff --git a/dist/log.py b/dist/log.py
index feeb053db3e..6d35bf2e718 100644
--- a/dist/log.py
+++ b/dist/log.py
@@ -8,14 +8,15 @@ import log_data
tmp_file = '__tmp'
# Map log record types to:
-# (C type, pack type, printf format, printf arg(s), printf setup)
+# (C type, pack type, printf format, printf arg(s), list of setup functions)
field_types = {
- 'string' : ('const char *', 'S', '%s', 'arg', ''),
+ 'string' : ('const char *', 'S', '%s', 'arg', [ '' ]),
'item' : ('WT_ITEM *', 'u', '%s', 'escaped',
- 'WT_ERR(__logrec_jsonify_str(session, &escaped, &arg));'),
- 'recno' : ('uint64_t', 'r', '%" PRIu64 "', 'arg', ''),
- 'uint32' : ('uint32_t', 'I', '%" PRIu32 "', 'arg', ''),
- 'uint64' : ('uint64_t', 'Q', '%" PRIu64 "', 'arg', ''),
+ [ 'WT_ERR(__logrec_make_json_str(session, &escaped, &arg));',
+ 'WT_ERR(__logrec_make_hex_str(session, &escaped, &arg));']),
+ 'recno' : ('uint64_t', 'r', '%" PRIu64 "', 'arg', [ '' ]),
+ 'uint32' : ('uint32_t', 'I', '%" PRIu32 "', 'arg', [ '' ]),
+ 'uint64' : ('uint64_t', 'Q', '%" PRIu64 "', 'arg', [ '' ]),
}
def cintype(f):
@@ -38,15 +39,13 @@ def clocaltype(f):
return type
def escape_decl(fields):
- for f in fields:
- if 'escaped' in field_types[f[0]][4]:
- return '\n\tchar *escaped;'
- return ''
+ return '\n\tchar *escaped;' if has_escape(fields) else ''
def has_escape(fields):
for f in fields:
- if 'escaped' in field_types[f[0]][4]:
- return True
+ for setup in field_types[f[0]][4]:
+ if 'escaped' in setup:
+ return True
return False
def pack_fmt(fields):
@@ -65,10 +64,38 @@ def printf_arg(f):
arg = field_types[f[0]][3].replace('arg', f[1])
return ' ' + arg
-def printf_setup(f):
- stmt = field_types[f[0]][4].replace('arg', f[1])
- return '' if stmt == '' else stmt + '\n\t'
-
+def printf_setup(f, i, nl_indent):
+ stmt = field_types[f[0]][4][i].replace('arg', f[1])
+ return '' if stmt == '' else stmt + nl_indent
+
+def n_setup(f):
+ return len(field_types[f[0]][4])
+
+# Create a printf line, with an optional setup function.
+# ishex indicates that the the field name in the output is modified
+# (to add "-hex"), and that the setup and printf are conditional
+# in the generated code.
+def printf_line(f, optype, i, ishex):
+ ifbegin = ''
+ ifend = ''
+ nl_indent = '\n\t'
+ name = f[1]
+ postcomma = '' if i + 1 == len(optype.fields) else ',\\n'
+ precomma = ''
+ if ishex > 0:
+ name += '-hex'
+ ifend = nl_indent + '}'
+ nl_indent += '\t'
+ ifbegin = 'if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) {' + nl_indent
+ if postcomma == '':
+ precomma = ',\\n'
+ body = '%s%s(__wt_fprintf(out,' % (
+ printf_setup(f, ishex, nl_indent),
+ 'WT_ERR' if has_escape(optype.fields) else 'WT_RET') + \
+ '%s "%s \\"%s\\": \\"%s\\"%s",%s));' % (
+ nl_indent, precomma, name, printf_fmt(f), postcomma,
+ printf_arg(f))
+ return ifbegin + body + ifend
#####################################################################
# Update log.h with #defines for types
@@ -176,7 +203,7 @@ __logrec_json_unpack_str(char *dest, size_t destlen, const char *src,
}
static int
-__logrec_jsonify_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item)
+__logrec_make_json_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item)
{
\tsize_t needed;
@@ -185,6 +212,17 @@ __logrec_jsonify_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item)
\t(void)__logrec_json_unpack_str(*destp, needed, item->data, item->size);
\treturn (0);
}
+
+static int
+__logrec_make_hex_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item)
+{
+\tsize_t needed;
+
+\tneeded = item->size * 2 + 1;
+\tWT_RET(__wt_realloc(session, NULL, needed, destp));
+\t__wt_fill_hex(item->data, item->size, (uint8_t *)*destp, needed, NULL);
+\treturn (0);
+}
''')
# Emit code to read, write and print log operations (within a log record)
@@ -255,11 +293,12 @@ __wt_logop_%(name)s_unpack(
tfile.write('''
int
__wt_logop_%(name)s_print(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ FILE *out, uint32_t flags)
{
%(arg_ret)s\t%(arg_decls)s
-\t%(arg_init)sWT_RET(__wt_logop_%(name)s_unpack(
+\t%(arg_unused)s%(arg_init)sWT_RET(__wt_logop_%(name)s_unpack(
\t session, pp, end%(arg_addrs)s));
\tWT_RET(__wt_fprintf(out, " \\"optype\\": \\"%(name)s\\",\\n"));
@@ -272,27 +311,22 @@ __wt_logop_%(name)s_print(
'arg_decls' : ('\n\t'.join('%s%s%s;' %
(clocaltype(f), '' if clocaltype(f)[-1] == '*' else ' ', f[1])
for f in optype.fields)) + escape_decl(optype.fields),
+ 'arg_unused' : ('' if has_escape(optype.fields)
+ else 'WT_UNUSED(flags);\n\t'),
'arg_init' : ('escaped = NULL;\n\t' if has_escape(optype.fields) else ''),
'arg_fini' : ('\nerr:\t__wt_free(session, escaped);\n\treturn (ret);'
if has_escape(optype.fields) else '\treturn (0);'),
'arg_addrs' : ''.join(', &%s' % f[1] for f in optype.fields),
- 'print_args' : '\n\t'.join(
- '%s%s(__wt_fprintf(out,\n\t " \\"%s\\": \\"%s\\",\\n",%s));' %
- (printf_setup(f),
- 'WT_ERR' if has_escape(optype.fields) else 'WT_RET',
- f[1], printf_fmt(f), printf_arg(f))
- for f in optype.fields[:-1]) + str(
- '\n\t%s%s(__wt_fprintf(out,\n\t " \\"%s\\": \\"%s\\"",%s));' %
- (printf_setup(last_field),
- 'WT_ERR' if has_escape(optype.fields) else 'WT_RET',
- last_field[1], printf_fmt(last_field), printf_arg(last_field))),
+ 'print_args' : '\n\t'.join(printf_line(f, optype, i, s)
+ for i,f in enumerate(optype.fields) for s in range(0, n_setup(f)))
})
# Emit the printlog entry point
tfile.write('''
int
__wt_txn_op_printlog(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ FILE *out, uint32_t flags)
{
\tuint32_t optype, opsize;
@@ -308,7 +342,8 @@ for optype in log_data.optypes:
tfile.write('''
\tcase %(macro)s:
-\t\tWT_RET(%(print_func)s(session, pp, end, out));
+\t\tWT_RET(%(print_func)s(session, pp, end, out,
+\t\t flags));
\t\tbreak;
''' % {
'macro' : optype.macro_name(),
diff --git a/dist/s_copyright b/dist/s_copyright
index 020be6ae33d..0816274a367 100755
--- a/dist/s_copyright
+++ b/dist/s_copyright
@@ -6,6 +6,7 @@ c1=__wt.copyright.1
c2=__wt.copyright.2
c3=__wt.copyright.3
c4=__wt.copyright.4
+c5=__wt.copyright.5
check()
{
@@ -34,6 +35,9 @@ check()
if `sed -e 1,3p -e 4q -e d $1 | diff - dist/$c4 > /dev/null` ; then
return;
fi
+ if `sed -e 2,7p -e 8q -e d $1 | diff - dist/$c5 > /dev/null` ; then
+ return;
+ fi
echo "$1: copyright information is incorrect"
exit 1
@@ -81,6 +85,16 @@ cat > $c4 <<ENDOFTEXT
# This is free and unencumbered software released into the public domain.
ENDOFTEXT
+cat > $c5 <<ENDOFTEXT
+ * Copyright (c) 2014-$year MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ENDOFTEXT
+
# Search for files, skipping some well-known 3rd party directories.
(cd .. && find [a-z]* -name '*.[chi]' \
-o -name '*.cxx' \
diff --git a/dist/s_funcs b/dist/s_funcs
index 3769ccc4aa7..5fee03b5615 100755
--- a/dist/s_funcs
+++ b/dist/s_funcs
@@ -6,7 +6,7 @@ trap 'rm -f $t; exit 0' 0 1 2 3 13 15
# List of files to search.
l=`sed -e 's,#.*,,' -e '/^$/d' -e 's,^,../,' filelist`
-l="$l `echo ../src/*/*.i ../src/utilities/*.c`"
+l="$l `echo ../src/*/*.i ../src/utilities/*.c ../bench/wtperf/*.c`"
(
# Copy out the functions we don't use, but it's OK.
diff --git a/dist/s_longlines b/dist/s_longlines
index 15ca5603385..decedb58f44 100755
--- a/dist/s_longlines
+++ b/dist/s_longlines
@@ -8,10 +8,11 @@ l=`(cd .. &&
find bench/wtperf examples ext src test -name '*.[chisy]' &&
find dist -name '*.py' &&
find src -name '*.in') |
- sed -e '/include\/extern\.h/d'\
- -e '/support\/stat\.c/d'`
+ sed -e '/dist\/stat_data\.py/d' \
+ -e '/support\/stat\.c/d' \
+ -e '/include\/extern\.h/d'`
for f in $l ; do
expand -t8 < ../$f | awk -- \
- "{if(length(\$0) > 80) printf(\"%s:%d\\n\", \"$f\", NR)}"
+ "{if(length(\$0) > 80) printf(\"%s:%d\\n\", \"$f\", NR)}"
done
diff --git a/dist/s_string.ok b/dist/s_string.ok
index b408888970b..27583402259 100644
--- a/dist/s_string.ok
+++ b/dist/s_string.ok
@@ -144,6 +144,7 @@ INIT
INITIALIZER
INMEM
INTL
+ISA
ITEMs
Inline
Intra
@@ -180,6 +181,7 @@ LevelDB
Levyx
Llqr
Llqrt
+LoadLoad
LockFile
Lookaside
Lookup
@@ -187,6 +189,7 @@ MALLOC
MEM
MEMALIGN
MERCHANTABILITY
+MONGODB
MSVC
MULTIBLOCK
MUTEX
@@ -283,10 +286,13 @@ Spinlock
Spinlocks
Split's
Stoica
+StoreLoad
+StoreStore
TAILQ
TCMalloc
TODO
TORTIOUS
+TSO
TXN
TXNC
Timespec
@@ -301,6 +307,7 @@ ULINE
URI
URIs
UTF
+UltraSparc
Unbuffered
UnixLib
Unmap
@@ -371,6 +378,7 @@ automake
bInheritHandle
basecfg
basho
+bcr
bdb
beginthreadex
bigram
@@ -412,6 +420,7 @@ bzip
calloc
cas
catfmt
+ccr
cd
centric
cfg
@@ -423,6 +432,7 @@ checkpointer
checkpointing
checksum
checksums
+children's
chk
chongo
cip
@@ -711,6 +721,7 @@ lookaside
lookup
lookups
lossy
+lr
lrt
lru
lseek
@@ -719,6 +730,7 @@ lsn
lsnappy
lt
lu
+lwsync
lz
lzo
madvise
@@ -726,6 +738,8 @@ majorp
malloc
marshall
marshalled
+mbll
+mbss
mem
memalign
membar
@@ -802,6 +816,7 @@ os
ovfl
ownp
packv
+pagesize
parens
pareto
parserp
@@ -1022,6 +1037,7 @@ variable's
vectorized
versa
vfprintf
+vm
vpack
vprintf
vrfy
diff --git a/dist/s_win b/dist/s_win
index cdfc71a8a1e..1eb4702d517 100755
--- a/dist/s_win
+++ b/dist/s_win
@@ -62,6 +62,7 @@ win_filelist()
-e 's;os_posix/os_mtx_cond.c;os_win/os_mtx_cond.c;' \
-e 's;os_posix/os_once.c;os_win/os_once.c;' \
-e 's;os_posix/os_open.c;os_win/os_open.c;' \
+ -e 's;os_posix/os_pagesize.c;os_win/os_pagesize.c;' \
-e 's;os_posix/os_path.c;os_win/os_path.c;' \
-e 's;os_posix/os_priv.c;os_win/os_priv.c;' \
-e 's;os_posix/os_remove.c;os_win/os_remove.c;' \
diff --git a/dist/stat.py b/dist/stat.py
index d62fda3fcb9..6dcfccfeab5 100644
--- a/dist/stat.py
+++ b/dist/stat.py
@@ -171,9 +171,7 @@ __wt_stat_''' + name + '''_aggregate_single(
{
''')
for l in sorted(list):
- if 'no_aggregate' in l.flags:
- o = '\tto->' + l.name + ' = from->' + l.name + ';\n'
- elif 'max_aggregate' in l.flags:
+ if 'max_aggregate' in l.flags:
o = '\tif (from->' + l.name + ' > to->' + l.name + ')\n' +\
'\t\tto->' + l.name + ' = from->' + l.name + ';\n'
else:
@@ -197,12 +195,12 @@ __wt_stat_''' + name + '''_aggregate(
f.write('\tint64_t v;\n\n')
break;
for l in sorted(list):
- if 'no_aggregate' in l.flags:
- o = '\tto->' + l.name + ' = from[0]->' + l.name + ';\n'
- elif 'max_aggregate' in l.flags:
- o = '\tif ((v = WT_STAT_READ(from, ' + l.name + ')) >\n' +\
- '\t to->' + l.name + ')\n' +\
- '\t\tto->' + l.name + ' = v;\n'
+ if 'max_aggregate' in l.flags:
+ o = '\tif ((v = WT_STAT_READ(from, ' + l.name + ')) > ' +\
+ 'to->' + l.name + ')\n'
+ if len(o) > 72: # Account for the leading tab.
+ o = o.replace(' > ', ' >\n\t ')
+ o +='\t\tto->' + l.name + ' = v;\n'
else:
o = '\tto->' + l.name + ' += WT_STAT_READ(from, ' + l.name + ');\n'
if len(o) > 72: # Account for the leading tab.
diff --git a/dist/stat_data.py b/dist/stat_data.py
index 3a23071a3f2..41a93961079 100644
--- a/dist/stat_data.py
+++ b/dist/stat_data.py
@@ -8,20 +8,13 @@
# NOTE: All statistics descriptions must have a prefix string followed by ':'.
#
# Data-source statistics are normally aggregated across the set of underlying
-# objects. Additional optionaly configuration flags are available:
-# no_aggregate Ignore the value when aggregating statistics
+# objects. Additional optional configuration flags are available:
# max_aggregate Take the maximum value when aggregating statistics
-#
-# Optional configuration flags:
# no_clear Value not cleared when statistics cleared
# no_scale Don't scale value per second in the logging tool script
#
-# The no_clear flag is a little complicated: it means we don't clear the values
-# when resetting statistics after each run (necessary when the WiredTiger engine
-# is updating values that persist over multiple runs, for example the count of
-# cursors), but it also causes the underlying display routines to not treat the
-# change between displays as relative to the number of seconds, that is, it's an
-# absolute value. The no_clear flag should be set in either case.
+# The no_clear and no_scale flags are normally always set together (values that
+# are maintained over time are normally not scaled per second).
from operator import attrgetter
import sys
@@ -129,13 +122,11 @@ connection_stats = [
# Async API statistics
##########################################
AsyncStat('async_alloc_race', 'number of allocation state races'),
- AsyncStat('async_alloc_view',
- 'number of operation slots viewed for allocation'),
+ AsyncStat('async_alloc_view', 'number of operation slots viewed for allocation'),
AsyncStat('async_cur_queue', 'current work queue length'),
AsyncStat('async_flush', 'number of flush calls'),
AsyncStat('async_full', 'number of times operation allocation failed'),
- AsyncStat('async_max_queue',
- 'maximum work queue length', 'no_clear,no_scale'),
+ AsyncStat('async_max_queue', 'maximum work queue length', 'no_clear,no_scale'),
AsyncStat('async_nowork', 'number of times worker found no work'),
AsyncStat('async_op_alloc', 'total allocations'),
AsyncStat('async_op_compact', 'total compact calls'),
@@ -158,89 +149,59 @@ connection_stats = [
##########################################
# Cache and eviction statistics
##########################################
- CacheStat('cache_bytes_dirty',
- 'tracked dirty bytes in the cache', 'no_clear,no_scale'),
- CacheStat('cache_bytes_internal',
- 'tracked bytes belonging to internal pages in the cache',
- 'no_clear,no_scale'),
- CacheStat('cache_bytes_inuse',
- 'bytes currently in the cache', 'no_clear,no_scale'),
- CacheStat('cache_bytes_leaf',
- 'tracked bytes belonging to leaf pages in the cache',
- 'no_clear,no_scale'),
- CacheStat('cache_bytes_max',
- 'maximum bytes configured', 'no_clear,no_scale'),
- CacheStat('cache_bytes_overflow',
- 'tracked bytes belonging to overflow pages in the cache',
- 'no_clear,no_scale'),
+ CacheStat('cache_bytes_dirty', 'tracked dirty bytes in the cache', 'no_clear,no_scale'),
+ CacheStat('cache_bytes_internal', 'tracked bytes belonging to internal pages in the cache', 'no_clear,no_scale'),
+ CacheStat('cache_bytes_inuse', 'bytes currently in the cache', 'no_clear,no_scale'),
+ CacheStat('cache_bytes_leaf', 'tracked bytes belonging to leaf pages in the cache', 'no_clear,no_scale'),
+ CacheStat('cache_bytes_max', 'maximum bytes configured', 'no_clear,no_scale'),
+ CacheStat('cache_bytes_overflow', 'tracked bytes belonging to overflow pages in the cache', 'no_clear,no_scale'),
CacheStat('cache_bytes_read', 'bytes read into cache'),
CacheStat('cache_bytes_write', 'bytes written from cache'),
CacheStat('cache_eviction_app', 'pages evicted by application threads'),
CacheStat('cache_eviction_checkpoint', 'checkpoint blocked page eviction'),
CacheStat('cache_eviction_clean', 'unmodified pages evicted'),
- CacheStat('cache_eviction_deepen',
- 'page split during eviction deepened the tree'),
+ CacheStat('cache_eviction_deepen', 'page split during eviction deepened the tree'),
CacheStat('cache_eviction_dirty', 'modified pages evicted'),
- CacheStat('cache_eviction_fail',
- 'pages selected for eviction unable to be evicted'),
- CacheStat('cache_eviction_force',
- 'pages evicted because they exceeded the in-memory maximum'),
- CacheStat('cache_eviction_force_delete',
- 'pages evicted because they had chains of deleted items'),
- CacheStat('cache_eviction_force_fail',
- 'failed eviction of pages that exceeded the in-memory maximum'),
+ CacheStat('cache_eviction_fail', 'pages selected for eviction unable to be evicted'),
+ CacheStat('cache_eviction_force', 'pages evicted because they exceeded the in-memory maximum'),
+ CacheStat('cache_eviction_force_delete', 'pages evicted because they had chains of deleted items'),
+ CacheStat('cache_eviction_force_fail', 'failed eviction of pages that exceeded the in-memory maximum'),
CacheStat('cache_eviction_hazard', 'hazard pointer blocked page eviction'),
CacheStat('cache_eviction_internal', 'internal pages evicted'),
- CacheStat('cache_eviction_maximum_page_size',
- 'maximum page size at eviction', 'no_clear,no_scale'),
- CacheStat('cache_eviction_queue_empty',
- 'eviction server candidate queue empty when topping up'),
- CacheStat('cache_eviction_queue_not_empty',
- 'eviction server candidate queue not empty when topping up'),
- CacheStat('cache_eviction_server_evicting',
- 'eviction server evicting pages'),
- CacheStat('cache_eviction_server_not_evicting',
- 'eviction server populating queue, but not evicting pages'),
- CacheStat('cache_eviction_slow',
- 'eviction server unable to reach eviction goal'),
- CacheStat('cache_eviction_split_internal',
- 'internal pages split during eviction'),
+ CacheStat('cache_eviction_maximum_page_size', 'maximum page size at eviction', 'no_clear,no_scale'),
+ CacheStat('cache_eviction_queue_empty', 'eviction server candidate queue empty when topping up'),
+ CacheStat('cache_eviction_queue_not_empty', 'eviction server candidate queue not empty when topping up'),
+ CacheStat('cache_eviction_server_evicting', 'eviction server evicting pages'),
+ CacheStat('cache_eviction_server_not_evicting', 'eviction server populating queue, but not evicting pages'),
+ CacheStat('cache_eviction_slow', 'eviction server unable to reach eviction goal'),
+ CacheStat('cache_eviction_split_internal', 'internal pages split during eviction'),
CacheStat('cache_eviction_split_leaf', 'leaf pages split during eviction'),
CacheStat('cache_eviction_walk', 'pages walked for eviction'),
- CacheStat('cache_eviction_worker_evicting',
- 'eviction worker thread evicting pages'),
+ CacheStat('cache_eviction_worker_evicting', 'eviction worker thread evicting pages'),
CacheStat('cache_inmem_split', 'in-memory page splits'),
- CacheStat('cache_inmem_splittable',
- 'in-memory page passed criteria to be split'),
+ CacheStat('cache_inmem_splittable', 'in-memory page passed criteria to be split'),
CacheStat('cache_lookaside_insert', 'lookaside table insert calls'),
CacheStat('cache_lookaside_remove', 'lookaside table remove calls'),
CacheStat('cache_overhead', 'percentage overhead', 'no_clear,no_scale'),
- CacheStat('cache_pages_dirty',
- 'tracked dirty pages in the cache', 'no_clear,no_scale'),
- CacheStat('cache_pages_inuse',
- 'pages currently held in the cache', 'no_clear,no_scale'),
+ CacheStat('cache_pages_dirty', 'tracked dirty pages in the cache', 'no_clear,no_scale'),
+ CacheStat('cache_pages_inuse', 'pages currently held in the cache', 'no_clear,no_scale'),
CacheStat('cache_read', 'pages read into cache'),
- CacheStat('cache_read_lookaside',
- 'pages read into cache requiring lookaside entries'),
+ CacheStat('cache_read_lookaside', 'pages read into cache requiring lookaside entries'),
CacheStat('cache_write', 'pages written from cache'),
- CacheStat('cache_write_lookaside',
- 'page written requiring lookaside records'),
- CacheStat('cache_write_restore',
- 'pages written requiring in-memory restoration'),
+ CacheStat('cache_write_lookaside', 'page written requiring lookaside records'),
+ CacheStat('cache_write_restore', 'pages written requiring in-memory restoration'),
##########################################
# Dhandle statistics
##########################################
- DhandleStat('dh_conn_handle_count',
- 'connection data handles currently active', 'no_clear,no_scale'),
+ DhandleStat('dh_conn_handle_count', 'connection data handles currently active', 'no_clear,no_scale'),
+ DhandleStat('dh_session_handles', 'session dhandles swept'),
+ DhandleStat('dh_session_sweeps', 'session sweep attempts'),
DhandleStat('dh_sweep_close', 'connection sweep dhandles closed'),
- DhandleStat('dh_sweep_remove',
- 'connection sweep dhandles removed from hash list'),
DhandleStat('dh_sweep_ref', 'connection sweep candidate became referenced'),
+ DhandleStat('dh_sweep_remove', 'connection sweep dhandles removed from hash list'),
DhandleStat('dh_sweep_tod', 'connection sweep time-of-death sets'),
DhandleStat('dh_sweeps', 'connection sweeps'),
- DhandleStat('dh_session_handles', 'session dhandles swept'),
- DhandleStat('dh_session_sweeps', 'session sweep attempts'),
##########################################
# Logging statistics
@@ -257,10 +218,8 @@ connection_stats = [
LogStat('log_flush', 'log flush operations'),
LogStat('log_max_filesize', 'maximum log file size', 'no_clear,no_scale'),
LogStat('log_prealloc_files', 'pre-allocated log files prepared'),
- LogStat('log_prealloc_max',
- 'number of pre-allocated log files to create', 'no_clear,no_scale'),
- LogStat('log_prealloc_missed',
- 'pre-allocated log files not ready and missed'),
+ LogStat('log_prealloc_max', 'number of pre-allocated log files to create', 'no_clear,no_scale'),
+ LogStat('log_prealloc_missed', 'pre-allocated log files not ready and missed'),
LogStat('log_prealloc_used', 'pre-allocated log files used'),
LogStat('log_release_write_lsn', 'log release advances write LSN'),
LogStat('log_scan_records', 'records processed by log scan'),
@@ -283,46 +242,32 @@ connection_stats = [
##########################################
# Reconciliation statistics
##########################################
- RecStat('rec_pages', 'page reconciliation calls'),
RecStat('rec_page_delete', 'pages deleted'),
RecStat('rec_page_delete_fast', 'fast-path pages deleted'),
+ RecStat('rec_pages', 'page reconciliation calls'),
RecStat('rec_pages_eviction', 'page reconciliation calls for eviction'),
- RecStat('rec_split_stashed_bytes',
- 'split bytes currently awaiting free', 'no_clear,no_scale'),
- RecStat('rec_split_stashed_objects',
- 'split objects currently awaiting free', 'no_clear,no_scale'),
+ RecStat('rec_split_stashed_bytes', 'split bytes currently awaiting free', 'no_clear,no_scale'),
+ RecStat('rec_split_stashed_objects', 'split objects currently awaiting free', 'no_clear,no_scale'),
##########################################
# Transaction statistics
##########################################
TxnStat('txn_begin', 'transaction begins'),
TxnStat('txn_checkpoint', 'transaction checkpoints'),
- TxnStat('txn_checkpoint_generation',
- 'transaction checkpoint generation', 'no_clear,no_scale'),
- TxnStat('txn_checkpoint_running',
- 'transaction checkpoint currently running', 'no_clear,no_scale'),
- TxnStat('txn_checkpoint_time_max',
- 'transaction checkpoint max time (msecs)', 'no_clear,no_scale'),
- TxnStat('txn_checkpoint_time_min',
- 'transaction checkpoint min time (msecs)', 'no_clear,no_scale'),
- TxnStat('txn_checkpoint_time_recent',
- 'transaction checkpoint most recent time (msecs)', 'no_clear,no_scale'),
- TxnStat('txn_checkpoint_time_total',
- 'transaction checkpoint total time (msecs)', 'no_clear,no_scale'),
+ TxnStat('txn_checkpoint_generation', 'transaction checkpoint generation', 'no_clear,no_scale'),
+ TxnStat('txn_checkpoint_running', 'transaction checkpoint currently running', 'no_clear,no_scale'),
+ TxnStat('txn_checkpoint_time_max', 'transaction checkpoint max time (msecs)', 'no_clear,no_scale'),
+ TxnStat('txn_checkpoint_time_min', 'transaction checkpoint min time (msecs)', 'no_clear,no_scale'),
+ TxnStat('txn_checkpoint_time_recent', 'transaction checkpoint most recent time (msecs)', 'no_clear,no_scale'),
+ TxnStat('txn_checkpoint_time_total', 'transaction checkpoint total time (msecs)', 'no_clear,no_scale'),
TxnStat('txn_commit', 'transactions committed'),
- TxnStat('txn_fail_cache',
- 'transaction failures due to cache overflow'),
- TxnStat('txn_pinned_checkpoint_range',
- 'transaction range of IDs currently pinned by a checkpoint',
- 'no_clear,no_scale'),
- TxnStat('txn_pinned_range',
- 'transaction range of IDs currently pinned', 'no_clear,no_scale'),
- TxnStat('txn_pinned_snapshot_range',
- 'transaction range of IDs currently pinned by named snapshots',
- 'no_clear,no_scale'),
+ TxnStat('txn_fail_cache', 'transaction failures due to cache overflow'),
+ TxnStat('txn_pinned_checkpoint_range', 'transaction range of IDs currently pinned by a checkpoint', 'no_clear,no_scale'),
+ TxnStat('txn_pinned_range', 'transaction range of IDs currently pinned', 'no_clear,no_scale'),
+ TxnStat('txn_pinned_snapshot_range', 'transaction range of IDs currently pinned by named snapshots', 'no_clear,no_scale'),
+ TxnStat('txn_rollback', 'transactions rolled back'),
TxnStat('txn_snapshots_created', 'number of named snapshots created'),
TxnStat('txn_snapshots_dropped', 'number of named snapshots dropped'),
- TxnStat('txn_rollback', 'transactions rolled back'),
TxnStat('txn_sync', 'transaction sync calls'),
##########################################
@@ -331,23 +276,18 @@ connection_stats = [
LSMStat('lsm_checkpoint_throttle', 'sleep for LSM checkpoint throttle'),
LSMStat('lsm_merge_throttle', 'sleep for LSM merge throttle'),
LSMStat('lsm_rows_merged', 'rows merged in an LSM tree'),
- LSMStat('lsm_work_queue_app',
- 'application work units currently queued', 'no_clear,no_scale'),
- LSMStat('lsm_work_queue_manager',
- 'merge work units currently queued', 'no_clear,no_scale'),
+ LSMStat('lsm_work_queue_app', 'application work units currently queued', 'no_clear,no_scale'),
+ LSMStat('lsm_work_queue_manager', 'merge work units currently queued', 'no_clear,no_scale'),
LSMStat('lsm_work_queue_max', 'tree queue hit maximum'),
- LSMStat('lsm_work_queue_switch',
- 'switch work units currently queued', 'no_clear,no_scale'),
+ LSMStat('lsm_work_queue_switch', 'switch work units currently queued', 'no_clear,no_scale'),
LSMStat('lsm_work_units_created', 'tree maintenance operations scheduled'),
- LSMStat('lsm_work_units_discarded',
- 'tree maintenance operations discarded'),
+ LSMStat('lsm_work_units_discarded', 'tree maintenance operations discarded'),
LSMStat('lsm_work_units_done', 'tree maintenance operations executed'),
##########################################
# Session operations
##########################################
- SessionStat('session_cursor_open',
- 'open cursor count', 'no_clear,no_scale'),
+ SessionStat('session_cursor_open', 'open cursor count', 'no_clear,no_scale'),
SessionStat('session_open', 'open session count', 'no_clear,no_scale'),
##########################################
@@ -385,8 +325,7 @@ dsrc_stats = [
# Session operations
##########################################
SessionStat('session_compact', 'object compaction'),
- SessionStat('session_cursor_open',
- 'open cursor count', 'no_clear,no_scale'),
+ SessionStat('session_cursor_open', 'open cursor count', 'no_clear,no_scale'),
##########################################
# Cursor operations
@@ -394,8 +333,7 @@ dsrc_stats = [
CursorStat('cursor_create', 'create calls'),
CursorStat('cursor_insert', 'insert calls'),
CursorStat('cursor_insert_bulk', 'bulk-loaded cursor-insert calls'),
- CursorStat('cursor_insert_bytes',
- 'cursor-insert key and value bytes inserted'),
+ CursorStat('cursor_insert_bytes', 'cursor-insert key and value bytes inserted'),
CursorStat('cursor_next', 'next calls'),
CursorStat('cursor_prev', 'prev calls'),
CursorStat('cursor_remove', 'remove calls'),
@@ -411,33 +349,21 @@ dsrc_stats = [
##########################################
# Btree statistics
##########################################
- BtreeStat('btree_checkpoint_generation',
- 'btree checkpoint generation', 'no_clear,no_scale'),
- BtreeStat('btree_column_deleted',
- 'column-store variable-size deleted values', 'no_scale'),
- BtreeStat('btree_column_fix',
- 'column-store fixed-size leaf pages', 'no_scale'),
- BtreeStat('btree_column_internal',
- 'column-store internal pages', 'no_scale'),
- BtreeStat('btree_column_rle',
- 'column-store variable-size RLE encoded values', 'no_scale'),
- BtreeStat('btree_column_variable',
- 'column-store variable-size leaf pages', 'no_scale'),
+ BtreeStat('btree_checkpoint_generation', 'btree checkpoint generation', 'no_clear,no_scale'),
+ BtreeStat('btree_column_deleted', 'column-store variable-size deleted values', 'no_scale'),
+ BtreeStat('btree_column_fix', 'column-store fixed-size leaf pages', 'no_scale'),
+ BtreeStat('btree_column_internal', 'column-store internal pages', 'no_scale'),
+ BtreeStat('btree_column_rle', 'column-store variable-size RLE encoded values', 'no_scale'),
+ BtreeStat('btree_column_variable', 'column-store variable-size leaf pages', 'no_scale'),
BtreeStat('btree_compact_rewrite', 'pages rewritten by compaction'),
BtreeStat('btree_entries', 'number of key/value pairs', 'no_scale'),
- BtreeStat('btree_fixed_len', 'fixed-record size', 'no_aggregate,no_scale'),
- BtreeStat('btree_maximum_depth',
- 'maximum tree depth', 'max_aggregate,no_scale'),
- BtreeStat('btree_maxintlkey',
- 'maximum internal page key size', 'max_aggregate,no_scale'),
- BtreeStat('btree_maxintlpage',
- 'maximum internal page size', 'max_aggregate,no_scale'),
- BtreeStat('btree_maxleafkey',
- 'maximum leaf page key size', 'max_aggregate,no_scale'),
- BtreeStat('btree_maxleafpage',
- 'maximum leaf page size', 'max_aggregate,no_scale'),
- BtreeStat('btree_maxleafvalue',
- 'maximum leaf page value size', 'max_aggregate,no_scale'),
+ BtreeStat('btree_fixed_len', 'fixed-record size', 'max_aggregate,no_scale'),
+ BtreeStat('btree_maximum_depth', 'maximum tree depth', 'max_aggregate,no_scale'),
+ BtreeStat('btree_maxintlkey', 'maximum internal page key size', 'max_aggregate,no_scale'),
+ BtreeStat('btree_maxintlpage', 'maximum internal page size', 'max_aggregate,no_scale'),
+ BtreeStat('btree_maxleafkey', 'maximum leaf page key size', 'max_aggregate,no_scale'),
+ BtreeStat('btree_maxleafpage', 'maximum leaf page size', 'max_aggregate,no_scale'),
+ BtreeStat('btree_maxleafvalue', 'maximum leaf page value size', 'max_aggregate,no_scale'),
BtreeStat('btree_overflow', 'overflow pages', 'no_scale'),
BtreeStat('btree_row_internal', 'row-store internal pages', 'no_scale'),
BtreeStat('btree_row_leaf', 'row-store leaf pages', 'no_scale'),
@@ -454,26 +380,21 @@ dsrc_stats = [
LSMStat('bloom_size', 'total size of bloom filters', 'no_scale'),
LSMStat('lsm_checkpoint_throttle', 'sleep for LSM checkpoint throttle'),
LSMStat('lsm_chunk_count', 'chunks in the LSM tree', 'no_scale'),
- LSMStat('lsm_generation_max',
- 'highest merge generation in the LSM tree', 'max_aggregate,no_scale'),
- LSMStat('lsm_lookup_no_bloom',
- 'queries that could have benefited ' +
- 'from a Bloom filter that did not exist'),
+ LSMStat('lsm_generation_max', 'highest merge generation in the LSM tree', 'max_aggregate,no_scale'),
+ LSMStat('lsm_lookup_no_bloom', 'queries that could have benefited from a Bloom filter that did not exist'),
LSMStat('lsm_merge_throttle', 'sleep for LSM merge throttle'),
##########################################
# Block manager statistics
##########################################
- BlockStat('allocation_size',
- 'file allocation unit size', 'no_aggregate,no_scale'),
+ BlockStat('allocation_size', 'file allocation unit size', 'max_aggregate,no_scale'),
BlockStat('block_alloc', 'blocks allocated'),
BlockStat('block_checkpoint_size', 'checkpoint size', 'no_scale'),
BlockStat('block_extension', 'allocations requiring file extension'),
BlockStat('block_free', 'blocks freed'),
- BlockStat('block_magic', 'file magic number', 'no_aggregate,no_scale'),
- BlockStat('block_major',
- 'file major version number', 'no_aggregate,no_scale'),
- BlockStat('block_minor', 'minor version number', 'no_aggregate,no_scale'),
+ BlockStat('block_magic', 'file magic number', 'max_aggregate,no_scale'),
+ BlockStat('block_major', 'file major version number', 'max_aggregate,no_scale'),
+ BlockStat('block_minor', 'minor version number', 'max_aggregate,no_scale'),
BlockStat('block_reuse_bytes', 'file bytes available for reuse'),
BlockStat('block_size', 'file size in bytes', 'no_scale'),
@@ -484,44 +405,33 @@ dsrc_stats = [
CacheStat('cache_bytes_write', 'bytes written from cache'),
CacheStat('cache_eviction_checkpoint', 'checkpoint blocked page eviction'),
CacheStat('cache_eviction_clean', 'unmodified pages evicted'),
- CacheStat('cache_eviction_deepen',
- 'page split during eviction deepened the tree'),
+ CacheStat('cache_eviction_deepen', 'page split during eviction deepened the tree'),
CacheStat('cache_eviction_dirty', 'modified pages evicted'),
- CacheStat('cache_eviction_fail',
- 'data source pages selected for eviction unable to be evicted'),
+ CacheStat('cache_eviction_fail', 'data source pages selected for eviction unable to be evicted'),
CacheStat('cache_eviction_hazard', 'hazard pointer blocked page eviction'),
CacheStat('cache_eviction_internal', 'internal pages evicted'),
- CacheStat('cache_eviction_split_internal',
- 'internal pages split during eviction'),
+ CacheStat('cache_eviction_split_internal', 'internal pages split during eviction'),
CacheStat('cache_eviction_split_leaf', 'leaf pages split during eviction'),
CacheStat('cache_inmem_split', 'in-memory page splits'),
- CacheStat('cache_inmem_splittable',
- 'in-memory page passed criteria to be split'),
- CacheStat('cache_overflow_value',
- 'overflow values cached in memory', 'no_scale'),
+ CacheStat('cache_inmem_splittable', 'in-memory page passed criteria to be split'),
+ CacheStat('cache_overflow_value', 'overflow values cached in memory', 'no_scale'),
CacheStat('cache_read', 'pages read into cache'),
- CacheStat('cache_read_lookaside',
- 'pages read into cache requiring lookaside entries'),
+ CacheStat('cache_read_lookaside', 'pages read into cache requiring lookaside entries'),
CacheStat('cache_read_overflow', 'overflow pages read into cache'),
CacheStat('cache_write', 'pages written from cache'),
- CacheStat('cache_write_lookaside',
- 'page written requiring lookaside records'),
- CacheStat('cache_write_restore',
- 'pages written requiring in-memory restoration'),
+ CacheStat('cache_write_lookaside', 'page written requiring lookaside records'),
+ CacheStat('cache_write_restore', 'pages written requiring in-memory restoration'),
##########################################
# Compression statistics
##########################################
- CompressStat('compress_raw_fail',
- 'raw compression call failed, no additional data available'),
- CompressStat('compress_raw_fail_temporary',
- 'raw compression call failed, additional data available'),
+ CompressStat('compress_raw_fail', 'raw compression call failed, no additional data available'),
+ CompressStat('compress_raw_fail_temporary', 'raw compression call failed, additional data available'),
CompressStat('compress_raw_ok', 'raw compression call succeeded'),
CompressStat('compress_read', 'compressed pages read'),
CompressStat('compress_write', 'compressed pages written'),
CompressStat('compress_write_fail', 'page written failed to compress'),
- CompressStat('compress_write_too_small',
- 'page written was too small to compress'),
+ CompressStat('compress_write_too_small', 'page written was too small to compress'),
##########################################
# Reconciliation statistics
@@ -529,8 +439,7 @@ dsrc_stats = [
RecStat('rec_dictionary', 'dictionary matches'),
RecStat('rec_multiblock_internal', 'internal page multi-block writes'),
RecStat('rec_multiblock_leaf', 'leaf page multi-block writes'),
- RecStat('rec_multiblock_max',
- 'maximum blocks required for a page', 'max_aggregate,no_scale'),
+ RecStat('rec_multiblock_max', 'maximum blocks required for a page', 'max_aggregate,no_scale'),
RecStat('rec_overflow_key_internal', 'internal-page overflow keys'),
RecStat('rec_overflow_key_leaf', 'leaf-page overflow keys'),
RecStat('rec_overflow_value', 'overflow values written'),
@@ -539,10 +448,8 @@ dsrc_stats = [
RecStat('rec_page_match', 'page checksum matches'),
RecStat('rec_pages', 'page reconciliation calls'),
RecStat('rec_pages_eviction', 'page reconciliation calls for eviction'),
- RecStat('rec_prefix_compression',
- 'leaf page key bytes discarded using prefix compression'),
- RecStat('rec_suffix_compression',
- 'internal page key bytes discarded using suffix compression'),
+ RecStat('rec_prefix_compression', 'leaf page key bytes discarded using prefix compression'),
+ RecStat('rec_suffix_compression', 'internal page key bytes discarded using suffix compression'),
##########################################
# Transaction statistics
diff --git a/src/block/block_compact.c b/src/block/block_compact.c
index d45d0a96da7..cd304b848d4 100644
--- a/src/block/block_compact.c
+++ b/src/block/block_compact.c
@@ -8,7 +8,7 @@
#include "wt_internal.h"
-static int __block_dump_avail(WT_SESSION_IMPL *, WT_BLOCK *);
+static int __block_dump_avail(WT_SESSION_IMPL *, WT_BLOCK *, bool);
/*
* __wt_block_compact_start --
@@ -22,8 +22,6 @@ __wt_block_compact_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
/* Switch to first-fit allocation. */
__wt_block_configure_first_fit(block, true);
- block->compact_pct_tenths = 0;
-
return (0);
}
@@ -34,14 +32,21 @@ __wt_block_compact_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
int
__wt_block_compact_end(WT_SESSION_IMPL *session, WT_BLOCK *block)
{
+ WT_DECL_RET;
+
WT_UNUSED(session);
/* Restore the original allocation plan. */
__wt_block_configure_first_fit(block, false);
- block->compact_pct_tenths = 0;
+ /* Dump the results of the compaction pass. */
+ if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT)) {
+ __wt_spin_lock(session, &block->live_lock);
+ ret = __block_dump_avail(session, block, false);
+ __wt_spin_unlock(session, &block->live_lock);
+ }
- return (0);
+ return (ret);
}
/*
@@ -70,12 +75,23 @@ __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, bool *skipp)
if (fh->size <= WT_MEGABYTE)
return (0);
+ /*
+ * Reset the compaction state information. This is done here, not in the
+ * compaction "start" routine, because this function is called first to
+ * determine if compaction is useful.
+ */
+ block->compact_pct_tenths = 0;
+ block->compact_pages_reviewed = 0;
+ block->compact_pages_skipped = 0;
+ block->compact_pages_written = 0;
+
__wt_spin_lock(session, &block->live_lock);
+ /* Dump the current state of the file. */
if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT))
- WT_ERR(__block_dump_avail(session, block));
+ WT_ERR(__block_dump_avail(session, block, true));
- /* Sum the available bytes in the first 80% and 90% of the file. */
+ /* Sum the available bytes in the initial 80% and 90% of the file. */
avail_eighty = avail_ninety = 0;
ninety = fh->size - fh->size / 10;
eighty = fh->size - ((fh->size / 10) * 2);
@@ -88,23 +104,6 @@ __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, bool *skipp)
avail_eighty += ext->size;
}
- WT_ERR(__wt_verbose(session, WT_VERB_COMPACT,
- "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first "
- "80%% of the file",
- block->name,
- (uintmax_t)avail_eighty / WT_MEGABYTE, (uintmax_t)avail_eighty));
- WT_ERR(__wt_verbose(session, WT_VERB_COMPACT,
- "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first "
- "90%% of the file",
- block->name,
- (uintmax_t)avail_ninety / WT_MEGABYTE, (uintmax_t)avail_ninety));
- WT_ERR(__wt_verbose(session, WT_VERB_COMPACT,
- "%s: require 10%% or %" PRIuMAX "MB (%" PRIuMAX ") in the first "
- "90%% of the file to perform compaction, compaction %s",
- block->name,
- (uintmax_t)(fh->size / 10) / WT_MEGABYTE, (uintmax_t)fh->size / 10,
- *skipp ? "skipped" : "proceeding"));
-
/*
* Skip files where we can't recover at least 1MB.
*
@@ -127,6 +126,23 @@ __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, bool *skipp)
block->compact_pct_tenths = 1;
}
+ WT_ERR(__wt_verbose(session, WT_VERB_COMPACT,
+ "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first "
+ "80%% of the file",
+ block->name,
+ (uintmax_t)avail_eighty / WT_MEGABYTE, (uintmax_t)avail_eighty));
+ WT_ERR(__wt_verbose(session, WT_VERB_COMPACT,
+ "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first "
+ "90%% of the file",
+ block->name,
+ (uintmax_t)avail_ninety / WT_MEGABYTE, (uintmax_t)avail_ninety));
+ WT_ERR(__wt_verbose(session, WT_VERB_COMPACT,
+ "%s: require 10%% or %" PRIuMAX "MB (%" PRIuMAX ") in the first "
+ "90%% of the file to perform compaction, compaction %s",
+ block->name,
+ (uintmax_t)(fh->size / 10) / WT_MEGABYTE, (uintmax_t)fh->size / 10,
+ *skipp ? "skipped" : "proceeding"));
+
err: __wt_spin_unlock(session, &block->live_lock);
return (ret);
@@ -177,6 +193,14 @@ __wt_block_compact_page_skip(WT_SESSION_IMPL *session,
}
__wt_spin_unlock(session, &block->live_lock);
+ if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT)) {
+ ++block->compact_pages_reviewed;
+ if (*skipp)
+ ++block->compact_pages_skipped;
+ else
+ ++block->compact_pages_written;
+ }
+
return (ret);
}
@@ -185,7 +209,7 @@ __wt_block_compact_page_skip(WT_SESSION_IMPL *session,
* Dump out the avail list so we can see what compaction will look like.
*/
static int
-__block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block)
+__block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block, bool start)
{
WT_EXTLIST *el;
WT_EXT *ext;
@@ -196,6 +220,20 @@ __block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block)
size = block->fh->size;
WT_RET(__wt_verbose(session, WT_VERB_COMPACT,
+ "============ %s",
+ start ? "testing for compaction" : "ending compaction pass"));
+
+ if (!start) {
+ WT_RET(__wt_verbose(session, WT_VERB_COMPACT,
+ "pages reviewed: %" PRIuMAX,
+ block->compact_pages_reviewed));
+ WT_RET(__wt_verbose(session, WT_VERB_COMPACT,
+ "pages skipped: %" PRIuMAX, block->compact_pages_skipped));
+ WT_RET(__wt_verbose(session, WT_VERB_COMPACT,
+ "pages written: %" PRIuMAX, block->compact_pages_written));
+ }
+
+ WT_RET(__wt_verbose(session, WT_VERB_COMPACT,
"file size %" PRIuMAX "MB (%" PRIuMAX ") with %" PRIuMAX
"%% space available %" PRIuMAX "MB (%" PRIuMAX ")",
(uintmax_t)size / WT_MEGABYTE, (uintmax_t)size,
@@ -219,6 +257,10 @@ __block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block)
}
#ifdef __VERBOSE_OUTPUT_PERCENTILE
+ /*
+ * The verbose output always displays 10% buckets, running this code
+ * as well also displays 1% buckets.
+ */
for (i = 0; i < WT_ELEMENTS(percentile); ++i) {
v = percentile[i] * 512;
WT_RET(__wt_verbose(session, WT_VERB_COMPACT,
diff --git a/src/block/block_mgr.c b/src/block/block_mgr.c
index 7260cab75d9..f9f66e05d7f 100644
--- a/src/block/block_mgr.c
+++ b/src/block/block_mgr.c
@@ -221,6 +221,18 @@ __bm_free(WT_BM *bm,
}
/*
+ * __bm_is_mapped --
+ * Return if the file is mapped into memory.
+ */
+static bool
+__bm_is_mapped(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+ WT_UNUSED(session);
+
+ return (bm->map == NULL ? false : true);
+}
+
+/*
* __bm_stat --
* Block-manager statistics.
*/
@@ -357,6 +369,7 @@ __bm_method_set(WT_BM *bm, bool readonly)
(int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly;
bm->free = (int (*)(WT_BM *,
WT_SESSION_IMPL *, const uint8_t *, size_t))__bm_readonly;
+ bm->is_mapped = __bm_is_mapped;
bm->preload = __wt_bm_preload;
bm->read = __wt_bm_read;
bm->salvage_end = (int (*)
@@ -367,6 +380,7 @@ __bm_method_set(WT_BM *bm, bool readonly)
(WT_BM *, WT_SESSION_IMPL *))__bm_readonly;
bm->salvage_valid = (int (*)(WT_BM *,
WT_SESSION_IMPL *, uint8_t *, size_t, bool))__bm_readonly;
+ bm->size = __wt_block_manager_size;
bm->stat = __bm_stat;
bm->sync =
(int (*)(WT_BM *, WT_SESSION_IMPL *, bool))__bm_readonly;
@@ -391,12 +405,14 @@ __bm_method_set(WT_BM *bm, bool readonly)
bm->compact_skip = __bm_compact_skip;
bm->compact_start = __bm_compact_start;
bm->free = __bm_free;
+ bm->is_mapped = __bm_is_mapped;
bm->preload = __wt_bm_preload;
bm->read = __wt_bm_read;
bm->salvage_end = __bm_salvage_end;
bm->salvage_next = __bm_salvage_next;
bm->salvage_start = __bm_salvage_start;
bm->salvage_valid = __bm_salvage_valid;
+ bm->size = __wt_block_manager_size;
bm->stat = __bm_stat;
bm->sync = __bm_sync;
bm->verify_addr = __bm_verify_addr;
diff --git a/src/block/block_open.c b/src/block/block_open.c
index 7cf12d36066..ff70b765d1f 100644
--- a/src/block/block_open.c
+++ b/src/block/block_open.c
@@ -405,27 +405,37 @@ __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats)
* Reading from the live system's structure normally requires locking,
* but it's an 8B statistics read, there's no need.
*/
- stats->allocation_size = block->allocsize;
- stats->block_checkpoint_size = (int64_t)block->live.ckpt_size;
- stats->block_magic = WT_BLOCK_MAGIC;
- stats->block_major = WT_BLOCK_MAJOR_VERSION;
- stats->block_minor = WT_BLOCK_MINOR_VERSION;
- stats->block_reuse_bytes = (int64_t)block->live.avail.bytes;
- stats->block_size = block->fh->size;
+ WT_STAT_WRITE(stats, allocation_size, block->allocsize);
+ WT_STAT_WRITE(
+ stats, block_checkpoint_size, (int64_t)block->live.ckpt_size);
+ WT_STAT_WRITE(stats, block_magic, WT_BLOCK_MAGIC);
+ WT_STAT_WRITE(stats, block_major, WT_BLOCK_MAJOR_VERSION);
+ WT_STAT_WRITE(stats, block_minor, WT_BLOCK_MINOR_VERSION);
+ WT_STAT_WRITE(
+ stats, block_reuse_bytes, (int64_t)block->live.avail.bytes);
+ WT_STAT_WRITE(stats, block_size, block->fh->size);
}
/*
* __wt_block_manager_size --
- * Set the size statistic for a file.
+ * Return the size of a live block handle.
*/
int
-__wt_block_manager_size(
- WT_SESSION_IMPL *session, const char *filename, WT_DSRC_STATS *stats)
+__wt_block_manager_size(WT_BM *bm, WT_SESSION_IMPL *session, wt_off_t *sizep)
{
- wt_off_t filesize;
-
- WT_RET(__wt_filesize_name(session, filename, false, &filesize));
- stats->block_size = filesize;
+ WT_UNUSED(session);
+ *sizep = bm->block->fh == NULL ? 0 : bm->block->fh->size;
return (0);
}
+
+/*
+ * __wt_block_manager_named_size --
+ * Return the size of a named file.
+ */
+int
+__wt_block_manager_named_size(
+ WT_SESSION_IMPL *session, const char *name, wt_off_t *sizep)
+{
+ return (__wt_filesize_name(session, name, false, sizep));
+}
diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c
index b2c9e4b67f8..8935d39b696 100644
--- a/src/btree/bt_compact.c
+++ b/src/btree/bt_compact.c
@@ -17,9 +17,11 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
WT_BM *bm;
WT_DECL_RET;
+ WT_MULTI *multi;
WT_PAGE *page;
WT_PAGE_MODIFY *mod;
size_t addr_size;
+ uint32_t i;
const uint8_t *addr;
*skipp = true; /* Default skip. */
@@ -41,29 +43,46 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
/*
* If the page is clean, test the original addresses.
- * If the page is a 1-to-1 replacement, test the replacement addresses.
+ * If the page is a replacement, test the replacement addresses.
* Ignore empty pages, they get merged into the parent.
*/
if (mod == NULL || mod->rec_result == 0) {
- WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+ __wt_ref_info(ref, &addr, &addr_size, NULL);
if (addr == NULL)
return (0);
- WT_RET(
+ return (
bm->compact_page_skip(bm, session, addr, addr_size, skipp));
- } else if (mod->rec_result == WT_PM_REC_REPLACE) {
- /*
- * The page's modification information can change underfoot if
- * the page is being reconciled, serialize with reconciliation.
- */
+ }
+
+ /*
+ * The page's modification information can change underfoot if the page
+ * is being reconciled, serialize with reconciliation.
+ */
+ if (mod->rec_result == WT_PM_REC_REPLACE ||
+ mod->rec_result == WT_PM_REC_MULTIBLOCK)
WT_RET(__wt_fair_lock(session, &page->page_lock));
+ if (mod->rec_result == WT_PM_REC_REPLACE)
ret = bm->compact_page_skip(bm, session,
mod->mod_replace.addr, mod->mod_replace.size, skipp);
+ if (mod->rec_result == WT_PM_REC_MULTIBLOCK)
+ for (multi = mod->mod_multi,
+ i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
+ if (multi->disk_image != NULL)
+ continue;
+ if ((ret = bm->compact_page_skip(bm, session,
+ multi->addr.addr, multi->addr.size, skipp)) != 0)
+ break;
+ if (!*skipp)
+ break;
+ }
+
+ if (mod->rec_result == WT_PM_REC_REPLACE ||
+ mod->rec_result == WT_PM_REC_MULTIBLOCK)
WT_TRET(__wt_fair_unlock(session, &page->page_lock));
- WT_RET(ret);
- }
- return (0);
+
+ return (ret);
}
/*
@@ -130,7 +149,7 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[])
* read, set its generation to a low value so it is evicted
* quickly.
*/
- WT_ERR(__wt_tree_walk(session, &ref, NULL,
+ WT_ERR(__wt_tree_walk(session, &ref,
WT_READ_COMPACT | WT_READ_NO_GEN | WT_READ_WONT_NEED));
if (ref == NULL)
break;
@@ -139,7 +158,8 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[])
if (skip)
continue;
- session->compaction = true;
+ session->compact_state = WT_COMPACT_SUCCESS;
+
/* Rewrite the page: mark the page and tree dirty. */
WT_ERR(__wt_page_modify_init(session, ref->page));
__wt_page_modify_set(session, ref->page);
@@ -182,7 +202,7 @@ __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
* address, the page isn't on disk, but we have to read internal pages
* to walk the tree regardless; throw up our hands and read it.
*/
- WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, &type));
+ __wt_ref_info(ref, &addr, &addr_size, &type);
if (addr == NULL)
return (0);
diff --git a/src/btree/bt_curnext.c b/src/btree/bt_curnext.c
index 3c96bad39d7..6573bc60165 100644
--- a/src/btree/bt_curnext.c
+++ b/src/btree/bt_curnext.c
@@ -389,6 +389,14 @@ __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt)
*/
cbt->page_deleted_count = 0;
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * If starting a new iteration, clear the last-key returned, it doesn't
+ * apply.
+ */
+ cbt->lastkey->size = 0;
+ cbt->lastrecno = WT_RECNO_OOB;
+#endif
/*
* If we don't have a search page, then we're done, we're starting at
* the beginning or end of the tree, not as a result of a search.
@@ -430,6 +438,104 @@ __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt)
}
}
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __cursor_key_order_check_col --
+ * Check key ordering for column-store cursor movements.
+ */
+static int
+__cursor_key_order_check_col(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next)
+{
+ int cmp;
+
+ cmp = 0; /* -Werror=maybe-uninitialized */
+
+ if (cbt->lastrecno != WT_RECNO_OOB) {
+ if (cbt->lastrecno < cbt->recno)
+ cmp = -1;
+ if (cbt->lastrecno > cbt->recno)
+ cmp = 1;
+ }
+
+ if (cbt->lastrecno == WT_RECNO_OOB ||
+ (next && cmp < 0) || (!next && cmp > 0)) {
+ cbt->lastrecno = cbt->recno;
+ return (0);
+ }
+
+ WT_PANIC_RET(session, EINVAL,
+ "WT_CURSOR.%s out-of-order returns: returned key %" PRIu64 " then "
+ "key %" PRIu64,
+ next ? "next" : "prev", cbt->lastrecno, cbt->recno);
+}
+
+/*
+ * __cursor_key_order_check_row --
+ * Check key ordering for row-store cursor movements.
+ */
+static int
+__cursor_key_order_check_row(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next)
+{
+ WT_BTREE *btree;
+ WT_ITEM *key;
+ WT_DECL_RET;
+ WT_DECL_ITEM(a);
+ WT_DECL_ITEM(b);
+ int cmp;
+
+ btree = S2BT(session);
+ key = &cbt->iface.key;
+ cmp = 0; /* -Werror=maybe-uninitialized */
+
+ if (cbt->lastkey->size != 0)
+ WT_RET(__wt_compare(
+ session, btree->collator, cbt->lastkey, key, &cmp));
+
+ if (cbt->lastkey->size == 0 || (next && cmp < 0) || (!next && cmp > 0))
+ return (__wt_buf_set(session, cbt->lastkey,
+ cbt->iface.key.data, cbt->iface.key.size));
+
+ WT_ERR(__wt_scr_alloc(session, 512, &a));
+ WT_ERR(__wt_buf_set_printable(
+ session, a, cbt->lastkey->data, cbt->lastkey->size));
+
+ WT_ERR(__wt_scr_alloc(session, 512, &b));
+ WT_ERR(__wt_buf_set_printable(session, b, key->data, key->size));
+
+ WT_PANIC_ERR(session, EINVAL,
+ "WT_CURSOR.%s out-of-order returns: returned key %.*s then "
+ "key %.*s",
+ next ? "next" : "prev",
+ (int)a->size, (const char *)a->data,
+ (int)b->size, (const char *)b->data);
+
+err: __wt_scr_free(session, &a);
+ __wt_scr_free(session, &b);
+
+ return (ret);
+}
+
+/*
+ * __wt_cursor_key_order_check --
+ * Check key ordering for cursor movements.
+ */
+int
+__wt_cursor_key_order_check(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next)
+{
+ switch (cbt->ref->page->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ return (__cursor_key_order_check_col(session, cbt, next));
+ case WT_PAGE_ROW_LEAF:
+ return (__cursor_key_order_check_row(session, cbt, next));
+ WT_ILLEGAL_VALUE(session);
+ }
+}
+#endif
+
/*
* __wt_btcur_next --
* Move to the next record in the tree.
@@ -527,10 +633,15 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating)
__wt_page_evict_soon(page);
cbt->page_deleted_count = 0;
- WT_ERR(__wt_tree_walk(session, &cbt->ref, NULL, flags));
+ WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
}
+#ifdef HAVE_DIAGNOSTIC
+ if (ret == 0)
+ WT_ERR(__wt_cursor_key_order_check(session, cbt, true));
+#endif
+
err: if (ret != 0)
WT_TRET(__cursor_reset(cbt));
return (ret);
diff --git a/src/btree/bt_curprev.c b/src/btree/bt_curprev.c
index b7cea561b48..1e4b1daa090 100644
--- a/src/btree/bt_curprev.c
+++ b/src/btree/bt_curprev.c
@@ -615,9 +615,13 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating)
__wt_page_evict_soon(page);
cbt->page_deleted_count = 0;
- WT_ERR(__wt_tree_walk(session, &cbt->ref, NULL, flags));
+ WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
}
+#ifdef HAVE_DIAGNOSTIC
+ if (ret == 0)
+ WT_ERR(__wt_cursor_key_order_check(session, cbt, false));
+#endif
err: if (ret != 0)
WT_TRET(__cursor_reset(cbt));
diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c
index 69512f45933..28b51fd2865 100644
--- a/src/btree/bt_cursor.c
+++ b/src/btree/bt_cursor.c
@@ -62,8 +62,18 @@ __cursor_size_chk(WT_SESSION_IMPL *session, WT_ITEM *kv)
static inline int
__cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt)
{
- return (btree->type == BTREE_COL_FIX &&
- !F_ISSET(cbt, WT_CBT_MAX_RECORD));
+ /*
+ * When there's no exact match, column-store search returns the key
+ * nearest the searched-for key (continuing past keys smaller than the
+ * searched-for key to return the next-largest key). Therefore, if the
+ * returned comparison is -1, the searched-for key was larger than any
+ * row on the page's standard information or column-store insert list.
+ *
+ * If the returned comparison is NOT -1, there was a row equal to or
+ * larger than the searched-for key, and we implicitly create missing
+ * rows.
+ */
+ return (btree->type == BTREE_COL_FIX && cbt->compare != -1);
}
/*
@@ -502,19 +512,14 @@ retry: WT_RET(__cursor_func_init(cbt, true));
case BTREE_COL_VAR:
/*
* If WT_CURSTD_APPEND is set, insert a new record (ignoring
- * the application's record number). First we search for the
- * maximum possible record number so the search ends on the
- * last page. The real record number is assigned by the
- * serialized append operation.
+ * the application's record number). The real record number
+ * is assigned by the serialized append operation.
*/
if (F_ISSET(cursor, WT_CURSTD_APPEND))
- cbt->iface.recno = UINT64_MAX;
+ cbt->iface.recno = WT_RECNO_OOB;
WT_ERR(__cursor_col_search(session, cbt, NULL));
- if (F_ISSET(cursor, WT_CURSTD_APPEND))
- cbt->iface.recno = WT_RECNO_OOB;
-
/*
* If not overwriting, fail if the key exists. Creating a
* record past the end of the tree in a fixed-length
@@ -816,7 +821,12 @@ err: if (ret == WT_RESTART) {
/*
* __wt_btcur_next_random --
- * Move to a random record in the tree.
+ * Move to a random record in the tree. There are two algorithms, one
+ * where we select a record at random from the whole tree on each
+ * retrieval and one where we first select a record at random from the
+ * whole tree, and then subsequently sample forward from that location.
+ * The sampling approach allows us to select reasonably uniform random
+ * points from unbalanced trees.
*/
int
__wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
@@ -825,6 +835,8 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
WT_DECL_RET;
WT_SESSION_IMPL *session;
WT_UPDATE *upd;
+ wt_off_t size;
+ uint64_t skip;
session = (WT_SESSION_IMPL *)cbt->iface.session;
btree = cbt->btree;
@@ -839,11 +851,65 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
WT_STAT_FAST_CONN_INCR(session, cursor_next);
WT_STAT_FAST_DATA_INCR(session, cursor_next);
- WT_RET(__cursor_func_init(cbt, true));
+ /*
+ * If retrieving random values without sampling, or we don't have a
+ * page reference, pick a roughly random leaf page in the tree.
+ */
+ if (cbt->ref == NULL || cbt->next_random_sample_size == 0) {
+ /*
+ * Skip past the sample size of the leaf pages in the tree
+ * between each random key return to compensate for unbalanced
+ * trees.
+ *
+ * Use the underlying file size divided by its block allocation
+ * size as our guess of leaf pages in the file (this can be
+ * entirely wrong, as it depends on how many pages are in this
+ * particular checkpoint, how large the leaf and internal pages
+ * really are, and other factors). Then, divide that value by
+ * the configured sample size and increment the final result to
+ * make sure tiny files don't leave us with a skip value of 0.
+ *
+ * !!!
+ * Ideally, the number would be prime to avoid restart issues.
+ */
+ if (cbt->next_random_sample_size != 0) {
+ WT_ERR(btree->bm->size(btree->bm, session, &size));
+ cbt->next_random_leaf_skip = (uint64_t)
+ ((size / btree->allocsize) /
+ cbt->next_random_sample_size) + 1;
+ }
+
+ /*
+ * Choose a leaf page from the tree.
+ */
+ WT_ERR(__cursor_func_init(cbt, true));
+ WT_WITH_PAGE_INDEX(
+ session, ret = __wt_row_random_descent(session, cbt));
+ WT_ERR(ret);
+ } else {
+ /*
+ * Read through the tree, skipping leaf pages. Be cautious about
+ * the skip count: if the last leaf page skipped was also the
+ * last leaf page in the tree, it may be set to zero on return
+ * with the end-of-walk condition.
+ *
+ * Pages read for data sampling aren't "useful"; don't update
+ * the read generation of pages already in memory, and if a page
+ * is read, set its generation to a low value so it is evicted
+ * quickly.
+ */
+ for (skip =
+ cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;)
+ WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip,
+ WT_READ_NO_GEN |
+ WT_READ_SKIP_INTL | WT_READ_WONT_NEED));
+ }
- WT_WITH_PAGE_INDEX(session,
- ret = __wt_row_random(session, cbt));
- WT_ERR(ret);
+ /*
+ * Select a random entry from the leaf page. If it's not valid, move to
+ * the next entry, if that doesn't work, move to the previous entry.
+ */
+ WT_ERR(__wt_row_random_leaf(session, cbt));
if (__cursor_valid(cbt, &upd))
WT_ERR(__wt_kv_return(session, cbt, upd));
else {
@@ -851,9 +917,9 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
ret = __wt_btcur_prev(cbt, false);
WT_ERR(ret);
}
+ return (0);
-err: if (ret != 0)
- WT_TRET(__cursor_reset(cbt));
+err: WT_TRET(__cursor_reset(cbt));
return (ret);
}
@@ -1167,6 +1233,11 @@ __wt_btcur_open(WT_CURSOR_BTREE *cbt)
{
cbt->row_key = &cbt->_row_key;
cbt->tmp = &cbt->_tmp;
+
+#ifdef HAVE_DIAGNOSTIC
+ cbt->lastkey = &cbt->_lastkey;
+ cbt->lastrecno = WT_RECNO_OOB;
+#endif
}
/*
@@ -1192,6 +1263,9 @@ __wt_btcur_close(WT_CURSOR_BTREE *cbt, bool lowlevel)
__wt_buf_free(session, &cbt->_row_key);
__wt_buf_free(session, &cbt->_tmp);
+#ifdef HAVE_DIAGNOSTIC
+ __wt_buf_free(session, &cbt->_lastkey);
+#endif
return (ret);
}
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index 0f47c060daf..393f869ece9 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -43,7 +43,7 @@ static int __debug_page_col_var(WT_DBG *, WT_PAGE *);
static int __debug_page_metadata(WT_DBG *, WT_PAGE *);
static int __debug_page_row_int(WT_DBG *, WT_PAGE *, uint32_t);
static int __debug_page_row_leaf(WT_DBG *, WT_PAGE *);
-static int __debug_ref(WT_DBG *, WT_REF *);
+static void __debug_ref(WT_DBG *, WT_REF *);
static void __debug_row_skip(WT_DBG *, WT_INSERT_HEAD *);
static int __debug_tree(
WT_SESSION_IMPL *, WT_BTREE *, WT_PAGE *, const char *, uint32_t);
@@ -74,9 +74,7 @@ __wt_debug_set_verbose(WT_SESSION_IMPL *session, const char *v)
static inline void
__debug_hex_byte(WT_DBG *ds, uint8_t v)
{
- static const char hex[] = "0123456789abcdef";
-
- __dmsg(ds, "#%c%c", hex[(v & 0xf0) >> 4], hex[v & 0x0f]);
+ __dmsg(ds, "#%c%c", __wt_hex[(v & 0xf0) >> 4], __wt_hex[v & 0x0f]);
}
/*
@@ -769,7 +767,7 @@ __debug_page_col_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags)
WT_INTL_FOREACH_BEGIN(session, page, ref) {
__dmsg(ds, "\trecno %" PRIu64 "\n", ref->key.recno);
- WT_RET(__debug_ref(ds, ref));
+ __debug_ref(ds, ref);
} WT_INTL_FOREACH_END;
if (LF_ISSET(WT_DEBUG_TREE_WALK))
@@ -843,7 +841,7 @@ __debug_page_row_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags)
WT_INTL_FOREACH_BEGIN(session, page, ref) {
__wt_ref_key(page, ref, &p, &len);
__debug_item(ds, "K", p, len);
- WT_RET(__debug_ref(ds, ref));
+ __debug_ref(ds, ref);
} WT_INTL_FOREACH_END;
if (LF_ISSET(WT_DEBUG_TREE_WALK))
@@ -965,7 +963,7 @@ __debug_update(WT_DBG *ds, WT_UPDATE *upd, bool hexbyte)
* __debug_ref --
* Dump a WT_REF structure.
*/
-static int
+static void
__debug_ref(WT_DBG *ds, WT_REF *ref)
{
WT_SESSION_IMPL *session;
@@ -994,14 +992,14 @@ __debug_ref(WT_DBG *ds, WT_REF *ref)
case WT_REF_SPLIT:
__dmsg(ds, "split");
break;
- WT_ILLEGAL_VALUE(session);
+ default:
+ __dmsg(ds, "INVALID");
+ break;
}
- WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+ __wt_ref_info(ref, &addr, &addr_size, NULL);
__dmsg(ds, " %s\n",
__wt_addr_string(session, addr, addr_size, ds->tmp));
-
- return (0);
}
/*
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index 294cc399d65..a6330326954 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -601,7 +601,7 @@ __btree_preload(WT_SESSION_IMPL *session)
/* Pre-load the second-level internal pages. */
WT_INTL_FOREACH_BEGIN(session, btree->root.page, ref) {
- WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+ __wt_ref_info(ref, &addr, &addr_size, NULL);
if (addr != NULL)
WT_RET(bm->preload(bm, session, addr, addr_size));
} WT_INTL_FOREACH_END;
@@ -622,7 +622,7 @@ __btree_get_last_recno(WT_SESSION_IMPL *session)
btree = S2BT(session);
next_walk = NULL;
- WT_RET(__wt_tree_walk(session, &next_walk, NULL, WT_READ_PREV));
+ WT_RET(__wt_tree_walk(session, &next_walk, WT_READ_PREV));
if (next_walk == NULL)
return (WT_NOTFOUND);
diff --git a/src/btree/bt_huffman.c b/src/btree/bt_huffman.c
index d9ff9616072..a34e57796a8 100644
--- a/src/btree/bt_huffman.c
+++ b/src/btree/bt_huffman.c
@@ -332,11 +332,17 @@ __wt_huffman_read(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *ip,
for (tp = table, lineno = 1; (ret =
fscanf(fp, "%" SCNi64 " %" SCNi64, &symbol, &frequency)) != EOF;
++tp, ++lineno) {
- if (lineno > entries)
+ /*
+ * Entries is 0-based, that is, there are (entries +1) possible
+ * values that can be configured. The line number is 1-based, so
+ * adjust the test for too many entries, and report (entries +1)
+ * in the error as the maximum possible number of entries.
+ */
+ if (lineno > entries + 1)
WT_ERR_MSG(session, EINVAL,
"Huffman table file %.*s is corrupted, "
"more than %" PRIu32 " entries",
- (int)ip->len, ip->str, entries);
+ (int)ip->len, ip->str, entries + 1);
if (ret != 2)
WT_ERR_MSG(session, EINVAL,
"line %u of Huffman table file %.*s is corrupted: "
diff --git a/src/btree/bt_misc.c b/src/btree/bt_misc.c
index d2b16bb5d21..a60499ef8b7 100644
--- a/src/btree/bt_misc.c
+++ b/src/btree/bt_misc.c
@@ -101,7 +101,7 @@ __wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf)
return (buf->data);
}
- (void)__wt_ref_info(session, ref, &addr, &addr_size, NULL);
+ __wt_ref_info(ref, &addr, &addr_size, NULL);
return (__wt_addr_string(session, addr, addr_size, buf));
}
diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c
index 8808f0b1a85..fdccf033828 100644
--- a/src/btree/bt_page.c
+++ b/src/btree/bt_page.c
@@ -272,7 +272,7 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page)
const WT_PAGE_HEADER *dsk;
WT_PAGE_INDEX *pindex;
WT_REF **refp, *ref;
- uint32_t i;
+ uint32_t hint, i;
btree = S2BT(session);
dsk = page->dsk;
@@ -284,9 +284,11 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page)
*/
pindex = WT_INTL_INDEX_GET_SAFE(page);
refp = pindex->index;
+ hint = 0;
WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
ref = *refp++;
ref->home = page;
+ ref->pindex_hint = hint++;
__wt_cell_unpack(cell, unpack);
ref->addr = cell;
@@ -404,7 +406,7 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
const WT_PAGE_HEADER *dsk;
WT_PAGE_INDEX *pindex;
WT_REF *ref, **refp;
- uint32_t i;
+ uint32_t hint, i;
bool overflow_keys;
btree = S2BT(session);
@@ -421,9 +423,11 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
pindex = WT_INTL_INDEX_GET_SAFE(page);
refp = pindex->index;
overflow_keys = false;
+ hint = 0;
WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
ref = *refp;
ref->home = page;
+ ref->pindex_hint = hint++;
__wt_cell_unpack(cell, unpack);
switch (unpack->type) {
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c
index 77215474359..c50f97bbe14 100644
--- a/src/btree/bt_read.c
+++ b/src/btree/bt_read.c
@@ -375,7 +375,7 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref)
* Get the address: if there is no address, the page was deleted, but a
* subsequent search or insert is forcing re-creation of the name space.
*/
- WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+ __wt_ref_info(ref, &addr, &addr_size, NULL);
if (addr == NULL) {
WT_ASSERT(session, previous_state == WT_REF_DELETED);
diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c
index 756ffd98f3a..b5c299b9ea9 100644
--- a/src/btree/bt_slvg.c
+++ b/src/btree/bt_slvg.c
@@ -1807,7 +1807,7 @@ err: if (page != NULL)
*/
static int
__slvg_row_build_internal(
- WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF *ss)
+ WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF *ss)
{
WT_ADDR *addr;
WT_DECL_RET;
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 631aca0d5c0..69c787c9385 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -15,6 +15,22 @@
} while (0)
/*
+ * A note on error handling: main split functions first allocate/initialize new
+ * structures; failures during that period are handled by discarding the memory
+ * and returning an error code, the caller knows the split didn't happen and
+ * proceeds accordingly. Second, split functions update the tree, and a failure
+ * in that period is catastrophic, any partial update to the tree requires a
+ * panic, we can't recover. Third, once the split is complete and the tree has
+ * been fully updated, we have to ignore most errors, the split is complete and
+ * correct, callers have to proceed accordingly.
+ */
+typedef enum {
+ WT_ERR_IGNORE, /* Ignore minor errors */
+ WT_ERR_PANIC, /* Panic on all errors */
+ WT_ERR_RETURN /* Clean up and return error */
+} WT_SPLIT_ERROR_PHASE;
+
+/*
* __split_oldest_gen --
* Calculate the oldest active split generation.
*/
@@ -190,6 +206,8 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
case WT_PAGE_COL_INT:
recno = 0; /* Less than any valid record number. */
WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ WT_ASSERT(session, ref->home == page);
+
WT_ASSERT(session, ref->key.recno > recno);
recno = ref->key.recno;
} WT_INTL_FOREACH_END;
@@ -202,6 +220,8 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
first = true;
WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ WT_ASSERT(session, ref->home == page);
+
__wt_ref_key(page, ref, &next->data, &next->size);
if (last->size == 0) {
if (first)
@@ -328,7 +348,7 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
/*
* If there's no address (the page has never been written), or the
* address has been instantiated, there's no work to do. Otherwise,
- * get the address from the on-page cell.
+ * instantiate the address in-memory, from the on-page cell.
*/
addr = ref->addr;
if (addr != NULL && !__wt_off_page(from_home, addr)) {
@@ -363,65 +383,101 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
}
/*
- * __split_child_block_evict_and_split --
- * Ensure the newly created child isn't evicted or split for now.
+ * __split_ref_step1 --
+ * Prepare a set of WT_REFs for a move.
*/
static void
-__split_child_block_evict_and_split(WT_PAGE *child)
+__split_ref_step1(
+ WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first)
{
+ WT_PAGE *child;
+ WT_REF *child_ref, *ref;
+ uint32_t i, j;
+
+ /* The newly created subtree is complete. */
+ WT_WRITE_BARRIER();
+
/*
- * Once the split is live, newly created internal pages might be evicted
- * and their WT_REF structures freed. If that happens before all threads
- * exit the index of the page which previously "owned" the WT_REF, a
- * thread might see a freed WT_REF. To ensure that doesn't happen, the
- * newly created page's modify structure has a field with a transaction
- * ID that's checked before any internal page is evicted. Unfortunately,
- * we don't know the correct value until we update the original page's
- * index (we need a transaction ID from after that update), but the act
- * of updating the original page's index is what allows the eviction to
- * happen.
- *
- * Once the split is live, newly created internal pages might themselves
- * split. The split itself is not the problem: if a page splits before
- * we fix up its WT_REF (in other words, a WT_REF we move is then moved
- * again, before we reset the underlying page's parent reference), it's
- * OK because the test we use to find a WT_REF and WT_PAGE that require
- * fixing up is only that the WT_REF points to the wrong parent, not it
- * points to a specific wrong parent. The problem is our fix up of the
- * WT_REFs in the created page could race with the subsequent fix of the
- * same WT_REFs (in a different created page), we'd have to acquire some
- * lock to prevent that race, and that's going to be difficult at best.
- *
- * For now, block eviction and splits in newly created pages until they
- * have been fixed up.
+ * Update the moved WT_REFs so threads moving through them start looking
+ * at the created children's page index information. Because we've not
+ * yet updated the page index of the parent page into which we are going
+ * to split this subtree, a cursor moving through these WT_REFs will
+ * ascend into the created children, but eventually fail as that parent
+ * page won't yet know about the created children pages. That's OK, we
+ * spin there until the parent's page index is updated.
*/
- F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
+ for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) {
+ ref = pindex->index[i];
+ child = ref->page;
+
+ /*
+ * Block eviction and splits in newly created pages.
+ *
+ * Once the split is live, newly created internal pages might be
+ * evicted and their WT_REF structures freed. If that happened
+ * before all threads exit the index of the page that previously
+ * "owned" the WT_REF, a thread might see a freed WT_REF. To
+ * ensure that doesn't happen, the newly created page's modify
+ * structure has a field with a transaction ID that's checked
+ * before any internal page is evicted. Unfortunately, we don't
+ * know the correct value until we update the original page's
+ * index (we need a transaction ID from after that update), but
+ * the act of updating the original page's index is what allows
+ * the eviction to happen.
+ *
+ * Split blocking was because historic versions of the split
+ * code didn't update the WT_REF.home field until after the
+ * split was live, so the WT_REF.home fields being updated could
+ * split again before the update, there's a race between splits
+ * as to which would update them first. The current code updates
+ * the WT_REF.home fields before going live (in this function),
+ * this shouldn't be an issue, but for now splits remain turned
+ * off.
+ */
+ F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
+
+ /*
+ * We use a page flag to prevent the child from splitting from
+ * underneath us, but the split-generation error checks don't
+ * know about that flag; use the standard macros to ensure that
+ * reading the child's page index structure is safe.
+ */
+ j = 0;
+ WT_ENTER_PAGE_INDEX(session);
+ WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
+ child_ref->home = child;
+ child_ref->pindex_hint = j++;
+ } WT_INTL_FOREACH_END;
+ WT_LEAVE_PAGE_INDEX(session);
+
+#ifdef HAVE_DIAGNOSTIC
+ WT_WITH_PAGE_INDEX(session,
+ __split_verify_intl_key_order(session, child));
+#endif
+ }
}
/*
- * __split_ref_move_final --
- * Finalize the moved WT_REF structures after the split succeeds.
+ * __split_ref_step2 --
+ * Allow the newly created children to be evicted or split.
*/
static int
-__split_ref_move_final(
- WT_SESSION_IMPL *session, WT_REF **refp, uint32_t entries)
+__split_ref_step2(
+ WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first)
{
WT_DECL_RET;
WT_PAGE *child;
- WT_REF *ref, *child_ref;
+ WT_REF *ref;
uint32_t i;
/*
- * The WT_REF structures moved to newly allocated child pages reference
- * the wrong parent page and we have to fix that up. The problem is
- * revealed when a thread of control searches for the child page's
- * reference structure slot, and fails to find it because the parent
- * page being searched no longer references the child. When that failure
- * happens the thread waits for the reference's home page to be updated,
- * which we do here: walk the children and fix them up.
+ * The split has gone live, enable eviction and splits on the newly
+ * created internal pages.
*/
- for (i = 0; i < entries; ++i, ++refp) {
- ref = *refp;
+ WT_WRITE_BARRIER();
+
+ for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) {
+ ref = pindex->index[i];
/*
* We don't hold hazard pointers on created pages, they cannot
@@ -441,42 +497,18 @@ __split_ref_move_final(
WT_ERR(ret);
child = ref->page;
+
+ /* The child can now be evicted or split. */
+ F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
+
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
__split_verify_intl_key_order(session, child));
#endif
- /*
- * We use a page flag to prevent the child from splitting from
- * underneath us, but the split-generation error checks don't
- * know about that flag; use the standard macros to ensure that
- * reading the child's page index structure is safe.
- */
- WT_ENTER_PAGE_INDEX(session);
- WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
- /*
- * The page's home reference may not be wrong, as we
- * opened up access from the top of the tree already,
- * disk pages may have been read in since then, and
- * those pages would have correct parent references.
- */
- if (child_ref->home != child) {
- child_ref->home = child;
- child_ref->pindex_hint = 0;
- }
- } WT_INTL_FOREACH_END;
- WT_LEAVE_PAGE_INDEX(session);
-
- /* The child can now be evicted or split. */
- F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
WT_ERR(__wt_hazard_clear(session, child));
}
- /*
- * Push out the changes: not required for correctness, but don't let
- * threads spin on incorrect page references longer than necessary.
- */
- WT_FULL_BARRIER();
return (0);
err: /* Something really bad just happened. */
@@ -496,11 +528,11 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex;
WT_REF **alloc_refp;
WT_REF **child_refp, *ref, **root_refp;
+ WT_SPLIT_ERROR_PHASE complete;
size_t child_incr, root_decr, root_incr, size;
uint64_t split_gen;
uint32_t children, chunk, i, j, remain;
uint32_t slots;
- bool complete;
void *p;
WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen);
@@ -511,7 +543,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
btree = S2BT(session);
alloc_index = NULL;
root_decr = root_incr = 0;
- complete = false;
+ complete = WT_ERR_RETURN;
/* The root page will be marked dirty, make sure that will succeed. */
WT_RET(__wt_page_modify_init(session, root));
@@ -589,16 +621,13 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_ERR(__wt_page_modify_init(session, child));
__wt_page_modify_set(session, child);
- /* Ensure the page isn't evicted or split for now. */
- __split_child_block_evict_and_split(child);
-
/*
* The newly allocated child's page index references the same
* structures as the root. (We cannot move WT_REF structures,
* threads may be underneath us right now changing the structure
* state.) However, if the WT_REF structures reference on-page
* information, we have to fix that, because the disk image for
- * the page that has an page index entry for the WT_REF is about
+ * the page that has a page index entry for the WT_REF is about
* to change.
*/
child_pindex = WT_INTL_INDEX_GET_SAFE(child);
@@ -615,31 +644,28 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_ASSERT(session,
root_refp - pindex->index == (ptrdiff_t)pindex->entries);
+ /* Start making real changes to the tree, errors are fatal. */
+ complete = WT_ERR_PANIC;
+
+ /* Prepare the WT_REFs for the move. */
+ __split_ref_step1(session, alloc_index, false);
+
/*
* Confirm the root page's index hasn't moved, then update it, which
- * makes the split visible to threads descending the tree. From this
- * point on, we're committed to the split.
- *
- * A note on error handling: until this point, there's no problem with
- * unwinding on error. We allocated a new page index, a new set of
- * WT_REFs and a new set of child pages -- if an error occurred, the
- * root remained unchanged, although it may have an incorrect memory
- * footprint. From now on we've modified the root page, attention
- * needs to be paid. However, subsequent failures are relatively benign,
- * the split is OK and complete. For that reason, we ignore errors past
- * this point unless there's a panic.
+ * makes the split visible to threads descending the tree.
*/
WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(root) == pindex);
WT_INTL_INDEX_SET(root, alloc_index);
- complete = true;
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
__split_verify_intl_key_order(session, root));
#endif
- /* Fix up the moved WT_REF structures. */
- WT_ERR(__split_ref_move_final(
- session, alloc_index->index, alloc_index->entries));
+ /* Finalize the WT_REFs we moved. */
+ WT_ERR(__split_ref_step2(session, alloc_index, false));
+
+ /* The split is complete and correct, ignore benign errors. */
+ complete = WT_ERR_IGNORE;
/* We've installed the allocated page-index, ensure error handling. */
alloc_index = NULL;
@@ -664,24 +690,25 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
__wt_cache_page_inmem_decr(session, root, root_decr);
__wt_page_modify_set(session, root);
-err: /*
- * If complete is true, we saw an error after opening up the tree to
- * descent through the root page's new index. There is nothing we
- * can do, there are threads potentially active in both versions of
- * the tree.
- *
- * A note on error handling: if we completed the split, return success,
- * nothing really bad can have happened, and our caller has to proceed
- * with the split.
- */
- if (!complete)
+err: switch (complete) {
+ case WT_ERR_RETURN:
__wt_free_ref_index(session, root, alloc_index, true);
-
- if (ret != 0 && ret != WT_PANIC)
+ break;
+ case WT_ERR_PANIC:
__wt_err(session, ret,
- "ignoring not-fatal error during root page split to "
- "deepen the tree");
- return (ret == WT_PANIC || !complete ? ret : 0);
+ "fatal error during root page split to deepen the tree");
+ ret = WT_PANIC;
+ break;
+ case WT_ERR_IGNORE:
+ if (ret != 0 && ret != WT_PANIC) {
+ __wt_err(session, ret,
+ "ignoring not-fatal error during root page split "
+ "to deepen the tree");
+ ret = 0;
+ }
+ break;
+ }
+ return (ret);
}
/*
@@ -698,19 +725,21 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
WT_PAGE *parent;
WT_PAGE_INDEX *alloc_index, *pindex;
WT_REF **alloc_refp, *next_ref;
+ WT_SPLIT_ERROR_PHASE complete;
size_t parent_decr, size;
uint64_t split_gen;
- uint32_t i, j;
+ uint32_t hint, i, j;
uint32_t deleted_entries, parent_entries, result_entries;
uint32_t *deleted_refs;
- bool complete, empty_parent;
+ bool empty_parent;
parent = ref->home;
alloc_index = pindex = NULL;
parent_decr = 0;
parent_entries = 0;
- complete = empty_parent = false;
+ empty_parent = false;
+ complete = WT_ERR_RETURN;
/* The parent page will be marked dirty, make sure that will succeed. */
WT_RET(__wt_page_modify_init(session, parent));
@@ -728,7 +757,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
* array anyway. Switch them to the special split state, so that any
* reading thread will restart.
*/
- WT_RET(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr));
+ WT_ERR(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr));
for (deleted_entries = 0, i = 0; i < parent_entries; ++i) {
next_ref = pindex->index[i];
WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
@@ -768,28 +797,40 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
* Allocate and initialize a new page index array for the parent, then
* copy references from the original index array, plus references from
* the newly created split array, into place.
+ *
+ * Update the WT_REF's page-index hint as we go. This can race with a
+ * thread setting the hint based on an older page-index, and the change
+ * isn't backed out in the case of an error, so there ways for the hint
+ * to be wrong; OK because it's just a hint.
*/
size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *);
WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
parent_incr += size;
alloc_index->index = (WT_REF **)(alloc_index + 1);
alloc_index->entries = result_entries;
- for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) {
+ for (alloc_refp = alloc_index->index,
+ hint = i = 0; i < parent_entries; ++i) {
next_ref = pindex->index[i];
if (next_ref == ref)
for (j = 0; j < new_entries; ++j) {
ref_new[j]->home = parent;
+ ref_new[j]->pindex_hint = hint++;
*alloc_refp++ = ref_new[j];
}
- else if (next_ref->state != WT_REF_SPLIT)
+ else if (next_ref->state != WT_REF_SPLIT) {
/* Skip refs we have marked for deletion. */
+ next_ref->pindex_hint = hint++;
*alloc_refp++ = next_ref;
+ }
}
/* Check that we filled in all the entries. */
WT_ASSERT(session,
alloc_refp - alloc_index->index == (ptrdiff_t)result_entries);
+ /* Start making real changes to the tree, errors are fatal. */
+ complete = WT_ERR_PANIC;
+
/*
* Confirm the parent page's index hasn't moved then update it, which
* makes the split visible to threads descending the tree.
@@ -830,16 +871,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
*/
WT_FULL_BARRIER();
- /*
- * A note on error handling: failures before we swapped the new page
- * index into the parent can be resolved by freeing allocated memory
- * because the original page is unchanged, we can continue to use it
- * and we have not yet modified the parent. Failures after we swap
- * the new page index into the parent are also relatively benign, the
- * split is OK and complete. For those reasons, we ignore errors past
- * this point unless there's a panic.
- */
- complete = true;
+ /* The split is complete and correct, ignore benign errors. */
+ complete = WT_ERR_IGNORE;
WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
"%p: %s %s" "split into parent %p, %" PRIu32 " -> %" PRIu32
@@ -923,7 +956,8 @@ err: __wt_scr_free(session, &scr);
* nothing really bad can have happened, and our caller has to proceed
* with the split.
*/
- if (!complete) {
+ switch (complete) {
+ case WT_ERR_RETURN:
for (i = 0; i < parent_entries; ++i) {
next_ref = pindex->index[i];
if (next_ref->state == WT_REF_SPLIT)
@@ -931,20 +965,28 @@ err: __wt_scr_free(session, &scr);
}
__wt_free_ref_index(session, NULL, alloc_index, false);
-
/*
* The split couldn't proceed because the parent would be empty,
* return EBUSY so our caller knows to unlock the WT_REF that's
* being deleted, but don't be noisy, there's nothing wrong.
*/
if (empty_parent)
- return (EBUSY);
+ ret = EBUSY;
+ break;
+ case WT_ERR_PANIC:
+ __wt_err(session, ret, "fatal error during parent page split");
+ ret = WT_PANIC;
+ break;
+ case WT_ERR_IGNORE:
+ if (ret != 0 && ret != WT_PANIC) {
+ __wt_err(session, ret,
+ "ignoring not-fatal error during parent page "
+ "split");
+ ret = 0;
+ }
+ break;
}
-
- if (ret != 0 && ret != WT_PANIC)
- __wt_err(session, ret,
- "ignoring not-fatal error during parent page split");
- return (ret == WT_PANIC || !complete ? ret : 0);
+ return (ret);
}
/*
@@ -960,11 +1002,11 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex, *replace_index;
WT_REF **alloc_refp;
WT_REF **child_refp, *page_ref, **page_refp, *ref;
+ WT_SPLIT_ERROR_PHASE complete;
size_t child_incr, page_decr, page_incr, parent_incr, size;
uint64_t split_gen;
uint32_t children, chunk, i, j, remain;
uint32_t slots;
- bool complete;
void *p;
WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal);
@@ -977,7 +1019,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
alloc_index = replace_index = NULL;
page_ref = page->pg_intl_parent_ref;
page_decr = page_incr = parent_incr = 0;
- complete = false;
+ complete = WT_ERR_RETURN;
/*
* Our caller is holding the page locked to single-thread splits, which
@@ -1074,9 +1116,6 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_ERR(__wt_page_modify_init(session, child));
__wt_page_modify_set(session, child);
- /* Ensure the page isn't evicted or split for now. */
- __split_child_block_evict_and_split(child);
-
/*
* The newly allocated child's page index references the same
* structures as the parent. (We cannot move WT_REF structures,
@@ -1100,22 +1139,16 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_ASSERT(session,
page_refp - pindex->index == (ptrdiff_t)pindex->entries);
+ /* Start making real changes to the tree, errors are fatal. */
+ complete = WT_ERR_PANIC;
+
+ /* Prepare the WT_REFs for the move. */
+ __split_ref_step1(session, alloc_index, true);
+
/* Split into the parent. */
WT_ERR(__split_parent(session, page_ref, alloc_index->index,
alloc_index->entries, parent_incr, false, false));
- /*
- * A note on error handling: until this point, there's no problem with
- * unwinding on error. We allocated a new page index, a new set of
- * WT_REFs and a new set of child pages -- if an error occurred, the
- * page remained unchanged, although it may have an incorrect memory
- * footprint. From now on we've modified the parent page, attention
- * needs to be paid. However, subsequent failures are relatively benign,
- * the split is OK and complete. For that reason, we ignore errors past
- * this point unless there's a panic.
- */
- complete = true;
-
/* Confirm the page's index hasn't moved, then update it. */
WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex);
WT_INTL_INDEX_SET(page, replace_index);
@@ -1127,9 +1160,17 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
__split_verify_intl_key_order(session, page));
#endif
- /* Fix up the moved WT_REF structures. */
- WT_ERR(__split_ref_move_final(
- session, alloc_index->index + 1, alloc_index->entries - 1));
+ /* Finalize the WT_REFs we moved. */
+ WT_ERR(__split_ref_step2(session, alloc_index, true));
+
+ /* The split is complete and correct, ignore benign errors. */
+ complete = WT_ERR_IGNORE;
+
+ /*
+ * Push out the changes: not required for correctness, but no reason
+ * to wait.
+ */
+ WT_FULL_BARRIER();
/*
* We don't care about the page-index we allocated, all we needed was
@@ -1158,24 +1199,26 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
__wt_cache_page_inmem_decr(session, page, page_decr);
__wt_page_modify_set(session, page);
-err: /*
- * If complete is true, we saw an error after opening up the tree to
- * descent through the page's new index. There is nothing we can do,
- * there are threads potentially active in both versions of the tree.
- *
- * A note on error handling: if we completed the split, return success,
- * nothing really bad can have happened, and our caller has to proceed
- * with the split.
- */
- if (!complete) {
+err: switch (complete) {
+ case WT_ERR_RETURN:
__wt_free_ref_index(session, page, alloc_index, true);
__wt_free_ref_index(session, page, replace_index, false);
- }
-
- if (ret != 0 && ret != WT_PANIC)
+ break;
+ case WT_ERR_PANIC:
__wt_err(session, ret,
- "ignoring not-fatal error during internal page split");
- return (ret == WT_PANIC || !complete ? ret : 0);
+ "fatal error during internal page split");
+ ret = WT_PANIC;
+ break;
+ case WT_ERR_IGNORE:
+ if (ret != 0 && ret != WT_PANIC) {
+ __wt_err(session, ret,
+ "ignoring not-fatal error during internal page "
+ "split");
+ ret = 0;
+ }
+ break;
+ }
+ return (ret);
}
/*
diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c
index 2f8759b9d82..ef70160aa72 100644
--- a/src/btree/bt_stat.c
+++ b/src/btree/bt_stat.c
@@ -35,10 +35,10 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst)
WT_STAT_SET(session, stats, btree_fixed_len, btree->bitcnt);
WT_STAT_SET(session, stats, btree_maximum_depth, btree->maximum_depth);
- WT_STAT_SET(session, stats, btree_maxintlpage, btree->maxintlpage);
WT_STAT_SET(session, stats, btree_maxintlkey, btree->maxintlkey);
- WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage);
+ WT_STAT_SET(session, stats, btree_maxintlpage, btree->maxintlpage);
WT_STAT_SET(session, stats, btree_maxleafkey, btree->maxleafkey);
+ WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage);
WT_STAT_SET(session, stats, btree_maxleafvalue, btree->maxleafvalue);
/* Everything else is really, really expensive. */
@@ -59,8 +59,8 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst)
WT_STAT_SET(session, stats, btree_row_leaf, 0);
next_walk = NULL;
- while ((ret = __wt_tree_walk(session, &next_walk, NULL, 0)) == 0 &&
- next_walk != NULL) {
+ while ((ret = __wt_tree_walk(
+ session, &next_walk, 0)) == 0 && next_walk != NULL) {
WT_WITH_PAGE_INDEX(session,
ret = __stat_page(session, next_walk->page, stats));
WT_RET(ret);
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index 07bb2eb3a01..86607d8f187 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -58,7 +58,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL;
for (walk = NULL;;) {
- WT_ERR(__wt_tree_walk(session, &walk, NULL, flags));
+ WT_ERR(__wt_tree_walk(session, &walk, flags));
if (walk == NULL)
break;
@@ -124,7 +124,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
/* Write all dirty in-cache pages. */
flags |= WT_READ_NO_EVICT;
for (walk = NULL;;) {
- WT_ERR(__wt_tree_walk(session, &walk, NULL, flags));
+ WT_ERR(__wt_tree_walk(session, &walk, flags));
if (walk == NULL)
break;
diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c
index c7d83d8dfff..abb18529041 100644
--- a/src/btree/bt_walk.c
+++ b/src/btree/bt_walk.c
@@ -69,16 +69,78 @@ retry: WT_INTL_INDEX_GET(session, ref->home, pindex);
}
/*
- * __wt_tree_walk --
+ * __ref_is_leaf --
+ * Check if a reference is for a leaf page.
+ */
+static inline bool
+__ref_is_leaf(WT_REF *ref)
+{
+ size_t addr_size;
+ u_int type;
+ const uint8_t *addr;
+
+ /*
+ * If the page has a disk address, we can crack it to figure out if
+ * this page is a leaf page or not. If there's no address, the page
+ * isn't on disk and we don't know the page type.
+ */
+ __wt_ref_info(ref, &addr, &addr_size, &type);
+ return (addr == NULL ?
+ false : type == WT_CELL_ADDR_LEAF || type == WT_CELL_ADDR_LEAF_NO);
+}
+
+/*
+ * __page_ascend --
+ * Ascend the tree one level.
+ */
+static void
+__page_ascend(WT_SESSION_IMPL *session,
+ WT_REF **refp, WT_PAGE_INDEX **pindexp, uint32_t *slotp)
+{
+ WT_REF *parent_ref, *ref;
+
+ /*
+ * Ref points to the first/last slot on an internal page from which we
+ * are ascending the tree, moving to the parent page. This is tricky
+ * because the internal page we're on may be splitting into its parent.
+ * Find a stable configuration where the page we start from and the
+ * page we're moving to are connected. The tree eventually stabilizes
+ * into that configuration, keep trying until we succeed.
+ */
+ for (ref = *refp;;) {
+ /*
+ * Find our parent slot on the next higher internal page, the
+ * slot from which we move to a next/prev slot, checking that
+ * we haven't reached the root.
+ */
+ parent_ref = ref->home->pg_intl_parent_ref;
+ if (__wt_ref_is_root(parent_ref))
+ break;
+ __page_refp(session, parent_ref, pindexp, slotp);
+
+ /*
+ * When internal pages split, the WT_REF structures being moved
+ * are updated first. If the WT_REF we started with references
+ * the same page as we found on our search of the parent, there
+ * is a consistent view.
+ */
+ if (ref->home == parent_ref->page)
+ break;
+ }
+
+ *refp = parent_ref;
+}
+
+/*
+ * __tree_walk_internal --
* Move to the next/previous page in the tree.
*/
-int
-__wt_tree_walk(WT_SESSION_IMPL *session,
- WT_REF **refp, uint64_t *walkcntp, uint32_t flags)
+static inline int
+__tree_walk_internal(WT_SESSION_IMPL *session,
+ WT_REF **refp, uint64_t *walkcntp, uint64_t *skipleafcntp, uint32_t flags)
{
WT_BTREE *btree;
WT_DECL_RET;
- WT_PAGE *page;
WT_PAGE_INDEX *pindex;
WT_REF *couple, *couple_orig, *ref;
bool empty_internal, prev, skip;
@@ -153,7 +215,7 @@ __wt_tree_walk(WT_SESSION_IMPL *session,
goto descend;
}
-ascend: /*
+ /*
* If the active page was the root, we've reached the walk's end.
* Release any hazard-pointer we're holding.
*/
@@ -167,13 +229,14 @@ ascend: /*
for (;;) {
/*
- * If we're at the last/first slot on the page, return this page
- * in post-order traversal. Otherwise we move to the next/prev
- * slot and left/right-most element in its subtree.
+ * If we're at the last/first slot on the internal page, return
+ * it in post-order traversal. Otherwise move to the next/prev
+ * slot and left/right-most element in that subtree.
*/
- if ((prev && slot == 0) ||
+ while ((prev && slot == 0) ||
(!prev && slot == pindex->entries - 1)) {
- ref = ref->home->pg_intl_parent_ref;
+ /* Ascend to the parent. */
+ __page_ascend(session, &ref, &pindex, &slot);
/*
* If we got all the way through an internal page and
@@ -185,40 +248,37 @@ ascend: /*
empty_internal = false;
}
- /* Optionally skip internal pages. */
- if (LF_ISSET(WT_READ_SKIP_INTL))
- goto ascend;
-
/*
- * We've ascended the tree and are returning an internal
- * page. If it's the root, discard our hazard pointer,
- * otherwise, swap our hazard pointer for the page we'll
- * return.
+ * If at the root and returning internal pages, return
+ * the root page, otherwise we're done. Regardless, no
+ * hazard pointer is required, release the one we hold.
*/
- if (__wt_ref_is_root(ref))
+ if (__wt_ref_is_root(ref)) {
WT_ERR(__wt_page_release(
session, couple, flags));
- else {
- /*
- * Locate the reference to our parent page then
- * swap our child hazard pointer for the parent.
- * We don't handle restart or not-found returns.
- * It would require additional complexity and is
- * not a possible return: we're moving to the
- * parent of the current child page, our parent
- * reference can't have split or been evicted.
- */
- __page_refp(session, ref, &pindex, &slot);
+ if (!LF_ISSET(WT_READ_SKIP_INTL))
+ *refp = ref;
+ goto done;
+ }
+
+ /*
+ * Optionally return internal pages. Swap our previous
+ * hazard pointer for the page we'll return. We don't
+ * handle restart or not-found returns, it would require
+ * additional complexity and is not a possible return:
+ * we're moving to the parent of the current child page,
+ * the parent can't have been evicted.
+ */
+ if (!LF_ISSET(WT_READ_SKIP_INTL)) {
if ((ret = __wt_page_swap(
session, couple, ref, flags)) != 0) {
WT_TRET(__wt_page_release(
session, couple, flags));
WT_ERR(ret);
}
+ *refp = ref;
+ goto done;
}
-
- *refp = ref;
- goto done;
}
if (prev)
@@ -304,6 +364,31 @@ ascend: /*
break;
}
+ /*
+ * Optionally skip leaf pages: skip all leaf pages if
+ * WT_READ_SKIP_LEAF is set, when the skip-leaf-count
+ * variable is non-zero, skip some count of leaf pages.
+ * If this page is disk-based, crack the cell to figure
+ * out it's a leaf page without reading it.
+ *
+ * If skipping some number of leaf pages, decrement the
+ * count of pages to zero, and then take the next leaf
+ * page we can. Be cautious around the page decrement,
+ * if for some reason don't take this particular page,
+ * we can take the next one, and, there are additional
+ * tests/decrements when we're about to return a leaf
+ * page.
+ */
+ if (skipleafcntp != NULL || LF_ISSET(WT_READ_SKIP_LEAF))
+ if (__ref_is_leaf(ref)) {
+ if (LF_ISSET(WT_READ_SKIP_LEAF))
+ break;
+ if (*skipleafcntp > 0) {
+ --*skipleafcntp;
+ break;
+ }
+ }
+
ret = __wt_page_swap(session, couple, ref, flags);
/*
@@ -359,13 +444,29 @@ ascend: /*
* A new page: configure for traversal of any internal
* page's children, else return the leaf page.
*/
-descend: couple = ref;
- page = ref->page;
- if (WT_PAGE_IS_INTERNAL(page)) {
- WT_INTL_INDEX_GET(session, page, pindex);
+ if (WT_PAGE_IS_INTERNAL(ref->page)) {
+descend: couple = ref;
+ WT_INTL_INDEX_GET(session, ref->page, pindex);
slot = prev ? pindex->entries - 1 : 0;
empty_internal = true;
} else {
+ /*
+ * Optionally skip leaf pages, the second half.
+ * We didn't have an on-page cell to figure out
+ * if it was a leaf page, we had to acquire the
+ * hazard pointer and look at the page.
+ */
+ if (skipleafcntp != NULL ||
+ LF_ISSET(WT_READ_SKIP_LEAF)) {
+ couple = ref;
+ if (LF_ISSET(WT_READ_SKIP_LEAF))
+ break;
+ if (*skipleafcntp > 0) {
+ --*skipleafcntp;
+ break;
+ }
+ }
+
*refp = ref;
goto done;
}
@@ -376,3 +477,37 @@ done:
err: WT_LEAVE_PAGE_INDEX(session);
return (ret);
}
+
+/*
+ * __wt_tree_walk --
+ * Move to the next/previous page in the tree.
+ */
+int
+__wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags)
+{
+ return (__tree_walk_internal(session, refp, NULL, NULL, flags));
+}
+
+/*
+ * __wt_tree_walk_count --
+ * Move to the next/previous page in the tree, tracking how many
+ * references were visited to get there.
+ */
+int
+__wt_tree_walk_count(WT_SESSION_IMPL *session,
+ WT_REF **refp, uint64_t *walkcntp, uint32_t flags)
+{
+ return (__tree_walk_internal(session, refp, walkcntp, NULL, flags));
+}
+
+/*
+ * __wt_tree_walk_skip --
+ * Move to the next/previous page in the tree, skipping a certain number
+ * of leaf pages before returning.
+ */
+int
+__wt_tree_walk_skip(WT_SESSION_IMPL *session,
+ WT_REF **refp, uint64_t *skipleafcntp, uint32_t flags)
+{
+ return (__tree_walk_internal(session, refp, NULL, skipleafcntp, flags));
+}
diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c
index e9fa570f97b..c5e2abbe440 100644
--- a/src/btree/col_srch.c
+++ b/src/btree/col_srch.c
@@ -9,12 +9,60 @@
#include "wt_internal.h"
/*
+ * __check_leaf_key_range --
+ * Check the search key is in the leaf page's key range.
+ */
+static inline int
+__check_leaf_key_range(WT_SESSION_IMPL *session,
+ uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt)
+{
+ WT_PAGE_INDEX *pindex;
+ uint32_t indx;
+
+ /*
+ * There are reasons we can't do the fast checks, and we continue with
+ * the leaf page search in those cases, only skipping the complete leaf
+ * page search if we know it's not going to work.
+ */
+ cbt->compare = 0;
+
+ /*
+ * Check if the search key is smaller than the parent's starting key for
+ * this page.
+ */
+ if (recno < leaf->key.recno) {
+ cbt->compare = 1; /* page keys > search key */
+ return (0);
+ }
+
+ /*
+ * Check if the search key is greater than or equal to the starting key
+ * for the parent's next page.
+ *
+ * !!!
+ * Check that "indx + 1" is a valid page-index entry first, because it
+ * also checks that "indx" is a valid page-index entry, and we have to
+ * do that latter check before looking at the indx slot of the array
+ * for a match to leaf (in other words, our page hint might be wrong).
+ */
+ WT_INTL_INDEX_GET(session, leaf->home, pindex);
+ indx = leaf->pindex_hint;
+ if (indx + 1 < pindex->entries && pindex->index[indx] == leaf)
+ if (recno >= pindex->index[indx + 1]->key.recno) {
+ cbt->compare = -1; /* page keys < search key */
+ return (0);
+ }
+
+ return (0);
+}
+
+/*
* __wt_col_search --
* Search a column-store tree for a specific record-based key.
*/
int
__wt_col_search(WT_SESSION_IMPL *session,
- uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt)
+ uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt)
{
WT_BTREE *btree;
WT_COL *cip;
@@ -24,6 +72,7 @@ __wt_col_search(WT_SESSION_IMPL *session,
WT_PAGE *page;
WT_PAGE_INDEX *pindex, *parent_pindex;
WT_REF *current, *descent;
+ uint64_t recno;
uint32_t base, indx, limit;
int depth;
@@ -31,8 +80,38 @@ __wt_col_search(WT_SESSION_IMPL *session,
__cursor_pos_clear(cbt);
- /* We may only be searching a single leaf page, not the full tree. */
+ /*
+ * When appending a new record, the search record number will be an
+ * out-of-band value, search for the largest key in the table instead.
+ */
+ if ((recno = search_recno) == WT_RECNO_OOB)
+ recno = UINT64_MAX;
+
+ /*
+ * We may be searching only a single leaf page, not the full tree. In
+ * the normal case where the page links to a parent, check the page's
+ * parent keys before doing the full search, it's faster when the
+ * cursor is being re-positioned. (One case where the page doesn't
+ * have a parent is if it is being re-instantiated in memory as part
+ * of a split).
+ */
if (leaf != NULL) {
+ WT_ASSERT(session, search_recno != WT_RECNO_OOB);
+
+ if (leaf->home != NULL) {
+ WT_RET(__check_leaf_key_range(
+ session, recno, leaf, cbt));
+ if (cbt->compare != 0) {
+ /*
+ * !!!
+ * WT_CURSOR.search_near uses the slot value to
+ * decide if there was an on-page match.
+ */
+ cbt->slot = 0;
+ return (0);
+ }
+ }
+
current = leaf;
goto leaf_only;
}
@@ -120,7 +199,17 @@ leaf_only:
page = current->page;
cbt->ref = current;
cbt->recno = recno;
- cbt->compare = 0;
+
+ /*
+ * Don't bother searching if the caller is appending a new record where
+ * we'll allocate the record number; we're not going to find a match by
+ * definition, and we figure out the record number and position when we
+ * do the work.
+ */
+ if (search_recno == WT_RECNO_OOB) {
+ cbt->compare = -1;
+ return (0);
+ }
/*
* Set the on-page slot to an impossible value larger than any possible
@@ -142,6 +231,7 @@ leaf_only:
* that's impossibly large for the page. We do have additional setup to
* do in that case, the record may be appended to the page.
*/
+ cbt->compare = 0;
if (page->type == WT_PAGE_COL_FIX) {
if (recno < page->pg_fix_recno) {
cbt->compare = 1;
@@ -190,18 +280,10 @@ past_end:
* This is a rarely used path: we normally find exact matches, because
* column-store files are dense, but in this case the caller searched
* past the end of the table.
- *
- * Don't bother searching if the caller is appending a new record where
- * we'll allocate the record number; we're not going to find a match by
- * definition, and we figure out the position when we do the work.
*/
cbt->ins_head = WT_COL_APPEND(page);
- if (recno == UINT64_MAX)
- cbt->ins = NULL;
- else
- cbt->ins = __col_insert_search(
- cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno);
- if (cbt->ins == NULL)
+ if ((cbt->ins = __col_insert_search(
+ cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno)) == NULL)
cbt->compare = -1;
else {
cbt->recno = WT_INSERT_RECNO(cbt->ins);
@@ -212,14 +294,5 @@ past_end:
else
cbt->compare = -1;
}
-
- /*
- * Note if the record is past the maximum record in the tree, the cursor
- * search functions need to know for fixed-length column-stores because
- * appended records implicitly create any skipped records, and cursor
- * search functions have to handle that case.
- */
- if (cbt->compare == -1)
- F_SET(cbt, WT_CBT_MAX_RECORD);
return (0);
}
diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c
index d2d8a4640ca..e98d30152ab 100644
--- a/src/btree/row_srch.c
+++ b/src/btree/row_srch.c
@@ -132,6 +132,76 @@ __wt_search_insert(
}
/*
+ * __check_leaf_key_range --
+ * Check the search key is in the leaf page's key range.
+ */
+static inline int
+__check_leaf_key_range(WT_SESSION_IMPL *session,
+ WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt)
+{
+ WT_BTREE *btree;
+ WT_COLLATOR *collator;
+ WT_ITEM *item;
+ WT_PAGE_INDEX *pindex;
+ uint32_t indx;
+ int cmp;
+
+ btree = S2BT(session);
+ collator = btree->collator;
+ item = cbt->tmp;
+
+ /*
+ * There are reasons we can't do the fast checks, and we continue with
+ * the leaf page search in those cases, only skipping the complete leaf
+ * page search if we know it's not going to work.
+ */
+ cbt->compare = 0;
+
+ /*
+ * First, confirm we have the right parent page-index slot, and quit if
+ * we don't. We don't search for the correct slot, that would make this
+ * cheap test expensive.
+ */
+ WT_INTL_INDEX_GET(session, leaf->home, pindex);
+ indx = leaf->pindex_hint;
+ if (indx >= pindex->entries || pindex->index[indx] != leaf)
+ return (0);
+
+ /*
+ * Check if the search key is smaller than the parent's starting key for
+ * this page.
+ *
+ * We can't compare against slot 0 on a row-store internal page because
+ * reconciliation doesn't build it, it may not be a valid key.
+ */
+ if (indx != 0) {
+ __wt_ref_key(leaf->home, leaf, &item->data, &item->size);
+ WT_RET(__wt_compare(session, collator, srch_key, item, &cmp));
+ if (cmp < 0) {
+ cbt->compare = 1; /* page keys > search key */
+ return (0);
+ }
+ }
+
+ /*
+ * Check if the search key is greater than or equal to the starting key
+ * for the parent's next page.
+ */
+ ++indx;
+ if (indx < pindex->entries) {
+ __wt_ref_key(
+ leaf->home, pindex->index[indx], &item->data, &item->size);
+ WT_RET(__wt_compare(session, collator, srch_key, item, &cmp));
+ if (cmp >= 0) {
+ cbt->compare = -1; /* page keys < search key */
+ return (0);
+ }
+ }
+
+ return (0);
+}
+
+/*
* __wt_row_search --
* Search a row-store tree for a specific key.
*/
@@ -179,8 +249,29 @@ __wt_row_search(WT_SESSION_IMPL *session,
append_check = insert && cbt->append_tree;
descend_right = true;
- /* We may only be searching a single leaf page, not the full tree. */
+ /*
+ * We may be searching only a single leaf page, not the full tree. In
+ * the normal case where the page links to a parent, check the page's
+ * parent keys before doing the full search, it's faster when the
+ * cursor is being re-positioned. (One case where the page doesn't
+ * have a parent is if it is being re-instantiated in memory as part
+ * of a split).
+ */
if (leaf != NULL) {
+ if (leaf->home != NULL) {
+ WT_RET(__check_leaf_key_range(
+ session, srch_key, leaf, cbt));
+ if (cbt->compare != 0) {
+ /*
+ * !!!
+ * WT_CURSOR.search_near uses the slot value to
+ * decide if there was an on-page match.
+ */
+ cbt->slot = 0;
+ return (0);
+ }
+ }
+
current = leaf;
goto leaf_only;
}
@@ -196,15 +287,6 @@ restart_page: page = current->page;
WT_INTL_INDEX_GET(session, page, pindex);
- /*
- * Fast-path internal pages with one child, a common case for
- * the root page in new trees.
- */
- if (pindex->entries == 1) {
- descent = pindex->index[0];
- goto descend;
- }
-
/* Fast-path appends. */
if (append_check) {
descent = pindex->index[pindex->entries - 1];
@@ -536,19 +618,163 @@ err: /*
}
/*
- * __wt_row_random --
- * Return a random key from a row-store tree.
+ * __wt_row_random_leaf --
+ * Return a random key from a row-store leaf page.
*/
int
-__wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+__wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+{
+ WT_INSERT *ins, **start, **stop;
+ WT_INSERT_HEAD *ins_head;
+ WT_PAGE *page;
+ uint32_t choice, entries, i;
+ int level;
+
+ page = cbt->ref->page;
+
+ start = stop = NULL; /* [-Wconditional-uninitialized] */
+ entries = 0; /* [-Wconditional-uninitialized] */
+
+ /* If the page has disk-based entries, select from them. */
+ if (page->pg_row_entries != 0) {
+ cbt->compare = 0;
+ cbt->slot = __wt_random(&session->rnd) % page->pg_row_entries;
+
+ /*
+ * The real row-store search function builds the key, so we
+ * have to as well.
+ */
+ return (__wt_row_leaf_key(session,
+ page, page->pg_row_d + cbt->slot, cbt->tmp, false));
+ }
+
+ /*
+ * If the tree is new (and not empty), it might have a large insert
+ * list.
+ */
+ F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
+ if ((cbt->ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
+ return (WT_NOTFOUND);
+
+ /*
+ * Walk down the list until we find a level with at least 50 entries,
+ * that's where we'll start rolling random numbers. The value 50 is
+ * used to ignore levels with only a few entries, that is, levels which
+ * are potentially badly skewed.
+ */
+ for (ins_head = cbt->ins_head,
+ level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) {
+ start = &ins_head->head[level];
+ for (entries = 0, stop = start;
+ *stop != NULL; stop = &(*stop)->next[level])
+ ++entries;
+
+ if (entries > 50)
+ break;
+ }
+
+ /*
+ * If it's a tiny list and we went all the way to level 0, correct the
+ * level; entries is correctly set.
+ */
+ if (level < 0)
+ level = 0;
+
+ /*
+ * Step down the skip list levels, selecting a random chunk of the name
+ * space at each level.
+ */
+ while (level > 0) {
+ /*
+ * There are (entries) or (entries + 1) chunks of the name space
+ * considered at each level. They are: between start and the 1st
+ * element, between the 1st and 2nd elements, and so on to the
+ * last chunk which is the name space after the stop element on
+ * the current level. This last chunk of name space may or may
+ * not be there: as we descend the levels of the skip list, this
+ * chunk may appear, depending if the next level down has
+ * entries logically after the stop point in the current level.
+ * We can't ignore those entries: because of the algorithm used
+ * to determine the depth of a skiplist, there may be a large
+ * number of entries "revealed" by descending a level.
+ *
+ * If the next level down has more items after the current stop
+ * point, there are (entries + 1) chunks to consider, else there
+ * are (entries) chunks.
+ */
+ if (*(stop - 1) == NULL)
+ choice = __wt_random(&session->rnd) % entries;
+ else
+ choice = __wt_random(&session->rnd) % (entries + 1);
+
+ if (choice == entries) {
+ /*
+ * We selected the name space after the stop element on
+ * this level. Set the start point to the current stop
+ * point, descend a level and move the stop element to
+ * the end of the list, that is, the end of the newly
+ * discovered name space, counting entries as we go.
+ */
+ start = stop;
+ --start;
+ --level;
+ for (entries = 0, stop = start;
+ *stop != NULL; stop = &(*stop)->next[level])
+ ++entries;
+ } else {
+ /*
+ * We selected another name space on the level. Move the
+ * start pointer the selected number of entries forward
+ * to the start of the selected chunk (if the selected
+ * number is 0, start won't move). Set the stop pointer
+ * to the next element in the list and drop both start
+ * and stop down a level.
+ */
+ for (i = 0; i < choice; ++i)
+ start = &(*start)->next[level];
+ stop = &(*start)->next[level];
+
+ --start;
+ --stop;
+ --level;
+
+ /* Count the entries in the selected name space. */
+ for (entries = 0,
+ ins = *start; ins != *stop; ins = ins->next[level])
+ ++entries;
+ }
+ }
+
+ /*
+ * When we reach the bottom level, entries will already be set. Select
+ * a random entry from the name space and return it.
+ *
+ * It should be impossible for the entries count to be 0 at this point,
+ * but check for it out of paranoia and to quiet static testing tools.
+ */
+ if (entries > 0)
+ entries = __wt_random(&session->rnd) % entries;
+ for (ins = *start; entries > 0; --entries)
+ ins = ins->next[0];
+
+ cbt->ins = ins;
+ cbt->compare = 0;
+
+ return (0);
+}
+
+/*
+ * __wt_row_random_descent --
+ * Find a random leaf page in a row-store tree.
+ */
+int
+__wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
{
WT_BTREE *btree;
WT_DECL_RET;
- WT_INSERT *p, *t;
WT_PAGE *page;
WT_PAGE_INDEX *pindex;
WT_REF *current, *descent;
- uint32_t cnt;
btree = S2BT(session);
@@ -585,43 +811,6 @@ restart_root:
return (ret);
}
- if (page->pg_row_entries != 0) {
- cbt->ref = current;
- cbt->compare = 0;
- cbt->slot = __wt_random(&session->rnd) % page->pg_row_entries;
-
- /*
- * The real row-store search function builds the key, so we
- * have to as well.
- */
- return (__wt_row_leaf_key(session,
- page, page->pg_row_d + cbt->slot, cbt->tmp, false));
- }
-
- /*
- * If the tree is new (and not empty), it might have a large insert
- * list. Count how many records are in the list.
- */
- F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
- if ((cbt->ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
- WT_ERR(WT_NOTFOUND);
- for (cnt = 1, p = WT_SKIP_FIRST(cbt->ins_head);; ++cnt)
- if ((p = WT_SKIP_NEXT(p)) == NULL)
- break;
-
- /*
- * Select a random number from 0 to (N - 1), return that record.
- */
- cnt = __wt_random(&session->rnd) % cnt;
- for (p = t = WT_SKIP_FIRST(cbt->ins_head);; t = p)
- if (cnt-- == 0 || (p = WT_SKIP_NEXT(p)) == NULL)
- break;
cbt->ref = current;
- cbt->compare = 0;
- cbt->ins = t;
-
return (0);
-
-err: WT_TRET(__wt_page_release(session, current, 0));
- return (ret);
}
diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c
index d3a0265c13a..e943f01236e 100644
--- a/src/cache/cache_las.c
+++ b/src/cache/cache_las.c
@@ -18,6 +18,7 @@ __wt_las_stats_update(WT_SESSION_IMPL *session)
WT_CONNECTION_IMPL *conn;
WT_CONNECTION_STATS **cstats;
WT_DSRC_STATS **dstats;
+ int64_t v;
conn = S2C(session);
@@ -37,10 +38,10 @@ __wt_las_stats_update(WT_SESSION_IMPL *session)
dstats = ((WT_CURSOR_BTREE *)
conn->las_session->las_cursor)->btree->dhandle->stats;
- WT_STAT_SET(session, cstats,
- cache_lookaside_insert, WT_STAT_READ(dstats, cursor_insert));
- WT_STAT_SET(session, cstats,
- cache_lookaside_remove, WT_STAT_READ(dstats, cursor_remove));
+ v = WT_STAT_READ(dstats, cursor_insert);
+ WT_STAT_SET(session, cstats, cache_lookaside_insert, v);
+ v = WT_STAT_READ(dstats, cursor_remove);
+ WT_STAT_SET(session, cstats, cache_lookaside_remove, v);
}
/*
diff --git a/src/config/config_def.c b/src/config/config_def.c
index d79ce6853e6..9d12e953498 100644
--- a/src/config/config_def.c
+++ b/src/config/config_def.c
@@ -323,6 +323,7 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_open_cursor[] = {
NULL, "choices=[\"hex\",\"json\",\"print\"]",
NULL, 0 },
{ "next_random", "boolean", NULL, NULL, NULL, 0 },
+ { "next_random_sample_size", "string", NULL, NULL, NULL, 0 },
{ "overwrite", "boolean", NULL, NULL, NULL, 0 },
{ "raw", "boolean", NULL, NULL, NULL, 0 },
{ "readonly", "boolean", NULL, NULL, NULL, 0 },
@@ -920,9 +921,10 @@ static const WT_CONFIG_ENTRY config_entries[] = {
NULL, 0
},
{ "WT_SESSION.open_cursor",
- "append=0,bulk=0,checkpoint=,dump=,next_random=0,overwrite=,raw=0"
- ",readonly=0,skip_sort_check=0,statistics=,target=",
- confchk_WT_SESSION_open_cursor, 11
+ "append=0,bulk=0,checkpoint=,dump=,next_random=0,"
+ "next_random_sample_size=0,overwrite=,raw=0,readonly=0,"
+ "skip_sort_check=0,statistics=,target=",
+ confchk_WT_SESSION_open_cursor, 12
},
{ "WT_SESSION.reconfigure",
"isolation=read-committed",
diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c
index bd14e1bf4fd..ee9935828e2 100644
--- a/src/conn/conn_api.c
+++ b/src/conn/conn_api.c
@@ -2003,6 +2003,9 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
WT_ERR(__wt_sweep_config(session, cfg));
WT_ERR(__wt_verbose_config(session, cfg));
+ /* Initialize the OS page size for mmap */
+ conn->page_size = __wt_get_vm_pagesize();
+
/* Now that we know if verbose is configured, output the version. */
WT_ERR(__wt_verbose(
session, WT_VERB_VERSION, "%s", WIREDTIGER_VERSION_STRING));
diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c
index c6d5b535b86..0821238fbd7 100644
--- a/src/conn/conn_dhandle.c
+++ b/src/conn/conn_dhandle.c
@@ -147,12 +147,14 @@ __conn_dhandle_mark_dead(WT_SESSION_IMPL *session)
int
__wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force)
{
+ WT_BM *bm;
WT_BTREE *btree;
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
bool marked_dead, no_schema_lock;
btree = S2BT(session);
+ bm = btree->bm;
dhandle = session->dhandle;
marked_dead = false;
@@ -191,7 +193,7 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force)
*/
if (!F_ISSET(btree,
WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) {
- if (force && (btree->bm == NULL || btree->bm->map == NULL)) {
+ if (force && (bm == NULL || !bm->is_mapped(bm, session))) {
WT_ERR(__conn_dhandle_mark_dead(session));
marked_dead = true;
}
diff --git a/src/cursor/cur_file.c b/src/cursor/cur_file.c
index 63f77248ca8..b955b292292 100644
--- a/src/cursor/cur_file.c
+++ b/src/cursor/cur_file.c
@@ -455,14 +455,24 @@ __wt_curfile_create(WT_SESSION_IMPL *session,
}
/*
- * random_retrieval
- * Random retrieval cursors only support next, reset and close.
+ * Random retrieval, row-store only.
+ * Random retrieval cursors support a limited set of methods.
*/
WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval));
if (cval.val != 0) {
+ if (WT_CURSOR_RECNO(cursor))
+ WT_ERR_MSG(session, ENOTSUP,
+ "next_random configuration not supported for "
+ "column-store objects");
+
__wt_cursor_set_notsup(cursor);
cursor->next = __curfile_next_random;
cursor->reset = __curfile_reset;
+
+ WT_ERR(__wt_config_gets_def(
+ session, cfg, "next_random_sample_size", 0, &cval));
+ if (cval.val != 0)
+ cbt->next_random_sample_size = (u_int)cval.val;
}
/* Underlying btree initialization. */
diff --git a/src/cursor/cur_json.c b/src/cursor/cur_json.c
index 8f858a5012f..3270be07de4 100644
--- a/src/cursor/cur_json.c
+++ b/src/cursor/cur_json.c
@@ -313,7 +313,6 @@ size_t
__wt_json_unpack_char(char ch, u_char *buf, size_t bufsz, bool force_unicode)
{
char abbrev;
- u_char h;
if (!force_unicode) {
if (isprint(ch) && ch != '\\' && ch != '"') {
@@ -354,16 +353,8 @@ __wt_json_unpack_char(char ch, u_char *buf, size_t bufsz, bool force_unicode)
*buf++ = 'u';
*buf++ = '0';
*buf++ = '0';
- h = (((u_char)ch) >> 4) & 0xF;
- if (h >= 10)
- *buf++ = 'A' + (h - 10);
- else
- *buf++ = '0' + h;
- h = ((u_char)ch) & 0xF;
- if (h >= 10)
- *buf++ = 'A' + (h - 10);
- else
- *buf++ = '0' + h;
+ *buf++ = __wt_hex[(ch & 0xf0) >> 4];
+ *buf++ = __wt_hex[ch & 0x0f];
}
return (6);
}
diff --git a/src/cursor/cur_stat.c b/src/cursor/cur_stat.c
index e1d5b8eb91a..652dec364fb 100644
--- a/src/cursor/cur_stat.c
+++ b/src/cursor/cur_stat.c
@@ -384,6 +384,7 @@ __curstat_file_init(WT_SESSION_IMPL *session,
{
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
+ wt_off_t size;
const char *filename;
/*
@@ -395,8 +396,8 @@ __curstat_file_init(WT_SESSION_IMPL *session,
if (!WT_PREFIX_SKIP(filename, "file:"))
return (EINVAL);
__wt_stat_dsrc_init_single(&cst->u.dsrc_stats);
- WT_RET(__wt_block_manager_size(
- session, filename, &cst->u.dsrc_stats));
+ WT_RET(__wt_block_manager_named_size(session, filename, &size));
+ cst->u.dsrc_stats.block_size = size;
__wt_curstat_dsrc_final(cst);
return (0);
}
@@ -662,7 +663,7 @@ __wt_curstat_open(WT_SESSION_IMPL *session,
/*
* We return the statistics field's offset as the key, and a string
- * description, a string value, and a uint64_t value as the value
+ * description, a string value, and a uint64_t value as the value
* columns.
*/
cursor->key_format = "i";
diff --git a/src/cursor/cur_std.c b/src/cursor/cur_std.c
index f92426355ef..da38988b6c2 100644
--- a/src/cursor/cur_std.c
+++ b/src/cursor/cur_std.c
@@ -40,11 +40,11 @@ void
__wt_cursor_set_notsup(WT_CURSOR *cursor)
{
/*
- * Set all of the cursor methods (except for close and reset), to fail.
- * Close is unchanged so the cursor can be discarded, reset defaults to
+ * Set cursor methods other than close, reconfigure and reset, to fail.
+ * Close is unchanged so the cursor can be discarded; reset is set to
* a no-op because session transactional operations reset all of the
- * cursors in a session, and random cursors shouldn't block transactions
- * or checkpoints.
+ * cursors in a session. Reconfigure is left open in case it's possible
+ * in the future to change these configurations.
*/
cursor->compare =
(int (*)(WT_CURSOR *, WT_CURSOR *, int *))__wt_cursor_notsup;
diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c
index dca72a16ee5..e746ccd5871 100644
--- a/src/cursor/cur_table.c
+++ b/src/cursor/cur_table.c
@@ -968,8 +968,11 @@ __wt_curtable_open(WT_SESSION_IMPL *session,
WT_ERR(__wt_strdup(session, tmp->data, &ctable->cfg[1]));
if (0) {
-err: WT_TRET(__curtable_close(cursor));
- *cursorp = NULL;
+err: if (*cursorp != NULL) {
+ WT_TRET(__wt_cursor_close(*cursorp));
+ *cursorp = NULL;
+ }
+ WT_TRET(__curtable_close(cursor));
}
__wt_scr_free(session, &tmp);
diff --git a/src/docs/command-line.dox b/src/docs/command-line.dox
index 745c5051be3..e2b376d5e3f 100644
--- a/src/docs/command-line.dox
+++ b/src/docs/command-line.dox
@@ -32,7 +32,7 @@ on success and non-zero on error.
The \c wt tool supports several commands. If configured in the underlying
database, some commands will run recovery when opening the database. If
-the user wants to force recovery on any command, use the \c -r option.
+the user wants to force recovery on any command, use the \c -R option.
In general, commands that modify the database or tables will run recovery
by default and commands that only read data will not run recovery.
@@ -46,7 +46,7 @@ opened as a WiredTiger database. See @ref backup for more information,
and @ref file_permissions for specifics on the copied file permissions.
@subsection util_backup_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] backup [-t uri] directory</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] backup [-t uri] directory</code>
@subsection util_backup_options Options
The following are command-specific options for the \c backup command:
@@ -64,7 +64,7 @@ The \c compact command attempts to rewrite the specified table or file
to consume less disk space.
@subsection util_compact_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] compact uri</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] compact uri</code>
@subsection util_compact_options Options
The \c compact command has no command-specific options.
@@ -78,7 +78,7 @@ configuration. It is equivalent to a call to WT_SESSION::create with
the specified string arguments.
@subsection util_create_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] create [-c config] uri</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] create [-c config] uri</code>
@subsection util_create_options Options
The following are command-specific options for the \c create command:
@@ -94,7 +94,7 @@ The \c drop command drops the specified \c uri. It is equivalent to a
call to WT_SESSION::drop with the "force" configuration argument.
@subsection util_drop_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] drop uri</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] drop uri</code>
@subsection util_drop_options Options
The \c drop command has no command-specific options.
@@ -109,7 +109,7 @@ which can be re-loaded into a new table using the \c load command.
See @subpage dump_formats for details of the dump file formats.
@subsection util_dump_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] dump [-jrx] [-c checkpoint] [-f output] uri</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] dump [-jrx] [-c checkpoint] [-f output] uri</code>
@subsection util_dump_options Options
The following are command-specific options for the \c dump command:
@@ -143,7 +143,7 @@ the database. If a URI is specified as an argument, only information about
that data source is printed.
@subsection util_list_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] list [-cv] [uri]</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] list [-cv] [uri]</code>
@subsection util_list_options Options
The following are command-specific options for the \c list command:
@@ -170,7 +170,7 @@ table will be overwritten by the new data (use the \c -n option to
make an attempt to overwrite existing data return an error).
@subsection util_load_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] load [-ajn] [-f input] [-r name] [uri configuration ...]</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] load [-ajn] [-f input] [-r name] [uri configuration ...]</code>
@subsection util_load_options Options
The following are command-specific options for the \c load command:
@@ -244,7 +244,7 @@ row-store table or file already exists, data in the table or file will
be overwritten by the new data.
@subsection util_loadtext_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] loadtext [-f input]</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] loadtext [-f input]</code>
@subsection util_loadtext_options Options
The following are command-specific options for the \c loadtext command:
@@ -260,7 +260,7 @@ Display the database log.
The \c printlog command outputs the database log.
@subsection util_printlog_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] printlog [-p] [-f output]</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] printlog [-x] [-f output]</code>
@subsection util_printlog_options Options
The following are command-specific options for the \c printlog command:
@@ -269,8 +269,9 @@ The following are command-specific options for the \c printlog command:
By default, the \c printlog command output is written to the standard
output; the \c -f option re-directs the output to the specified file.
-@par <code>-p</code>
-Display the log in a printable format.
+@par <code>-x</code>
+Keys and value items in the log are printed in hex format in addition
+to the default string format.
<hr>
@section util_read wt read
@@ -283,7 +284,7 @@ with string or record number keys and string values.
The \c read command exits non-zero if a specified record is not found.
@subsection util_read_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] read uri key ...</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] read uri key ...</code>
@subsection util_read_options Options
The \c read command has no command-specific options.
@@ -295,7 +296,7 @@ Rename a table or file.
The \c rename command renames the specified table or file.
@subsection util_rename_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] rename uri name</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] rename uri name</code>
@subsection util_rename_options Options
The \c rename command has no command-specific options.
@@ -309,7 +310,7 @@ data that cannot be recovered. Underlying files are re-written in
place, overwriting the original file contents.
@subsection util_salvage_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] salvage [-F force] uri</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] salvage [-F force] uri</code>
@subsection util_salvage_options Options
The following are command-specific options for the \c salvage command:
@@ -327,7 +328,7 @@ The \c stat command outputs run-time statistics for the WiredTiger
engine, or, if specified, for the URI on the command-line.
@subsection util_stat_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] stat [-f] [uri]</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] stat [-f] [uri]</code>
@subsection util_stat_options Options
The following are command-specific options for the \c stat command:
@@ -345,7 +346,7 @@ success if the data source is up-to-date, and failure if the data source
cannot be upgraded.
@subsection util_upgrade_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] upgrade uri</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] upgrade uri</code>
@subsection util_upgrade_options Options
The \c upgrade command has no command-specific options.
@@ -359,7 +360,7 @@ success if the data source is correct, and failure if the data source is
corrupted.
@subsection util_verify_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] verify uri</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] verify uri</code>
@subsection util_verify_options Options
The \c verify command has no command-specific options.
@@ -381,9 +382,9 @@ Attempting to overwrite an already existing record will fail.
@subsection util_write_synopsis Synopsis
<code>
-wt [-rVv] [-C config] [-E secretkey ] [-h directory] write -a uri value ...
+wt [-RVv] [-C config] [-E secretkey ] [-h directory] write -a uri value ...
<br>
-wt [-rVv] [-C config] [-E secretkey ] [-h directory] write [-o] uri key value ...
+wt [-RVv] [-C config] [-E secretkey ] [-h directory] write [-o] uri key value ...
</code>
@subsection util_write_options Options
diff --git a/src/docs/cursor-random.dox b/src/docs/cursor-random.dox
index 446981e3192..a0a3212be6d 100644
--- a/src/docs/cursor-random.dox
+++ b/src/docs/cursor-random.dox
@@ -2,6 +2,27 @@
The \c next_random configuration to the WT_SESSION::open_cursor method
configures the cursor to return a pseudo-random record from a row-store
-object (the configuration is not supported on other types of objects).
+object (the \c next_random configuration is not supported on other types
+of objects).
+
+Applications should use the WT_CURSOR::next method to retrieve records
+from the object, most other cursor methods are not supported. For
+example, it's not possible to update using a cursor configured for
+random retrieval.
+
+By default, each returned record is pseudo-randomly selected from the
+underlying object as a whole. That can lead to skewed results when the
+underlying tree structure is unbalanced or records are not uniformly
+distributed. In such cases, the \c next_random_sample_size configuration
+can also be specified. Setting \c next_random_sample_size configures the
+number of samples the application expects to take using the cursor. A
+cursor configured using \c next_random_sample_size divides the object
+into \c next_random_sample_size pieces, and each subsequent retrieval
+returns a record from the next one of those pieces.
+
+For example, setting \c next_random_sample_percent to \c 10 would cause
+the cursor to sequentially return records from each tenth part of the
+object. Setting \c next_random_sample_percent to \c 1000 would cause the
+cursor to sequentially return records from each .1% of the object.
*/
diff --git a/src/docs/license.dox b/src/docs/license.dox
index f34ebad19a7..febced2c6af 100644
--- a/src/docs/license.dox
+++ b/src/docs/license.dox
@@ -13,6 +13,19 @@ WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the
<b>GNU General Public License</b></a> for details.
+Additionally, portions of the WiredTiger distribution are distributed
+under the terms of the
+<a href="http://www.opensource.org/licenses/BSD-3-Clause">
+BSD-3-Clause License</a>. These files have
+<a href="http://www.opensource.org/licenses/BSD-3-Clause">
+BSD-3-Clause License</a>
+copyright notices, and may be freely used and redistributed under the
+terms of that notice.
+
+Additionally, portions of the WiredTiger distribution are public domain
+software. Public domain files have notices releasing the software into
+the public domain and may be freely used and redistributed.
+
For a license to use the WiredTiger software under conditions other than
those described above, or for technical support for this software, please
contact MongoDB, Inc. at
@@ -28,7 +41,7 @@ of the WiredTiger library should comply with these copyrights.
@hrow{Distribution Files, Copyright Holder, License}
@row{\c src/include/bitstring.i, University of California\, Berkeley, <a href="http://www.opensource.org/licenses/BSD-3-Clause">BSD-3-Clause License</a>}
@row{\c src/include/queue.h, University of California\, Berkeley, <a href="http://www.opensource.org/licenses/BSD-3-Clause">BSD-3-Clause License</a>}
-@row{\c src/os_posix/getopt.c, University of California\, Berkeley, <a href="http://www.opensource.org/licenses/BSD-3-Clause">BSD-3-Clause License</a>}
+@row{\c src/os_posix/os_getopt.c, University of California\, Berkeley, <a href="http://www.opensource.org/licenses/BSD-3-Clause">BSD-3-Clause License</a>}
@row{\c src/support/hash_city.c, Google\, Inc., <a href="http://www.opensource.org/licenses/MIT">The MIT License</a>}
@row{\c src/support/hash_fnv.c, Authors, Public Domain}
</table>
@@ -63,10 +76,4 @@ selected portions of the WiredTiger sources, please review the copyright
notices and LICENSE files included in the WiredTiger distribution for
the terms and conditions of such redistribution.
-@section license_public_domain Public domain software
-
-Many portions of the WiredTiger distribution are public domain software.
-Public domain files have notices releasing the software into the public
-domain and may be freely used and redistributed.
-
*/
diff --git a/src/docs/wtperf.dox b/src/docs/wtperf.dox
index f3bdd64cfda..339bf740265 100644
--- a/src/docs/wtperf.dox
+++ b/src/docs/wtperf.dox
@@ -206,6 +206,8 @@ if non zero choose a value from within this range as the key for
insert operations
@par random_value (boolean, default=false)
generate random content for the value
+@par read_range (unsigned int, default=0)
+scan a range of keys after each search
@par reopen_connection (boolean, default=true)
close and reopen the connection between populate and workload phases
@par report_interval (unsigned int, default=2)
diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c
index 2b2117ad9fd..c5f6ae3d4d1 100644
--- a/src/evict/evict_file.c
+++ b/src/evict/evict_file.c
@@ -31,8 +31,8 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
/* Walk the tree, discarding pages. */
next_ref = NULL;
- WT_ERR(__wt_tree_walk(session, &next_ref, NULL,
- WT_READ_CACHE | WT_READ_NO_EVICT));
+ WT_ERR(__wt_tree_walk(
+ session, &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT));
while ((ref = next_ref) != NULL) {
page = ref->page;
@@ -68,8 +68,8 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
* the reconciliation, the next walk call could miss a page in
* the tree.
*/
- WT_ERR(__wt_tree_walk(session, &next_ref, NULL,
- WT_READ_CACHE | WT_READ_NO_EVICT));
+ WT_ERR(__wt_tree_walk(session,
+ &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT));
switch (syncop) {
case WT_SYNC_CLOSE:
diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c
index a8979fa6231..0e2b33c35ec 100644
--- a/src/evict/evict_lru.c
+++ b/src/evict/evict_lru.c
@@ -1229,7 +1229,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
*/
for (evict = start, pages_walked = 0;
evict < end && !enough && (ret == 0 || ret == WT_NOTFOUND);
- ret = __wt_tree_walk(
+ ret = __wt_tree_walk_count(
session, &btree->evict_ref, &pages_walked, walk_flags)) {
enough = pages_walked > cache->evict_max_refs_per_file;
if ((ref = btree->evict_ref) == NULL) {
@@ -1336,8 +1336,9 @@ fast: /* If the page can't be evicted, give up. */
if (__wt_ref_is_root(ref))
WT_RET(__evict_clear_walk(session));
else if (ref->page->read_gen == WT_READGEN_OLDEST)
- WT_RET_NOTFOUND_OK(__wt_tree_walk(session,
- &btree->evict_ref, &pages_walked, walk_flags));
+ WT_RET_NOTFOUND_OK(__wt_tree_walk_count(
+ session, &btree->evict_ref,
+ &pages_walked, walk_flags));
}
WT_STAT_FAST_CONN_INCRV(session, cache_eviction_walk, pages_walked);
@@ -1617,7 +1618,7 @@ __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile)
next_walk = NULL;
session->dhandle = dhandle;
- while (__wt_tree_walk(session, &next_walk, NULL,
+ while (__wt_tree_walk(session, &next_walk,
WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 &&
next_walk != NULL) {
page = next_walk->page;
diff --git a/src/include/block.h b/src/include/block.h
index 4bff6c82783..804eec24874 100644
--- a/src/include/block.h
+++ b/src/include/block.h
@@ -173,6 +173,7 @@ struct __wt_bm {
int (*compact_skip)(WT_BM *, WT_SESSION_IMPL *, bool *);
int (*compact_start)(WT_BM *, WT_SESSION_IMPL *);
int (*free)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
+ bool (*is_mapped)(WT_BM *, WT_SESSION_IMPL *);
int (*preload)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
int (*read)
(WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t);
@@ -182,6 +183,7 @@ struct __wt_bm {
int (*salvage_start)(WT_BM *, WT_SESSION_IMPL *);
int (*salvage_valid)
(WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t, bool);
+ int (*size)(WT_BM *, WT_SESSION_IMPL *, wt_off_t *);
int (*stat)(WT_BM *, WT_SESSION_IMPL *, WT_DSRC_STATS *stats);
int (*sync)(WT_BM *, WT_SESSION_IMPL *, bool);
int (*verify_addr)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
@@ -244,7 +246,10 @@ struct __wt_block {
bool ckpt_inprogress;/* Live checkpoint in progress */
/* Compaction support */
- int compact_pct_tenths; /* Percent to compact */
+ int compact_pct_tenths; /* Percent to compact */
+ uint64_t compact_pages_reviewed;/* Pages reviewed */
+ uint64_t compact_pages_skipped; /* Pages skipped */
+ uint64_t compact_pages_written; /* Pages rewritten */
/* Salvage support */
wt_off_t slvg_off; /* Salvage file offset */
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 6ee74c61a38..12a736c56a2 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -478,7 +478,7 @@ struct __wt_page {
#define pg_row_ins u.row.ins
#undef pg_row_upd
#define pg_row_upd u.row.upd
-#define pg_row_entries u.row.entries
+#undef pg_row_entries
#define pg_row_entries u.row.entries
/* Fixed-length column-store leaf page. */
@@ -1049,7 +1049,7 @@ struct __wt_insert_head {
uint64_t __prev_split_gen = (session)->split_gen; \
if (__prev_split_gen == 0) \
do { \
- WT_PUBLISH((session)->split_gen, \
+ WT_PUBLISH((session)->split_gen, \
S2C(session)->split_gen); \
} while ((session)->split_gen != S2C(session)->split_gen)
diff --git a/src/include/btree.i b/src/include/btree.i
index 3e2e7158e04..23e0dfea2cd 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -948,9 +948,8 @@ __wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value)
* __wt_ref_info --
* Return the addr/size and type triplet for a reference.
*/
-static inline int
-__wt_ref_info(WT_SESSION_IMPL *session,
- WT_REF *ref, const uint8_t **addrp, size_t *sizep, u_int *typep)
+static inline void
+__wt_ref_info(WT_REF *ref, const uint8_t **addrp, size_t *sizep, u_int *typep)
{
WT_ADDR *addr;
WT_CELL_UNPACK *unpack, _unpack;
@@ -984,7 +983,9 @@ __wt_ref_info(WT_SESSION_IMPL *session,
case WT_ADDR_LEAF_NO:
*typep = WT_CELL_ADDR_LEAF_NO;
break;
- WT_ILLEGAL_VALUE(session);
+ default:
+ *typep = 0;
+ break;
}
} else {
__wt_cell_unpack((WT_CELL *)addr, unpack);
@@ -993,7 +994,6 @@ __wt_ref_info(WT_SESSION_IMPL *session,
if (typep != NULL)
*typep = unpack->type;
}
- return (0);
}
/*
@@ -1009,7 +1009,7 @@ __wt_ref_block_free(WT_SESSION_IMPL *session, WT_REF *ref)
if (ref->addr == NULL)
return (0);
- WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+ __wt_ref_info(ref, &addr, &addr_size, NULL);
WT_RET(__wt_btree_block_free(session, addr, addr_size));
/* Clear the address (so we don't free it twice). */
diff --git a/src/include/column.i b/src/include/column.i
index fc1f372b2a9..9388e07d0d8 100644
--- a/src/include/column.i
+++ b/src/include/column.i
@@ -176,6 +176,16 @@ __col_insert_search(WT_INSERT_HEAD *inshead,
continue;
}
+ /*
+ * When no exact match is found, the search returns the smallest
+ * key larger than the searched-for key, or the largest key
+ * smaller than the searched-for key, if there is no larger key.
+ * Our callers depend on that: specifically, the fixed-length
+ * column store cursor code interprets returning a key smaller
+ * than the searched-for key to mean the searched-for key is
+ * larger than any key on the page. Don't change that behavior,
+ * things will break.
+ */
ins_recno = WT_INSERT_RECNO(ret_ins);
cmp = (recno == ins_recno) ? 0 : (recno < ins_recno) ? -1 : 1;
@@ -282,7 +292,17 @@ __col_var_search(WT_PAGE *page, uint64_t recno, uint64_t *start_recnop)
start_recno = repeat->recno + repeat->rle;
}
- if (recno >= start_recno + (page->pg_var_entries - start_indx))
+ /*
+ * !!!
+ * The test could be written more simply as:
+ *
+ * (recno >= start_recno + (page->pg_var_entries - start_indx))
+ *
+ * It's split into two parts because the simpler test will overflow if
+ * searching for large record numbers.
+ */
+ if (recno >= start_recno &&
+ recno - start_recno >= page->pg_var_entries - start_indx)
return (NULL);
return (page->pg_var_d + start_indx + (uint32_t)(recno - start_recno));
diff --git a/src/include/connection.h b/src/include/connection.h
index 2367f5a0035..1c1cb9b8987 100644
--- a/src/include/connection.h
+++ b/src/include/connection.h
@@ -415,6 +415,7 @@ struct __wt_connection_impl {
uint32_t direct_io;
uint32_t write_through; /* FILE_FLAG_WRITE_THROUGH type flags */
bool mmap; /* mmap configuration */
+ int page_size; /* OS page size for mmap alignment */
uint32_t verbose;
uint32_t flags;
diff --git a/src/include/cursor.h b/src/include/cursor.h
index 43bbfcf5b05..4f232ce4fd0 100644
--- a/src/include/cursor.h
+++ b/src/include/cursor.h
@@ -104,6 +104,14 @@ struct __wt_cursor_btree {
uint64_t recno; /* Record number */
/*
+ * Next-random cursors can optionally be configured to step through a
+ * percentage of the total leaf pages to their next value. Note the
+ * configured value and the calculated number of leaf pages to skip.
+ */
+ uint64_t next_random_leaf_skip;
+ u_int next_random_sample_size;
+
+ /*
* The search function sets compare to:
* < 1 if the found key is less than the specified key
* 0 if the found key matches the specified key
@@ -192,18 +200,23 @@ struct __wt_cursor_btree {
uint8_t append_tree; /* Cursor appended to the tree */
+#ifdef HAVE_DIAGNOSTIC
+ /* Check that cursor next/prev never returns keys out-of-order. */
+ WT_ITEM *lastkey, _lastkey;
+ uint64_t lastrecno;
+#endif
+
#define WT_CBT_ACTIVE 0x01 /* Active in the tree */
#define WT_CBT_ITERATE_APPEND 0x02 /* Col-store: iterating append list */
#define WT_CBT_ITERATE_NEXT 0x04 /* Next iteration configuration */
#define WT_CBT_ITERATE_PREV 0x08 /* Prev iteration configuration */
-#define WT_CBT_MAX_RECORD 0x10 /* Col-store: past end-of-table */
-#define WT_CBT_NO_TXN 0x20 /* Non-transactional cursor
+#define WT_CBT_NO_TXN 0x10 /* Non-transactional cursor
(e.g. on a checkpoint) */
-#define WT_CBT_SEARCH_SMALLEST 0x40 /* Row-store: small-key insert list */
+#define WT_CBT_SEARCH_SMALLEST 0x20 /* Row-store: small-key insert list */
#define WT_CBT_POSITION_MASK /* Flags associated with position */ \
(WT_CBT_ITERATE_APPEND | WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV | \
- WT_CBT_MAX_RECORD | WT_CBT_SEARCH_SMALLEST)
+ WT_CBT_SEARCH_SMALLEST)
uint8_t flags;
};
diff --git a/src/include/extern.h b/src/include/extern.h
index bd32e067a58..7338f8dae3b 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -51,7 +51,8 @@ extern int __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const
extern int __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block);
extern int __wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh, uint32_t allocsize);
extern void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats);
-extern int __wt_block_manager_size( WT_SESSION_IMPL *session, const char *filename, WT_DSRC_STATS *stats);
+extern int __wt_block_manager_size(WT_BM *bm, WT_SESSION_IMPL *session, wt_off_t *sizep);
+extern int __wt_block_manager_named_size( WT_SESSION_IMPL *session, const char *name, wt_off_t *sizep);
extern int __wt_bm_preload( WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size);
extern int __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size);
extern int __wt_block_read_off_blind( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset);
@@ -91,6 +92,7 @@ extern int __wt_bloom_drop(WT_BLOOM *bloom, const char *config);
extern int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp);
extern void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt);
+extern int __wt_cursor_key_order_check( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next);
extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating);
extern int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating);
extern int __wt_btcur_reset(WT_CURSOR_BTREE *cbt);
@@ -167,9 +169,11 @@ extern int __wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk, size_t size, bool empty_page_ok);
extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *tag, WT_ITEM *buf);
-extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags);
+extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags);
+extern int __wt_tree_walk_count(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags);
+extern int __wt_tree_walk_skip(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp, uint32_t flags);
extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove);
-extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt);
+extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt);
extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page);
extern int __wt_row_leaf_key_copy( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key);
extern int __wt_row_leaf_key_work(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip_arg, WT_ITEM *keyb, bool instantiate);
@@ -184,7 +188,8 @@ extern WT_UPDATE *__wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE
extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd);
extern int __wt_search_insert( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key);
extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert);
-extern int __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt);
+extern int __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt);
+extern int __wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt);
extern void __wt_las_stats_update(WT_SESSION_IMPL *session);
extern int __wt_las_create(WT_SESSION_IMPL *session);
extern int __wt_las_destroy(WT_SESSION_IMPL *session);
@@ -360,23 +365,23 @@ extern int __wt_logrec_read(WT_SESSION_IMPL *session, const uint8_t **pp, const
extern int __wt_logop_read(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *optypep, uint32_t *opsizep);
extern int __wt_logop_col_put_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t recno, WT_ITEM *value);
extern int __wt_logop_col_put_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *recnop, WT_ITEM *valuep);
-extern int __wt_logop_col_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_col_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags);
extern int __wt_logop_col_remove_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t recno);
extern int __wt_logop_col_remove_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *recnop);
-extern int __wt_logop_col_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_col_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags);
extern int __wt_logop_col_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t start, uint64_t stop);
extern int __wt_logop_col_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *startp, uint64_t *stopp);
-extern int __wt_logop_col_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_col_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags);
extern int __wt_logop_row_put_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *key, WT_ITEM *value);
extern int __wt_logop_row_put_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *keyp, WT_ITEM *valuep);
-extern int __wt_logop_row_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_row_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags);
extern int __wt_logop_row_remove_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *key);
extern int __wt_logop_row_remove_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *keyp);
-extern int __wt_logop_row_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_row_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags);
extern int __wt_logop_row_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *start, WT_ITEM *stop, uint32_t mode);
extern int __wt_logop_row_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *startp, WT_ITEM *stopp, uint32_t *modep);
-extern int __wt_logop_row_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
-extern int __wt_txn_op_printlog( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_row_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags);
+extern int __wt_txn_op_printlog( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags);
extern void __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
extern int __wt_log_slot_switch( WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool retry, bool forced);
extern int __wt_log_slot_new(WT_SESSION_IMPL *session);
@@ -466,7 +471,7 @@ extern int __wt_meta_track_init(WT_SESSION_IMPL *session);
extern int __wt_meta_track_destroy(WT_SESSION_IMPL *session);
extern int __wt_turtle_init(WT_SESSION_IMPL *session);
extern int __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, char **valuep);
-extern int __wt_turtle_update( WT_SESSION_IMPL *session, const char *key, const char *value);
+extern int __wt_turtle_update(WT_SESSION_IMPL *session, const char *key, const char *value);
extern void __wt_abort(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn));
extern int __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp);
extern int __wt_realloc(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp);
@@ -512,6 +517,7 @@ extern int __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp);
extern int __wt_once(void (*init_routine)(void));
extern int __wt_open(WT_SESSION_IMPL *session, const char *name, bool ok_create, bool exclusive, int dio_type, WT_FH **fhp);
extern int __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp);
+extern int __wt_get_vm_pagesize(void);
extern bool __wt_absolute_path(const char *path);
extern const char *__wt_path_separator(void);
extern bool __wt_has_priv(void);
@@ -653,6 +659,7 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp
);
extern int __wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page);
extern void __wt_hazard_close(WT_SESSION_IMPL *session);
+extern void __wt_fill_hex(const uint8_t *src, size_t src_max, uint8_t *dest, size_t dest_max, size_t *lenp);
extern int __wt_raw_to_hex( WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to);
extern int __wt_raw_to_esc_hex( WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to);
extern int __wt_hex2byte(const u_char *from, u_char *to);
@@ -670,6 +677,7 @@ extern uint32_t __wt_log2_int(uint32_t n);
extern bool __wt_ispo2(uint32_t v);
extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2);
extern void __wt_random_init(WT_RAND_STATE volatile *rnd_state);
+extern int __wt_random_init_seed( WT_SESSION_IMPL *session, WT_RAND_STATE volatile *rnd_state);
extern uint32_t __wt_random(WT_RAND_STATE volatile *rnd_state);
extern int __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size);
extern int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4)));
@@ -731,7 +739,7 @@ extern int __wt_txn_checkpoint_logread( WT_SESSION_IMPL *session, const uint8_t
extern int __wt_txn_checkpoint_log( WT_SESSION_IMPL *session, bool full, uint32_t flags, WT_LSN *lsnp);
extern int __wt_txn_truncate_log( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop);
extern int __wt_txn_truncate_end(WT_SESSION_IMPL *session);
-extern int __wt_txn_printlog(WT_SESSION *wt_session, FILE *out);
+extern int __wt_txn_printlog(WT_SESSION *wt_session, FILE *out, uint32_t flags);
extern int __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_txn_named_snapshot_drop(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *nameval);
diff --git a/src/include/flags.h b/src/include/flags.h
index 064349125cc..bafff92fbc0 100644
--- a/src/include/flags.h
+++ b/src/include/flags.h
@@ -45,8 +45,9 @@
#define WT_READ_NO_WAIT 0x00000020
#define WT_READ_PREV 0x00000040
#define WT_READ_SKIP_INTL 0x00000080
-#define WT_READ_TRUNCATE 0x00000100
-#define WT_READ_WONT_NEED 0x00000200
+#define WT_READ_SKIP_LEAF 0x00000100
+#define WT_READ_TRUNCATE 0x00000200
+#define WT_READ_WONT_NEED 0x00000400
#define WT_SESSION_CAN_WAIT 0x00000001
#define WT_SESSION_CLEAR_EVICT_WALK 0x00000002
#define WT_SESSION_INTERNAL 0x00000004
diff --git a/src/include/gcc.h b/src/include/gcc.h
index 01e33792d73..bb80f8b738b 100644
--- a/src/include/gcc.h
+++ b/src/include/gcc.h
@@ -156,8 +156,7 @@ __wt_atomic_cas_ptr(void *vp, void *old, void *new)
#if defined(x86_64) || defined(__x86_64__)
/* Pause instruction to prevent excess processor bus usage */
-#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory")
-
+#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory")
#define WT_FULL_BARRIER() do { \
__asm__ volatile ("mfence" ::: "memory"); \
} while (0)
@@ -169,7 +168,7 @@ __wt_atomic_cas_ptr(void *vp, void *old, void *new)
} while (0)
#elif defined(i386) || defined(__i386__)
-#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory")
+#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory")
#define WT_FULL_BARRIER() do { \
__asm__ volatile ("lock; addl $0, 0(%%esp)" ::: "memory"); \
} while (0)
@@ -177,23 +176,58 @@ __wt_atomic_cas_ptr(void *vp, void *old, void *new)
#define WT_WRITE_BARRIER() WT_FULL_BARRIER()
#elif defined(__PPC64__) || defined(PPC64)
+/* ori 0,0,0 is the PPC64 noop instruction */
#define WT_PAUSE() __asm__ volatile("ori 0,0,0" ::: "memory")
-#define WT_FULL_BARRIER() do {
+#define WT_FULL_BARRIER() do { \
__asm__ volatile ("sync" ::: "memory"); \
} while (0)
-#define WT_READ_BARRIER() WT_FULL_BARRIER()
-#define WT_WRITE_BARRIER() WT_FULL_BARRIER()
+
+/* TODO: ISA 2.07 Elemental Memory Barriers would be better,
+ specifically mbll, and mbss, but they are not supported by POWER 8 */
+#define WT_READ_BARRIER() do { \
+ __asm__ volatile ("lwsync" ::: "memory"); \
+} while (0)
+#define WT_WRITE_BARRIER() do { \
+ __asm__ volatile ("lwsync" ::: "memory"); \
+} while (0)
#elif defined(__aarch64__)
#define WT_PAUSE() __asm__ volatile("yield" ::: "memory")
#define WT_FULL_BARRIER() do { \
- __asm__ volatile ("dsb sy" ::: "memory"); \
+ __asm__ volatile ("dsb sy" ::: "memory"); \
+} while (0)
+#define WT_READ_BARRIER() do { \
+ __asm__ volatile ("dsb ld" ::: "memory"); \
+} while (0)
+#define WT_WRITE_BARRIER() do { \
+ __asm__ volatile ("dsb st" ::: "memory"); \
+} while (0)
+
+#elif defined(__s390x__)
+#define WT_PAUSE() __asm__ volatile("lr 0,0" ::: "memory")
+#define WT_FULL_BARRIER() do { \
+ __asm__ volatile ("bcr 15,0\n" ::: "memory"); \
} while (0)
+#define WT_READ_BARRIER() WT_FULL_BARRIER()
+#define WT_WRITE_BARRIER() WT_FULL_BARRIER()
+
+#elif defined(__sparc__)
+#define WT_PAUSE() __asm__ volatile("rd %%ccr, %%g0" ::: "memory")
+
+#define WT_FULL_BARRIER() do { \
+ __asm__ volatile ("membar #StoreLoad" ::: "memory"); \
+} while (0)
+
+/*
+ * On UltraSparc machines, TSO is used, and so there is no need for membar.
+ * READ_BARRIER = #LoadLoad, and WRITE_BARRIER = #StoreStore are noop.
+ */
#define WT_READ_BARRIER() do { \
- __asm__ volatile ("dsb ld" ::: "memory"); \
+ __asm__ volatile ("" ::: "memory"); \
} while (0)
+
#define WT_WRITE_BARRIER() do { \
- __asm__ volatile ("dsb st" ::: "memory"); \
+ __asm__ volatile ("" ::: "memory"); \
} while (0)
#else
diff --git a/src/include/log.h b/src/include/log.h
index 521de567fc0..e7737e12663 100644
--- a/src/include/log.h
+++ b/src/include/log.h
@@ -267,6 +267,11 @@ struct __wt_log_desc {
};
/*
+ * Flags for __wt_txn_op_printlog.
+ */
+#define WT_TXN_PRINTLOG_HEX 0x0001 /* Add hex output */
+
+/*
* WT_LOG_REC_DESC --
* A descriptor for a log record type.
*/
diff --git a/src/include/misc.h b/src/include/misc.h
index e542baec642..898e44eb8e0 100644
--- a/src/include/misc.h
+++ b/src/include/misc.h
@@ -268,3 +268,6 @@ union __wt_rand_state {
uint32_t w, z;
} x;
};
+
+/* Shared array for converting to hex */
+extern const u_char __wt_hex[];
diff --git a/src/include/session.h b/src/include/session.h
index 5c3bcfb8ed0..1eca49f2c40 100644
--- a/src/include/session.h
+++ b/src/include/session.h
@@ -74,7 +74,10 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl {
TAILQ_HEAD(__cursors, __wt_cursor) cursors;
WT_CURSOR_BACKUP *bkp_cursor; /* Hot backup cursor */
- WT_COMPACT *compact; /* Compact state */
+
+ WT_COMPACT *compact; /* Compaction information */
+ enum { WT_COMPACT_NONE=0,
+ WT_COMPACT_RUNNING, WT_COMPACT_SUCCESS } compact_state;
/*
* Lookaside table cursor, sweep and eviction worker threads only.
@@ -134,8 +137,6 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl {
void *reconcile; /* Reconciliation support */
int (*reconcile_cleanup)(WT_SESSION_IMPL *);
- bool compaction; /* Compaction did some work */
-
uint32_t flags;
/*
diff --git a/src/include/stat.h b/src/include/stat.h
index dfe7ee5c6cd..a554607b7d5 100644
--- a/src/include/stat.h
+++ b/src/include/stat.h
@@ -139,8 +139,8 @@ __wt_stats_clear(void *stats_arg, int slot)
*/
#define WT_STAT_READ(stats, fld) \
__wt_stats_aggregate(stats, WT_STATS_FIELD_TO_SLOT(stats, fld))
-#define WT_STAT_WRITE(session, stats, fld) \
- ((stats)[WT_STATS_SLOT_ID(session)]->fld);
+#define WT_STAT_WRITE(stats, fld, v) \
+ (stats)->fld = (int64_t)(v)
#define WT_STAT_DECRV(session, stats, fld, value) \
(stats)[WT_STATS_SLOT_ID(session)]->fld -= (int64_t)(value)
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index 08f73386090..bdd8bb65910 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -896,18 +896,17 @@ struct __wt_session {
* boolean flag; default \c false.}
* @config{bulk, configure the cursor for bulk-loading\, a fast\,
* initial load path (see @ref tune_bulk_load for more information).
- * Bulk-load may only be used for newly created objects and cursors
- * configured for bulk-load only support the WT_CURSOR::insert and
- * WT_CURSOR::close methods. When bulk-loading row-store objects\, keys
- * must be loaded in sorted order. The value is usually a true/false
- * flag; when bulk-loading fixed-length column store objects\, the
- * special value \c bitmap allows chunks of a memory resident bitmap to
- * be loaded directly into a file by passing a \c WT_ITEM to
- * WT_CURSOR::set_value where the \c size field indicates the number of
- * records in the bitmap (as specified by the object's \c value_format
- * configuration). Bulk-loaded bitmap values must end on a byte boundary
- * relative to the bit count (except for the last set of values
- * loaded)., a string; default \c false.}
+ * Bulk-load may only be used for newly created objects and applications
+ * should use the WT_CURSOR::insert method to insert rows. When
+ * bulk-loading\, rows must be loaded in sorted order. The value is
+ * usually a true/false flag; when bulk-loading fixed-length column
+ * store objects\, the special value \c bitmap allows chunks of a memory
+ * resident bitmap to be loaded directly into a file by passing a \c
+ * WT_ITEM to WT_CURSOR::set_value where the \c size field indicates the
+ * number of records in the bitmap (as specified by the object's \c
+ * value_format configuration). Bulk-loaded bitmap values must end on a
+ * byte boundary relative to the bit count (except for the last set of
+ * values loaded)., a string; default \c false.}
* @config{checkpoint, the name of a checkpoint to open (the reserved
* name "WiredTigerCheckpoint" opens the most recent internal checkpoint
* taken for the object). The cursor does not support data
@@ -921,10 +920,19 @@ struct __wt_session {
* string\, chosen from the following options: \c "hex"\, \c "json"\, \c
* "print"; default empty.}
* @config{next_random, configure the cursor to return a pseudo-random
- * record from the object; valid only for row-store cursors. Cursors
- * configured with \c next_random=true only support the WT_CURSOR::next
- * and WT_CURSOR::close methods. See @ref cursor_random for details., a
- * boolean flag; default \c false.}
+ * record from the object when the WT_CURSOR::next method is called;
+ * valid only for row-store cursors. See @ref cursor_random for
+ * details., a boolean flag; default \c false.}
+ * @config{next_random_sample_size, cursors configured by \c next_random
+ * to return pseudo-random records from the object randomly select from
+ * the entire object\, by default. Setting \c next_random_sample_size
+ * to a non-zero value sets the number of samples the application
+ * expects to take using the \c next_random cursor. A cursor configured
+ * with both \c next_random and \c next_random_sample_size attempts to
+ * divide the object into \c next_random_sample_size equal-sized
+ * pieces\, and each retrieval returns a record from one of those
+ * pieces. See @ref cursor_random for details., a string; default \c
+ * 0.}
* @config{overwrite, configures whether the cursor's insert\, update
* and remove methods check the existing state of the record. If \c
* overwrite is \c false\, WT_CURSOR::insert fails with
diff --git a/src/log/log_auto.c b/src/log/log_auto.c
index 5a1d03b1976..54df01d01ab 100644
--- a/src/log/log_auto.c
+++ b/src/log/log_auto.c
@@ -69,7 +69,7 @@ __logrec_json_unpack_str(char *dest, size_t destlen, const char *src,
}
static int
-__logrec_jsonify_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item)
+__logrec_make_json_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item)
{
size_t needed;
@@ -79,6 +79,17 @@ __logrec_jsonify_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item)
return (0);
}
+static int
+__logrec_make_hex_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item)
+{
+ size_t needed;
+
+ needed = item->size * 2 + 1;
+ WT_RET(__wt_realloc(session, NULL, needed, destp));
+ __wt_fill_hex(item->data, item->size, (uint8_t *)*destp, needed, NULL);
+ return (0);
+}
+
int
__wt_logop_col_put_pack(
WT_SESSION_IMPL *session, WT_ITEM *logrec,
@@ -121,7 +132,8 @@ __wt_logop_col_put_unpack(
int
__wt_logop_col_put_print(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ FILE *out, uint32_t flags)
{
WT_DECL_RET;
uint32_t fileid;
@@ -138,9 +150,14 @@ __wt_logop_col_put_print(
" \"fileid\": \"%" PRIu32 "\",\n", fileid));
WT_ERR(__wt_fprintf(out,
" \"recno\": \"%" PRIu64 "\",\n", recno));
- WT_ERR(__logrec_jsonify_str(session, &escaped, &value));
+ WT_ERR(__logrec_make_json_str(session, &escaped, &value));
WT_ERR(__wt_fprintf(out,
" \"value\": \"%s\"", escaped));
+ if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) {
+ WT_ERR(__logrec_make_hex_str(session, &escaped, &value));
+ WT_ERR(__wt_fprintf(out,
+ ",\n \"value-hex\": \"%s\"", escaped));
+ }
err: __wt_free(session, escaped);
return (ret);
@@ -188,11 +205,13 @@ __wt_logop_col_remove_unpack(
int
__wt_logop_col_remove_print(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ FILE *out, uint32_t flags)
{
uint32_t fileid;
uint64_t recno;
+ WT_UNUSED(flags);
WT_RET(__wt_logop_col_remove_unpack(
session, pp, end, &fileid, &recno));
@@ -246,12 +265,14 @@ __wt_logop_col_truncate_unpack(
int
__wt_logop_col_truncate_print(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ FILE *out, uint32_t flags)
{
uint32_t fileid;
uint64_t start;
uint64_t stop;
+ WT_UNUSED(flags);
WT_RET(__wt_logop_col_truncate_unpack(
session, pp, end, &fileid, &start, &stop));
@@ -307,7 +328,8 @@ __wt_logop_row_put_unpack(
int
__wt_logop_row_put_print(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ FILE *out, uint32_t flags)
{
WT_DECL_RET;
uint32_t fileid;
@@ -322,12 +344,22 @@ __wt_logop_row_put_print(
WT_RET(__wt_fprintf(out, " \"optype\": \"row_put\",\n"));
WT_ERR(__wt_fprintf(out,
" \"fileid\": \"%" PRIu32 "\",\n", fileid));
- WT_ERR(__logrec_jsonify_str(session, &escaped, &key));
+ WT_ERR(__logrec_make_json_str(session, &escaped, &key));
WT_ERR(__wt_fprintf(out,
" \"key\": \"%s\",\n", escaped));
- WT_ERR(__logrec_jsonify_str(session, &escaped, &value));
+ if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) {
+ WT_ERR(__logrec_make_hex_str(session, &escaped, &key));
+ WT_ERR(__wt_fprintf(out,
+ " \"key-hex\": \"%s\",\n", escaped));
+ }
+ WT_ERR(__logrec_make_json_str(session, &escaped, &value));
WT_ERR(__wt_fprintf(out,
" \"value\": \"%s\"", escaped));
+ if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) {
+ WT_ERR(__logrec_make_hex_str(session, &escaped, &value));
+ WT_ERR(__wt_fprintf(out,
+ ",\n \"value-hex\": \"%s\"", escaped));
+ }
err: __wt_free(session, escaped);
return (ret);
@@ -375,7 +407,8 @@ __wt_logop_row_remove_unpack(
int
__wt_logop_row_remove_print(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ FILE *out, uint32_t flags)
{
WT_DECL_RET;
uint32_t fileid;
@@ -389,9 +422,14 @@ __wt_logop_row_remove_print(
WT_RET(__wt_fprintf(out, " \"optype\": \"row_remove\",\n"));
WT_ERR(__wt_fprintf(out,
" \"fileid\": \"%" PRIu32 "\",\n", fileid));
- WT_ERR(__logrec_jsonify_str(session, &escaped, &key));
+ WT_ERR(__logrec_make_json_str(session, &escaped, &key));
WT_ERR(__wt_fprintf(out,
" \"key\": \"%s\"", escaped));
+ if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) {
+ WT_ERR(__logrec_make_hex_str(session, &escaped, &key));
+ WT_ERR(__wt_fprintf(out,
+ ",\n \"key-hex\": \"%s\"", escaped));
+ }
err: __wt_free(session, escaped);
return (ret);
@@ -439,7 +477,8 @@ __wt_logop_row_truncate_unpack(
int
__wt_logop_row_truncate_print(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ FILE *out, uint32_t flags)
{
WT_DECL_RET;
uint32_t fileid;
@@ -455,12 +494,22 @@ __wt_logop_row_truncate_print(
WT_RET(__wt_fprintf(out, " \"optype\": \"row_truncate\",\n"));
WT_ERR(__wt_fprintf(out,
" \"fileid\": \"%" PRIu32 "\",\n", fileid));
- WT_ERR(__logrec_jsonify_str(session, &escaped, &start));
+ WT_ERR(__logrec_make_json_str(session, &escaped, &start));
WT_ERR(__wt_fprintf(out,
" \"start\": \"%s\",\n", escaped));
- WT_ERR(__logrec_jsonify_str(session, &escaped, &stop));
+ if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) {
+ WT_ERR(__logrec_make_hex_str(session, &escaped, &start));
+ WT_ERR(__wt_fprintf(out,
+ " \"start-hex\": \"%s\",\n", escaped));
+ }
+ WT_ERR(__logrec_make_json_str(session, &escaped, &stop));
WT_ERR(__wt_fprintf(out,
" \"stop\": \"%s\",\n", escaped));
+ if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) {
+ WT_ERR(__logrec_make_hex_str(session, &escaped, &stop));
+ WT_ERR(__wt_fprintf(out,
+ " \"stop-hex\": \"%s\",\n", escaped));
+ }
WT_ERR(__wt_fprintf(out,
" \"mode\": \"%" PRIu32 "\"", mode));
@@ -470,7 +519,8 @@ err: __wt_free(session, escaped);
int
__wt_txn_op_printlog(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ FILE *out, uint32_t flags)
{
uint32_t optype, opsize;
@@ -480,27 +530,33 @@ __wt_txn_op_printlog(
switch (optype) {
case WT_LOGOP_COL_PUT:
- WT_RET(__wt_logop_col_put_print(session, pp, end, out));
+ WT_RET(__wt_logop_col_put_print(session, pp, end, out,
+ flags));
break;
case WT_LOGOP_COL_REMOVE:
- WT_RET(__wt_logop_col_remove_print(session, pp, end, out));
+ WT_RET(__wt_logop_col_remove_print(session, pp, end, out,
+ flags));
break;
case WT_LOGOP_COL_TRUNCATE:
- WT_RET(__wt_logop_col_truncate_print(session, pp, end, out));
+ WT_RET(__wt_logop_col_truncate_print(session, pp, end, out,
+ flags));
break;
case WT_LOGOP_ROW_PUT:
- WT_RET(__wt_logop_row_put_print(session, pp, end, out));
+ WT_RET(__wt_logop_row_put_print(session, pp, end, out,
+ flags));
break;
case WT_LOGOP_ROW_REMOVE:
- WT_RET(__wt_logop_row_remove_print(session, pp, end, out));
+ WT_RET(__wt_logop_row_remove_print(session, pp, end, out,
+ flags));
break;
case WT_LOGOP_ROW_TRUNCATE:
- WT_RET(__wt_logop_row_truncate_print(session, pp, end, out));
+ WT_RET(__wt_logop_row_truncate_print(session, pp, end, out,
+ flags));
break;
WT_ILLEGAL_VALUE(session);
diff --git a/src/lsm/lsm_stat.c b/src/lsm/lsm_stat.c
index c1eb7a2a389..7c53990a2a2 100644
--- a/src/lsm/lsm_stat.c
+++ b/src/lsm/lsm_stat.c
@@ -91,7 +91,7 @@ __curstat_lsm_init(
* top-level.
*/
new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
- new->lsm_generation_max = chunk->generation;
+ WT_STAT_WRITE(new, lsm_generation_max, chunk->generation);
/* Aggregate statistics from each new chunk. */
__wt_stat_dsrc_aggregate_single(new, stats);
@@ -115,37 +115,40 @@ __curstat_lsm_init(
* into the top-level.
*/
new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
- new->bloom_size =
- (int64_t)((chunk->count * lsm_tree->bloom_bit_count) / 8);
- new->bloom_page_evict =
- new->cache_eviction_clean + new->cache_eviction_dirty;
- new->bloom_page_read = new->cache_read;
+ WT_STAT_WRITE(new, bloom_size,
+ (int64_t)((chunk->count * lsm_tree->bloom_bit_count) / 8));
+ WT_STAT_WRITE(new, bloom_page_evict,
+ new->cache_eviction_clean + new->cache_eviction_dirty);
+ WT_STAT_WRITE(new, bloom_page_read, new->cache_read);
__wt_stat_dsrc_aggregate_single(new, stats);
WT_ERR(stat_cursor->close(stat_cursor));
}
/* Set statistics that aren't aggregated directly into the cursor */
- stats->bloom_count = bloom_count;
- stats->lsm_chunk_count = lsm_tree->nchunks;
+ WT_STAT_WRITE(stats, bloom_count, bloom_count);
+ WT_STAT_WRITE(stats, lsm_chunk_count, lsm_tree->nchunks);
/* Include, and optionally clear, LSM-level specific information. */
- stats->bloom_miss = lsm_tree->bloom_miss;
+ WT_STAT_WRITE(stats, bloom_miss, lsm_tree->bloom_miss);
if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
lsm_tree->bloom_miss = 0;
- stats->bloom_hit = lsm_tree->bloom_hit;
+ WT_STAT_WRITE(stats, bloom_hit, lsm_tree->bloom_hit);
if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
lsm_tree->bloom_hit = 0;
- stats->bloom_false_positive = lsm_tree->bloom_false_positive;
+ WT_STAT_WRITE(
+ stats, bloom_false_positive, lsm_tree->bloom_false_positive);
if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
lsm_tree->bloom_false_positive = 0;
- stats->lsm_lookup_no_bloom = lsm_tree->lsm_lookup_no_bloom;
+ WT_STAT_WRITE(
+ stats, lsm_lookup_no_bloom, lsm_tree->lsm_lookup_no_bloom);
if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
lsm_tree->lsm_lookup_no_bloom = 0;
- stats->lsm_checkpoint_throttle = lsm_tree->lsm_checkpoint_throttle;
+ WT_STAT_WRITE(
+ stats, lsm_checkpoint_throttle, lsm_tree->lsm_checkpoint_throttle);
if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
lsm_tree->lsm_checkpoint_throttle = 0;
- stats->lsm_merge_throttle = lsm_tree->lsm_merge_throttle;
+ WT_STAT_WRITE(stats, lsm_merge_throttle, lsm_tree->lsm_merge_throttle);
if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
lsm_tree->lsm_merge_throttle = 0;
diff --git a/src/meta/meta_turtle.c b/src/meta/meta_turtle.c
index 13e8b31916f..3bd57846862 100644
--- a/src/meta/meta_turtle.c
+++ b/src/meta/meta_turtle.c
@@ -271,8 +271,7 @@ err: WT_TRET(__wt_fclose(&fp, WT_FHANDLE_READ));
* Update the turtle file.
*/
int
-__wt_turtle_update(
- WT_SESSION_IMPL *session, const char *key, const char *value)
+__wt_turtle_update(WT_SESSION_IMPL *session, const char *key, const char *value)
{
WT_FH *fh;
WT_DECL_ITEM(buf);
diff --git a/src/os_posix/os_map.c b/src/os_posix/os_map.c
index e95ccb0ade2..4276c89dbcf 100644
--- a/src/os_posix/os_map.c
+++ b/src/os_posix/os_map.c
@@ -48,8 +48,6 @@ __wt_mmap(WT_SESSION_IMPL *session,
return (0);
}
-#define WT_VM_PAGESIZE 4096
-
/*
* __wt_mmap_preload --
* Cause a section of a memory map to be faulted in.
@@ -59,9 +57,10 @@ __wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size)
{
#ifdef HAVE_POSIX_MADVISE
/* Linux requires the address be aligned to a 4KB boundary. */
+ WT_CONNECTION_IMPL *conn = S2C(session);
WT_BM *bm = S2BT(session)->bm;
WT_DECL_RET;
- void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(WT_VM_PAGESIZE - 1));
+ void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(conn->page_size - 1));
size += WT_PTRDIFF(p, blk);
/* XXX proxy for "am I doing a scan?" -- manual read-ahead */
@@ -78,9 +77,9 @@ __wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size)
* Manual pages aren't clear on whether alignment is required for the
* size, so we will be conservative.
*/
- size &= ~(size_t)(WT_VM_PAGESIZE - 1);
+ size &= ~(size_t)(conn->page_size - 1);
- if (size > WT_VM_PAGESIZE &&
+ if (size > (size_t)conn->page_size &&
(ret = posix_madvise(blk, size, POSIX_MADV_WILLNEED)) != 0)
WT_RET_MSG(session, ret, "posix_madvise will need");
#else
@@ -101,8 +100,9 @@ __wt_mmap_discard(WT_SESSION_IMPL *session, void *p, size_t size)
{
#ifdef HAVE_POSIX_MADVISE
/* Linux requires the address be aligned to a 4KB boundary. */
+ WT_CONNECTION_IMPL *conn = S2C(session);
WT_DECL_RET;
- void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(WT_VM_PAGESIZE - 1));
+ void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(conn->page_size - 1));
size += WT_PTRDIFF(p, blk);
if ((ret = posix_madvise(blk, size, POSIX_MADV_DONTNEED)) != 0)
diff --git a/src/os_posix/os_pagesize.c b/src/os_posix/os_pagesize.c
new file mode 100644
index 00000000000..e7c7b4fdf15
--- /dev/null
+++ b/src/os_posix/os_pagesize.c
@@ -0,0 +1,19 @@
+/*-
+ * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_get_vm_pagesize --
+ * Return the default page size of a virtual memory page.
+ */
+int
+__wt_get_vm_pagesize(void)
+{
+ return (getpagesize());
+}
diff --git a/src/os_win/os_pagesize.c b/src/os_win/os_pagesize.c
new file mode 100644
index 00000000000..55cd6a694ec
--- /dev/null
+++ b/src/os_win/os_pagesize.c
@@ -0,0 +1,23 @@
+/*-
+ * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_get_vm_pagesize --
+ * Return the default page size of a virtual memory page.
+ */
+int
+__wt_get_vm_pagesize(void)
+{
+ SYSTEM_INFO system_info;
+
+ GetSystemInfo(&system_info);
+
+ return (system_info.dwPageSize);
+}
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index fd2aec45115..2b07117f9d5 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -1276,6 +1276,8 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
for (upd = upd_list; upd->next != NULL; upd = upd->next)
;
upd->next = append;
+ __wt_cache_page_inmem_incr(
+ session, page, WT_UPDATE_MEMSIZE(append));
}
/*
@@ -1756,7 +1758,7 @@ __rec_key_state_update(WT_RECONCILE *r, bool ovfl_key)
* Figure out the maximum leaf page size for the reconciliation.
*/
static inline uint32_t
-__rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+__rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r)
{
WT_BTREE *btree;
WT_PAGE *page;
@@ -3263,7 +3265,14 @@ supd_check_complete:
memset(WT_BLOCK_HEADER_REF(dsk), 0, btree->block_header);
bnd->cksum = __wt_cksum(buf->data, buf->size);
- if (mod->rec_result == WT_PM_REC_MULTIBLOCK &&
+ /*
+ * One last check: don't reuse blocks if compacting, the reason
+ * for compaction is to move blocks to different locations. We
+ * do this check after calculating the checksums, hopefully the
+ * next write can be skipped.
+ */
+ if (session->compact_state == WT_COMPACT_NONE &&
+ mod->rec_result == WT_PM_REC_MULTIBLOCK &&
mod->mod_multi_entries > bnd_slot) {
multi = &mod->mod_multi[bnd_slot];
if (multi->size == bnd->size &&
@@ -4465,7 +4474,7 @@ compare: /*
WT_ERR(__rec_txn_read(session, r, ins, NULL, NULL, &upd));
if (upd == NULL)
continue;
- for (n = WT_INSERT_RECNO(ins); src_recno <= n; ++src_recno) {
+ for (n = WT_INSERT_RECNO(ins); src_recno <= n;) {
/*
* The application may have inserted records which left
* gaps in the name space, and these gaps can be huge.
@@ -4505,7 +4514,7 @@ compare: /*
last->size == size &&
memcmp(last->data, data, size) == 0)) {
++rle;
- continue;
+ goto next;
}
WT_ERR(__rec_col_var_helper(session, r,
salvage, last, last_deleted, 0, rle));
@@ -4524,6 +4533,15 @@ compare: /*
}
last_deleted = deleted;
rle = 1;
+
+ /*
+ * Move to the next record. It's not a simple increment
+ * because if it's the maximum record, incrementing it
+ * wraps to 0 and this turns into an infinite loop.
+ */
+next: if (src_recno == UINT64_MAX)
+ break;
+ ++src_recno;
}
}
diff --git a/src/session/session_api.c b/src/session/session_api.c
index 053f69ee7f8..f0d0f26db54 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -148,7 +148,7 @@ __session_close(WT_SESSION *wt_session, const char *config)
* via the registered close callback.
*/
if (session->event_handler->handle_close != NULL &&
- !WT_STREQ(cursor->uri, WT_LAS_URI))
+ !WT_STREQ(cursor->internal_uri, WT_LAS_URI))
WT_TRET(session->event_handler->handle_close(
session->event_handler, wt_session, cursor));
WT_TRET(cursor->close(cursor));
diff --git a/src/session/session_compact.c b/src/session/session_compact.c
index 456fcd3ce03..8a5b741c0c5 100644
--- a/src/session/session_compact.c
+++ b/src/session/session_compact.c
@@ -172,12 +172,12 @@ __compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
for (i = 0; i < 100; ++i) {
WT_ERR(__wt_txn_checkpoint(session, checkpoint_cfg));
- session->compaction = false;
+ session->compact_state = WT_COMPACT_RUNNING;
WT_WITH_SCHEMA_LOCK(session,
ret = __wt_schema_worker(
session, uri, __wt_compact, NULL, cfg, 0));
WT_ERR(ret);
- if (!session->compaction)
+ if (session->compact_state != WT_COMPACT_SUCCESS)
break;
WT_ERR(__wt_txn_checkpoint(session, checkpoint_cfg));
@@ -185,7 +185,9 @@ __compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
WT_ERR(__session_compact_check_timeout(session, start_time));
}
-err: __wt_scr_free(session, &t);
+err: session->compact_state = WT_COMPACT_NONE;
+
+ __wt_scr_free(session, &t);
return (ret);
}
diff --git a/src/support/global.c b/src/support/global.c
index 1e32f5b4453..2330a65a707 100644
--- a/src/support/global.c
+++ b/src/support/global.c
@@ -12,28 +12,6 @@ WT_PROCESS __wt_process; /* Per-process structure */
static int __wt_pthread_once_failed; /* If initialization failed */
/*
- * __system_is_little_endian --
- * Check if the system is little endian.
- */
-static int
-__system_is_little_endian(void)
-{
- uint64_t v;
- bool little;
-
- v = 1;
- little = *((uint8_t *)&v) != 0;
-
- if (little)
- return (0);
-
- fprintf(stderr,
- "This release of the WiredTiger data engine does not support "
- "big-endian systems; contact WiredTiger for more information.\n");
- return (EINVAL);
-}
-
-/*
* __wt_global_once --
* Global initialization, run once.
*/
@@ -42,11 +20,6 @@ __wt_global_once(void)
{
WT_DECL_RET;
- if ((ret = __system_is_little_endian()) != 0) {
- __wt_pthread_once_failed = ret;
- return;
- }
-
if ((ret =
__wt_spin_init(NULL, &__wt_process.spinlock, "global")) != 0) {
__wt_pthread_once_failed = ret;
@@ -115,7 +88,7 @@ __wt_attach(WT_SESSION_IMPL *session)
/* Sleep forever, the debugger will interrupt us when it attaches. */
for (;;)
- __wt_sleep(100, 0);
+ __wt_sleep(10, 0);
#else
WT_UNUSED(session);
#endif
diff --git a/src/support/hash_city.c b/src/support/hash_city.c
index 9a4a6464f40..33f4113c004 100644
--- a/src/support/hash_city.c
+++ b/src/support/hash_city.c
@@ -99,6 +99,12 @@ static uint32_t UNALIGNED_LOAD32(const char *p) {
#define bswap_32(x) OSSwapInt32(x)
#define bswap_64(x) OSSwapInt64(x)
+#elif defined(__sun)
+
+#include <sys/byteorder.h>
+#define bswap_32 BSWAP_32
+#define bswap_64 BSWAP_64
+
#else
#include <byteswap.h>
#endif
diff --git a/src/support/hex.c b/src/support/hex.c
index eb9f420911a..5fb8d4bc190 100644
--- a/src/support/hex.c
+++ b/src/support/hex.c
@@ -8,7 +8,7 @@
#include "wt_internal.h"
-static const u_char hex[] = "0123456789abcdef";
+const u_char __wt_hex[] = "0123456789abcdef";
/*
* __fill_hex --
@@ -25,8 +25,8 @@ __fill_hex(const uint8_t *src, size_t src_max,
--dest_max;
for (; src_max > 0 && dest_max > 1;
src_max -= 1, dest_max -= 2, ++src) {
- *dest++ = hex[(*src & 0xf0) >> 4];
- *dest++ = hex[*src & 0x0f];
+ *dest++ = __wt_hex[(*src & 0xf0) >> 4];
+ *dest++ = __wt_hex[*src & 0x0f];
}
*dest++ = '\0';
if (lenp != NULL)
@@ -34,6 +34,17 @@ __fill_hex(const uint8_t *src, size_t src_max,
}
/*
+ * __wt_fill_hex --
+ * In-memory conversion of raw bytes to a hexadecimal representation.
+ */
+void
+__wt_fill_hex(const uint8_t *src, size_t src_max,
+ uint8_t *dest, size_t dest_max, size_t *lenp)
+{
+ __fill_hex(src, src_max, dest, dest_max, lenp);
+}
+
+/*
* __wt_raw_to_hex --
* Convert a chunk of data to a nul-terminated printable hex string.
*/
@@ -83,8 +94,8 @@ __wt_raw_to_esc_hex(
*t++ = *p;
} else {
*t++ = '\\';
- *t++ = hex[(*p & 0xf0) >> 4];
- *t++ = hex[*p & 0x0f];
+ *t++ = __wt_hex[(*p & 0xf0) >> 4];
+ *t++ = __wt_hex[*p & 0x0f];
}
*t++ = '\0';
to->size = WT_PTRDIFF(t, to->mem);
diff --git a/src/support/huffman.c b/src/support/huffman.c
index 4bda365cb10..9488dbf14fe 100644
--- a/src/support/huffman.c
+++ b/src/support/huffman.c
@@ -1,9 +1,31 @@
-/*-
+/*
* Copyright (c) 2014-2015 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
- * See the file LICENSE for redistribution information.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name MongoDB or the name WiredTiger
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY MONGODB INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
*/
#include "wt_internal.h"
diff --git a/src/support/rand.c b/src/support/rand.c
index f5ecb12633e..3adcb801f03 100644
--- a/src/support/rand.c
+++ b/src/support/rand.c
@@ -60,6 +60,29 @@ __wt_random_init(WT_RAND_STATE volatile * rnd_state)
}
/*
+ * __wt_random_init_seed --
+ * Initialize the state of a 32-bit pseudo-random number.
+ * Use this, instead of __wt_random_init if we are running with multiple
+ * threads and we want each thread to initialize its own random state based
+ * on a different random seed.
+ */
+int
+__wt_random_init_seed(
+ WT_SESSION_IMPL *session, WT_RAND_STATE volatile * rnd_state)
+{
+ struct timespec ts;
+ WT_RAND_STATE rnd;
+
+ WT_RET(__wt_epoch(session, &ts));
+ M_W(rnd) = (uint32_t)(ts.tv_nsec + 521288629);
+ M_Z(rnd) = (uint32_t)(ts.tv_nsec + 362436069);
+
+ *rnd_state = rnd;
+
+ return (0);
+}
+
+/*
* __wt_random --
* Return a 32-bit pseudo-random number.
*/
diff --git a/src/support/stat.c b/src/support/stat.c
index 4d7cd65fd18..7a615131628 100644
--- a/src/support/stat.c
+++ b/src/support/stat.c
@@ -250,19 +250,24 @@ __wt_stat_dsrc_aggregate_single(
to->block_alloc += from->block_alloc;
to->block_free += from->block_free;
to->block_checkpoint_size += from->block_checkpoint_size;
- to->allocation_size = from->allocation_size;
+ if (from->allocation_size > to->allocation_size)
+ to->allocation_size = from->allocation_size;
to->block_reuse_bytes += from->block_reuse_bytes;
- to->block_magic = from->block_magic;
- to->block_major = from->block_major;
+ if (from->block_magic > to->block_magic)
+ to->block_magic = from->block_magic;
+ if (from->block_major > to->block_major)
+ to->block_major = from->block_major;
to->block_size += from->block_size;
- to->block_minor = from->block_minor;
+ if (from->block_minor > to->block_minor)
+ to->block_minor = from->block_minor;
to->btree_checkpoint_generation += from->btree_checkpoint_generation;
to->btree_column_fix += from->btree_column_fix;
to->btree_column_internal += from->btree_column_internal;
to->btree_column_deleted += from->btree_column_deleted;
to->btree_column_variable += from->btree_column_variable;
to->btree_column_rle += from->btree_column_rle;
- to->btree_fixed_len = from->btree_fixed_len;
+ if (from->btree_fixed_len > to->btree_fixed_len)
+ to->btree_fixed_len = from->btree_fixed_len;
if (from->btree_maxintlkey > to->btree_maxintlkey)
to->btree_maxintlkey = from->btree_maxintlkey;
if (from->btree_maxintlpage > to->btree_maxintlpage)
@@ -367,12 +372,16 @@ __wt_stat_dsrc_aggregate(
to->block_free += WT_STAT_READ(from, block_free);
to->block_checkpoint_size +=
WT_STAT_READ(from, block_checkpoint_size);
- to->allocation_size = from[0]->allocation_size;
+ if ((v = WT_STAT_READ(from, allocation_size)) > to->allocation_size)
+ to->allocation_size = v;
to->block_reuse_bytes += WT_STAT_READ(from, block_reuse_bytes);
- to->block_magic = from[0]->block_magic;
- to->block_major = from[0]->block_major;
+ if ((v = WT_STAT_READ(from, block_magic)) > to->block_magic)
+ to->block_magic = v;
+ if ((v = WT_STAT_READ(from, block_major)) > to->block_major)
+ to->block_major = v;
to->block_size += WT_STAT_READ(from, block_size);
- to->block_minor = from[0]->block_minor;
+ if ((v = WT_STAT_READ(from, block_minor)) > to->block_minor)
+ to->block_minor = v;
to->btree_checkpoint_generation +=
WT_STAT_READ(from, btree_checkpoint_generation);
to->btree_column_fix += WT_STAT_READ(from, btree_column_fix);
@@ -382,15 +391,14 @@ __wt_stat_dsrc_aggregate(
to->btree_column_variable +=
WT_STAT_READ(from, btree_column_variable);
to->btree_column_rle += WT_STAT_READ(from, btree_column_rle);
- to->btree_fixed_len = from[0]->btree_fixed_len;
- if ((v = WT_STAT_READ(from, btree_maxintlkey)) >
- to->btree_maxintlkey)
+ if ((v = WT_STAT_READ(from, btree_fixed_len)) > to->btree_fixed_len)
+ to->btree_fixed_len = v;
+ if ((v = WT_STAT_READ(from, btree_maxintlkey)) > to->btree_maxintlkey)
to->btree_maxintlkey = v;
if ((v = WT_STAT_READ(from, btree_maxintlpage)) >
to->btree_maxintlpage)
to->btree_maxintlpage = v;
- if ((v = WT_STAT_READ(from, btree_maxleafkey)) >
- to->btree_maxleafkey)
+ if ((v = WT_STAT_READ(from, btree_maxleafkey)) > to->btree_maxleafkey)
to->btree_maxleafkey = v;
if ((v = WT_STAT_READ(from, btree_maxleafpage)) >
to->btree_maxleafpage)
diff --git a/src/txn/txn.c b/src/txn/txn.c
index f835fea8f67..0a3e4a7a7db 100644
--- a/src/txn/txn.c
+++ b/src/txn/txn.c
@@ -216,6 +216,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, bool force)
conn = S2C(session);
txn_global = &conn->txn_global;
+retry:
current_id = last_running = txn_global->current;
oldest_session = NULL;
prev_oldest_id = txn_global->oldest_id;
@@ -287,43 +288,60 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, bool force)
WT_TXNID_LT(txn_global->last_running, last_running);
/* Update the oldest ID. */
- if ((WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) &&
- __wt_atomic_casiv32(&txn_global->scan_count, 1, -1)) {
- WT_ORDERED_READ(session_cnt, conn->session_cnt);
- for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
- if ((id = s->id) != WT_TXN_NONE &&
- WT_TXNID_LT(id, last_running))
- last_running = id;
- if ((id = s->snap_min) != WT_TXN_NONE &&
- WT_TXNID_LT(id, oldest_id))
- oldest_id = id;
- }
-
- if (WT_TXNID_LT(last_running, oldest_id))
- oldest_id = last_running;
-
-#ifdef HAVE_DIAGNOSTIC
+ if (WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) {
/*
- * Make sure the ID doesn't move past any named snapshots.
- *
- * Don't include the read/assignment in the assert statement.
- * Coverity complains if there are assignments only done in
- * diagnostic builds, and when the read is from a volatile.
+ * We know we want to update. Check if we're racing.
*/
- id = txn_global->nsnap_oldest_id;
- WT_ASSERT(session,
- id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id));
+ if (__wt_atomic_casiv32(&txn_global->scan_count, 1, -1)) {
+ WT_ORDERED_READ(session_cnt, conn->session_cnt);
+ for (i = 0, s = txn_global->states;
+ i < session_cnt; i++, s++) {
+ if ((id = s->id) != WT_TXN_NONE &&
+ WT_TXNID_LT(id, last_running))
+ last_running = id;
+ if ((id = s->snap_min) != WT_TXN_NONE &&
+ WT_TXNID_LT(id, oldest_id))
+ oldest_id = id;
+ }
+
+ if (WT_TXNID_LT(last_running, oldest_id))
+ oldest_id = last_running;
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * Make sure the ID doesn't move past any named
+ * snapshots.
+ *
+ * Don't include the read/assignment in the assert
+ * statement. Coverity complains if there are
+ * assignments only done in diagnostic builds, and
+ * when the read is from a volatile.
+ */
+ id = txn_global->nsnap_oldest_id;
+ WT_ASSERT(session,
+ id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id));
#endif
- if (WT_TXNID_LT(txn_global->last_running, last_running))
- txn_global->last_running = last_running;
- if (WT_TXNID_LT(txn_global->oldest_id, oldest_id))
- txn_global->oldest_id = oldest_id;
- WT_ASSERT(session, txn_global->scan_count == -1);
- txn_global->scan_count = 0;
+ if (WT_TXNID_LT(txn_global->last_running, last_running))
+ txn_global->last_running = last_running;
+ if (WT_TXNID_LT(txn_global->oldest_id, oldest_id))
+ txn_global->oldest_id = oldest_id;
+ WT_ASSERT(session, txn_global->scan_count == -1);
+ txn_global->scan_count = 0;
+ } else {
+ /*
+ * We wanted to update the oldest ID but we're racing
+ * another thread. Retry if this is a forced update.
+ */
+ WT_ASSERT(session, txn_global->scan_count > 0);
+ (void)__wt_atomic_subiv32(&txn_global->scan_count, 1);
+ if (force) {
+ __wt_yield();
+ goto retry;
+ }
+ }
} else {
if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) &&
- current_id - oldest_id > 10000 && last_running_moved &&
- oldest_session != NULL) {
+ current_id - oldest_id > 10000 && oldest_session != NULL) {
(void)__wt_verbose(session, WT_VERB_TRANSACTION,
"old snapshot %" PRIu64
" pinned in session %d [%s]"
diff --git a/src/txn/txn_log.c b/src/txn/txn_log.c
index c5fa52dea6a..148ed868792 100644
--- a/src/txn/txn_log.c
+++ b/src/txn/txn_log.c
@@ -8,6 +8,12 @@
#include "wt_internal.h"
+/* Cookie passed to __txn_printlog. */
+typedef struct {
+ FILE *out;
+ uint32_t flags;
+} WT_TXN_PRINTLOG_ARGS;
+
/*
* __txn_op_log --
* Log an operation for the current transaction.
@@ -64,7 +70,8 @@ err: __wt_buf_free(session, &key);
*/
static int
__txn_commit_printlog(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out,
+ uint32_t flags)
{
bool firstrecord;
@@ -79,7 +86,7 @@ __txn_commit_printlog(
firstrecord = false;
- WT_RET(__wt_txn_op_printlog(session, pp, end, out));
+ WT_RET(__wt_txn_op_printlog(session, pp, end, out, flags));
WT_RET(__wt_fprintf(out, "\n }"));
}
@@ -459,6 +466,7 @@ __txn_printlog(WT_SESSION_IMPL *session,
FILE *out;
WT_LOG_RECORD *logrec;
WT_LSN ckpt_lsn;
+ WT_TXN_PRINTLOG_ARGS *args;
const uint8_t *end, *p;
const char *msg;
uint64_t txnid;
@@ -467,7 +475,8 @@ __txn_printlog(WT_SESSION_IMPL *session,
bool compressed;
WT_UNUSED(next_lsnp);
- out = cookie;
+ args = cookie;
+ out = args->out;
p = WT_LOG_SKIP_HEADER(rawrec->data);
end = (const uint8_t *)rawrec->data + rawrec->size;
@@ -506,7 +515,8 @@ __txn_printlog(WT_SESSION_IMPL *session,
WT_RET(__wt_fprintf(out, " \"type\" : \"commit\",\n"));
WT_RET(__wt_fprintf(out,
" \"txnid\" : %" PRIu64 ",\n", txnid));
- WT_RET(__txn_commit_printlog(session, &p, end, out));
+ WT_RET(__txn_commit_printlog(session, &p, end, out,
+ args->flags));
break;
case WT_LOGREC_FILE_SYNC:
@@ -537,15 +547,18 @@ __txn_printlog(WT_SESSION_IMPL *session,
* Print the log in a human-readable format.
*/
int
-__wt_txn_printlog(WT_SESSION *wt_session, FILE *out)
+__wt_txn_printlog(WT_SESSION *wt_session, FILE *out, uint32_t flags)
{
WT_SESSION_IMPL *session;
+ WT_TXN_PRINTLOG_ARGS args;
session = (WT_SESSION_IMPL *)wt_session;
+ args.out = out;
+ args.flags = flags;
WT_RET(__wt_fprintf(out, "[\n"));
WT_RET(__wt_log_scan(
- session, NULL, WT_LOGSCAN_FIRST, __txn_printlog, out));
+ session, NULL, WT_LOGSCAN_FIRST, __txn_printlog, &args));
WT_RET(__wt_fprintf(out, "\n]\n"));
return (0);
diff --git a/src/utilities/util_main.c b/src/utilities/util_main.c
index 9cbda08690e..3b7187bd0de 100644
--- a/src/utilities/util_main.c
+++ b/src/utilities/util_main.c
@@ -226,7 +226,6 @@ main(int argc, char *argv[])
ret = func(session, argc, argv);
/* Close the database. */
-
err: if (conn != NULL && (tret = conn->close(conn, NULL)) != 0 && ret == 0)
ret = tret;
diff --git a/src/utilities/util_printlog.c b/src/utilities/util_printlog.c
index d202b09b228..3a665c1c657 100644
--- a/src/utilities/util_printlog.c
+++ b/src/utilities/util_printlog.c
@@ -15,10 +15,10 @@ util_printlog(WT_SESSION *session, int argc, char *argv[])
{
WT_DECL_RET;
int ch;
- bool printable;
+ uint32_t flags;
- printable = false;
- while ((ch = __wt_getopt(progname, argc, argv, "f:p")) != EOF)
+ flags = 0;
+ while ((ch = __wt_getopt(progname, argc, argv, "f:x")) != EOF)
switch (ch) {
case 'f': /* output file */
if (freopen(__wt_optarg, "w", stdout) == NULL) {
@@ -27,8 +27,8 @@ util_printlog(WT_SESSION *session, int argc, char *argv[])
return (1);
}
break;
- case 'p':
- printable = true;
+ case 'x': /* hex output */
+ LF_SET(WT_TXN_PRINTLOG_HEX);
break;
case '?':
default:
@@ -41,8 +41,7 @@ util_printlog(WT_SESSION *session, int argc, char *argv[])
if (argc != 0)
return (usage());
- WT_UNUSED(printable);
- ret = __wt_txn_printlog(session, stdout);
+ ret = __wt_txn_printlog(session, stdout, flags);
if (ret != 0) {
fprintf(stderr, "%s: printlog failed: %s\n",
@@ -61,7 +60,7 @@ usage(void)
{
(void)fprintf(stderr,
"usage: %s %s "
- "printlog [-p] [-f output-file]\n",
+ "printlog [-x] [-f output-file]\n",
progname, usage_prefix);
return (1);
}
diff --git a/test/format/ops.c b/test/format/ops.c
index c705d362fe8..7e299b7d975 100644
--- a/test/format/ops.c
+++ b/test/format/ops.c
@@ -504,7 +504,7 @@ skip_insert: if (col_update(tinfo,
*/
if (!insert) {
dir = (int)mmrand(&tinfo->rnd, 0, 1);
- for (np = 0; np < mmrand(&tinfo->rnd, 1, 8); ++np) {
+ for (np = 0; np < mmrand(&tinfo->rnd, 1, 30); ++np) {
if (notfound)
break;
if (nextprev(cursor, dir, &notfound))
diff --git a/test/suite/test_bulk01.py b/test/suite/test_bulk01.py
index 80b420c9392..df027df0ddd 100644
--- a/test/suite/test_bulk01.py
+++ b/test/suite/test_bulk01.py
@@ -130,7 +130,7 @@ class test_bulk_load(wttest.WiredTigerTestCase):
# Test that variable-length column-store bulk-load efficiently creates big
# records.
- def test_bulk_load_col_delete_big(self):
+ def test_bulk_load_col_big(self):
if self.keyfmt != 'r' or self.valfmt == '8t':
return
diff --git a/test/suite/test_colgap.py b/test/suite/test_colgap.py
index 4192f14c5e6..924d622a024 100644
--- a/test/suite/test_colgap.py
+++ b/test/suite/test_colgap.py
@@ -28,6 +28,7 @@
import wiredtiger, wttest
from helper import simple_populate, key_populate, value_populate
+from wtscenario import check_scenarios, multiply_scenarios, number_scenarios
# test_colgap.py
# Test variable-length column-store gap performance.
@@ -119,5 +120,90 @@ class test_column_store_gap(wttest.WiredTigerTestCase):
self.backward(cursor, list(reversed(v)))
+# Basic testing of variable-length column-store with big records.
+class test_colmax(wttest.WiredTigerTestCase):
+ name = 'test_colmax'
+
+ types = [
+ ('file', dict(type='file:')),
+ ('table', dict(type='table:'))
+ ]
+ valfmt = [
+ ('integer', dict(valfmt='i')),
+ ('string', dict(valfmt='S')),
+ ]
+ record_number = [
+ ('big', dict(recno=18446744073709551606)),
+ ('max', dict(recno=18446744073709551615)),
+ ]
+ bulk = [
+ ('bulk', dict(bulk=1)),
+ ('not-bulk', dict(bulk=0)),
+ ]
+ reopen = [
+ ('reopen', dict(reopen=1)),
+ ('not-reopen', dict(reopen=0)),
+ ]
+ single = [
+ ('single', dict(single=1)),
+ ('not-single', dict(single=0)),
+ ]
+
+ scenarios = number_scenarios(multiply_scenarios(\
+ '.', types, valfmt, record_number, bulk, reopen, single))
+
+ # Test that variable-length column-store correctly/efficiently handles big
+ # records (if it's not efficient, we'll just hang).
+ def test_colmax_op(self):
+ recno = self.recno
+
+ uri = self.type + self.name
+ self.session.create(uri, 'key_format=r' +',value_format=' + self.valfmt)
+
+ # Insert a big record with/without a bulk cursor.
+ bulk_config = ""
+ if self.bulk:
+ bulk_config = "bulk"
+ cursor = self.session.open_cursor(uri, None, bulk_config)
+
+ # Optionaly make the big record the only record in the table.
+ if not self.single:
+ for i in range(1, 723):
+ cursor[key_populate(cursor, i)] = value_populate(cursor, i)
+
+ # Confirm searching past the end of the table works.
+ if not self.bulk:
+ cursor.set_key(recno)
+ self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND)
+
+ # Insert the big record.
+ cursor[key_populate(cursor, recno)] = value_populate(cursor, recno)
+
+ # Optionally flush to disk; re-open the cursor as necessary.
+ if self.bulk or self.reopen:
+ cursor.close()
+ if self.reopen == 1:
+ self.reopen_conn()
+ if self.bulk or self.reopen:
+ cursor = self.session.open_cursor(uri, None, None)
+
+ # Search for the large record.
+ cursor.set_key(recno)
+ self.assertEqual(cursor.search(), 0)
+ self.assertEqual(cursor.get_value(), value_populate(cursor, recno))
+
+ # Update it.
+ cursor[key_populate(cursor, recno)] = value_populate(cursor, 37)
+ cursor.set_key(recno)
+ self.assertEqual(cursor.search(), 0)
+ self.assertEqual(cursor.get_value(), value_populate(cursor, 37))
+
+ # Remove it.
+ cursor.set_key(recno)
+ self.assertEqual(cursor.remove(), 0)
+ cursor.set_key(key_populate(cursor, recno))
+ self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND)
+
+
if __name__ == '__main__':
wttest.run()
diff --git a/test/suite/test_compact.py b/test/suite/test_compact01.py
index c7269785115..c7269785115 100644
--- a/test/suite/test_compact.py
+++ b/test/suite/test_compact01.py
diff --git a/test/suite/test_compact02.py b/test/suite/test_compact02.py
new file mode 100644
index 00000000000..f2d5c1fa283
--- /dev/null
+++ b/test/suite/test_compact02.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2015 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_compact02.py
+# Test that compact reduces the file size.
+#
+
+import wiredtiger, wttest
+from wiredtiger import stat
+from wtscenario import multiply_scenarios, number_scenarios
+
+# Test basic compression
+class test_compact02(wttest.WiredTigerTestCase):
+
+ types = [
+ ('file', dict(uri='file:test_compact02')),
+ ]
+ cacheSize = [
+ ('default', dict(cacheSize='')),
+ ('1mb', dict(cacheSize='cache_size=1MB')),
+ ('10gb', dict(cacheSize='cache_size=10GB')),
+ ]
+
+ # There's a balance between the pages we create and the size of the records
+ # being stored: compaction doesn't work on tables with many overflow items
+ # because we don't rewrite them. Experimentally, 8KB is as small as the test
+ # can go. Additionally, we can't set the maximum page size too large because
+ # there won't be enough pages to rewrite. Experimentally, 32KB (the default)
+ # is as large as the test can go.
+ fileConfig = [
+ ('default', dict(fileConfig='')),
+ ('8KB', dict(fileConfig='leaf_page_max=8kb')),
+ ]
+ scenarios = \
+ number_scenarios(multiply_scenarios('.', types, cacheSize, fileConfig))
+
+ # We want about 22K records that total about 130Mb. That is an average
+ # of 6196 bytes per record. Half the records should be smaller, about
+ # 2700 bytes (about 30Mb) and the other half should be larger, 9666 bytes
+ # per record (about 100Mb).
+ #
+ # Test flow is as follows.
+ #
+ # 1. Create a table with the data, alternating record size.
+ # 2. Checkpoint and get stats on the table to confirm the size.
+ # 3. Delete the half of the records with the larger record size.
+ # 4. Call compact.
+ # 5. Get stats on compacted table.
+ #
+ nrecords = 22000
+ bigvalue = "abcdefghi" * 1074 # 9*1074 == 9666
+ smallvalue = "ihgfedcba" * 303 # 9*303 == 2727
+
+ fullsize = nrecords / 2 * len(bigvalue) + nrecords / 2 * len(smallvalue)
+
+ # Return the size of the file
+ def getSize(self):
+ cstat = self.session.open_cursor(
+ 'statistics:' + self.uri, None, 'statistics=(size)')
+ sz = cstat[stat.dsrc.block_size][2]
+ cstat.close()
+ return sz
+
+ # This test varies the cache size and so needs to set up its own connection.
+ # Override the standard methods.
+ def setUpConnectionOpen(self, dir):
+ return None
+ def setUpSessionOpen(self, conn):
+ return None
+ def ConnectionOpen(self, cacheSize):
+ self.home = '.'
+ conn_params = 'create,' + \
+ cacheSize + ',error_prefix="%s: ",' % self.shortid() + \
+ 'statistics=(fast)'
+ try:
+ self.conn = wiredtiger.wiredtiger_open(self.home, conn_params)
+ except wiredtiger.WiredTigerError as e:
+ print "Failed conn at '%s' with config '%s'" % (dir, conn_params)
+ self.session = self.conn.open_session(None)
+
+ # Create a table, add keys with both big and small values.
+ def test_compact02(self):
+ self.ConnectionOpen(self.cacheSize)
+
+ mb = 1024 * 1024
+ params = 'key_format=i,value_format=S,' + self.fileConfig
+
+ # 1. Create a table with the data, alternating record size.
+ self.session.create(self.uri, params)
+ c = self.session.open_cursor(self.uri, None)
+ for i in range(self.nrecords):
+ if i % 2 == 0:
+ c[i] = str(i) + self.bigvalue
+ else:
+ c[i] = str(i) + self.smallvalue
+ c.close()
+
+ # 2. Checkpoint and get stats on the table to confirm the size.
+ self.session.checkpoint()
+ sz = self.getSize()
+ self.pr('After populate ' + str(sz / mb) + 'MB')
+ self.assertGreater(sz, self.fullsize)
+
+ # 3. Delete the half of the records with the larger record size.
+ c = self.session.open_cursor(self.uri, None)
+ count = 0
+ for i in range(self.nrecords):
+ if i % 2 == 0:
+ count += 1
+ c.set_key(i)
+ c.remove()
+ c.close()
+ self.pr('Removed total ' + str((count * 9666) / mb) + 'MB')
+
+ # 4. Call compact.
+ self.session.compact(self.uri, None)
+
+ # 5. Get stats on compacted table.
+ sz = self.getSize()
+ self.pr('After compact ' + str(sz / mb) + 'MB')
+
+ # After compact, the file size should be less than half the full size.
+ self.assertLess(sz, self.fullsize / 2)
+
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/test/suite/test_cursor_random.py b/test/suite/test_cursor_random.py
index 10a3140a2fd..b424dbbc7e3 100644
--- a/test/suite/test_cursor_random.py
+++ b/test/suite/test_cursor_random.py
@@ -29,90 +29,93 @@
import wiredtiger, wttest
from helper import complex_populate, simple_populate
from helper import key_populate, value_populate
-from wtscenario import check_scenarios
+from wtscenario import check_scenarios, multiply_scenarios, number_scenarios
# test_cursor_random.py
# Cursor next_random operations
class test_cursor_random(wttest.WiredTigerTestCase):
- scenarios = check_scenarios([
- ('file', dict(type='file:',fmt='S')),
- ('table', dict(type='table:',fmt='S'))
- ])
+ types = [
+ ('file', dict(type='file:random')),
+ ('table', dict(type='table:random'))
+ ]
+ config = [
+ ('sample', dict(config='next_random=true,next_random_sample_size=35')),
+ ('not-sample', dict(config='next_random=true'))
+ ]
+ scenarios =number_scenarios(multiply_scenarios('.', types, config))
# Check that opening a random cursor on a row-store returns not-supported
- # for every method except for next and reset, and next returns not-found.
- def test_cursor_random_column(self):
- uri = self.type + 'random'
- self.session.create(uri, 'key_format=' + self.fmt + ',value_format=S')
- cursor = self.session.open_cursor(uri, None, "next_random=true")
+ # for methods other than next, reconfigure and reset, and next returns
+ # not-found.
+ def test_cursor_random(self):
+ uri = self.type
+ self.session.create(uri, 'key_format=S,value_format=S')
+ cursor = self.session.open_cursor(uri, None, self.config)
self.assertRaises(
wiredtiger.WiredTigerError, lambda: cursor.compare(cursor))
+ self.assertRaises(wiredtiger.WiredTigerError, lambda: cursor.insert())
self.assertRaises(wiredtiger.WiredTigerError, lambda: cursor.prev())
+ self.assertRaises(wiredtiger.WiredTigerError, lambda: cursor.remove())
self.assertRaises(wiredtiger.WiredTigerError, lambda: cursor.search())
self.assertRaises(
wiredtiger.WiredTigerError, lambda: cursor.search_near())
- self.assertRaises(wiredtiger.WiredTigerError, lambda: cursor.insert())
self.assertRaises(wiredtiger.WiredTigerError, lambda: cursor.update())
- self.assertRaises(wiredtiger.WiredTigerError, lambda: cursor.remove())
- cursor.reset()
self.assertTrue(cursor.next(), wiredtiger.WT_NOTFOUND)
+ self.assertEquals(cursor.reconfigure(), 0)
+ self.assertEquals(cursor.reset(), 0)
cursor.close()
# Check that next_random works with a single value, repeatedly.
def test_cursor_random_single_record(self):
- uri = self.type + 'random'
- self.session.create(uri, 'key_format=' + self.fmt + ',value_format=S')
+ uri = self.type
+ self.session.create(uri, 'key_format=S,value_format=S')
cursor = self.session.open_cursor(uri, None)
cursor['AAA'] = 'BBB'
cursor.close()
- cursor = self.session.open_cursor(uri, None, "next_random=true")
+ cursor = self.session.open_cursor(uri, None, self.config)
for i in range(1,5):
- cursor.next()
+ self.assertEquals(cursor.next(), 0)
self.assertEquals(cursor.get_key(), 'AAA')
cursor.close
# Check that next_random works in the presence of a larger set of values,
# where the values are in an insert list.
def test_cursor_random_multiple_insert_records(self):
- uri = self.type + 'random'
- if self.type == 'file:':
+ uri = self.type
+ if uri.startswith('file:'):
simple_populate(self, uri,
- 'allocation_size=512,leaf_page_max=512,key_format=' +\
- self.fmt, 100)
+ 'allocation_size=512,leaf_page_max=512,key_format=S', 100)
else:
complex_populate(self, uri,
- 'allocation_size=512,leaf_page_max=512,key_format=' +\
- self.fmt, 100)
+ 'allocation_size=512,leaf_page_max=512,key_format=S', 100)
# In a insert list, next_random always selects the middle key/value
# pair, all we can do is confirm cursor.next works.
- cursor = self.session.open_cursor(uri, None, "next_random=true")
+ cursor = self.session.open_cursor(uri, None, self.config)
self.assertEqual(cursor.next(), 0)
# Check that next_random works in the presence of a larger set of values,
# where the values are in a disk format page.
def cursor_random_multiple_page_records(self, reopen):
- uri = self.type + 'random'
- if self.type == 'file:':
+ uri = self.type
+ if uri.startswith('file:'):
simple_populate(self, uri,
- 'allocation_size=512,leaf_page_max=512,key_format=' +\
- self.fmt, 10000)
+ 'allocation_size=512,leaf_page_max=512,key_format=S', 10000)
else:
complex_populate(self, uri,
- 'allocation_size=512,leaf_page_max=512,key_format=' +\
- self.fmt, 10000)
+ 'allocation_size=512,leaf_page_max=512,key_format=S', 10000)
# Optionally close the connection so everything is forced to disk,
# insert lists are an entirely different path in the code.
if reopen:
self.reopen_conn()
- cursor = self.session.open_cursor(uri, None, "next_random=true")
+ cursor = self.session.open_cursor(uri, None, self.config)
last = ''
match = 0
for i in range(1,10):
- cursor.next()
+ self.assertEqual(cursor.next(), 0)
current = cursor.get_key()
if current == last:
match += 1
@@ -128,23 +131,32 @@ class test_cursor_random(wttest.WiredTigerTestCase):
# Check that opening a random cursor on column-store returns not-supported.
class test_cursor_random_column(wttest.WiredTigerTestCase):
scenarios = check_scenarios([
- ('file', dict(uri='file:random',fmt='r')),
- ('table', dict(uri='table:random',fmt='r')),
+ ('file', dict(uri='file:random')),
+ ('table', dict(uri='table:random'))
])
def test_cursor_random_column(self):
- self.session.create(
- self.uri, 'key_format=' + self.fmt + ',value_format=S')
- cursor = self.session.open_cursor(self.uri, None, "next_random=true")
- self.assertRaises(wiredtiger.WiredTigerError, lambda: cursor.next())
- cursor.close()
+ self.session.create(self.uri, 'key_format=r,value_format=S')
+ msg = '/Operation not supported/'
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda:
+ self.session.open_cursor(self.uri, None, "next_random=true"), msg)
# Check next_random works in the presence a set of updates, some or all of
# which are invisible to the cursor.
class test_cursor_random_invisible(wttest.WiredTigerTestCase):
+ types = [
+ ('file', dict(type='file:random')),
+ ('table', dict(type='table:random'))
+ ]
+ config = [
+ ('sample', dict(config='next_random=true,next_random_sample_size=35')),
+ ('not-sample', dict(config='next_random=true'))
+ ]
+ scenarios =number_scenarios(multiply_scenarios('.', types, config))
+
def test_cursor_random_invisible_all(self):
- uri = 'file:random'
+ uri = self.type
self.session.create(uri, 'key_format=S,value_format=S')
cursor = self.session.open_cursor(uri, None)
@@ -156,11 +168,11 @@ class test_cursor_random_invisible(wttest.WiredTigerTestCase):
# Open another session, the updates won't yet be visible, we shouldn't
# find anything at all.
s = self.conn.open_session()
- cursor = s.open_cursor(uri, None, "next_random=true")
+ cursor = s.open_cursor(uri, None, self.config)
self.assertEqual(cursor.next(), wiredtiger.WT_NOTFOUND)
def test_cursor_random_invisible_after(self):
- uri = 'file:random'
+ uri = self.type
self.session.create(uri, 'key_format=S,value_format=S')
cursor = self.session.open_cursor(uri, None)
@@ -175,12 +187,12 @@ class test_cursor_random_invisible(wttest.WiredTigerTestCase):
# Open another session, the updates won't yet be visible, we should
# return the only possible record.
s = self.conn.open_session()
- cursor = s.open_cursor(uri, None, "next_random=true")
- cursor.next()
+ cursor = s.open_cursor(uri, None, self.config)
+ self.assertEquals(cursor.next(), 0)
self.assertEqual(cursor.get_key(), key_populate(cursor, 1))
def test_cursor_random_invisible_before(self):
- uri = 'file:random'
+ uri = self.type
self.session.create(uri, 'key_format=S,value_format=S')
cursor = self.session.open_cursor(uri, None)
@@ -195,8 +207,8 @@ class test_cursor_random_invisible(wttest.WiredTigerTestCase):
# Open another session, the updates won't yet be visible, we should
# return the only possible record.
s = self.conn.open_session()
- cursor = s.open_cursor(uri, None, "next_random=true")
- cursor.next()
+ cursor = s.open_cursor(uri, None, self.config)
+ self.assertEquals(cursor.next(), 0)
self.assertEqual(cursor.get_key(), key_populate(cursor, 99))
diff --git a/test/suite/test_cursor_random02.py b/test/suite/test_cursor_random02.py
new file mode 100644
index 00000000000..7c9e0e38cb9
--- /dev/null
+++ b/test/suite/test_cursor_random02.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2015 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wttest
+from helper import complex_populate, simple_populate
+from helper import key_populate, value_populate
+from wtscenario import check_scenarios, multiply_scenarios, number_scenarios
+
+# test_cursor_random02.py
+# Cursor next_random operations
+class test_cursor_random02(wttest.WiredTigerTestCase):
+ type = 'table:random'
+ config = [
+ ('not-sample', dict(config='next_random=true'))
+ ]
+ records = [
+ ('1', dict(records=1)),
+ ('250', dict(records=250)),
+ ('500', dict(records=500)),
+ ('5000', dict(records=5000)),
+ ('10000', dict(records=10000)),
+ ('50000', dict(records=50000)),
+ ]
+ scenarios = number_scenarios(multiply_scenarios('.', config, records))
+
+ # Check that next_random works in the presence of a larger set of values,
+ # where the values are in an insert list.
+ def test_cursor_random_reasonable_distribution(self):
+ uri = self.type
+ num_entries = self.records
+
+ # Set the leaf-page-max value, otherwise the page might split.
+ simple_populate(self, uri,
+ 'leaf_page_max=100MB,key_format=S', num_entries)
+ # Setup an array to track which keys are seen
+ visitedKeys = [0] * (num_entries + 1)
+
+ cursor = self.session.open_cursor(uri, None, 'next_random=true')
+ for i in range(0, num_entries):
+ self.assertEqual(cursor.next(), 0)
+ current = cursor.get_key()
+ current = int(current)
+ visitedKeys[current] = visitedKeys[current] + 1
+
+ differentKeys = sum(x > 0 for x in visitedKeys)
+
+ #print visitedKeys
+ #print differentKeys
+ '''
+ self.tty('differentKeys: ' + str(differentKeys) + ' of ' + \
+ str(num_entries) + ', ' + \
+ str((int)((differentKeys * 100) / num_entries)) + '%')
+ '''
+
+ self.assertGreater(differentKeys, num_entries / 4,
+ 'next_random random distribution not adequate')
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/test/suite/test_jsondump02.py b/test/suite/test_jsondump02.py
index 790f651fd2f..ac81e0729e5 100644
--- a/test/suite/test_jsondump02.py
+++ b/test/suite/test_jsondump02.py
@@ -209,7 +209,7 @@ class test_jsondump02(wttest.WiredTigerTestCase):
self.check_json(self.table_uri3, (
('"key0" : 1', '"value0" : "\\u0001\\u0002\\u0003"'),
('"key0" : 2',
- '"value0" : "\\u0077\\u0088\\u0099\\u0000\\u00FF\\u00FE"')))
+ '"value0" : "\\u0077\\u0088\\u0099\\u0000\\u00ff\\u00fe"')))
self.check_json(self.table_uri4, (
('"ikey" : 1,\n"Skey" : "key1"',
'"S1" : "val1",\n"i2" : 1,\n"S3" : "val1",\n"i4" : 1'),
diff --git a/test/suite/test_txn08.py b/test/suite/test_txn08.py
index d35a0c70b3b..8ee48104231 100644
--- a/test/suite/test_txn08.py
+++ b/test/suite/test_txn08.py
@@ -82,6 +82,11 @@ class test_txn08(wttest.WiredTigerTestCase, suite_subprocess):
self.runWt(['printlog'], outfilename='printlog.out')
self.check_file_contains('printlog.out',
'\\u0001\\u0002abcd\\u0003\\u0004')
+ self.runWt(['printlog', '-x'], outfilename='printlog-hex.out')
+ self.check_file_contains('printlog-hex.out',
+ '\\u0001\\u0002abcd\\u0003\\u0004')
+ self.check_file_contains('printlog-hex.out',
+ '0102616263640304')
if __name__ == '__main__':
wttest.run()