summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Gorrod <alexander.gorrod@mongodb.com>2017-02-17 11:16:55 +1100
committerAlex Gorrod <alexander.gorrod@mongodb.com>2017-02-17 11:16:55 +1100
commite1bcc30da91eedd0b17cebb725cc7e607ffa2340 (patch)
treef4cf5e09a0b9f9ec6d9e36eeb124d976f5c5b107
parent48a3cbc17fa902528217287fd075c87efb44aebc (diff)
parent8a1adcc4a1c4c25e1270290a8eb21173f41e83a9 (diff)
downloadmongodb-3.5.4.tar.gz
Merge branch 'develop' into mongodb-3.6mongodb-3.5.4
-rw-r--r--bench/wtperf/config.c42
-rw-r--r--bench/wtperf/idle_table_cycle.c2
-rw-r--r--bench/wtperf/stress/btree-split-stress.wtperf3
-rw-r--r--bench/wtperf/wtperf.c163
-rw-r--r--bench/wtperf/wtperf.h4
-rw-r--r--bench/wtperf/wtperf_opt.i10
-rw-r--r--build_posix/Make.subdirs1
-rw-r--r--dist/api_data.py3
-rw-r--r--dist/filelist1
-rw-r--r--dist/flags.py7
-rw-r--r--dist/s_define.list2
-rwxr-xr-xdist/s_stat3
-rw-r--r--dist/s_string.ok7
-rwxr-xr-xdist/s_void10
-rw-r--r--dist/stat_data.py4
-rw-r--r--examples/c/ex_file_system.c13
-rw-r--r--ext/test/fail_fs/Makefile.am9
-rw-r--r--ext/test/fail_fs/fail_fs.c847
-rw-r--r--src/async/async_api.c5
-rw-r--r--src/async/async_worker.c2
-rw-r--r--src/btree/bt_cursor.c136
-rw-r--r--src/btree/bt_debug.c4
-rw-r--r--src/btree/bt_random.c427
-rw-r--r--src/btree/bt_split.c233
-rw-r--r--src/btree/bt_walk.c4
-rw-r--r--src/btree/row_srch.c212
-rw-r--r--src/checksum/power8/crc32_wrapper.c4
-rw-r--r--src/checksum/zseries/crc32-s390x.c26
-rw-r--r--src/config/config_def.c60
-rw-r--r--src/conn/conn_api.c11
-rw-r--r--src/conn/conn_cache.c6
-rw-r--r--src/conn/conn_cache_pool.c8
-rw-r--r--src/conn/conn_ckpt.c26
-rw-r--r--src/conn/conn_dhandle.c55
-rw-r--r--src/conn/conn_handle.c21
-rw-r--r--src/conn/conn_log.c50
-rw-r--r--src/conn/conn_open.c42
-rw-r--r--src/conn/conn_stat.c33
-rw-r--r--src/conn/conn_sweep.c26
-rw-r--r--src/cursor/cur_backup.c8
-rw-r--r--src/cursor/cur_index.c26
-rw-r--r--src/cursor/cur_std.c7
-rw-r--r--src/cursor/cur_table.c2
-rw-r--r--src/docs/cursor-random.dox5
-rw-r--r--src/docs/upgrading.dox6
-rw-r--r--src/docs/wtperf.dox6
-rw-r--r--src/evict/evict_lru.c617
-rw-r--r--src/evict/evict_stat.c2
-rw-r--r--src/include/btmem.h8
-rw-r--r--src/include/btree.i28
-rw-r--r--src/include/cache.h2
-rw-r--r--src/include/cache.i2
-rw-r--r--src/include/connection.h8
-rw-r--r--src/include/dhandle.h18
-rw-r--r--src/include/extern.h25
-rw-r--r--src/include/extern_posix.h4
-rw-r--r--src/include/extern_win.h4
-rw-r--r--src/include/flags.h79
-rw-r--r--src/include/log.h3
-rw-r--r--src/include/misc.i5
-rw-r--r--src/include/mutex.h4
-rw-r--r--src/include/packing.i7
-rw-r--r--src/include/schema.h162
-rw-r--r--src/include/session.h2
-rw-r--r--src/include/stat.h4
-rw-r--r--src/include/wiredtiger.in251
-rw-r--r--src/log/log.c42
-rw-r--r--src/log/log_slot.c206
-rw-r--r--src/lsm/lsm_cursor.c4
-rw-r--r--src/lsm/lsm_manager.c12
-rw-r--r--src/lsm/lsm_stat.c4
-rw-r--r--src/lsm/lsm_tree.c63
-rw-r--r--src/lsm/lsm_work_unit.c4
-rw-r--r--src/lsm/lsm_worker.c2
-rw-r--r--src/os_posix/os_mtx_cond.c28
-rw-r--r--src/os_win/os_mtx_cond.c43
-rw-r--r--src/schema/schema_drop.c2
-rw-r--r--src/schema/schema_list.c2
-rw-r--r--src/schema/schema_rename.c2
-rw-r--r--src/schema/schema_worker.c2
-rw-r--r--src/session/session_api.c56
-rw-r--r--src/session/session_dhandle.c43
-rw-r--r--src/support/cond_auto.c80
-rw-r--r--src/support/rand.c12
-rw-r--r--src/support/stat.c16
-rw-r--r--src/support/thread_group.c2
-rw-r--r--src/txn/txn.c95
-rw-r--r--src/txn/txn_ckpt.c39
-rw-r--r--src/txn/txn_log.c4
-rw-r--r--src/utilities/util.h2
-rw-r--r--src/utilities/util_alter.c9
-rw-r--r--src/utilities/util_compact.c14
-rw-r--r--src/utilities/util_create.c12
-rw-r--r--src/utilities/util_drop.c10
-rw-r--r--src/utilities/util_dump.c26
-rw-r--r--src/utilities/util_list.c21
-rw-r--r--src/utilities/util_load.c2
-rw-r--r--src/utilities/util_load_json.c2
-rw-r--r--src/utilities/util_loadtext.c13
-rw-r--r--src/utilities/util_main.c4
-rw-r--r--src/utilities/util_printlog.c14
-rw-r--r--src/utilities/util_read.c19
-rw-r--r--src/utilities/util_rebalance.c30
-rw-r--r--src/utilities/util_rename.c15
-rw-r--r--src/utilities/util_salvage.c30
-rw-r--r--src/utilities/util_stat.c6
-rw-r--r--src/utilities/util_truncate.c11
-rw-r--r--src/utilities/util_upgrade.c30
-rw-r--r--src/utilities/util_verify.c34
-rw-r--r--src/utilities/util_write.c20
-rw-r--r--test/csuite/Makefile.am12
-rw-r--r--test/csuite/wt2909_checkpoint_integrity/main.c666
-rw-r--r--test/csuite/wt3120_filesys/main.c99
-rw-r--r--test/csuite/wt3135_search_near_collator/main.c360
-rw-r--r--test/csuite/wt3184_dup_index_collator/main.c168
-rw-r--r--test/suite/test_cursor_random.py49
-rw-r--r--test/suite/test_reconfig04.py2
-rw-r--r--test/suite/test_sweep01.py7
-rw-r--r--test/utility/misc.c2
-rw-r--r--test/utility/test_util.h2
120 files changed, 4594 insertions, 1677 deletions
diff --git a/bench/wtperf/config.c b/bench/wtperf/config.c
index a15a3485dde..9eea99eeec4 100644
--- a/bench/wtperf/config.c
+++ b/bench/wtperf/config.c
@@ -215,6 +215,7 @@ config_threads(WTPERF *wtperf, const char *config, size_t len)
return (EINVAL);
}
workp = &wtperf->workload[wtperf->workload_cnt++];
+ workp->table_index = INT32_MAX;
while ((ret = scan->next(scan, &k, &v)) == 0) {
if (STRING_MATCH("count", k.str, k.len)) {
@@ -233,12 +234,28 @@ config_threads(WTPERF *wtperf, const char *config, size_t len)
goto err;
continue;
}
+ if (STRING_MATCH("pause", k.str, k.len)) {
+ if ((workp->pause = v.val) < 0)
+ goto err;
+ continue;
+ }
if (STRING_MATCH("read", k.str, k.len) ||
STRING_MATCH("reads", k.str, k.len)) {
if ((workp->read = v.val) < 0)
goto err;
continue;
}
+ if (STRING_MATCH("read_range", k.str, k.len)) {
+ if ((workp->read_range = v.val) < 0)
+ goto err;
+ continue;
+ }
+ if (STRING_MATCH("table", k.str, k.len)) {
+ if (v.val <= 0)
+ goto err;
+ workp->table_index = (int32_t)v.val - 1;
+ continue;
+ }
if (STRING_MATCH("throttle", k.str, k.len)) {
workp->throttle = (uint64_t)v.val;
continue;
@@ -760,16 +777,33 @@ config_sanity(WTPERF *wtperf)
opts->value_sz_min = opts->value_sz;
}
- if (opts->readonly && wtperf->workload != NULL)
+ if (wtperf->workload != NULL)
for (i = 0, workp = wtperf->workload;
- i < wtperf->workload_cnt; ++i, ++workp)
- if (workp->insert != 0 || workp->update != 0 ||
- workp->truncate != 0) {
+ i < wtperf->workload_cnt; ++i, ++workp) {
+ if (opts->readonly &&
+ (workp->insert != 0 || workp->update != 0 ||
+ workp->truncate != 0)) {
fprintf(stderr,
"Invalid workload: insert, update or "
"truncate specified with readonly\n");
return (EINVAL);
}
+ if (workp->insert != 0 &&
+ workp->table_index != INT32_MAX) {
+ fprintf(stderr,
+ "Invalid workload: Cannot insert into "
+ "specific table only\n");
+ return (EINVAL);
+ }
+ if (workp->table_index != INT32_MAX &&
+ workp->table_index >= (int32_t)opts->table_count) {
+ fprintf(stderr,
+ "Workload table index %" PRId32
+ " is larger than table count %" PRId32,
+ workp->table_index, opts->table_count);
+ return (EINVAL);
+ }
+ }
return (0);
}
diff --git a/bench/wtperf/idle_table_cycle.c b/bench/wtperf/idle_table_cycle.c
index 13fa55e86f5..bb44cfbde59 100644
--- a/bench/wtperf/idle_table_cycle.c
+++ b/bench/wtperf/idle_table_cycle.c
@@ -120,6 +120,7 @@ cycle_idle_tables(void *arg)
return (NULL);
start = stop;
+#if 1
/*
* Drop the table. Keep retrying on EBUSY failure - it is an
* expected return when checkpoints are happening.
@@ -136,6 +137,7 @@ cycle_idle_tables(void *arg)
}
if (check_timing(wtperf, "drop", start, &stop) != 0)
return (NULL);
+#endif
}
return (NULL);
diff --git a/bench/wtperf/stress/btree-split-stress.wtperf b/bench/wtperf/stress/btree-split-stress.wtperf
index 86bb288fc6d..eb6ca1cfddc 100644
--- a/bench/wtperf/stress/btree-split-stress.wtperf
+++ b/bench/wtperf/stress/btree-split-stress.wtperf
@@ -6,5 +6,4 @@ run_time=300
reopen_connection=false
populate_threads=2
value_sz=256
-read_range=100
-threads=((count=4,inserts=1,throttle=100000),(count=8,reads=1))
+threads=((count=4,inserts=1,throttle=100000),(count=8,reads=1,read_range=100))
diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c
index baa259f8817..7f5e5ad3373 100644
--- a/bench/wtperf/wtperf.c
+++ b/bench/wtperf/wtperf.c
@@ -432,19 +432,17 @@ err: wtperf->error = wtperf->stop = true;
* search do them. Ensuring the keys we see are always in order.
*/
static int
-do_range_reads(WTPERF *wtperf, WT_CURSOR *cursor)
+do_range_reads(WTPERF *wtperf, WT_CURSOR *cursor, int64_t read_range)
{
- CONFIG_OPTS *opts;
- size_t range;
uint64_t next_val, prev_val;
+ int64_t range;
char *range_key_buf;
char buf[512];
int ret;
- opts = wtperf->opts;
ret = 0;
- if (opts->read_range == 0)
+ if (read_range == 0)
return (0);
memset(&buf[0], 0, 512 * sizeof(char));
@@ -454,7 +452,7 @@ do_range_reads(WTPERF *wtperf, WT_CURSOR *cursor)
testutil_check(cursor->get_key(cursor, &range_key_buf));
extract_key(range_key_buf, &next_val);
- for (range = 0; range < opts->read_range; ++range) {
+ for (range = 0; range < read_range; ++range) {
prev_val = next_val;
ret = cursor->next(cursor);
/* We are done if we reach the end. */
@@ -475,12 +473,56 @@ do_range_reads(WTPERF *wtperf, WT_CURSOR *cursor)
return (0);
}
+/* pre_load_data --
+ * Pull everything into cache before starting the workload phase.
+ */
+static int
+pre_load_data(WTPERF *wtperf)
+{
+ CONFIG_OPTS *opts;
+ WT_CONNECTION *conn;
+ WT_CURSOR *cursor;
+ WT_SESSION *session;
+ char *key;
+ int ret;
+ size_t i;
+
+ opts = wtperf->opts;
+ conn = wtperf->conn;
+
+ if ((ret = conn->open_session(
+ conn, NULL, opts->sess_config, &session)) != 0) {
+ lprintf(wtperf, ret, 0, "worker: WT_CONNECTION.open_session");
+ goto err;
+ }
+ for (i = 0; i < opts->table_count; i++) {
+ if ((ret = session->open_cursor(session,
+ wtperf->uris[i], NULL, NULL, &cursor)) != 0) {
+ lprintf(wtperf, ret, 0,
+ "worker: WT_SESSION.open_cursor: %s",
+ wtperf->uris[i]);
+ goto err;
+ }
+ while (cursor->next(cursor) == 0)
+ if ((ret = cursor->get_key(cursor, &key)) != 0)
+ goto err;
+ if ((ret = cursor->close(cursor)) != 0)
+ goto err;
+ }
+ if ((ret = session->close(session, NULL)) != 0)
+ goto err;
+ if (ret != 0)
+err: lprintf(wtperf, ret, 0, "Pre-workload traverse error");
+ return (ret);
+}
+
static void *
worker(void *arg)
{
struct timespec start, stop;
CONFIG_OPTS *opts;
TRACK *trk;
+ WORKLOAD *workload;
WTPERF *wtperf;
WTPERF_THREAD *thread;
WT_CONNECTION *conn;
@@ -495,13 +537,14 @@ worker(void *arg)
char buf[512];
thread = (WTPERF_THREAD *)arg;
+ workload = thread->workload;
wtperf = thread->wtperf;
opts = wtperf->opts;
conn = wtperf->conn;
cursors = NULL;
- log_table_cursor = NULL; /* -Wconditional-initialized */
+ cursor = log_table_cursor = NULL; /* -Wconditional-initialized */
ops = 0;
- ops_per_txn = thread->workload->ops_per_txn;
+ ops_per_txn = workload->ops_per_txn;
session = NULL;
trk = NULL;
@@ -510,7 +553,6 @@ worker(void *arg)
lprintf(wtperf, ret, 0, "worker: WT_CONNECTION.open_session");
goto err;
}
- cursors = dcalloc(opts->table_count, sizeof(WT_CURSOR *));
for (i = 0; i < opts->table_count_idle; i++) {
snprintf(buf, 512, "%s_idle%05d", wtperf->uris[0], (int)i);
if ((ret = session->open_cursor(
@@ -525,14 +567,34 @@ worker(void *arg)
goto err;
}
}
- for (i = 0; i < opts->table_count; i++) {
+ if (workload->table_index != INT32_MAX) {
if ((ret = session->open_cursor(session,
- wtperf->uris[i], NULL, NULL, &cursors[i])) != 0) {
+ wtperf->uris[workload->table_index],
+ NULL, NULL, &cursor)) != 0) {
lprintf(wtperf, ret, 0,
"worker: WT_SESSION.open_cursor: %s",
- wtperf->uris[i]);
+ wtperf->uris[workload->table_index]);
+ goto err;
+ }
+ if ((ret = session->open_cursor(session,
+ wtperf->uris[workload->table_index],
+ NULL, "next_random=true", &thread->rand_cursor)) != 0) {
+ lprintf(wtperf, ret, 0,
+ "worker: WT_SESSION.open_cursor: random %s",
+ wtperf->uris[workload->table_index]);
goto err;
}
+ } else {
+ cursors = dcalloc(opts->table_count, sizeof(WT_CURSOR *));
+ for (i = 0; i < opts->table_count; i++) {
+ if ((ret = session->open_cursor(session,
+ wtperf->uris[i], NULL, NULL, &cursors[i])) != 0) {
+ lprintf(wtperf, ret, 0,
+ "worker: WT_SESSION.open_cursor: %s",
+ wtperf->uris[i]);
+ goto err;
+ }
+ }
}
if (opts->log_like_table && (ret = session->open_cursor(session,
wtperf->log_table_uri, NULL, NULL, &log_table_cursor)) != 0) {
@@ -543,19 +605,19 @@ worker(void *arg)
}
/* Setup the timer for throttling. */
- if (thread->workload->throttle != 0)
+ if (workload->throttle != 0)
setup_throttle(thread);
/* Setup for truncate */
- if (thread->workload->truncate != 0)
+ if (workload->truncate != 0)
if ((ret = setup_truncate(wtperf, thread, session)) != 0)
goto err;
key_buf = thread->key_buf;
value_buf = thread->value_buf;
- op = thread->workload->ops;
- op_end = op + sizeof(thread->workload->ops);
+ op = workload->ops;
+ op_end = op + sizeof(workload->ops);
if ((ops_per_txn != 0 || opts->log_like_table) &&
(ret = session->begin_transaction(session, NULL)) != 0) {
@@ -564,6 +626,8 @@ worker(void *arg)
}
while (!wtperf->stop) {
+ if (workload->pause != 0)
+ (void)sleep((unsigned int)workload->pause);
/*
* Generate the next key and setup operation specific
* statistics tracking objects.
@@ -603,10 +667,12 @@ worker(void *arg)
generate_key(opts, key_buf, next_val);
- /*
- * Spread the data out around the multiple databases.
- */
- cursor = cursors[map_key_to_table(wtperf->opts, next_val)];
+ if (workload->table_index == INT32_MAX)
+ /*
+ * Spread the data out around the multiple databases.
+ */
+ cursor = cursors[
+ map_key_to_table(wtperf->opts, next_val)];
/*
* Skip the first time we do an operation, when trk->ops
@@ -642,7 +708,8 @@ worker(void *arg)
* for several operations, confirming that the
* next key is in the correct order.
*/
- ret = do_range_reads(wtperf, cursor);
+ ret = do_range_reads(wtperf,
+ cursor, workload->read_range);
}
if (ret == 0 || ret == WT_NOTFOUND)
@@ -689,7 +756,7 @@ worker(void *arg)
*/
strncpy(value_buf,
value, opts->value_sz_max - 1);
- if (thread->workload->update_delta != 0)
+ if (workload->update_delta != 0)
update_value_delta(thread);
if (value_buf[0] == 'a')
value_buf[0] = 'b';
@@ -806,7 +873,7 @@ op_err: if (ret == WT_ROLLBACK && ops_per_txn != 0) {
/* Schedule the next operation */
if (++op == op_end)
- op = thread->workload->ops;
+ op = workload->ops;
/*
* Decrement throttle ops and check if we should sleep
@@ -843,7 +910,7 @@ run_mix_schedule_op(WORKLOAD *workp, int op, int64_t op_cnt)
uint8_t *p, *end;
/* Jump around the array to roughly spread out the operations. */
- jump = 100 / op_cnt;
+ jump = (int)(100 / op_cnt);
/*
* Find a read operation and replace it with another operation. This
@@ -884,17 +951,6 @@ run_mix_schedule(WTPERF *wtperf, WORKLOAD *workp)
opts = wtperf->opts;
- /* Confirm reads, inserts, truncates and updates cannot all be zero. */
- if (workp->insert == 0 && workp->read == 0 &&
- workp->truncate == 0 && workp->update == 0) {
- lprintf(wtperf, EINVAL, 0, "no operations scheduled");
- return (EINVAL);
- }
-
- /*
- * Handle truncate first - it's a special case that can't be used in
- * a mixed workload.
- */
if (workp->truncate != 0) {
if (workp->insert != 0 ||
workp->read != 0 || workp->update != 0) {
@@ -906,6 +962,12 @@ run_mix_schedule(WTPERF *wtperf, WORKLOAD *workp)
return (0);
}
+ /* Confirm reads, inserts and updates cannot all be zero. */
+ if (workp->insert == 0 && workp->read == 0 && workp->update == 0) {
+ lprintf(wtperf, EINVAL, 0, "no operations scheduled");
+ return (EINVAL);
+ }
+
/*
* Check for a simple case where the thread is only doing insert or
* update operations (because the default operation for a
@@ -2244,6 +2306,8 @@ start_run(WTPERF *wtperf)
opts->checkpoint_threads, checkpoint_worker) != 0)
goto err;
}
+ if (opts->pre_load_data && (ret = pre_load_data(wtperf)) != 0)
+ goto err;
/* Execute the workload. */
if ((ret = execute_workload(wtperf)) != 0)
goto err;
@@ -2827,14 +2891,43 @@ static uint64_t
wtperf_rand(WTPERF_THREAD *thread)
{
CONFIG_OPTS *opts;
+ WT_CURSOR *rnd_cursor;
WTPERF *wtperf;
double S1, S2, U;
uint64_t rval;
+ int ret;
+ char *key_buf;
wtperf = thread->wtperf;
opts = wtperf->opts;
/*
+ * If we have a random cursor set up then use it.
+ */
+ if ((rnd_cursor = thread->rand_cursor) != NULL) {
+ if ((ret = rnd_cursor->next(rnd_cursor)) != 0) {
+ lprintf(wtperf, ret, 0, "worker: rand next failed");
+ /* 0 is outside the expected range. */
+ return (0);
+ }
+ if ((ret = rnd_cursor->get_key(rnd_cursor, &key_buf)) != 0) {
+ lprintf(wtperf, ret, 0,
+ "worker: rand next key retrieval");
+ return (0);
+ }
+ /*
+ * Resetting the cursor is not fatal. We still return the
+ * value we retrieved above. We do it so that we don't
+ * leave a cursor positioned.
+ */
+ if ((ret = rnd_cursor->reset(rnd_cursor)) != 0)
+ lprintf(wtperf, ret, 0,
+ "worker: rand cursor reset failed");
+ extract_key(key_buf, &rval);
+ return (rval);
+ }
+
+ /*
* Use WiredTiger's random number routine: it's lock-free and fairly
* good.
*/
diff --git a/bench/wtperf/wtperf.h b/bench/wtperf/wtperf.h
index 81d74e134f6..3efb8ab700e 100644
--- a/bench/wtperf/wtperf.h
+++ b/bench/wtperf/wtperf.h
@@ -66,6 +66,9 @@ typedef struct {
uint64_t throttle; /* Maximum operations/second */
/* Number of operations per transaction. Zero for autocommit */
int64_t ops_per_txn;
+ int64_t pause; /* Time between scans */
+ int64_t read_range; /* Range of reads */
+ int32_t table_index; /* Table to focus ops on */
int64_t truncate; /* Truncate ratio */
uint64_t truncate_pct; /* Truncate Percent */
uint64_t truncate_count; /* Truncate Count */
@@ -225,6 +228,7 @@ typedef struct {
struct __wtperf_thread { /* Per-thread structure */
WTPERF *wtperf; /* Enclosing configuration */
+ WT_CURSOR *rand_cursor; /* Random key cursor */
WT_RAND_STATE rnd; /* Random number generation state */
diff --git a/bench/wtperf/wtperf_opt.i b/bench/wtperf/wtperf_opt.i
index 680eb53a90e..63cef4c28fb 100644
--- a/bench/wtperf/wtperf_opt.i
+++ b/bench/wtperf/wtperf_opt.i
@@ -145,12 +145,13 @@ DEF_OPT_AS_UINT32(populate_ops_per_txn, 0,
"phase, zero for auto-commit")
DEF_OPT_AS_UINT32(populate_threads, 1,
"number of populate threads, 1 for bulk load")
+DEF_OPT_AS_BOOL(pre_load_data, 0,
+ "Scan all data prior to starting the workload phase to warm the cache")
DEF_OPT_AS_UINT32(random_range, 0,
"if non zero choose a value from within this range as the key for "
"insert operations")
DEF_OPT_AS_BOOL(random_value, 0, "generate random content for the value")
DEF_OPT_AS_BOOL(range_partition, 0, "partition data by range (vs hash)")
-DEF_OPT_AS_UINT32(read_range, 0, "scan a range of keys after each search")
DEF_OPT_AS_BOOL(readonly, 0,
"reopen the connection between populate and workload phases in readonly "
"mode. Requires reopen_connection turned on (default). Requires that "
@@ -192,9 +193,10 @@ DEF_OPT_AS_STRING(threads, "", "workload configuration: each 'count' "
"'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' "
"which would create 2 threads doing nothing but reads and 8 threads "
"each doing 50% inserts and 25% reads and updates. Allowed configuration "
- "values are 'count', 'throttle', 'update_delta', 'reads', 'inserts', "
- "'updates', 'truncate', 'truncate_pct' and 'truncate_count'. There are "
- "also behavior modifiers, supported modifiers are 'ops_per_txn'")
+ "values are 'count', 'throttle', 'update_delta', 'reads', 'read_range', "
+ "'inserts', 'updates', 'truncate', 'truncate_pct' and 'truncate_count'. "
+ "There are also behavior modifiers, supported modifiers are "
+ "'ops_per_txn'")
DEF_OPT_AS_CONFIG_STRING(transaction_config, "",
"WT_SESSION.begin_transaction configuration string, applied during the "
"populate phase when populate_ops_per_txn is nonzero")
diff --git a/build_posix/Make.subdirs b/build_posix/Make.subdirs
index 01f23dcbbc1..4ecec37ca6c 100644
--- a/build_posix/Make.subdirs
+++ b/build_posix/Make.subdirs
@@ -17,6 +17,7 @@ ext/encryptors/nop
ext/encryptors/rotn
ext/extractors/csv
ext/test/kvs_bdb HAVE_BERKELEY_DB
+ext/test/fail_fs
.
api/leveldb LEVELDB
examples/c
diff --git a/dist/api_data.py b/dist/api_data.py
index 324d1e4f281..1d669fa7fe0 100644
--- a/dist/api_data.py
+++ b/dist/api_data.py
@@ -524,6 +524,7 @@ connection_runtime_config = [
'checkpoint',
'compact',
'evict',
+ 'evict_stuck',
'evictserver',
'fileops',
'handleops',
@@ -717,7 +718,7 @@ wiredtiger_open_common =\
]),
Config('extensions', '', r'''
list of shared library extensions to load (using dlopen).
- Any values specified to an library extension are passed to
+ Any values specified to a library extension are passed to
WT_CONNECTION::load_extension as the \c config parameter
(for example,
<code>extensions=(/path/ext.so={entry=my_entry})</code>)''',
diff --git a/dist/filelist b/dist/filelist
index 13d67ef961b..3886035eaa9 100644
--- a/dist/filelist
+++ b/dist/filelist
@@ -30,6 +30,7 @@ src/btree/bt_io.c
src/btree/bt_misc.c
src/btree/bt_ovfl.c
src/btree/bt_page.c
+src/btree/bt_random.c
src/btree/bt_read.c
src/btree/bt_rebalance.c
src/btree/bt_ret.c
diff --git a/dist/flags.py b/dist/flags.py
index 70e18712839..b20a7181532 100644
--- a/dist/flags.py
+++ b/dist/flags.py
@@ -64,6 +64,7 @@ flags = {
'VERB_COMPACT',
'VERB_EVICT',
'VERB_EVICTSERVER',
+ 'VERB_EVICT_STUCK',
'VERB_FILEOPS',
'VERB_HANDLEOPS',
'VERB_LOG',
@@ -116,12 +117,14 @@ flags = {
'SESSION_CAN_WAIT',
'SESSION_INTERNAL',
'SESSION_LOCKED_CHECKPOINT',
- 'SESSION_LOCKED_HANDLE_LIST',
+ 'SESSION_LOCKED_HANDLE_LIST_READ',
+ 'SESSION_LOCKED_HANDLE_LIST_WRITE',
'SESSION_LOCKED_METADATA',
'SESSION_LOCKED_PASS',
'SESSION_LOCKED_SCHEMA',
'SESSION_LOCKED_SLOT',
- 'SESSION_LOCKED_TABLE',
+ 'SESSION_LOCKED_TABLE_READ',
+ 'SESSION_LOCKED_TABLE_WRITE',
'SESSION_LOCKED_TURTLE',
'SESSION_LOGGING_INMEM',
'SESSION_LOOKASIDE_CURSOR',
diff --git a/dist/s_define.list b/dist/s_define.list
index 53a3df87615..8911d888077 100644
--- a/dist/s_define.list
+++ b/dist/s_define.list
@@ -39,6 +39,8 @@ WT_PADDING_CHECK
WT_READ_BARRIER
WT_REF_SIZE
WT_SESSION_LOCKED_CHECKPOINT
+WT_SESSION_LOCKED_TABLE_READ
+WT_SESSION_LOCKED_TABLE_WRITE
WT_SESSION_LOCKED_TURTLE
WT_SIZE_CHECK
WT_STATS_FIELD_TO_OFFSET
diff --git a/dist/s_stat b/dist/s_stat
index 5d5937e1833..6aeeca6faa6 100755
--- a/dist/s_stat
+++ b/dist/s_stat
@@ -25,9 +25,6 @@ cat << UNUSED_STAT_FIELDS
lock_checkpoint_count
lock_checkpoint_wait_application
lock_checkpoint_wait_internal
-lock_handle_list_count
-lock_handle_list_wait_application
-lock_handle_list_wait_internal
lock_metadata_count
lock_metadata_wait_application
lock_metadata_wait_internal
diff --git a/dist/s_string.ok b/dist/s_string.ok
index 2b998c27813..e033f77327f 100644
--- a/dist/s_string.ok
+++ b/dist/s_string.ok
@@ -770,6 +770,7 @@ idx
ifdef
ifdef's
iiSii
+iiiS
iiii
iiu
ikey
@@ -1138,6 +1139,7 @@ subgetraw
subgets
subinit
sublicense
+subtest
subtree
sunique
superset
@@ -1182,6 +1184,7 @@ txt
typedef
uB
uS
+ui
uint
uintmax
unbare
@@ -1217,6 +1220,7 @@ upg
uri
uri's
uris
+usec
usecs
usedp
userbad
@@ -1247,6 +1251,9 @@ vunpack
vw
vxr
waitpid
+waker
+wakeup
+wakeups
walk's
warmup
wb
diff --git a/dist/s_void b/dist/s_void
index 025f6d4c7eb..90425d5a718 100755
--- a/dist/s_void
+++ b/dist/s_void
@@ -78,10 +78,20 @@ func_ok()
-e '/int demo_file_sync$/d' \
-e '/int demo_fs_directory_list_free$/d' \
-e '/int demo_fs_exist$/d' \
+ -e '/int fail_file_lock$/d' \
+ -e '/int fail_file_sync$/d' \
+ -e '/int fail_fs_directory_list_free$/d' \
+ -e '/int fail_fs_exist$/d' \
+ -e '/int fail_fs_simulate_fail$/d' \
+ -e '/int fail_fs_terminate$/d' \
-e '/int handle_message$/d' \
-e '/int handle_progress$/d' \
-e '/int helium_cursor_reset$/d' \
-e '/int helium_session_verify$/d' \
+ -e '/int index_compare_primary$/d' \
+ -e '/int index_compare_S$/d' \
+ -e '/int index_compare_u$/d' \
+ -e '/int index_extractor_u$/d' \
-e '/int log_print_err$/d' \
-e '/int lz4_error$/d' \
-e '/int lz4_pre_size$/d' \
diff --git a/dist/stat_data.py b/dist/stat_data.py
index 0af5d6d017e..a4d92345f88 100644
--- a/dist/stat_data.py
+++ b/dist/stat_data.py
@@ -288,9 +288,7 @@ connection_stats = [
LockStat('lock_checkpoint_count', 'checkpoint lock acquisitions'),
LockStat('lock_checkpoint_wait_application', 'checkpoint lock application thread wait time (usecs)'),
LockStat('lock_checkpoint_wait_internal', 'checkpoint lock internal thread wait time (usecs)'),
- LockStat('lock_handle_list_count', 'handle-list lock acquisitions'),
- LockStat('lock_handle_list_wait_application', 'handle-list lock application thread wait time (usecs)'),
- LockStat('lock_handle_list_wait_internal', 'handle-list lock internal thread wait time (usecs)'),
+ LockStat('lock_handle_list_wait_eviction', 'handle-list lock eviction thread wait time (usecs)'),
LockStat('lock_metadata_count', 'metadata lock acquisitions'),
LockStat('lock_metadata_wait_application', 'metadata lock application thread wait time (usecs)'),
LockStat('lock_metadata_wait_internal', 'metadata lock internal thread wait time (usecs)'),
diff --git a/examples/c/ex_file_system.c b/examples/c/ex_file_system.c
index 56869171558..e807ac54d3b 100644
--- a/examples/c/ex_file_system.c
+++ b/examples/c/ex_file_system.c
@@ -399,6 +399,7 @@ demo_fs_directory_list(WT_FILE_SYSTEM *file_system,
uint32_t allocated, count;
int ret = 0;
char *name, **entries;
+ void *p;
(void)session; /* Unused */
@@ -424,14 +425,16 @@ demo_fs_directory_list(WT_FILE_SYSTEM *file_system,
* matter if the list is a bit longer than necessary.
*/
if (count >= allocated) {
- entries = realloc(
- entries, (allocated + 10) * sizeof(char *));
- if (entries == NULL) {
+ p = realloc(
+ entries, (allocated + 10) * sizeof(*entries));
+ if (p == NULL) {
ret = ENOMEM;
goto err;
}
- memset(entries + allocated * sizeof(char *),
- 0, 10 * sizeof(char *));
+
+ entries = p;
+ memset(entries + allocated * sizeof(*entries),
+ 0, 10 * sizeof(*entries));
allocated += 10;
}
entries[count++] = strdup(name);
diff --git a/ext/test/fail_fs/Makefile.am b/ext/test/fail_fs/Makefile.am
new file mode 100644
index 00000000000..f31f5395cd1
--- /dev/null
+++ b/ext/test/fail_fs/Makefile.am
@@ -0,0 +1,9 @@
+AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include
+
+noinst_LTLIBRARIES = libwiredtiger_fail_fs.la
+libwiredtiger_fail_fs_la_SOURCES = fail_fs.c
+
+# libtool hack: noinst_LTLIBRARIES turns off building shared libraries as well
+# as installation, it will only build static libraries. As far as I can tell,
+# the "approved" libtool way to turn them back on is by adding -rpath.
+libwiredtiger_fail_fs_la_LDFLAGS = -avoid-version -module -rpath /nowhere
diff --git a/ext/test/fail_fs/fail_fs.c b/ext/test/fail_fs/fail_fs.c
new file mode 100644
index 00000000000..d0d8a14c8c2
--- /dev/null
+++ b/ext/test/fail_fs/fail_fs.c
@@ -0,0 +1,847 @@
+/*-
+ * Public Domain 2014-2016 MongoDB, Inc.
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <string.h>
+#include <limits.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <execinfo.h>
+
+#include <wiredtiger_ext.h>
+#include "queue.h"
+
+#define FAIL_FS_GIGABYTE (1024 * 1024 * 1024)
+
+#define FAIL_FS_ENV_ENABLE "WT_FAIL_FS_ENABLE"
+#define FAIL_FS_ENV_WRITE_ALLOW "WT_FAIL_FS_WRITE_ALLOW"
+#define FAIL_FS_ENV_READ_ALLOW "WT_FAIL_FS_READ_ALLOW"
+
+/*
+ * A "fail file system", that is, a file system extension that fails when we
+ * want it to. This is only used in test frameworks, this fact allows us to
+ * simplify some error paths. This code is not portable to Windows, as it has
+ * direct knowledge of file descriptors, environment variables and stack
+ * traces.
+ *
+ * When the filesystem extension is configured, parameters can set how many
+ * reads or writes can be allowed before failure. If this is not fine-grained
+ * enough, an 'environment' configuration parameter can be specified. If that
+ * is used, then on every file system read or write, environment variables are
+ * checked that control when reading or writing should fail.
+ */
+typedef struct {
+ WT_FILE_SYSTEM iface;
+ /*
+ * WiredTiger performs schema and I/O operations in parallel, all file
+ * system and file handle access must be thread-safe. This extension
+ * uses a single, global file system lock.
+ */
+ pthread_rwlock_t lock; /* Lock */
+ bool fail_enabled;
+ bool use_environment;
+ bool verbose;
+ int64_t read_ops;
+ int64_t write_ops;
+ int64_t allow_reads;
+ int64_t allow_writes;
+ /* Queue of file handles */
+ TAILQ_HEAD(fail_file_handle_qh, fail_file_handle) fileq;
+ WT_EXTENSION_API *wtext; /* Extension functions */
+} FAIL_FILE_SYSTEM;
+
+typedef struct fail_file_handle {
+ WT_FILE_HANDLE iface;
+
+ /*
+ * Track the system file descriptor for each file.
+ */
+ FAIL_FILE_SYSTEM *fail_fs; /* Enclosing file system */
+ TAILQ_ENTRY(fail_file_handle) q; /* Queue of handles */
+ int fd; /* System file descriptor */
+} FAIL_FILE_HANDLE;
+
+static int fail_file_close(WT_FILE_HANDLE *, WT_SESSION *);
+static void fail_file_handle_remove(WT_SESSION *, FAIL_FILE_HANDLE *);
+static int fail_file_lock(WT_FILE_HANDLE *, WT_SESSION *, bool);
+static int fail_file_read(
+ WT_FILE_HANDLE *, WT_SESSION *, wt_off_t, size_t, void *);
+static int fail_file_size(WT_FILE_HANDLE *, WT_SESSION *, wt_off_t *);
+static int fail_file_sync(WT_FILE_HANDLE *, WT_SESSION *);
+static int fail_file_truncate(WT_FILE_HANDLE *, WT_SESSION *, wt_off_t);
+static int fail_file_write(
+ WT_FILE_HANDLE *, WT_SESSION *, wt_off_t, size_t, const void *);
+static bool fail_fs_arg(
+ const char *, WT_CONFIG_ITEM *, WT_CONFIG_ITEM *, int64_t *);
+static int fail_fs_directory_list(WT_FILE_SYSTEM *, WT_SESSION *,
+ const char *, const char *, char ***, uint32_t *);
+static int fail_fs_directory_list_free(
+ WT_FILE_SYSTEM *, WT_SESSION *, char **, uint32_t);
+static void fail_fs_env(const char *, int64_t *);
+static int fail_fs_exist(WT_FILE_SYSTEM *, WT_SESSION *, const char *, bool *);
+static int fail_fs_open(WT_FILE_SYSTEM *, WT_SESSION *,
+ const char *, WT_FS_OPEN_FILE_TYPE, uint32_t, WT_FILE_HANDLE **);
+static int fail_fs_remove(
+ WT_FILE_SYSTEM *, WT_SESSION *, const char *, uint32_t);
+static int fail_fs_rename(
+ WT_FILE_SYSTEM *, WT_SESSION *, const char *, const char *, uint32_t);
+static int fail_fs_simulate_fail(
+ FAIL_FILE_HANDLE *, WT_SESSION *, int64_t, const char *);
+static int fail_fs_size(
+ WT_FILE_SYSTEM *, WT_SESSION *, const char *, wt_off_t *);
+static int fail_fs_terminate(WT_FILE_SYSTEM *, WT_SESSION *);
+
+/*
+ * We use pthread functions for portable locking.
+ * Assert on errors for simplicity.
+ */
+static void
+fail_fs_allocate_lock(pthread_rwlock_t *lockp)
+{
+ assert(pthread_rwlock_init(lockp, NULL) == 0);
+}
+
+static void
+fail_fs_destroy_lock(pthread_rwlock_t *lockp)
+{
+ assert(pthread_rwlock_destroy(lockp) == 0);
+}
+
+static void
+fail_fs_lock(pthread_rwlock_t *lockp)
+{
+ assert(pthread_rwlock_wrlock(lockp) == 0);
+}
+
+static void
+fail_fs_unlock(pthread_rwlock_t *lockp)
+{
+ assert(pthread_rwlock_unlock(lockp) == 0);
+}
+
+/*
+ * fail_file_close --
+ * ANSI C close.
+ */
+static int
+fail_file_close(WT_FILE_HANDLE *file_handle, WT_SESSION *session)
+{
+ FAIL_FILE_HANDLE *fail_fh;
+ FAIL_FILE_SYSTEM *fail_fs;
+ int ret;
+
+ (void)session; /* Unused */
+
+ fail_fh = (FAIL_FILE_HANDLE *)file_handle;
+ fail_fs = fail_fh->fail_fs;
+
+ /*
+ * We don't actually open an fd when opening directories for flushing,
+ * so ignore that case here.
+ */
+ if (fail_fh->fd < 0)
+ return (0);
+ ret = close(fail_fh->fd);
+ fail_fh->fd = -1;
+ fail_fs_lock(&fail_fs->lock);
+ fail_file_handle_remove(session, fail_fh);
+ fail_fs_unlock(&fail_fs->lock);
+ return (ret);
+}
+
+/*
+ * fail_file_handle_remove --
+ * Destroy an in-memory file handle. Should only happen on remove or
+ * shutdown. The file system lock must be held during this call.
+ */
+static void
+fail_file_handle_remove(WT_SESSION *session, FAIL_FILE_HANDLE *fail_fh)
+{
+ FAIL_FILE_SYSTEM *fail_fs;
+
+ (void)session; /* Unused */
+ fail_fs = fail_fh->fail_fs;
+
+ TAILQ_REMOVE(&fail_fs->fileq, fail_fh, q);
+
+ free(fail_fh->iface.name);
+ free(fail_fh);
+}
+
+/*
+ * fail_file_lock --
+ * Lock/unlock a file.
+ */
+static int
+fail_file_lock(WT_FILE_HANDLE *file_handle, WT_SESSION *session, bool lock)
+{
+ /* Locks are always granted. */
+ (void)file_handle; /* Unused */
+ (void)session; /* Unused */
+ (void)lock; /* Unused */
+
+ return (0);
+}
+
+/*
+ * fail_file_read --
+ * POSIX pread.
+ */
+static int
+fail_file_read(WT_FILE_HANDLE *file_handle,
+ WT_SESSION *session, wt_off_t offset, size_t len, void *buf)
+{
+ FAIL_FILE_HANDLE *fail_fh;
+ FAIL_FILE_SYSTEM *fail_fs;
+ WT_EXTENSION_API *wtext;
+ int64_t envint, read_ops;
+ int ret;
+ size_t chunk;
+ ssize_t nr;
+ uint8_t *addr;
+
+ fail_fh = (FAIL_FILE_HANDLE *)file_handle;
+ fail_fs = fail_fh->fail_fs;
+ wtext = fail_fs->wtext;
+ read_ops = 0;
+ ret = 0;
+
+ fail_fs_lock(&fail_fs->lock);
+
+ if (fail_fs->use_environment) {
+ fail_fs_env(FAIL_FS_ENV_ENABLE, &envint);
+ if (envint != 0) {
+ if (!fail_fs->fail_enabled) {
+ fail_fs->fail_enabled = true;
+ fail_fs_env(FAIL_FS_ENV_READ_ALLOW,
+ &fail_fs->allow_reads);
+ fail_fs->read_ops = 0;
+ }
+ read_ops = ++fail_fs->read_ops;
+ } else
+ fail_fs->fail_enabled = false;
+ } else
+ read_ops = ++fail_fs->read_ops;
+
+ fail_fs_unlock(&fail_fs->lock);
+
+ if (fail_fs->fail_enabled && fail_fs->allow_reads != 0 &&
+ read_ops % fail_fs->allow_reads == 0)
+ return (fail_fs_simulate_fail(
+ fail_fh, session, read_ops, "read"));
+
+ /* Break reads larger than 1GB into 1GB chunks. */
+ for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) {
+ chunk = (len < FAIL_FS_GIGABYTE) ? len : FAIL_FS_GIGABYTE;
+ if ((nr = pread(fail_fh->fd, addr, chunk, offset)) <= 0) {
+ (void)wtext->err_printf(wtext, session,
+ "%s: handle-read: failed to read %" PRIuMAX
+ " bytes at offset %" PRIuMAX ": %s",
+ fail_fh->iface.name,
+ (uintmax_t)len, (uintmax_t)offset,
+ wtext->strerror(wtext, NULL, errno));
+ ret = (nr == 0 ? WT_ERROR : errno);
+ break;
+ }
+ }
+ return (ret);
+}
+
+/*
+ * fail_file_size --
+ * Get the size of a file in bytes, by file handle.
+ */
+static int
+fail_file_size(
+ WT_FILE_HANDLE *file_handle, WT_SESSION *session, wt_off_t *sizep)
+{
+ FAIL_FILE_HANDLE *fail_fh;
+ struct stat statbuf;
+ int ret;
+
+ (void)session; /* Unused */
+
+ fail_fh = (FAIL_FILE_HANDLE *)file_handle;
+ ret = 0;
+
+ if ((ret = fstat(fail_fh->fd, &statbuf)) != 0)
+ return (ret);
+ *sizep = statbuf.st_size;
+ return (0);
+}
+
+/*
+ * fail_file_sync --
+ * Ensure the content of the file is stable. This is a no-op in our
+ * file system.
+ */
+static int
+fail_file_sync(WT_FILE_HANDLE *file_handle, WT_SESSION *session)
+{
+ (void)file_handle; /* Unused */
+ (void)session; /* Unused */
+
+ return (0);
+}
+
+/*
+ * fail_file_truncate --
+ * POSIX ftruncate.
+ */
+static int
+fail_file_truncate(
+ WT_FILE_HANDLE *file_handle, WT_SESSION *session, wt_off_t offset)
+{
+ FAIL_FILE_HANDLE *fail_fh;
+
+ (void)session; /* Unused */
+
+ fail_fh = (FAIL_FILE_HANDLE *)file_handle;
+ return (ftruncate(fail_fh->fd, offset));
+}
+
+/*
+ * fail_file_write --
+ * POSIX pwrite.
+ */
+static int
+fail_file_write(WT_FILE_HANDLE *file_handle, WT_SESSION *session,
+ wt_off_t offset, size_t len, const void *buf)
+{
+ FAIL_FILE_HANDLE *fail_fh;
+ FAIL_FILE_SYSTEM *fail_fs;
+ WT_EXTENSION_API *wtext;
+ int64_t envint, write_ops;
+ int ret;
+ size_t chunk;
+ ssize_t nr;
+ const uint8_t *addr;
+
+ fail_fh = (FAIL_FILE_HANDLE *)file_handle;
+ fail_fs = fail_fh->fail_fs;
+ wtext = fail_fs->wtext;
+ write_ops = 0;
+ ret = 0;
+
+ fail_fs_lock(&fail_fs->lock);
+
+ if (fail_fs->use_environment) {
+ fail_fs_env(FAIL_FS_ENV_ENABLE, &envint);
+ if (envint != 0) {
+ if (!fail_fs->fail_enabled) {
+ fail_fs->fail_enabled = true;
+ fail_fs_env(FAIL_FS_ENV_WRITE_ALLOW,
+ &fail_fs->allow_writes);
+ fail_fs->write_ops = 0;
+ }
+ write_ops = ++fail_fs->write_ops;
+ } else
+ fail_fs->fail_enabled = false;
+ } else
+ write_ops = ++fail_fs->write_ops;
+
+ fail_fs_unlock(&fail_fs->lock);
+
+ if (fail_fs->fail_enabled && fail_fs->allow_writes != 0 &&
+ write_ops % fail_fs->allow_writes == 0)
+ return (fail_fs_simulate_fail(
+ fail_fh, session, write_ops, "write"));
+
+ /* Break writes larger than 1GB into 1GB chunks. */
+ for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) {
+ chunk = (len < FAIL_FS_GIGABYTE) ? len : FAIL_FS_GIGABYTE;
+ if ((nr = pwrite(fail_fh->fd, addr, chunk, offset)) <= 0) {
+ (void)wtext->err_printf(wtext, session,
+ "%s: handle-write: failed to write %" PRIuMAX
+ " bytes at offset %" PRIuMAX ": %s",
+ fail_fh->iface.name,
+ (uintmax_t)len, (uintmax_t)offset,
+ wtext->strerror(wtext, NULL, errno));
+ ret = (nr == 0 ? WT_ERROR : errno);
+ break;
+ }
+ }
+ return (ret);
+}
+
+/*
+ * fail_fs_arg --
+ * If the key matches, return the value interpreted as an integer.
+ */
+static bool
+fail_fs_arg(const char *match, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value,
+ int64_t *argp)
+{
+ if (strncmp(match, key->str, key->len) == 0 &&
+ match[key->len] == '\0' &&
+ (value->type == WT_CONFIG_ITEM_BOOL ||
+ value->type == WT_CONFIG_ITEM_NUM)) {
+ *argp = value->val;
+ return (true);
+ }
+ return (false);
+}
+
+/*
+ * fail_fs_directory_list --
+ * Return a list of files in a given sub-directory.
+ */
+static int
+fail_fs_directory_list(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *session, const char *directory,
+ const char *prefix, char ***dirlistp, uint32_t *countp)
+{
+ FAIL_FILE_HANDLE *fail_fh;
+ FAIL_FILE_SYSTEM *fail_fs;
+ size_t len, prefix_len;
+ uint32_t allocated, count;
+ int ret;
+ char *name, **entries;
+ void *p;
+
+ (void)session; /* Unused */
+
+ fail_fs = (FAIL_FILE_SYSTEM *)file_system;
+ ret = 0;
+ *dirlistp = NULL;
+ *countp = 0;
+
+ entries = NULL;
+ allocated = count = 0;
+ len = strlen(directory);
+ prefix_len = prefix == NULL ? 0 : strlen(prefix);
+
+ fail_fs_lock(&fail_fs->lock);
+ TAILQ_FOREACH(fail_fh, &fail_fs->fileq, q) {
+ name = fail_fh->iface.name;
+ if (strncmp(name, directory, len) != 0 ||
+ (prefix != NULL && strncmp(name, prefix, prefix_len) != 0))
+ continue;
+
+ /*
+ * Increase the list size in groups of 10, it doesn't
+ * matter if the list is a bit longer than necessary.
+ */
+ if (count >= allocated) {
+ p = realloc(
+ entries, (allocated + 10) * sizeof(*entries));
+ if (p == NULL) {
+ ret = ENOMEM;
+ goto err;
+ }
+ entries = p;
+ memset(entries + allocated * sizeof(*entries),
+ 0, 10 * sizeof(*entries));
+ allocated += 10;
+ }
+ entries[count++] = strdup(name);
+ }
+
+ *dirlistp = entries;
+ *countp = count;
+
+err: fail_fs_unlock(&fail_fs->lock);
+ if (ret == 0)
+ return (0);
+
+ if (entries != NULL) {
+ while (count > 0)
+ free(entries[--count]);
+ free(entries);
+ }
+
+ return (ret);
+}
+
+/*
+ * fail_fs_directory_list_free --
+ * Free memory allocated by fail_fs_directory_list.
+ */
+static int
+fail_fs_directory_list_free(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *session, char **dirlist, uint32_t count)
+{
+ (void)file_system; /* Unused */
+ (void)session; /* Unused */
+
+ if (dirlist != NULL) {
+ while (count > 0)
+ free(dirlist[--count]);
+ free(dirlist);
+ }
+ return (0);
+}
+
+/*
+ * fail_fs_env --
+ * If the name is in the environment, return its integral value.
+ */
+static void
+fail_fs_env(const char *name, int64_t *valp)
+{
+ int64_t result;
+ char *s, *value;
+
+ result = 0;
+ if ((value = getenv(name)) != NULL) {
+ s = value;
+ if (strcmp(value, "true") == 0)
+ result = 1;
+ else if (strcmp(value, "false") != 0) {
+ result = strtoll(value, &s, 10);
+ if (*s != '\0')
+ result = 0;
+ }
+ }
+ *valp = result;
+}
+
+/*
+ * fail_fs_exist --
+ * Return if the file exists.
+ */
+static int
+fail_fs_exist(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *session, const char *name, bool *existp)
+{
+ (void)file_system; /* Unused */
+ (void)session; /* Unused */
+
+ *existp = (access(name, F_OK) == 0);
+ return (0);
+}
+
+/*
+ * fail_fs_open --
+ * fopen for the fail file system.
+ */
+static int
+fail_fs_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session,
+ const char *name, WT_FS_OPEN_FILE_TYPE file_type, uint32_t flags,
+ WT_FILE_HANDLE **file_handlep)
+{
+ FAIL_FILE_HANDLE *fail_fh;
+ FAIL_FILE_SYSTEM *fail_fs;
+ WT_EXTENSION_API *wtext;
+ WT_FILE_HANDLE *file_handle;
+ int fd, open_flags, ret;
+
+ (void)session; /* Unused */
+
+ *file_handlep = NULL;
+
+ fail_fh = NULL;
+ fail_fs = (FAIL_FILE_SYSTEM *)file_system;
+ fd = -1;
+ ret = 0;
+
+ if (fail_fs->verbose) {
+ wtext = fail_fs->wtext;
+ (void)wtext->msg_printf(wtext, session, "fail_fs: open: %s",
+ name);
+ }
+
+ fail_fs_lock(&fail_fs->lock);
+
+ open_flags = 0;
+ if ((flags & WT_FS_OPEN_CREATE) != 0)
+ open_flags |= O_CREAT;
+ if ((flags & WT_FS_OPEN_EXCLUSIVE) != 0)
+ open_flags |= O_EXCL;
+ if ((flags & WT_FS_OPEN_READONLY) != 0)
+ open_flags |= O_RDONLY;
+ else
+ open_flags |= O_RDWR;
+
+ /*
+ * Opening a file handle on a directory is only to support filesystems
+ * that require a directory sync for durability. This is a no-op
+ * for this file system.
+ */
+ if (file_type == WT_FS_OPEN_FILE_TYPE_DIRECTORY)
+ fd = -1;
+ else if ((fd = open(name, open_flags, 0666)) < 0) {
+ ret = errno;
+ goto err;
+ }
+
+ /* We create a handle structure for each open. */
+ if ((fail_fh = calloc(1, sizeof(FAIL_FILE_HANDLE))) == NULL) {
+ ret = ENOMEM;
+ goto err;
+ }
+
+ /* Initialize private information. */
+ fail_fh->fail_fs = fail_fs;
+ fail_fh->fd = fd;
+
+ /* Initialize public information. */
+ file_handle = (WT_FILE_HANDLE *)fail_fh;
+ if ((file_handle->name = strdup(name)) == NULL) {
+ ret = ENOMEM;
+ goto err;
+ }
+
+ /* Setup the function call table. */
+ file_handle->close = fail_file_close;
+ file_handle->fh_advise = NULL;
+ file_handle->fh_extend = NULL;
+ file_handle->fh_extend_nolock = NULL;
+ file_handle->fh_lock = fail_file_lock;
+ file_handle->fh_map = NULL;
+ file_handle->fh_map_discard = NULL;
+ file_handle->fh_map_preload = NULL;
+ file_handle->fh_unmap = NULL;
+ file_handle->fh_read = fail_file_read;
+ file_handle->fh_size = fail_file_size;
+ file_handle->fh_sync = fail_file_sync;
+ file_handle->fh_sync_nowait = NULL;
+ file_handle->fh_truncate = fail_file_truncate;
+ file_handle->fh_write = fail_file_write;
+
+ TAILQ_INSERT_HEAD(&fail_fs->fileq, fail_fh, q);
+
+ *file_handlep = file_handle;
+
+ if (0) {
+err: if (fd != -1)
+ (void)close(fd);
+ free(fail_fh);
+ }
+
+ fail_fs_unlock(&fail_fs->lock);
+ return (ret);
+}
+
+/*
+ * fail_fs_remove --
+ * POSIX remove.
+ */
+static int
+fail_fs_remove(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *session, const char *name, uint32_t flags)
+{
+ (void)file_system; /* Unused */
+ (void)session; /* Unused */
+ (void)flags; /* Unused */
+
+ return (unlink(name));
+}
+
+/*
+ * fail_fs_rename --
+ * POSIX rename.
+ */
+static int
+fail_fs_rename(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *session, const char *from, const char *to, uint32_t flags)
+{
+ (void)file_system; /* Unused */
+ (void)session; /* Unused */
+ (void)flags; /* Unused */
+
+ return (rename(from, to));
+}
+
+/*
+ * fail_fs_simulate_fail --
+ * Simulate a failure from this file system by reporting it
+ * and returning a non-zero return code.
+ */
+static int
+fail_fs_simulate_fail(FAIL_FILE_HANDLE *fail_fh, WT_SESSION *session,
+ int64_t nops, const char *opkind)
+{
+ FAIL_FILE_SYSTEM *fail_fs;
+ WT_EXTENSION_API *wtext;
+#ifdef __FreeBSD__
+ size_t btret, i;
+#else
+ int btret, i;
+#endif
+ void *bt[100];
+ char **btstr;
+
+ fail_fs = fail_fh->fail_fs;
+ if (fail_fs->verbose) {
+ wtext = fail_fs->wtext;
+ (void)wtext->msg_printf(wtext, session,
+ "fail_fs: %s: simulated failure after %" PRId64
+ " %s operations", fail_fh->iface.name, nops, opkind);
+#ifdef __FreeBSD__
+ btret = backtrace(bt, sizeof(bt) / sizeof(bt[0]));
+#else
+ btret = backtrace(bt, (int)(sizeof(bt) / sizeof(bt[0])));
+#endif
+ if ((btstr = backtrace_symbols(bt, btret)) != NULL) {
+ for (i = 0; i < btret; i++)
+ (void)wtext->msg_printf(wtext, session, " %s",
+ btstr[i]);
+ free(btstr);
+ }
+ }
+ return (EIO);
+}
+
+/*
+ * fail_fs_size --
+ * Get the size of a file in bytes, by file name.
+ */
+static int
+fail_fs_size(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *session, const char *name, wt_off_t *sizep)
+{
+ struct stat statbuf;
+ int ret;
+
+ (void)file_system; /* Unused */
+ (void)session; /* Unused */
+
+ ret = 0;
+ if ((ret = stat(name, &statbuf)) != 0)
+ return (ret);
+ *sizep = statbuf.st_size;
+ return (0);
+}
+
+/*
+ * fail_fs_terminate --
+ * Discard any resources on termination
+ */
+static int
+fail_fs_terminate(WT_FILE_SYSTEM *file_system, WT_SESSION *session)
+{
+ FAIL_FILE_HANDLE *fail_fh;
+ FAIL_FILE_SYSTEM *fail_fs;
+
+ fail_fs = (FAIL_FILE_SYSTEM *)file_system;
+
+ while ((fail_fh = TAILQ_FIRST(&fail_fs->fileq)) != NULL)
+ fail_file_handle_remove(session, fail_fh);
+
+ fail_fs_destroy_lock(&fail_fs->lock);
+ free(fail_fs);
+
+ return (0);
+}
+
+/*
+ * wiredtiger_extension_init --
+ * WiredTiger fail filesystem extension.
+ */
+int
+wiredtiger_extension_init(WT_CONNECTION *conn, WT_CONFIG_ARG *config)
+{
+ FAIL_FILE_SYSTEM *fail_fs;
+ WT_CONFIG_ITEM k, v;
+ WT_CONFIG_PARSER *config_parser;
+ WT_EXTENSION_API *wtext;
+ WT_FILE_SYSTEM *file_system;
+ int64_t argval;
+ int ret;
+
+ ret = 0;
+ wtext = conn->get_extension_api(conn);
+ if ((fail_fs = calloc(1, sizeof(FAIL_FILE_SYSTEM))) == NULL) {
+ (void)wtext->err_printf(wtext, NULL,
+ "fail_file_system extension_init: %s",
+ wtext->strerror(wtext, NULL, ENOMEM));
+ return (ENOMEM);
+ }
+ fail_fs->wtext = wtext;
+ file_system = (WT_FILE_SYSTEM *)fail_fs;
+
+ /* Get any configuration values. */
+ if ((ret = wtext->config_parser_open_arg(
+ wtext, NULL, config, &config_parser)) != 0) {
+ (void)wtext->err_printf(wtext, NULL,
+ "WT_EXTENSION_API.config_parser_open: config: %s",
+ wtext->strerror(wtext, NULL, ret));
+ goto err;
+ }
+ while ((ret = config_parser->next(config_parser, &k, &v)) == 0) {
+ if (fail_fs_arg("environment", &k, &v, &argval)) {
+ fail_fs->use_environment = (argval != 0);
+ continue;
+ } else if (fail_fs_arg("verbose", &k, &v, &argval)) {
+ fail_fs->verbose = (argval != 0);
+ continue;
+ } else if (fail_fs_arg("allow_writes", &k, &v,
+ &fail_fs->allow_writes))
+ continue;
+ else if (fail_fs_arg("allow_reads", &k, &v,
+ &fail_fs->allow_reads))
+ continue;
+
+ (void)wtext->err_printf(wtext, NULL,
+ "WT_CONFIG_PARSER.next: unexpected configuration "
+ "information: %.*s=%.*s: %s",
+ (int)k.len, k.str, (int)v.len, v.str,
+ wtext->strerror(wtext, NULL, ret));
+ goto err;
+ }
+ if (ret != WT_NOTFOUND) {
+ (void)wtext->err_printf(wtext, NULL,
+ "WT_CONFIG_PARSER.next: config: %s",
+ wtext->strerror(wtext, NULL, ret));
+ goto err;
+ }
+ if ((ret = config_parser->close(config_parser)) != 0) {
+ (void)wtext->err_printf(wtext, NULL,
+ "WT_CONFIG_PARSER.close: config: %s",
+ wtext->strerror(wtext, NULL, ret));
+ goto err;
+ }
+ if (fail_fs->allow_writes != 0 || fail_fs->allow_reads != 0)
+ fail_fs->fail_enabled = true;
+
+ fail_fs_allocate_lock(&fail_fs->lock);
+ /* Initialize the in-memory jump table. */
+ file_system->fs_directory_list = fail_fs_directory_list;
+ file_system->fs_directory_list_free = fail_fs_directory_list_free;
+ file_system->fs_exist = fail_fs_exist;
+ file_system->fs_open_file = fail_fs_open;
+ file_system->fs_remove = fail_fs_remove;
+ file_system->fs_rename = fail_fs_rename;
+ file_system->fs_size = fail_fs_size;
+ file_system->terminate = fail_fs_terminate;
+ if ((ret = conn->set_file_system(conn, file_system, NULL)) != 0) {
+ (void)wtext->err_printf(wtext, NULL,
+ "WT_CONNECTION.set_file_system: %s",
+ wtext->strerror(wtext, NULL, ret));
+ goto err;
+ }
+ return (0);
+
+err: free(fail_fs);
+ return (ret);
+}
diff --git a/src/async/async_api.c b/src/async/async_api.c
index 54bcb7cd26c..026a008188c 100644
--- a/src/async/async_api.c
+++ b/src/async/async_api.c
@@ -240,8 +240,7 @@ __async_start(WT_SESSION_IMPL *session)
async = conn->async;
TAILQ_INIT(&async->formatqh);
WT_RET(__wt_spin_init(session, &async->ops_lock, "ops"));
- WT_RET(__wt_cond_alloc(
- session, "async flush", false, &async->flush_cond));
+ WT_RET(__wt_cond_alloc(session, "async flush", &async->flush_cond));
WT_RET(__wt_async_op_init(session));
/*
@@ -541,7 +540,7 @@ retry:
async->flush_op.state = WT_ASYNCOP_READY;
WT_RET(__wt_async_op_enqueue(session, &async->flush_op));
while (async->flush_state != WT_ASYNC_FLUSH_COMPLETE)
- __wt_cond_wait(session, async->flush_cond, 100000);
+ __wt_cond_wait(session, async->flush_cond, 100000, NULL);
/*
* Flush is done. Clear the flags.
*/
diff --git a/src/async/async_worker.c b/src/async/async_worker.c
index b1bc3902f7c..11f59ed14f1 100644
--- a/src/async/async_worker.c
+++ b/src/async/async_worker.c
@@ -107,7 +107,7 @@ __async_flush_wait(WT_SESSION_IMPL *session, WT_ASYNC *async, uint64_t my_gen)
{
while (async->flush_state == WT_ASYNC_FLUSHING &&
async->flush_gen == my_gen)
- __wt_cond_wait(session, async->flush_cond, 10000);
+ __wt_cond_wait(session, async->flush_cond, 10000, NULL);
}
/*
diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c
index d18b9b76992..5fde2237538 100644
--- a/src/btree/bt_cursor.c
+++ b/src/btree/bt_cursor.c
@@ -76,11 +76,11 @@ __cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt)
}
/*
- * __cursor_valid --
+ * __wt_cursor_valid --
* Return if the cursor references an valid key/value pair.
*/
-static inline bool
-__cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
+bool
+__wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
{
WT_BTREE *btree;
WT_CELL *cell;
@@ -330,7 +330,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt)
WT_ERR(btree->type == BTREE_ROW ?
__cursor_row_search(session, cbt, cbt->ref, false) :
__cursor_col_search(session, cbt, cbt->ref));
- valid = cbt->compare == 0 && __cursor_valid(cbt, &upd);
+ valid = cbt->compare == 0 && __wt_cursor_valid(cbt, &upd);
}
if (!valid) {
WT_ERR(__cursor_func_init(cbt, true));
@@ -338,7 +338,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt)
WT_ERR(btree->type == BTREE_ROW ?
__cursor_row_search(session, cbt, NULL, false) :
__cursor_col_search(session, cbt, NULL));
- valid = cbt->compare == 0 && __cursor_valid(cbt, &upd);
+ valid = cbt->compare == 0 && __wt_cursor_valid(cbt, &upd);
}
if (valid)
@@ -419,14 +419,14 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
* Ignore those cases, it makes things too complicated.
*/
if (cbt->slot != 0 && cbt->slot != cbt->ref->page->entries - 1)
- valid = __cursor_valid(cbt, &upd);
+ valid = __wt_cursor_valid(cbt, &upd);
}
if (!valid) {
WT_ERR(__cursor_func_init(cbt, true));
WT_ERR(btree->type == BTREE_ROW ?
__cursor_row_search(session, cbt, NULL, true) :
__cursor_col_search(session, cbt, NULL));
- valid = __cursor_valid(cbt, &upd);
+ valid = __wt_cursor_valid(cbt, &upd);
}
/*
@@ -462,7 +462,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
WT_ERR(btree->type == BTREE_ROW ?
__cursor_row_search(session, cbt, NULL, true) :
__cursor_col_search(session, cbt, NULL));
- if (__cursor_valid(cbt, &upd)) {
+ if (__wt_cursor_valid(cbt, &upd)) {
exact = cbt->compare;
ret = __wt_kv_return(session, cbt, upd);
} else if ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND)
@@ -537,7 +537,7 @@ retry: WT_RET(__cursor_func_init(cbt, true));
* Fail in that case, the record exists.
*/
if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
- ((cbt->compare == 0 && __cursor_valid(cbt, NULL)) ||
+ ((cbt->compare == 0 && __wt_cursor_valid(cbt, NULL)) ||
(cbt->compare != 0 && __cursor_fix_implicit(btree, cbt))))
WT_ERR(WT_DUPLICATE_KEY);
@@ -552,7 +552,7 @@ retry: WT_RET(__cursor_func_init(cbt, true));
* key/value pair.
*/
if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
- cbt->compare == 0 && __cursor_valid(cbt, NULL))
+ cbt->compare == 0 && __wt_cursor_valid(cbt, NULL))
WT_ERR(WT_DUPLICATE_KEY);
ret = __cursor_row_modify(session, cbt, false);
@@ -682,12 +682,12 @@ retry: WT_RET(__cursor_func_init(cbt, true));
/*
* If we find a matching record, check whether an update would
* conflict. Do this before checking if the update is visible
- * in __cursor_valid, or we can miss conflict.
+ * in __wt_cursor_valid, or we can miss conflict.
*/
WT_ERR(__curfile_update_check(cbt));
/* Remove the record if it exists. */
- if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) {
+ if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) {
if (!__cursor_fix_implicit(btree, cbt))
WT_ERR(WT_NOTFOUND);
/*
@@ -711,7 +711,7 @@ retry: WT_RET(__cursor_func_init(cbt, true));
/* Check whether an update would conflict. */
WT_ERR(__curfile_update_check(cbt));
- if (cbt->compare != 0 || !__cursor_valid(cbt, NULL))
+ if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL))
WT_ERR(WT_NOTFOUND);
ret = __cursor_row_modify(session, cbt, true);
@@ -786,7 +786,8 @@ retry: WT_RET(__cursor_func_init(cbt, true));
*/
if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
WT_ERR(__curfile_update_check(cbt));
- if ((cbt->compare != 0 || !__cursor_valid(cbt, NULL)) &&
+ if ((cbt->compare != 0 ||
+ !__wt_cursor_valid(cbt, NULL)) &&
!__cursor_fix_implicit(btree, cbt))
WT_ERR(WT_NOTFOUND);
}
@@ -800,7 +801,7 @@ retry: WT_RET(__cursor_func_init(cbt, true));
*/
if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
WT_ERR(__curfile_update_check(cbt));
- if (cbt->compare != 0 || !__cursor_valid(cbt, NULL))
+ if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL))
WT_ERR(WT_NOTFOUND);
}
ret = __cursor_row_modify(session, cbt, false);
@@ -830,111 +831,6 @@ err: if (ret == WT_RESTART) {
}
/*
- * __wt_btcur_next_random --
- * Move to a random record in the tree. There are two algorithms, one
- * where we select a record at random from the whole tree on each
- * retrieval and one where we first select a record at random from the
- * whole tree, and then subsequently sample forward from that location.
- * The sampling approach allows us to select reasonably uniform random
- * points from unbalanced trees.
- */
-int
-__wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
-{
- WT_BTREE *btree;
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
- WT_UPDATE *upd;
- wt_off_t size;
- uint64_t skip;
-
- session = (WT_SESSION_IMPL *)cbt->iface.session;
- btree = cbt->btree;
-
- /*
- * Only supports row-store: applications can trivially select a random
- * value from a column-store, if there were any reason to do so.
- */
- if (btree->type != BTREE_ROW)
- WT_RET_MSG(session, ENOTSUP,
- "WT_CURSOR.next_random only supported by row-store tables");
-
- WT_STAT_CONN_INCR(session, cursor_next);
- WT_STAT_DATA_INCR(session, cursor_next);
-
- /*
- * If retrieving random values without sampling, or we don't have a
- * page reference, pick a roughly random leaf page in the tree.
- */
- if (cbt->ref == NULL || cbt->next_random_sample_size == 0) {
- /*
- * Skip past the sample size of the leaf pages in the tree
- * between each random key return to compensate for unbalanced
- * trees.
- *
- * Use the underlying file size divided by its block allocation
- * size as our guess of leaf pages in the file (this can be
- * entirely wrong, as it depends on how many pages are in this
- * particular checkpoint, how large the leaf and internal pages
- * really are, and other factors). Then, divide that value by
- * the configured sample size and increment the final result to
- * make sure tiny files don't leave us with a skip value of 0.
- *
- * !!!
- * Ideally, the number would be prime to avoid restart issues.
- */
- if (cbt->next_random_sample_size != 0) {
- WT_ERR(btree->bm->size(btree->bm, session, &size));
- cbt->next_random_leaf_skip = (uint64_t)
- ((size / btree->allocsize) /
- cbt->next_random_sample_size) + 1;
- }
-
- /*
- * Choose a leaf page from the tree.
- */
- WT_ERR(__cursor_func_init(cbt, true));
- WT_WITH_PAGE_INDEX(
- session, ret = __wt_row_random_descent(session, cbt));
- WT_ERR(ret);
- } else {
- /*
- * Read through the tree, skipping leaf pages. Be cautious about
- * the skip count: if the last leaf page skipped was also the
- * last leaf page in the tree, it may be set to zero on return
- * with the end-of-walk condition.
- *
- * Pages read for data sampling aren't "useful"; don't update
- * the read generation of pages already in memory, and if a page
- * is read, set its generation to a low value so it is evicted
- * quickly.
- */
- for (skip =
- cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;)
- WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip,
- WT_READ_NO_GEN |
- WT_READ_SKIP_INTL | WT_READ_WONT_NEED));
- }
-
- /*
- * Select a random entry from the leaf page. If it's not valid, move to
- * the next entry, if that doesn't work, move to the previous entry.
- */
- WT_ERR(__wt_row_random_leaf(session, cbt));
- if (__cursor_valid(cbt, &upd))
- WT_ERR(__wt_kv_return(session, cbt, upd));
- else {
- if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND)
- ret = __wt_btcur_prev(cbt, false);
- WT_ERR(ret);
- }
- return (0);
-
-err: WT_TRET(__cursor_reset(cbt));
- return (ret);
-}
-
-/*
* __wt_btcur_compare --
* Return a comparison between two cursors.
*/
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index b62125e069d..d664da2ebd3 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -652,7 +652,7 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref)
page = ref->page;
mod = page->modify;
- WT_RET(ds->f(ds, "%p", (void *)page));
+ WT_RET(ds->f(ds, "%p", (void *)ref));
switch (page->type) {
case WT_PAGE_COL_INT:
@@ -699,8 +699,6 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref)
WT_RET(ds->f(ds, ", evict-lru"));
if (F_ISSET_ATOMIC(page, WT_PAGE_OVERFLOW_KEYS))
WT_RET(ds->f(ds, ", overflow-keys"));
- if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK))
- WT_RET(ds->f(ds, ", split-block"));
if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT))
WT_RET(ds->f(ds, ", split-insert"));
if (F_ISSET_ATOMIC(page, WT_PAGE_UPDATE_IGNORE))
diff --git a/src/btree/bt_random.c b/src/btree/bt_random.c
new file mode 100644
index 00000000000..44de511f787
--- /dev/null
+++ b/src/btree/bt_random.c
@@ -0,0 +1,427 @@
+/*-
+ * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_row_random_leaf --
+ * Return a random key from a row-store leaf page.
+ */
+int
+__wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+{
+ WT_INSERT *ins, **start, **stop;
+ WT_INSERT_HEAD *ins_head;
+ WT_PAGE *page;
+ uint64_t samples;
+ uint32_t choice, entries, i;
+ int level;
+
+ page = cbt->ref->page;
+ start = stop = NULL; /* [-Wconditional-uninitialized] */
+ entries = 0; /* [-Wconditional-uninitialized] */
+
+ __cursor_pos_clear(cbt);
+
+ /* If the page has disk-based entries, select from them. */
+ if (page->entries != 0) {
+ cbt->compare = 0;
+ cbt->slot = __wt_random(&session->rnd) % page->entries;
+
+ /*
+ * The real row-store search function builds the key, so we
+ * have to as well.
+ */
+ return (__wt_row_leaf_key(session,
+ page, page->pg_row + cbt->slot, cbt->tmp, false));
+ }
+
+ /*
+ * If the tree is new (and not empty), it might have a large insert
+ * list.
+ *
+ * Walk down the list until we find a level with at least 50 entries,
+ * that's where we'll start rolling random numbers. The value 50 is
+ * used to ignore levels with only a few entries, that is, levels which
+ * are potentially badly skewed.
+ */
+ F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
+ if ((ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
+ return (WT_NOTFOUND);
+ for (level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) {
+ start = &ins_head->head[level];
+ for (entries = 0, stop = start;
+ *stop != NULL; stop = &(*stop)->next[level])
+ ++entries;
+
+ if (entries > 50)
+ break;
+ }
+
+ /*
+ * If it's a tiny list and we went all the way to level 0, correct the
+ * level; entries is correctly set.
+ */
+ if (level < 0)
+ level = 0;
+
+ /*
+ * Step down the skip list levels, selecting a random chunk of the name
+ * space at each level.
+ */
+ for (samples = entries; level > 0; samples += entries) {
+ /*
+ * There are (entries) or (entries + 1) chunks of the name space
+ * considered at each level. They are: between start and the 1st
+ * element, between the 1st and 2nd elements, and so on to the
+ * last chunk which is the name space after the stop element on
+ * the current level. This last chunk of name space may or may
+ * not be there: as we descend the levels of the skip list, this
+ * chunk may appear, depending if the next level down has
+ * entries logically after the stop point in the current level.
+ * We can't ignore those entries: because of the algorithm used
+ * to determine the depth of a skiplist, there may be a large
+ * number of entries "revealed" by descending a level.
+ *
+ * If the next level down has more items after the current stop
+ * point, there are (entries + 1) chunks to consider, else there
+ * are (entries) chunks.
+ */
+ if (*(stop - 1) == NULL)
+ choice = __wt_random(&session->rnd) % entries;
+ else
+ choice = __wt_random(&session->rnd) % (entries + 1);
+
+ if (choice == entries) {
+ /*
+ * We selected the name space after the stop element on
+ * this level. Set the start point to the current stop
+ * point, descend a level and move the stop element to
+ * the end of the list, that is, the end of the newly
+ * discovered name space, counting entries as we go.
+ */
+ start = stop;
+ --start;
+ --level;
+ for (entries = 0, stop = start;
+ *stop != NULL; stop = &(*stop)->next[level])
+ ++entries;
+ } else {
+ /*
+ * We selected another name space on the level. Move the
+ * start pointer the selected number of entries forward
+ * to the start of the selected chunk (if the selected
+ * number is 0, start won't move). Set the stop pointer
+ * to the next element in the list and drop both start
+ * and stop down a level.
+ */
+ for (i = 0; i < choice; ++i)
+ start = &(*start)->next[level];
+ stop = &(*start)->next[level];
+
+ --start;
+ --stop;
+ --level;
+
+ /* Count the entries in the selected name space. */
+ for (entries = 0,
+ ins = *start; ins != *stop; ins = ins->next[level])
+ ++entries;
+ }
+ }
+
+ /*
+ * When we reach the bottom level, entries will already be set. Select
+ * a random entry from the name space and return it.
+ *
+ * It should be impossible for the entries count to be 0 at this point,
+ * but check for it out of paranoia and to quiet static testing tools.
+ */
+ if (entries > 0)
+ entries = __wt_random(&session->rnd) % entries;
+ for (ins = *start; entries > 0; --entries)
+ ins = ins->next[0];
+
+ cbt->ins = ins;
+ cbt->ins_head = ins_head;
+ cbt->compare = 0;
+
+ /*
+ * Random lookups in newly created collections can be slow if a page
+ * consists of a large skiplist. Schedule the page for eviction if we
+ * encounter a large skiplist. This worthwhile because applications
+ * that take a sample often take many samples, so the overhead of
+ * traversing the skip list each time accumulates to real time.
+ */
+ if (samples > 5000)
+ __wt_page_evict_soon(session, cbt->ref);
+
+ return (0);
+}
+
+/*
+ * __wt_random_descent --
+ * Find a random page in a tree for either sampling or eviction.
+ */
+int
+__wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, bool eviction)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_INDEX *pindex;
+ WT_REF *current, *descent;
+ uint32_t flags, i, entries, retry;
+
+ btree = S2BT(session);
+ current = NULL;
+ retry = 100;
+
+ /* Eviction should not be tapped to do eviction. */
+ if (eviction)
+ flags = WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN |
+ WT_READ_NO_WAIT | WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK;
+ else
+ flags = WT_READ_RESTART_OK;
+
+ if (0) {
+restart: /*
+ * Discard the currently held page and restart the search from
+ * the root.
+ */
+ WT_RET(__wt_page_release(session, current, flags));
+ }
+
+ /* Search the internal pages of the tree. */
+ current = &btree->root;
+ for (;;) {
+ page = current->page;
+ if (!WT_PAGE_IS_INTERNAL(page))
+ break;
+
+ WT_INTL_INDEX_GET(session, page, pindex);
+ entries = pindex->entries;
+
+ /* Eviction just wants any random child. */
+ if (eviction) {
+ descent = pindex->index[
+ __wt_random(&session->rnd) % entries];
+ goto descend;
+ }
+
+ /*
+ * There may be empty pages in the tree, and they're useless to
+ * us. If we don't find a non-empty page in "entries" random
+ * guesses, take the first non-empty page in the tree. If the
+ * search page contains nothing other than empty pages, restart
+ * from the root some number of times before giving up.
+ *
+ * Random sampling is looking for a key/value pair on a random
+ * leaf page, and so will accept any page that contains a valid
+ * key/value pair, so on-disk is fine, but deleted is not.
+ */
+ descent = NULL;
+ for (i = 0; i < entries; ++i) {
+ descent =
+ pindex->index[__wt_random(&session->rnd) % entries];
+ if (descent->state == WT_REF_MEM ||
+ descent->state == WT_REF_DISK)
+ break;
+ }
+ if (i == entries)
+ for (i = 0; i < entries; ++i) {
+ descent = pindex->index[i];
+ if (descent->state == WT_REF_MEM ||
+ descent->state == WT_REF_DISK)
+ break;
+ }
+ if (i == entries || descent == NULL) {
+ if (--retry > 0)
+ goto restart;
+
+ WT_RET(__wt_page_release(session, current, flags));
+ return (WT_NOTFOUND);
+ }
+
+ /*
+ * Swap the current page for the child page. If the page splits
+ * while we're retrieving it, restart the search at the root.
+ *
+ * On other error, simply return, the swap call ensures we're
+ * holding nothing on failure.
+ */
+descend: if ((ret =
+ __wt_page_swap(session, current, descent, flags)) == 0) {
+ current = descent;
+ continue;
+ }
+ if (eviction && (ret == WT_NOTFOUND || ret == WT_RESTART))
+ break;
+ if (ret == WT_RESTART)
+ goto restart;
+ return (ret);
+ }
+
+ /*
+ * There is no point starting with the root page: the walk will exit
+ * immediately. In that case we aren't holding a hazard pointer so
+ * there is nothing to release.
+ */
+ if (!eviction || !__wt_ref_is_root(current))
+ *refp = current;
+ return (0);
+}
+
+/*
+ * __wt_btcur_next_random --
+ * Move to a random record in the tree. There are two algorithms, one
+ * where we select a record at random from the whole tree on each
+ * retrieval and one where we first select a record at random from the
+ * whole tree, and then subsequently sample forward from that location.
+ * The sampling approach allows us to select reasonably uniform random
+ * points from unbalanced trees.
+ */
+int
+__wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+ wt_off_t size;
+ uint64_t n, skip;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ btree = cbt->btree;
+
+ /*
+ * Only supports row-store: applications can trivially select a random
+ * value from a column-store, if there were any reason to do so.
+ */
+ if (btree->type != BTREE_ROW)
+ WT_RET_MSG(session, ENOTSUP,
+ "WT_CURSOR.next_random only supported by row-store tables");
+
+ WT_STAT_CONN_INCR(session, cursor_next);
+ WT_STAT_DATA_INCR(session, cursor_next);
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * Under some conditions we end up using the underlying cursor.next to
+ * walk through the object. Since there are multiple calls, we can hit
+ * the cursor-order checks, turn them off.
+ */
+ __wt_cursor_key_order_reset(cbt);
+#endif
+
+ /*
+ * If we don't have a current position in the tree, or if retrieving
+ * random values without sampling, pick a roughly random leaf page in
+ * the tree and return an entry from it.
+ */
+ if (cbt->ref == NULL || cbt->next_random_sample_size == 0) {
+ WT_ERR(__cursor_func_init(cbt, true));
+ WT_WITH_PAGE_INDEX(session,
+ ret = __wt_random_descent(session, &cbt->ref, false));
+ if (ret == 0)
+ goto random_page_entry;
+
+ /*
+ * Random descent may return not-found: the tree might be empty
+ * or have so many deleted items we didn't find any valid pages.
+ * We can't return WT_NOTFOUND to the application unless a tree
+ * is really empty, fallback to skipping through tree pages.
+ */
+ WT_ERR_NOTFOUND_OK(ret);
+ }
+
+ /*
+ * Cursor through the tree, skipping past the sample size of the leaf
+ * pages in the tree between each random key return to compensate for
+ * unbalanced trees.
+ *
+ * If the random descent attempt failed, we don't have a configured
+ * sample size, use 100 for no particular reason.
+ */
+ if (cbt->next_random_sample_size == 0)
+ cbt->next_random_sample_size = 100;
+
+ /*
+ * If the random descent attempt failed, or it's our first skip attempt,
+ * we haven't yet set the pages to skip, do it now.
+ *
+ * Use the underlying file size divided by its block allocation size as
+ * our guess of leaf pages in the file (this can be entirely wrong, as
+ * it depends on how many pages are in this particular checkpoint, how
+ * large the leaf and internal pages really are, and other factors).
+ * Then, divide that value by the configured sample size and increment
+ * the final result to make sure tiny files don't leave us with a skip
+ * value of 0.
+ *
+ * !!!
+ * Ideally, the number would be prime to avoid restart issues.
+ */
+ if (cbt->next_random_leaf_skip == 0) {
+ WT_ERR(btree->bm->size(btree->bm, session, &size));
+ cbt->next_random_leaf_skip = (uint64_t)
+ ((size / btree->allocsize) /
+ cbt->next_random_sample_size) + 1;
+ }
+
+ /*
+ * Be paranoid about loop termination: first, if the last leaf page
+ * skipped was also the last leaf page in the tree, skip may be set to
+ * zero on return along with the NULL WT_REF end-of-walk condition.
+ * Second, if a tree has no valid pages at all (the condition after
+ * initial creation), we might make no progress at all, or finally, if
+ * a tree has only deleted pages, we'll make progress, but never get a
+ * useful WT_REF. And, of course, the tree can switch from one of these
+ * states to another without warning. Decrement skip regardless of what
+ * is happening in the search, guarantee we eventually quit.
+ *
+ * Pages read for data sampling aren't "useful"; don't update the read
+ * generation of pages already in memory, and if a page is read, set
+ * its generation to a low value so it is evicted quickly.
+ */
+ for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) {
+ n = skip;
+ WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip,
+ WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED));
+ if (n == skip) {
+ if (skip == 0)
+ break;
+ --skip;
+ }
+ }
+
+ /*
+ * We can't return WT_NOTFOUND to the application unless a tree is
+ * really empty, fallback to a random entry from the first page in the
+ * tree that has anything at all.
+ */
+ if (cbt->ref == NULL)
+ WT_ERR(__wt_btcur_next(cbt, false));
+
+random_page_entry:
+ /*
+ * Select a random entry from the leaf page. If it's not valid, move to
+ * the next entry, if that doesn't work, move to the previous entry.
+ */
+ WT_ERR(__wt_row_random_leaf(session, cbt));
+ if (__wt_cursor_valid(cbt, &upd))
+ WT_ERR(__wt_kv_return(session, cbt, upd));
+ else {
+ if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND)
+ ret = __wt_btcur_prev(cbt, false);
+ WT_ERR(ret);
+ }
+ return (0);
+
+err: WT_TRET(__cursor_reset(cbt));
+ return (ret);
+}
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 6b0b8a08c02..45550ff627f 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -54,6 +54,16 @@ __split_oldest_gen(WT_SESSION_IMPL *session)
}
/*
+ * __wt_split_obsolete --
+ * Check if it is safe to free / evict based on split generation.
+ */
+bool
+__wt_split_obsolete(WT_SESSION_IMPL *session, uint64_t split_gen)
+{
+ return (split_gen < __split_oldest_gen(session));
+}
+
+/*
* __split_stash_add --
* Add a new entry into the session's split stash list.
*/
@@ -187,7 +197,7 @@ __split_safe_free(WT_SESSION_IMPL *session,
#ifdef HAVE_DIAGNOSTIC
/*
* __split_verify_intl_key_order --
- * Verify the key order on an internal page after a split, diagnostic only.
+ * Verify the key order on an internal page after a split.
*/
static void
__split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
@@ -239,6 +249,46 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
break;
}
}
+
+/*
+ * __split_verify_root --
+ * Verify a root page involved in a split.
+ */
+static int
+__split_verify_root(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_DECL_RET;
+ WT_REF *ref;
+
+ /* The split is complete and live, verify all of the pages involved. */
+ __split_verify_intl_key_order(session, page);
+
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ /*
+ * An eviction thread might be attempting to evict the page
+ * (the WT_REF may be WT_REF_LOCKED), or it may be a disk based
+ * page (the WT_REF may be WT_REF_READING), or it may be in
+ * some other state. Acquire a hazard pointer for any
+ * in-memory pages so we know the state of the page.
+ *
+ * Ignore pages not in-memory (deleted, on-disk, being read),
+ * there's no in-memory structure to check.
+ */
+ if ((ret = __wt_page_in(session,
+ ref, WT_READ_CACHE | WT_READ_NO_EVICT)) == WT_NOTFOUND)
+ continue;
+ WT_ERR(ret);
+
+ __split_verify_intl_key_order(session, ref->page);
+
+ WT_ERR(__wt_page_release(session, ref, WT_READ_NO_EVICT));
+ } WT_INTL_FOREACH_END;
+
+ return (0);
+
+err: /* Something really bad just happened. */
+ WT_PANIC_RET(session, ret, "fatal error during page split");
+}
#endif
/*
@@ -390,12 +440,12 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
}
/*
- * __split_ref_step1 --
+ * __split_ref_prepare --
* Prepare a set of WT_REFs for a move.
*/
static void
-__split_ref_step1(
- WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first)
+__split_ref_prepare(WT_SESSION_IMPL *session,
+ WT_PAGE_INDEX *pindex, uint64_t split_gen, bool skip_first)
{
WT_PAGE *child;
WT_REF *child_ref, *ref;
@@ -418,30 +468,25 @@ __split_ref_step1(
child = ref->page;
/*
- * Block eviction and splits in newly created pages.
+ * Block eviction in newly created pages.
*
* Once the split is live, newly created internal pages might be
* evicted and their WT_REF structures freed. If that happened
* before all threads exit the index of the page that previously
* "owned" the WT_REF, a thread might see a freed WT_REF. To
- * ensure that doesn't happen, the newly created page's modify
- * structure has a field with a transaction ID that's checked
- * before any internal page is evicted. Unfortunately, we don't
- * know the correct value until we update the original page's
- * index (we need a transaction ID from after that update), but
- * the act of updating the original page's index is what allows
- * the eviction to happen.
+ * ensure that doesn't happen, the newly created page contains
+ * the current split generation and can't be evicted until
+ * all readers have left the old generation.
*
- * Split blocking was because historic versions of the split
- * code didn't update the WT_REF.home field until after the
- * split was live, so the WT_REF.home fields being updated could
- * split again before the update, there's a race between splits
- * as to which would update them first. The current code updates
- * the WT_REF.home fields before going live (in this function),
- * this shouldn't be an issue, but for now splits remain turned
- * off.
+ * Historic, we also blocked splits in newly created pages
+ * because we didn't update the WT_REF.home field until after
+ * the split was live, so the WT_REF.home fields being updated
+ * could split again before the update, there's a race between
+ * splits as to which would update them first. The current code
+ * updates the WT_REF.home fields before going live (in this
+ * function), this isn't an issue.
*/
- F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
+ child->pg_intl_split_gen = split_gen;
/*
* We use a page flag to prevent the child from splitting from
@@ -465,64 +510,6 @@ __split_ref_step1(
}
/*
- * __split_ref_step2 --
- * Allow the newly created children to be evicted or split.
- */
-static int
-__split_ref_step2(
- WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first)
-{
- WT_DECL_RET;
- WT_PAGE *child;
- WT_REF *ref;
- uint32_t i;
-
- /*
- * The split has gone live, enable eviction and splits on the newly
- * created internal pages.
- */
- WT_WRITE_BARRIER();
-
- for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) {
- ref = pindex->index[i];
-
- /*
- * We don't hold hazard pointers on created pages, they cannot
- * be evicted because the page-modify transaction value set as
- * they were created prevents eviction. (See above, we reset
- * that value as part of fixing up the page.) But, an eviction
- * thread might be attempting to evict the page (the WT_REF may
- * be WT_REF_LOCKED), or it may be a disk based page (the WT_REF
- * may be WT_REF_READING), or it may be in some other state.
- * Acquire a hazard pointer for any in-memory pages so we know
- * the state of the page. Ignore pages not in-memory (deleted,
- * on-disk, being read), there's no in-memory structure to fix.
- */
- if ((ret = __wt_page_in(session,
- ref, WT_READ_CACHE | WT_READ_NO_EVICT)) == WT_NOTFOUND)
- continue;
- WT_ERR(ret);
-
- child = ref->page;
-
- /* The child can now be evicted or split. */
- F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
-
-#ifdef HAVE_DIAGNOSTIC
- WT_WITH_PAGE_INDEX(session,
- __split_verify_intl_key_order(session, child));
-#endif
-
- WT_ERR(__wt_hazard_clear(session, ref));
- }
-
- return (0);
-
-err: /* Something really bad just happened. */
- WT_PANIC_RET(session, ret, "fatal error resolving a split");
-}
-
-/*
* __split_root --
* Split the root page in-memory, deepening the tree.
*/
@@ -653,8 +640,12 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
/* Start making real changes to the tree, errors are fatal. */
complete = WT_ERR_PANIC;
+ /* Get a generation for this split, mark the root page. */
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ root->pg_intl_split_gen = split_gen;
+
/* Prepare the WT_REFs for the move. */
- __split_ref_step1(session, alloc_index, false);
+ __split_ref_prepare(session, alloc_index, split_gen, false);
/*
* Confirm the root page's index hasn't moved, then update it, which
@@ -662,20 +653,17 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
*/
WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(root) == pindex);
WT_INTL_INDEX_SET(root, alloc_index);
+ alloc_index = NULL;
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
- __split_verify_intl_key_order(session, root));
+ ret = __split_verify_root(session, root));
+ WT_ERR(ret);
#endif
- /* Finalize the WT_REFs we moved. */
- WT_ERR(__split_ref_step2(session, alloc_index, false));
- /* The split is complete and correct, ignore benign errors. */
+ /* The split is complete and verified, ignore benign errors. */
complete = WT_ERR_IGNORE;
- /* We've installed the allocated page-index, ensure error handling. */
- alloc_index = NULL;
-
/*
* We can't free the previous root's index, there may be threads using
* it. Add to the session's discard list, to be freed once we know no
@@ -686,7 +674,6 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
* fails, we don't roll back that change, because threads may already
* be using the new index.
*/
- split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
WT_TRET(__split_safe_free(session, split_gen, false, pindex, size));
root_decr += size;
@@ -838,6 +825,10 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
/* Start making real changes to the tree, errors are fatal. */
complete = WT_ERR_PANIC;
+ /* Get a generation for this split, mark the parent page. */
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ parent->pg_intl_split_gen = split_gen;
+
/*
* Confirm the parent page's index hasn't moved then update it, which
* makes the split visible to threads descending the tree.
@@ -846,11 +837,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
WT_INTL_INDEX_SET(parent, alloc_index);
alloc_index = NULL;
-#ifdef HAVE_DIAGNOSTIC
- WT_WITH_PAGE_INDEX(session,
- __split_verify_intl_key_order(session, parent));
-#endif
-
/*
* If discarding the page's original WT_REF field, reset it to split.
* Threads cursoring through the tree were blocked because that WT_REF
@@ -869,16 +855,25 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
__wt_free(session, ref->page_del);
}
+ /*
+ * Set the discarded WT_REF state to split, ensuring we don't
+ * race with any discard of the WT_REF deleted fields.
+ */
WT_PUBLISH(ref->state, WT_REF_SPLIT);
+
+ /*
+ * Push out the change: not required for correctness, but stops
+ * threads spinning on incorrect page references.
+ */
+ WT_FULL_BARRIER();
}
- /*
- * Push out the changes: not required for correctness, but don't let
- * threads spin on incorrect page references longer than necessary.
- */
- WT_FULL_BARRIER();
+#ifdef HAVE_DIAGNOSTIC
+ WT_WITH_PAGE_INDEX(session,
+ __split_verify_intl_key_order(session, parent));
+#endif
- /* The split is complete and correct, ignore benign errors. */
+ /* The split is complete and verified, ignore benign errors. */
complete = WT_ERR_IGNORE;
/*
@@ -908,7 +903,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
*
* Acquire a new split generation.
*/
- split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
for (i = 0, deleted_refs = scr->mem; i < deleted_entries; ++i) {
next_ref = pindex->index[deleted_refs[i]];
WT_ASSERT(session, next_ref->state == WT_REF_SPLIT);
@@ -1160,14 +1154,21 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
/* Start making real changes to the tree, errors are fatal. */
complete = WT_ERR_PANIC;
+ /* Get a generation for this split, mark the page. */
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ page->pg_intl_split_gen = split_gen;
+
/* Prepare the WT_REFs for the move. */
- __split_ref_step1(session, alloc_index, true);
+ __split_ref_prepare(session, alloc_index, split_gen, true);
/* Split into the parent. */
WT_ERR(__split_parent(session, page_ref, alloc_index->index,
alloc_index->entries, parent_incr, false, false));
- /* Confirm the page's index hasn't moved, then update it. */
+ /*
+ * Confirm the page's index hasn't moved, then update it, which makes
+ * the split visible to threads descending the tree.
+ */
WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex);
WT_INTL_INDEX_SET(page, replace_index);
@@ -1178,19 +1179,10 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
__split_verify_intl_key_order(session, page));
#endif
- /* Finalize the WT_REFs we moved. */
- WT_ERR(__split_ref_step2(session, alloc_index, true));
-
- /* The split is complete and correct, ignore benign errors. */
+ /* The split is complete and verified, ignore benign errors. */
complete = WT_ERR_IGNORE;
/*
- * Push out the changes: not required for correctness, but no reason
- * to wait.
- */
- WT_FULL_BARRIER();
-
- /*
* We don't care about the page-index we allocated, all we needed was
* the array of WT_REF structures, which has now been split into the
* parent page.
@@ -1207,7 +1199,6 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
* back that change, because threads may already be using the new parent
* page.
*/
- split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
WT_TRET(__split_safe_free(session, split_gen, false, pindex, size));
page_decr += size;
@@ -1284,10 +1275,6 @@ __split_internal_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool trylock,
for (;;) {
parent = ref->home;
- /* Skip pages that aren't ready to split. */
- if (F_ISSET_ATOMIC(parent, WT_PAGE_SPLIT_BLOCK))
- return (EBUSY);
-
if (trylock)
WT_RET(__wt_try_writelock(session, &parent->page_lock));
else
@@ -2086,8 +2073,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
WT_PAGE *parent;
bool hazard;
- __wt_verbose(
- session, WT_VERB_SPLIT, "%p: split-insert", (void *)ref->page);
+ __wt_verbose(session, WT_VERB_SPLIT, "%p: split-insert", (void *)ref);
WT_RET(__split_internal_lock(session, ref, true, &parent, &hazard));
if ((ret = __split_insert(session, ref)) != 0) {
@@ -2178,8 +2164,7 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
WT_PAGE *parent;
bool hazard;
- __wt_verbose(
- session, WT_VERB_SPLIT, "%p: split-multi", (void *)ref->page);
+ __wt_verbose(session, WT_VERB_SPLIT, "%p: split-multi", (void *)ref);
WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard));
if ((ret = __split_multi(session, ref, closing)) != 0 || closing) {
@@ -2207,8 +2192,7 @@ __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
WT_PAGE *parent;
bool hazard;
- __wt_verbose(
- session, WT_VERB_SPLIT, "%p: reverse-split", (void *)ref->page);
+ __wt_verbose(session, WT_VERB_SPLIT, "%p: reverse-split", (void *)ref);
WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard));
ret = __split_parent(session, ref, NULL, 0, 0, false, true);
@@ -2229,8 +2213,7 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, WT_MULTI *multi)
page = ref->page;
- __wt_verbose(
- session, WT_VERB_SPLIT, "%p: split-rewrite", (void *)ref->page);
+ __wt_verbose(session, WT_VERB_SPLIT, "%p: split-rewrite", (void *)ref);
/*
* This isn't a split: a reconciliation failed because we couldn't write
diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c
index 049700952ee..ddaa2e5f70b 100644
--- a/src/btree/bt_walk.c
+++ b/src/btree/bt_walk.c
@@ -340,9 +340,7 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
* Take a copy of any held page and clear the return value. Remember
* the hazard pointer we're currently holding.
*
- * We may be passed a pointer to btree->evict_page that we are clearing
- * here. We check when discarding pages that we're not discarding that
- * page, so this clear must be done before the page is released.
+ * Clear the returned value, it makes future error handling easier.
*/
couple = couple_orig = ref = *refp;
*refp = NULL;
diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c
index aa299a161da..9c3d467340e 100644
--- a/src/btree/row_srch.c
+++ b/src/btree/row_srch.c
@@ -623,215 +623,3 @@ leaf_match: cbt->compare = 0;
err: WT_TRET(__wt_page_release(session, current, 0));
return (ret);
}
-
-/*
- * __wt_row_random_leaf --
- * Return a random key from a row-store leaf page.
- */
-int
-__wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
-{
- WT_INSERT *ins, **start, **stop;
- WT_INSERT_HEAD *ins_head;
- WT_PAGE *page;
- uint64_t samples;
- uint32_t choice, entries, i;
- int level;
-
- page = cbt->ref->page;
- start = stop = NULL; /* [-Wconditional-uninitialized] */
- entries = 0; /* [-Wconditional-uninitialized] */
-
- __cursor_pos_clear(cbt);
-
- /* If the page has disk-based entries, select from them. */
- if (page->entries != 0) {
- cbt->compare = 0;
- cbt->slot = __wt_random(&session->rnd) % page->entries;
-
- /*
- * The real row-store search function builds the key, so we
- * have to as well.
- */
- return (__wt_row_leaf_key(session,
- page, page->pg_row + cbt->slot, cbt->tmp, false));
- }
-
- /*
- * If the tree is new (and not empty), it might have a large insert
- * list.
- *
- * Walk down the list until we find a level with at least 50 entries,
- * that's where we'll start rolling random numbers. The value 50 is
- * used to ignore levels with only a few entries, that is, levels which
- * are potentially badly skewed.
- */
- F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
- if ((ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
- return (WT_NOTFOUND);
- for (level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) {
- start = &ins_head->head[level];
- for (entries = 0, stop = start;
- *stop != NULL; stop = &(*stop)->next[level])
- ++entries;
-
- if (entries > 50)
- break;
- }
-
- /*
- * If it's a tiny list and we went all the way to level 0, correct the
- * level; entries is correctly set.
- */
- if (level < 0)
- level = 0;
-
- /*
- * Step down the skip list levels, selecting a random chunk of the name
- * space at each level.
- */
- for (samples = entries; level > 0; samples += entries) {
- /*
- * There are (entries) or (entries + 1) chunks of the name space
- * considered at each level. They are: between start and the 1st
- * element, between the 1st and 2nd elements, and so on to the
- * last chunk which is the name space after the stop element on
- * the current level. This last chunk of name space may or may
- * not be there: as we descend the levels of the skip list, this
- * chunk may appear, depending if the next level down has
- * entries logically after the stop point in the current level.
- * We can't ignore those entries: because of the algorithm used
- * to determine the depth of a skiplist, there may be a large
- * number of entries "revealed" by descending a level.
- *
- * If the next level down has more items after the current stop
- * point, there are (entries + 1) chunks to consider, else there
- * are (entries) chunks.
- */
- if (*(stop - 1) == NULL)
- choice = __wt_random(&session->rnd) % entries;
- else
- choice = __wt_random(&session->rnd) % (entries + 1);
-
- if (choice == entries) {
- /*
- * We selected the name space after the stop element on
- * this level. Set the start point to the current stop
- * point, descend a level and move the stop element to
- * the end of the list, that is, the end of the newly
- * discovered name space, counting entries as we go.
- */
- start = stop;
- --start;
- --level;
- for (entries = 0, stop = start;
- *stop != NULL; stop = &(*stop)->next[level])
- ++entries;
- } else {
- /*
- * We selected another name space on the level. Move the
- * start pointer the selected number of entries forward
- * to the start of the selected chunk (if the selected
- * number is 0, start won't move). Set the stop pointer
- * to the next element in the list and drop both start
- * and stop down a level.
- */
- for (i = 0; i < choice; ++i)
- start = &(*start)->next[level];
- stop = &(*start)->next[level];
-
- --start;
- --stop;
- --level;
-
- /* Count the entries in the selected name space. */
- for (entries = 0,
- ins = *start; ins != *stop; ins = ins->next[level])
- ++entries;
- }
- }
-
- /*
- * When we reach the bottom level, entries will already be set. Select
- * a random entry from the name space and return it.
- *
- * It should be impossible for the entries count to be 0 at this point,
- * but check for it out of paranoia and to quiet static testing tools.
- */
- if (entries > 0)
- entries = __wt_random(&session->rnd) % entries;
- for (ins = *start; entries > 0; --entries)
- ins = ins->next[0];
-
- cbt->ins = ins;
- cbt->ins_head = ins_head;
- cbt->compare = 0;
-
- /*
- * Random lookups in newly created collections can be slow if a page
- * consists of a large skiplist. Schedule the page for eviction if we
- * encounter a large skiplist. This worthwhile because applications
- * that take a sample often take many samples, so the overhead of
- * traversing the skip list each time accumulates to real time.
- */
- if (samples > 5000)
- __wt_page_evict_soon(session, cbt->ref);
-
- return (0);
-}
-
-/*
- * __wt_row_random_descent --
- * Find a random leaf page in a row-store tree.
- */
-int
-__wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
-{
- WT_BTREE *btree;
- WT_DECL_RET;
- WT_PAGE *page;
- WT_PAGE_INDEX *pindex;
- WT_REF *current, *descent;
-
- btree = S2BT(session);
- current = NULL;
-
- if (0) {
-restart: /*
- * Discard the currently held page and restart the search from
- * the root.
- */
- WT_RET(__wt_page_release(session, current, 0));
- }
-
- /* Search the internal pages of the tree. */
- current = &btree->root;
- for (;;) {
- page = current->page;
- if (page->type != WT_PAGE_ROW_INT)
- break;
-
- WT_INTL_INDEX_GET(session, page, pindex);
- descent = pindex->index[
- __wt_random(&session->rnd) % pindex->entries];
-
- /*
- * Swap the current page for the child page. If the page splits
- * while we're retrieving it, restart the search at the root.
- *
- * On other error, simply return, the swap call ensures we're
- * holding nothing on failure.
- */
- if ((ret = __wt_page_swap(
- session, current, descent, WT_READ_RESTART_OK)) == 0) {
- current = descent;
- continue;
- }
- if (ret == WT_RESTART)
- goto restart;
- return (ret);
- }
-
- cbt->ref = current;
- return (0);
-}
diff --git a/src/checksum/power8/crc32_wrapper.c b/src/checksum/power8/crc32_wrapper.c
index ddfa2bdaeb8..a9be9ced1c6 100644
--- a/src/checksum/power8/crc32_wrapper.c
+++ b/src/checksum/power8/crc32_wrapper.c
@@ -1,4 +1,6 @@
#if defined(__powerpc64__)
+#include "wt_internal.h"
+
#define CRC_TABLE
#include "crc32_constants.h"
@@ -68,8 +70,6 @@ out:
}
#endif
-#include "wt_internal.h"
-
/*
* __wt_checksum_hw --
* WiredTiger: return a checksum for a chunk of memory.
diff --git a/src/checksum/zseries/crc32-s390x.c b/src/checksum/zseries/crc32-s390x.c
index f77d6768d42..28b46594220 100644
--- a/src/checksum/zseries/crc32-s390x.c
+++ b/src/checksum/zseries/crc32-s390x.c
@@ -6,8 +6,20 @@
* Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
*
*/
+#include "wt_internal.h"
+
#include <sys/types.h>
#include <endian.h>
+
+#if defined(HAVE_CRC32_HARDWARE)
+
+#include <sys/auxv.h>
+
+/* RHEL 7 has kernel support, but does not define this constant in the lib c headers. */
+#ifndef HWCAP_S390_VX
+#define HWCAP_S390_VX 2048
+#endif
+
#include "crc32-s390x.h"
#include "slicing-consts.h"
@@ -69,8 +81,6 @@ unsigned int __wt_crc32c_le(unsigned int crc, const unsigned char *buf, size_t l
/* Main CRC-32 functions */
DEFINE_CRC32_VX(__wt_crc32c_le_vx, __wt_crc32c_le_vgfm_16, __wt_crc32c_le)
-#include "wt_internal.h"
-
/*
* __wt_checksum_hw --
* WiredTiger: return a checksum for a chunk of memory.
@@ -81,6 +91,8 @@ __wt_checksum_hw(const void *chunk, size_t len)
return (~__wt_crc32c_le_vx(0xffffffff, chunk, len));
}
+#endif
+
/*
* __wt_checksum_init --
* WiredTiger: detect CRC hardware and set the checksum function.
@@ -89,8 +101,14 @@ void
__wt_checksum_init(void)
{
#if defined(HAVE_CRC32_HARDWARE)
- __wt_process.checksum = __wt_checksum_hw;
-#else
+ unsigned long caps = getauxval(AT_HWCAP);
+
+ if (caps & HWCAP_S390_VX)
+ __wt_process.checksum = __wt_checksum_hw;
+ else
+ __wt_process.checksum = __wt_checksum_sw;
+
+#else /* !HAVE_CRC32_HARDWARE */
__wt_process.checksum = __wt_checksum_sw;
#endif
}
diff --git a/src/config/config_def.c b/src/config/config_def.c
index 6a93c1d05e2..b11a8d63fdb 100644
--- a/src/config/config_def.c
+++ b/src/config/config_def.c
@@ -147,12 +147,12 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = {
confchk_WT_CONNECTION_reconfigure_statistics_log_subconfigs, 5 },
{ "verbose", "list",
NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\","
- "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\","
- "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\","
- "\"read\",\"rebalance\",\"reconcile\",\"recovery\","
- "\"recovery_progress\",\"salvage\",\"shared_cache\",\"split\","
- "\"temporary\",\"thread_group\",\"transaction\",\"verify\","
- "\"version\",\"write\"]",
+ "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\","
+ "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\","
+ "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\","
+ "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\","
+ "\"split\",\"temporary\",\"thread_group\",\"transaction\","
+ "\"verify\",\"version\",\"write\"]",
NULL, 0 },
{ NULL, NULL, NULL, NULL, NULL, 0 }
};
@@ -750,12 +750,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
{ "use_environment_priv", "boolean", NULL, NULL, NULL, 0 },
{ "verbose", "list",
NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\","
- "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\","
- "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\","
- "\"read\",\"rebalance\",\"reconcile\",\"recovery\","
- "\"recovery_progress\",\"salvage\",\"shared_cache\",\"split\","
- "\"temporary\",\"thread_group\",\"transaction\",\"verify\","
- "\"version\",\"write\"]",
+ "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\","
+ "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\","
+ "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\","
+ "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\","
+ "\"split\",\"temporary\",\"thread_group\",\"transaction\","
+ "\"verify\",\"version\",\"write\"]",
NULL, 0 },
{ "write_through", "list",
NULL, "choices=[\"data\",\"log\"]",
@@ -837,12 +837,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = {
{ "use_environment_priv", "boolean", NULL, NULL, NULL, 0 },
{ "verbose", "list",
NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\","
- "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\","
- "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\","
- "\"read\",\"rebalance\",\"reconcile\",\"recovery\","
- "\"recovery_progress\",\"salvage\",\"shared_cache\",\"split\","
- "\"temporary\",\"thread_group\",\"transaction\",\"verify\","
- "\"version\",\"write\"]",
+ "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\","
+ "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\","
+ "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\","
+ "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\","
+ "\"split\",\"temporary\",\"thread_group\",\"transaction\","
+ "\"verify\",\"version\",\"write\"]",
NULL, 0 },
{ "version", "string", NULL, NULL, NULL, 0 },
{ "write_through", "list",
@@ -919,12 +919,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = {
confchk_wiredtiger_open_transaction_sync_subconfigs, 2 },
{ "verbose", "list",
NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\","
- "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\","
- "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\","
- "\"read\",\"rebalance\",\"reconcile\",\"recovery\","
- "\"recovery_progress\",\"salvage\",\"shared_cache\",\"split\","
- "\"temporary\",\"thread_group\",\"transaction\",\"verify\","
- "\"version\",\"write\"]",
+ "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\","
+ "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\","
+ "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\","
+ "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\","
+ "\"split\",\"temporary\",\"thread_group\",\"transaction\","
+ "\"verify\",\"version\",\"write\"]",
NULL, 0 },
{ "version", "string", NULL, NULL, NULL, 0 },
{ "write_through", "list",
@@ -1001,12 +1001,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = {
confchk_wiredtiger_open_transaction_sync_subconfigs, 2 },
{ "verbose", "list",
NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\","
- "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\","
- "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\","
- "\"read\",\"rebalance\",\"reconcile\",\"recovery\","
- "\"recovery_progress\",\"salvage\",\"shared_cache\",\"split\","
- "\"temporary\",\"thread_group\",\"transaction\",\"verify\","
- "\"version\",\"write\"]",
+ "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\","
+ "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\","
+ "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\","
+ "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\","
+ "\"split\",\"temporary\",\"thread_group\",\"transaction\","
+ "\"verify\",\"version\",\"write\"]",
NULL, 0 },
{ "write_through", "list",
NULL, "choices=[\"data\",\"log\"]",
diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c
index f691a76b1f2..124250a7a7d 100644
--- a/src/conn/conn_api.c
+++ b/src/conn/conn_api.c
@@ -1798,6 +1798,7 @@ __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[])
{ "checkpoint", WT_VERB_CHECKPOINT },
{ "compact", WT_VERB_COMPACT },
{ "evict", WT_VERB_EVICT },
+ { "evict_stuck", WT_VERB_EVICT_STUCK },
{ "evictserver", WT_VERB_EVICTSERVER },
{ "fileops", WT_VERB_FILEOPS },
{ "handleops", WT_VERB_HANDLEOPS },
@@ -1987,6 +1988,16 @@ __conn_set_file_system(
CONNECTION_API_CALL(conn, session, set_file_system, config, cfg);
WT_UNUSED(cfg);
+ /*
+ * You can only configure a file system once, and attempting to do it
+ * again probably means the extension argument didn't have early-load
+ * set and we've already configured the default file system.
+ */
+ if (conn->file_system != NULL)
+ WT_ERR_MSG(session, EPERM,
+ "filesystem already configured; custom filesystems should "
+ "enable \"early_load\" configuration");
+
conn->file_system = file_system;
err: API_END_RET(session, ret);
diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c
index 2b0e5081f04..28dd06332e0 100644
--- a/src/conn/conn_cache.c
+++ b/src/conn/conn_cache.c
@@ -187,8 +187,8 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[])
WT_RET_MSG(session, EINVAL,
"eviction target must be lower than the eviction trigger");
- WT_RET(__wt_cond_auto_alloc(session, "cache eviction server",
- false, 10000, WT_MILLION, &cache->evict_cond));
+ WT_RET(__wt_cond_auto_alloc(session,
+ "cache eviction server", 10000, WT_MILLION, &cache->evict_cond));
WT_RET(__wt_spin_init(session, &cache->evict_pass_lock, "evict pass"));
WT_RET(__wt_spin_init(session,
&cache->evict_queue_lock, "cache eviction queue"));
@@ -312,7 +312,7 @@ __wt_cache_destroy(WT_SESSION_IMPL *session)
cache->bytes_dirty_intl + cache->bytes_dirty_leaf,
cache->pages_dirty_intl + cache->pages_dirty_leaf);
- WT_TRET(__wt_cond_auto_destroy(session, &cache->evict_cond));
+ WT_TRET(__wt_cond_destroy(session, &cache->evict_cond));
__wt_spin_destroy(session, &cache->evict_pass_lock);
__wt_spin_destroy(session, &cache->evict_queue_lock);
__wt_spin_destroy(session, &cache->evict_walk_lock);
diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c
index 79c2fc23da5..49b766f4602 100644
--- a/src/conn/conn_cache_pool.c
+++ b/src/conn/conn_cache_pool.c
@@ -32,7 +32,7 @@
*/
#define WT_CACHE_POOL_APP_EVICT_MULTIPLIER 3
#define WT_CACHE_POOL_APP_WAIT_MULTIPLIER 6
-#define WT_CACHE_POOL_READ_MULTIPLIER 1
+#define WT_CACHE_POOL_READ_MULTIPLIER 1
static void __cache_pool_adjust(
WT_SESSION_IMPL *, uint64_t, uint64_t, bool, bool *);
@@ -104,8 +104,8 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
TAILQ_INIT(&cp->cache_pool_qh);
WT_ERR(__wt_spin_init(
session, &cp->cache_pool_lock, "cache shared pool"));
- WT_ERR(__wt_cond_alloc(session,
- "cache pool server", false, &cp->cache_pool_cond));
+ WT_ERR(__wt_cond_alloc(
+ session, "cache pool server", &cp->cache_pool_cond));
__wt_process.cache_pool = cp;
__wt_verbose(session,
@@ -733,7 +733,7 @@ __wt_cache_pool_server(void *arg)
F_ISSET(cache, WT_CACHE_POOL_RUN)) {
if (cp->currently_used <= cp->size)
__wt_cond_wait(
- session, cp->cache_pool_cond, WT_MILLION);
+ session, cp->cache_pool_cond, WT_MILLION, NULL);
/*
* Re-check pool run flag - since we want to avoid getting the
diff --git a/src/conn/conn_ckpt.c b/src/conn/conn_ckpt.c
index faeef4e71a2..7797ed4421c 100644
--- a/src/conn/conn_ckpt.c
+++ b/src/conn/conn_ckpt.c
@@ -63,6 +63,16 @@ __ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, bool *startp)
}
/*
+ * __ckpt_server_run_chk --
+ * Check to decide if the checkpoint server should continue running.
+ */
+static bool
+__ckpt_server_run_chk(WT_SESSION_IMPL *session)
+{
+ return (F_ISSET(S2C(session), WT_CONN_SERVER_CHECKPOINT));
+}
+
+/*
* __ckpt_server --
* The checkpoint server thread.
*/
@@ -78,14 +88,18 @@ __ckpt_server(void *arg)
conn = S2C(session);
wt_session = (WT_SESSION *)session;
- while (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
- F_ISSET(conn, WT_CONN_SERVER_CHECKPOINT)) {
+ for (;;) {
/*
* Wait...
* NOTE: If the user only configured logsize, then usecs
* will be 0 and this wait won't return until signalled.
*/
- __wt_cond_wait(session, conn->ckpt_cond, conn->ckpt_usecs);
+ __wt_cond_wait(session,
+ conn->ckpt_cond, conn->ckpt_usecs, __ckpt_server_run_chk);
+
+ /* Check if we're quitting or being reconfigured. */
+ if (!__ckpt_server_run_chk(session))
+ break;
/*
* Checkpoint the database if the connection is marked dirty.
@@ -113,7 +127,8 @@ __ckpt_server(void *arg)
* it so we don't do another checkpoint
* immediately.
*/
- __wt_cond_wait(session, conn->ckpt_cond, 1);
+ __wt_cond_wait(
+ session, conn->ckpt_cond, 1, NULL);
}
} else
WT_STAT_CONN_INCR(session, txn_checkpoint_skipped);
@@ -152,8 +167,7 @@ __ckpt_server_start(WT_CONNECTION_IMPL *conn)
"checkpoint-server", true, session_flags, &conn->ckpt_session));
session = conn->ckpt_session;
- WT_RET(__wt_cond_alloc(
- session, "checkpoint server", false, &conn->ckpt_cond));
+ WT_RET(__wt_cond_alloc(session, "checkpoint server", &conn->ckpt_cond));
/*
* Start the thread.
diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c
index b2f4bb04ce4..866b8633f71 100644
--- a/src/conn/conn_dhandle.c
+++ b/src/conn/conn_dhandle.c
@@ -25,21 +25,19 @@ __conn_dhandle_destroy(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle)
}
/*
- * __conn_dhandle_alloc --
+ * __wt_conn_dhandle_alloc --
* Allocate a new data handle and return it linked into the connection's
* list.
*/
-static int
-__conn_dhandle_alloc(WT_SESSION_IMPL *session,
- const char *uri, const char *checkpoint, WT_DATA_HANDLE **dhandlep)
+int
+__wt_conn_dhandle_alloc(
+ WT_SESSION_IMPL *session, const char *uri, const char *checkpoint)
{
WT_BTREE *btree;
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
uint64_t bucket;
- *dhandlep = NULL;
-
WT_RET(__wt_calloc_one(session, &dhandle));
__wt_rwlock_init(session, &dhandle->rwlock);
@@ -75,7 +73,7 @@ __conn_dhandle_alloc(WT_SESSION_IMPL *session,
bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE;
WT_CONN_DHANDLE_INSERT(S2C(session), dhandle, bucket);
- *dhandlep = dhandle;
+ session->dhandle = dhandle;
return (0);
err: __conn_dhandle_destroy(session, dhandle);
@@ -122,10 +120,7 @@ __wt_conn_dhandle_find(
}
}
- WT_RET(__conn_dhandle_alloc(session, uri, checkpoint, &dhandle));
-
- session->dhandle = dhandle;
- return (0);
+ return (WT_NOTFOUND);
}
/*
@@ -419,12 +414,11 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session, const char *uri,
{
WT_CONNECTION_IMPL *conn;
WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
uint64_t bucket;
conn = S2C(session);
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
-
/*
* If we're given a URI, then we walk only the hash list for that
* name. If we don't have a URI we walk the entire dhandle list.
@@ -432,29 +426,42 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session, const char *uri,
if (uri != NULL) {
bucket =
__wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE;
- TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) {
+
+ for (dhandle = NULL;;) {
+ WT_WITH_HANDLE_LIST_READ_LOCK(session,
+ WT_DHANDLE_NEXT(session, dhandle,
+ &conn->dhhash[bucket], hashq));
+ if (dhandle == NULL)
+ return (0);
+
if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
F_ISSET(dhandle, WT_DHANDLE_DEAD) ||
dhandle->checkpoint != NULL ||
strcmp(uri, dhandle->name) != 0)
continue;
- WT_RET(__conn_btree_apply_internal(
- session, dhandle, file_func, name_func, cfg));
+ WT_ERR(__conn_btree_apply_internal(session,
+ dhandle, file_func, name_func, cfg));
}
} else {
- TAILQ_FOREACH(dhandle, &conn->dhqh, q) {
+ for (dhandle = NULL;;) {
+ WT_WITH_HANDLE_LIST_READ_LOCK(session,
+ WT_DHANDLE_NEXT(session, dhandle, &conn->dhqh, q));
+ if (dhandle == NULL)
+ return (0);
+
if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
F_ISSET(dhandle, WT_DHANDLE_DEAD) ||
dhandle->checkpoint != NULL ||
!WT_PREFIX_MATCH(dhandle->name, "file:") ||
WT_IS_METADATA(dhandle))
continue;
- WT_RET(__conn_btree_apply_internal(
- session, dhandle, file_func, name_func, cfg));
+ WT_ERR(__conn_btree_apply_internal(session,
+ dhandle, file_func, name_func, cfg));
}
}
- return (0);
+err: WT_DHANDLE_RELEASE(dhandle);
+ return (ret);
}
/*
@@ -473,7 +480,8 @@ __wt_conn_dhandle_close_all(
conn = S2C(session);
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
+ WT_ASSERT(session,
+ F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE));
WT_ASSERT(session, session->dhandle == NULL);
bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE;
@@ -534,7 +542,8 @@ __conn_dhandle_remove(WT_SESSION_IMPL *session, bool final)
dhandle = session->dhandle;
bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE;
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
+ WT_ASSERT(session,
+ F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE));
WT_ASSERT(session, dhandle != conn->cache->evict_file_next);
/* Check if the handle was reacquired by a session while we waited. */
@@ -583,7 +592,7 @@ __wt_conn_dhandle_discard_single(
}
/* Try to remove the handle, protected by the data handle lock. */
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
tret = __conn_dhandle_remove(session, final));
if (set_pass_intr)
(void)__wt_atomic_subv32(&S2C(session)->cache->pass_intr, 1);
diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c
index 3f7fc9bb2a7..287e9ca7b99 100644
--- a/src/conn/conn_handle.c
+++ b/src/conn/conn_handle.c
@@ -53,18 +53,18 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn)
/* Spinlocks. */
WT_RET(__wt_spin_init(session, &conn->api_lock, "api"));
WT_SPIN_INIT_TRACKED(session, &conn->checkpoint_lock, checkpoint);
- WT_SPIN_INIT_TRACKED(session, &conn->dhandle_lock, handle_list);
WT_RET(__wt_spin_init(session, &conn->encryptor_lock, "encryptor"));
WT_RET(__wt_spin_init(session, &conn->fh_lock, "file list"));
WT_RET(__wt_spin_init(session, &conn->las_lock, "lookaside table"));
WT_SPIN_INIT_TRACKED(session, &conn->metadata_lock, metadata);
WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure"));
WT_SPIN_INIT_TRACKED(session, &conn->schema_lock, schema);
- WT_SPIN_INIT_TRACKED(session, &conn->table_lock, table);
WT_RET(__wt_spin_init(session, &conn->turtle_lock, "turtle file"));
/* Read-write locks */
+ __wt_rwlock_init(session, &conn->dhandle_lock);
__wt_rwlock_init(session, &conn->hot_backup_lock);
+ __wt_rwlock_init(session, &conn->table_lock);
WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS, &conn->page_lock));
for (i = 0; i < WT_PAGE_LOCKS; ++i)
@@ -79,7 +79,7 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn)
WT_RET(__wt_spin_init(
session, &conn->lsm_manager.switch_lock, "LSM switch queue lock"));
WT_RET(__wt_cond_alloc(
- session, "LSM worker cond", false, &conn->lsm_manager.work_cond));
+ session, "LSM worker cond", &conn->lsm_manager.work_cond));
/*
* Generation numbers.
@@ -109,16 +109,15 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn)
* __wt_connection_destroy --
* Destroy the connection's underlying WT_CONNECTION_IMPL structure.
*/
-int
+void
__wt_connection_destroy(WT_CONNECTION_IMPL *conn)
{
- WT_DECL_RET;
WT_SESSION_IMPL *session;
u_int i;
/* Check there's something to destroy. */
if (conn == NULL)
- return (0);
+ return;
session = conn->default_session;
@@ -135,7 +134,7 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn)
__wt_spin_destroy(session, &conn->api_lock);
__wt_spin_destroy(session, &conn->block_lock);
__wt_spin_destroy(session, &conn->checkpoint_lock);
- __wt_spin_destroy(session, &conn->dhandle_lock);
+ __wt_rwlock_destroy(session, &conn->dhandle_lock);
__wt_spin_destroy(session, &conn->encryptor_lock);
__wt_spin_destroy(session, &conn->fh_lock);
__wt_rwlock_destroy(session, &conn->hot_backup_lock);
@@ -143,17 +142,12 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn)
__wt_spin_destroy(session, &conn->metadata_lock);
__wt_spin_destroy(session, &conn->reconfig_lock);
__wt_spin_destroy(session, &conn->schema_lock);
- __wt_spin_destroy(session, &conn->table_lock);
+ __wt_rwlock_destroy(session, &conn->table_lock);
__wt_spin_destroy(session, &conn->turtle_lock);
for (i = 0; i < WT_PAGE_LOCKS; ++i)
__wt_spin_destroy(session, &conn->page_lock[i]);
__wt_free(session, conn->page_lock);
- /* Destroy the file-system configuration. */
- if (conn->file_system != NULL && conn->file_system->terminate != NULL)
- WT_TRET(conn->file_system->terminate(
- conn->file_system, (WT_SESSION *)session));
-
/* Free allocated memory. */
__wt_free(session, conn->cfg);
__wt_free(session, conn->home);
@@ -162,5 +156,4 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn)
__wt_stat_connection_discard(session, conn);
__wt_free(NULL, conn);
- return (ret);
}
diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c
index 8f8f8614ba8..c6dd795389d 100644
--- a/src/conn/conn_log.c
+++ b/src/conn/conn_log.c
@@ -174,7 +174,7 @@ __logmgr_config(
WT_RET(__logmgr_sync_cfg(session, cfg));
if (conn->log_cond != NULL)
- __wt_cond_auto_signal(session, conn->log_cond);
+ __wt_cond_signal(session, conn->log_cond);
return (0);
}
@@ -341,7 +341,7 @@ __wt_log_truncate_files(
conn = S2C(session);
if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
return (0);
- if (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
+ if (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN) &&
FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE))
WT_RET_MSG(session, EINVAL,
"Attempt to archive manually while a server is running");
@@ -505,8 +505,7 @@ __log_file_server(void *arg)
locked = false;
__wt_spin_unlock(session, &log->log_sync_lock);
} else {
- __wt_cond_auto_signal(
- session, conn->log_wrlsn_cond);
+ __wt_cond_signal(session, conn->log_wrlsn_cond);
/*
* We do not want to wait potentially a second
* to process this. Yield to give the wrlsn
@@ -517,8 +516,9 @@ __log_file_server(void *arg)
continue;
}
}
+
/* Wait until the next event. */
- __wt_cond_wait(session, conn->log_file_cond, WT_MILLION / 10);
+ __wt_cond_wait(session, conn->log_file_cond, 100000, NULL);
}
if (0) {
@@ -730,12 +730,8 @@ __log_wrlsn_server(void *arg)
if (yield++ < WT_THOUSAND)
__wt_yield();
else
- /*
- * Send in false because if we did any work we would
- * not be on this path.
- */
__wt_cond_auto_wait(
- session, conn->log_wrlsn_cond, did_work);
+ session, conn->log_wrlsn_cond, did_work, NULL);
}
/*
* On close we need to do this one more time because there could
@@ -840,10 +836,9 @@ __log_server(void *arg)
}
/* Wait until the next event. */
-
__wt_epoch(session, &start);
- __wt_cond_auto_wait_signal(session,
- conn->log_cond, did_work, &signalled);
+ __wt_cond_auto_wait_signal(
+ session, conn->log_cond, did_work, NULL, &signalled);
__wt_epoch(session, &now);
timediff = WT_TIMEDIFF_MS(now, start);
}
@@ -904,10 +899,8 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[])
WT_INIT_LSN(&log->write_lsn);
WT_INIT_LSN(&log->write_start_lsn);
log->fileid = 0;
- WT_RET(__wt_cond_alloc(
- session, "log sync", false, &log->log_sync_cond));
- WT_RET(__wt_cond_alloc(
- session, "log write", false, &log->log_write_cond));
+ WT_RET(__wt_cond_alloc(session, "log sync", &log->log_sync_cond));
+ WT_RET(__wt_cond_alloc(session, "log write", &log->log_write_cond));
WT_RET(__wt_log_open(session));
WT_RET(__wt_log_slot_init(session));
@@ -930,6 +923,8 @@ __wt_logmgr_open(WT_SESSION_IMPL *session)
if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
return (0);
+ F_SET(conn, WT_CONN_LOG_SERVER_RUN);
+
/*
* Start the log close thread. It is not configurable.
* If logging is enabled, this thread runs.
@@ -937,8 +932,8 @@ __wt_logmgr_open(WT_SESSION_IMPL *session)
session_flags = WT_SESSION_NO_DATA_HANDLES;
WT_RET(__wt_open_internal_session(conn,
"log-close-server", false, session_flags, &conn->log_file_session));
- WT_RET(__wt_cond_alloc(conn->log_file_session,
- "log close server", false, &conn->log_file_cond));
+ WT_RET(__wt_cond_alloc(
+ conn->log_file_session, "log close server", &conn->log_file_cond));
/*
* Start the log file close thread.
@@ -954,8 +949,7 @@ __wt_logmgr_open(WT_SESSION_IMPL *session)
WT_RET(__wt_open_internal_session(conn, "log-wrlsn-server",
false, session_flags, &conn->log_wrlsn_session));
WT_RET(__wt_cond_auto_alloc(conn->log_wrlsn_session,
- "log write lsn server", false, 10000, WT_MILLION,
- &conn->log_wrlsn_cond));
+ "log write lsn server", 10000, WT_MILLION, &conn->log_wrlsn_cond));
WT_RET(__wt_thread_create(conn->log_wrlsn_session,
&conn->log_wrlsn_tid, __log_wrlsn_server, conn->log_wrlsn_session));
conn->log_wrlsn_tid_set = true;
@@ -969,13 +963,13 @@ __wt_logmgr_open(WT_SESSION_IMPL *session)
if (conn->log_session != NULL) {
WT_ASSERT(session, conn->log_cond != NULL);
WT_ASSERT(session, conn->log_tid_set == true);
- __wt_cond_auto_signal(session, conn->log_cond);
+ __wt_cond_signal(session, conn->log_cond);
} else {
/* The log server gets its own session. */
WT_RET(__wt_open_internal_session(conn,
"log-server", false, session_flags, &conn->log_session));
WT_RET(__wt_cond_auto_alloc(conn->log_session,
- "log server", false, 50000, WT_MILLION, &conn->log_cond));
+ "log server", 50000, WT_MILLION, &conn->log_cond));
/*
* Start the thread.
@@ -1001,6 +995,8 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
conn = S2C(session);
+ F_CLR(conn, WT_CONN_LOG_SERVER_RUN);
+
if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) {
/*
* We always set up the log_path so printlog can work without
@@ -1011,7 +1007,7 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
return (0);
}
if (conn->log_tid_set) {
- __wt_cond_auto_signal(session, conn->log_cond);
+ __wt_cond_signal(session, conn->log_cond);
WT_TRET(__wt_thread_join(session, conn->log_tid));
conn->log_tid_set = false;
}
@@ -1026,7 +1022,7 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
conn->log_file_session = NULL;
}
if (conn->log_wrlsn_tid_set) {
- __wt_cond_auto_signal(session, conn->log_wrlsn_cond);
+ __wt_cond_signal(session, conn->log_wrlsn_cond);
WT_TRET(__wt_thread_join(session, conn->log_wrlsn_tid));
conn->log_wrlsn_tid_set = false;
}
@@ -1047,9 +1043,9 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
}
/* Destroy the condition variables now that all threads are stopped */
- WT_TRET(__wt_cond_auto_destroy(session, &conn->log_cond));
+ WT_TRET(__wt_cond_destroy(session, &conn->log_cond));
WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond));
- WT_TRET(__wt_cond_auto_destroy(session, &conn->log_wrlsn_cond));
+ WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond));
WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond));
WT_TRET(__wt_cond_destroy(session, &conn->log->log_write_cond));
diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c
index d4ace127bb2..5b20377d437 100644
--- a/src/conn/conn_open.c
+++ b/src/conn/conn_open.c
@@ -25,7 +25,7 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[])
* Tell internal server threads to run: this must be set before opening
* any sessions.
*/
- F_SET(conn, WT_CONN_SERVER_RUN | WT_CONN_LOG_SERVER_RUN);
+ F_SET(conn, WT_CONN_SERVER_RUN);
/* WT_SESSION_IMPL array. */
WT_RET(__wt_calloc(session,
@@ -100,8 +100,12 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
__wt_yield();
}
- /* Clear any pending async ops. */
+ /*
+ * Clear any pending async operations and shut down the async worker
+ * threads and system before closing LSM.
+ */
WT_TRET(__wt_async_flush(session));
+ WT_TRET(__wt_async_destroy(session));
/*
* Shut down server threads other than the eviction server, which is
@@ -110,14 +114,14 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
* exit before files are closed.
*/
F_CLR(conn, WT_CONN_SERVER_RUN);
- WT_TRET(__wt_async_destroy(session));
WT_TRET(__wt_lsm_manager_destroy(session));
- WT_TRET(__wt_sweep_destroy(session));
F_SET(conn, WT_CONN_CLOSING);
-
WT_TRET(__wt_checkpoint_server_destroy(session));
WT_TRET(__wt_statlog_destroy(session, true));
+ WT_TRET(__wt_sweep_destroy(session));
+
+ /* The eviction server is shut down last. */
WT_TRET(__wt_evict_destroy(session));
/* Shut down the lookaside table, after all eviction is complete. */
@@ -126,7 +130,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
/* Close open data handles. */
WT_TRET(__wt_conn_dhandle_discard(session));
- /* Shut down metadata tracking, required before creating tables. */
+ /* Shut down metadata tracking. */
WT_TRET(__wt_meta_track_destroy(session));
/*
@@ -140,7 +144,6 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE))
WT_TRET(__wt_txn_checkpoint_log(
session, true, WT_TXN_LOG_CKPT_STOP, NULL));
- F_CLR(conn, WT_CONN_LOG_SERVER_RUN);
WT_TRET(__wt_logmgr_destroy(session));
/* Free memory for collators, compressors, data sources. */
@@ -159,15 +162,6 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
/* Discard transaction state. */
__wt_txn_global_destroy(session);
- /* Close extensions, first calling any unload entry point. */
- while ((dlh = TAILQ_FIRST(&conn->dlhqh)) != NULL) {
- TAILQ_REMOVE(&conn->dlhqh, dlh, q);
-
- if (dlh->terminate != NULL)
- WT_TRET(dlh->terminate(wt_conn));
- WT_TRET(__wt_dlclose(session, dlh));
- }
-
/* Close the lock file, opening up the database to other connections. */
if (conn->lock_fh != NULL)
WT_TRET(__wt_close(session, &conn->lock_fh));
@@ -199,8 +193,22 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
__wt_free(session, s->hazard);
}
+ /* Destroy the file-system configuration. */
+ if (conn->file_system != NULL && conn->file_system->terminate != NULL)
+ WT_TRET(conn->file_system->terminate(
+ conn->file_system, (WT_SESSION *)session));
+
+ /* Close extensions, first calling any unload entry point. */
+ while ((dlh = TAILQ_FIRST(&conn->dlhqh)) != NULL) {
+ TAILQ_REMOVE(&conn->dlhqh, dlh, q);
+
+ if (dlh->terminate != NULL)
+ WT_TRET(dlh->terminate(wt_conn));
+ WT_TRET(__wt_dlclose(session, dlh));
+ }
+
/* Destroy the handle. */
- WT_TRET(__wt_connection_destroy(conn));
+ __wt_connection_destroy(conn);
return (ret);
}
diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c
index 3bcdfd7ecb1..d89392b66c6 100644
--- a/src/conn/conn_stat.c
+++ b/src/conn/conn_stat.c
@@ -409,7 +409,6 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp)
struct timespec ts;
struct tm *tm, _tm;
WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
WT_FSTREAM *log_stream;
conn = S2C(session);
@@ -446,12 +445,9 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp)
* Lock the schema and walk the list of open handles, dumping
* any that match the list of object sources.
*/
- if (conn->stat_sources != NULL) {
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __wt_conn_btree_apply(
+ if (conn->stat_sources != NULL)
+ WT_RET(__wt_conn_btree_apply(
session, NULL, __statlog_apply, NULL, NULL));
- WT_RET(ret);
- }
/*
* Walk the list of open LSM trees, dumping any that match the
@@ -485,8 +481,7 @@ __statlog_on_close(WT_SESSION_IMPL *session)
if (!FLD_ISSET(conn->stat_flags, WT_STAT_ON_CLOSE))
return (0);
- if (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
- F_ISSET(conn, WT_CONN_SERVER_STATISTICS))
+ if (F_ISSET(conn, WT_CONN_SERVER_STATISTICS))
WT_RET_MSG(session, EINVAL,
"Attempt to log statistics while a server is running");
@@ -498,6 +493,16 @@ err: __wt_scr_free(session, &tmp);
}
/*
+ * __statlog_server_run_chk --
+ * Check to decide if the statistics log server should continue running.
+ */
+static bool
+__statlog_server_run_chk(WT_SESSION_IMPL *session)
+{
+ return (F_ISSET(S2C(session), WT_CONN_SERVER_STATISTICS));
+}
+
+/*
* __statlog_server --
* The statistics server thread.
*/
@@ -525,10 +530,14 @@ __statlog_server(void *arg)
WT_ERR(__wt_buf_init(session, &path, strlen(conn->stat_path) + 128));
WT_ERR(__wt_buf_init(session, &tmp, strlen(conn->stat_path) + 128));
- while (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
- F_ISSET(conn, WT_CONN_SERVER_STATISTICS)) {
+ for (;;) {
/* Wait until the next event. */
- __wt_cond_wait(session, conn->stat_cond, conn->stat_usecs);
+ __wt_cond_wait(session, conn->stat_cond,
+ conn->stat_usecs, __statlog_server_run_chk);
+
+ /* Check if we're quitting or being reconfigured. */
+ if (!__statlog_server_run_chk(session))
+ break;
if (WT_STAT_ENABLED(session))
WT_ERR(__statlog_log_one(session, &path, &tmp));
@@ -563,7 +572,7 @@ __statlog_start(WT_CONNECTION_IMPL *conn)
session = conn->stat_session;
WT_RET(__wt_cond_alloc(
- session, "statistics log server", false, &conn->stat_cond));
+ session, "statistics log server", &conn->stat_cond));
/*
* Start the thread.
diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c
index 7d5cb7d7c72..8c186c63939 100644
--- a/src/conn/conn_sweep.c
+++ b/src/conn/conn_sweep.c
@@ -233,7 +233,7 @@ __sweep_remove_handles(WT_SESSION_IMPL *session)
if (!WT_DHANDLE_CAN_DISCARD(dhandle))
continue;
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
ret = __sweep_remove_one(session, dhandle));
if (ret == 0)
WT_STAT_CONN_INCR(session, dh_sweep_remove);
@@ -246,6 +246,16 @@ __sweep_remove_handles(WT_SESSION_IMPL *session)
}
/*
+ * __sweep_server_run_chk --
+ * Check to decide if the checkpoint server should continue running.
+ */
+static bool
+__sweep_server_run_chk(WT_SESSION_IMPL *session)
+{
+ return (F_ISSET(S2C(session), WT_CONN_SERVER_SWEEP));
+}
+
+/*
* __sweep_server --
* The handle sweep server thread.
*/
@@ -266,11 +276,15 @@ __sweep_server(void *arg)
/*
* Sweep for dead and excess handles.
*/
- while (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
- F_ISSET(conn, WT_CONN_SERVER_SWEEP)) {
+ for (;;) {
/* Wait until the next event. */
- __wt_cond_wait(session,
- conn->sweep_cond, conn->sweep_interval * WT_MILLION);
+ __wt_cond_wait(session, conn->sweep_cond,
+ conn->sweep_interval * WT_MILLION, __sweep_server_run_chk);
+
+ /* Check if we're quitting or being reconfigured. */
+ if (!__sweep_server_run_chk(session))
+ break;
+
__wt_seconds(session, &now);
WT_STAT_CONN_INCR(session, dh_sweeps);
@@ -390,7 +404,7 @@ __wt_sweep_create(WT_SESSION_IMPL *session)
session = conn->sweep_session;
WT_RET(__wt_cond_alloc(
- session, "handle sweep server", false, &conn->sweep_cond));
+ session, "handle sweep server", &conn->sweep_cond));
WT_RET(__wt_thread_create(
session, &conn->sweep_tid, __sweep_server, session));
diff --git a/src/cursor/cur_backup.c b/src/cursor/cur_backup.c
index 08b15e6ca5e..61ced8d11e7 100644
--- a/src/cursor/cur_backup.c
+++ b/src/cursor/cur_backup.c
@@ -346,13 +346,9 @@ __backup_stop(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb)
static int
__backup_all(WT_SESSION_IMPL *session)
{
- WT_DECL_RET;
-
/* Build a list of the file objects that need to be copied. */
- WT_WITH_HANDLE_LIST_LOCK(session, ret =
- __wt_meta_apply_all(session, NULL, __backup_list_uri_append, NULL));
-
- return (ret);
+ return (__wt_meta_apply_all(
+ session, NULL, __backup_list_uri_append, NULL));
}
/*
diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c
index 4786b0524bc..6fc01c0421f 100644
--- a/src/cursor/cur_index.c
+++ b/src/cursor/cur_index.c
@@ -240,7 +240,17 @@ __curindex_search(WT_CURSOR *cursor)
found_key = child->key;
if (found_key.size < cursor->key.size)
WT_ERR(WT_NOTFOUND);
- found_key.size = cursor->key.size;
+
+ /*
+ * Custom collators expect to see complete keys, pass an item containing
+ * all the visible fields so it unpacks correctly.
+ */
+ if (cindex->index->collator != NULL &&
+ !F_ISSET(cursor, WT_CURSTD_RAW_SEARCH))
+ WT_ERR(__wt_struct_repack(session, child->key_format,
+ cindex->iface.key_format, &child->key, &found_key));
+ else
+ found_key.size = cursor->key.size;
WT_ERR(__wt_compare(
session, cindex->index->collator, &cursor->key, &found_key, &cmp));
@@ -307,8 +317,18 @@ __curindex_search_near(WT_CURSOR *cursor, int *exact)
* so we flip the sign of the result to match what callers expect.
*/
found_key = child->key;
- if (found_key.size > cursor->key.size)
- found_key.size = cursor->key.size;
+ if (found_key.size > cursor->key.size) {
+ /*
+ * Custom collators expect to see complete keys, pass an item
+ * containing all the visible fields so it unpacks correctly.
+ */
+ if (cindex->index->collator != NULL)
+ WT_ERR(__wt_struct_repack(session,
+ cindex->child->key_format, cindex->iface.key_format,
+ &child->key, &found_key));
+ else
+ found_key.size = cursor->key.size;
+ }
WT_ERR(__wt_compare(
session, cindex->index->collator, &cursor->key, &found_key, exact));
diff --git a/src/cursor/cur_std.c b/src/cursor/cur_std.c
index 7ace6d49cf0..99a9e373354 100644
--- a/src/cursor/cur_std.c
+++ b/src/cursor/cur_std.c
@@ -633,6 +633,7 @@ __wt_cursor_reconfigure(WT_CURSOR *cursor, const char *config)
int
__wt_cursor_dup_position(WT_CURSOR *to_dup, WT_CURSOR *cursor)
{
+ WT_DECL_RET;
WT_ITEM key;
/*
@@ -662,9 +663,11 @@ __wt_cursor_dup_position(WT_CURSOR *to_dup, WT_CURSOR *cursor)
* cursors cannot reference application memory after cursor operations
* and that requirement will save the day.
*/
- WT_RET(cursor->search(cursor));
+ F_SET(cursor, WT_CURSTD_RAW_SEARCH);
+ ret = cursor->search(cursor);
+ F_CLR(cursor, WT_CURSTD_RAW_SEARCH);
- return (0);
+ return (ret);
}
/*
diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c
index 76f7fc5865f..7e8cd153d2d 100644
--- a/src/cursor/cur_table.c
+++ b/src/cursor/cur_table.c
@@ -769,7 +769,7 @@ __curtable_complete(WT_SESSION_IMPL *session, WT_TABLE *table)
return (0);
/* If the table is incomplete, wait on the table lock and recheck. */
- WT_WITH_TABLE_LOCK(session, complete = table->cg_complete);
+ WT_WITH_TABLE_READ_LOCK(session, complete = table->cg_complete);
if (!complete)
WT_RET_MSG(session, EINVAL,
"'%s' not available until all column groups are created",
diff --git a/src/docs/cursor-random.dox b/src/docs/cursor-random.dox
index a0a3212be6d..b6434e3d161 100644
--- a/src/docs/cursor-random.dox
+++ b/src/docs/cursor-random.dox
@@ -20,9 +20,4 @@ cursor configured using \c next_random_sample_size divides the object
into \c next_random_sample_size pieces, and each subsequent retrieval
returns a record from the next one of those pieces.
-For example, setting \c next_random_sample_percent to \c 10 would cause
-the cursor to sequentially return records from each tenth part of the
-object. Setting \c next_random_sample_percent to \c 1000 would cause the
-cursor to sequentially return records from each .1% of the object.
-
*/
diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox
index 4a356f7da61..f463e6bc615 100644
--- a/src/docs/upgrading.dox
+++ b/src/docs/upgrading.dox
@@ -7,6 +7,12 @@
The WiredTiger Utility can now \c truncate an object. Removing all contents
from the specified object.
</dd>
+<dt>Handle list lock statistics</dt>
+<dd>
+In the 2.9.1 release we added statistics tracking handle list lock timing, we
+have switched that lock from a spin lock to a read-write lock, and consequently
+changed the statistics tracking lock related wait time.
+</dd>
</dl>
@section version_291 Upgrading to Version 2.9.1
diff --git a/src/docs/wtperf.dox b/src/docs/wtperf.dox
index 83aadf8a776..2eac0fef3f4 100644
--- a/src/docs/wtperf.dox
+++ b/src/docs/wtperf.dox
@@ -195,14 +195,14 @@ use pareto distribution for random numbers. Zero to disable, otherwise a percen
number of operations to group into each transaction in the populate phase, zero for auto-commit
@par populate_threads (unsigned int, default=1)
number of populate threads, 1 for bulk load
+@par pre_load_data (boolean, default=false)
+Scan all data prior to starting the workload phase to warm the cache
@par random_range (unsigned int, default=0)
if non zero choose a value from within this range as the key for insert operations
@par random_value (boolean, default=false)
generate random content for the value
@par range_partition (boolean, default=false)
partition data by range (vs hash)
-@par read_range (unsigned int, default=0)
-scan a range of keys after each search
@par readonly (boolean, default=false)
reopen the connection between populate and workload phases in readonly mode. Requires reopen_connection turned on (default). Requires that read be the only workload specified
@par reopen_connection (boolean, default=true)
@@ -228,7 +228,7 @@ number of tables to run operations over. Keys are divided evenly over the table
@par table_count_idle (unsigned int, default=0)
number of tables to create, that won't be populated. Default 0.
@par threads (string, default="")
-workload configuration: each 'count' entry is the total number of threads, and the 'insert', 'read' and 'update' entries are the ratios of insert, read and update operations done by each worker thread; If a throttle value is provided each thread will do a maximum of that number of operations per second; multiple workload configurations may be specified per threads configuration; for example, a more complex threads configuration might be 'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' which would create 2 threads doing nothing but reads and 8 threads each doing 50% inserts and 25% reads and updates. Allowed configuration values are 'count', 'throttle', 'update_delta', 'reads', 'inserts', 'updates', 'truncate', 'truncate_pct' and 'truncate_count'. There are also behavior modifiers, supported modifiers are 'ops_per_txn'
+workload configuration: each 'count' entry is the total number of threads, and the 'insert', 'read' and 'update' entries are the ratios of insert, read and update operations done by each worker thread; If a throttle value is provided each thread will do a maximum of that number of operations per second; multiple workload configurations may be specified per threads configuration; for example, a more complex threads configuration might be 'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' which would create 2 threads doing nothing but reads and 8 threads each doing 50% inserts and 25% reads and updates. Allowed configuration values are 'count', 'throttle', 'update_delta', 'reads', 'read_range', 'inserts', 'updates', 'truncate', 'truncate_pct' and 'truncate_count'. There are also behavior modifiers, supported modifiers are 'ops_per_txn'
@par transaction_config (string, default="")
WT_SESSION.begin_transaction configuration string, applied during the populate phase when populate_ops_per_txn is nonzero
@par table_name (string, default="test")
diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c
index 9b969de9a9e..42fe4d4608e 100644
--- a/src/evict/evict_lru.c
+++ b/src/evict/evict_lru.c
@@ -24,40 +24,40 @@ static int __evict_walk_file(
(S2C(s)->evict_threads.current_threads > 1)
/*
- * __evict_lock_dhandle --
- * Try to get the dhandle lock, with yield and sleep back off.
+ * __evict_lock_handle_list --
+ * Try to get the handle list lock, with yield and sleep back off.
* Keep timing statistics overall.
*/
static int
-__evict_lock_dhandle(WT_SESSION_IMPL *session)
+__evict_lock_handle_list(WT_SESSION_IMPL *session)
{
struct timespec enter, leave;
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
- WT_SPINLOCK *dh_lock;
- int64_t **stats;
+ WT_RWLOCK *dh_lock;
u_int spins;
bool dh_stats;
conn = S2C(session);
cache = conn->cache;
dh_lock = &conn->dhandle_lock;
- stats = (int64_t **)conn->stats;
- dh_stats = WT_STAT_ENABLED(session) && dh_lock->stat_count_off != -1;
/*
- * Maintain lock acquisition timing statistics as if this were a
- * regular lock acquisition.
+ * Setup tracking of handle lock acquisition wait time if statistics
+ * are enabled.
*/
+ dh_stats = WT_STAT_ENABLED(session);
+
if (dh_stats)
__wt_epoch(session, &enter);
+
/*
* Use a custom lock acquisition back off loop so the eviction server
* notices any interrupt quickly.
*/
for (spins = 0;
- (ret = __wt_spin_trylock_track(session, dh_lock)) == EBUSY &&
+ (ret = __wt_try_readlock(session, dh_lock)) == EBUSY &&
cache->pass_intr == 0; spins++) {
if (spins < WT_THOUSAND)
__wt_yield();
@@ -70,8 +70,9 @@ __evict_lock_dhandle(WT_SESSION_IMPL *session)
WT_RET(ret);
if (dh_stats) {
__wt_epoch(session, &leave);
- stats[session->stat_bucket][dh_lock->stat_int_usecs_off] +=
- (int64_t)WT_TIMEDIFF_US(leave, enter);
+ WT_STAT_CONN_INCRV(
+ session, lock_handle_list_wait_eviction,
+ (int64_t)WT_TIMEDIFF_US(leave, enter));
}
return (0);
}
@@ -197,8 +198,7 @@ __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref)
}
__wt_spin_unlock(session, &cache->evict_queues[q].evict_lock);
}
- WT_ASSERT(session,
- !F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU));
+ WT_ASSERT(session, !F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU));
__wt_spin_unlock(session, &cache->evict_queue_lock);
}
@@ -267,7 +267,7 @@ __wt_evict_server_wake(WT_SESSION_IMPL *session)
}
#endif
- __wt_cond_auto_signal(session, cache->evict_cond);
+ __wt_cond_signal(session, cache->evict_cond);
}
/*
@@ -280,12 +280,12 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread)
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
- bool did_work;
+ bool did_work, was_intr;
conn = S2C(session);
cache = conn->cache;
-#ifdef HAVE_DIAGNOSTIC
+#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE)
/*
* Ensure the cache stuck timer is initialized when starting eviction.
*/
@@ -308,12 +308,28 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread)
ret = __evict_server(session, &did_work);
F_CLR(cache->walk_session, WT_SESSION_LOCKED_PASS);
F_CLR(session, WT_SESSION_LOCKED_PASS);
+ was_intr = cache->pass_intr != 0;
__wt_spin_unlock(session, &cache->evict_pass_lock);
WT_ERR(ret);
+
+ /*
+ * If the eviction server was interrupted, wait until
+ * requests have been processed: the system may
+ * otherwise be busy so don't go to sleep.
+ */
+ if (was_intr) {
+ while (cache->pass_intr != 0 &&
+ F_ISSET(conn, WT_CONN_EVICTION_RUN) &&
+ F_ISSET(thread, WT_THREAD_RUN))
+ __wt_yield();
+ continue;
+ }
+
__wt_verbose(session, WT_VERB_EVICTSERVER, "sleeping");
+
/* Don't rely on signals: check periodically. */
__wt_cond_auto_wait(
- session, cache->evict_cond, did_work);
+ session, cache->evict_cond, did_work, NULL);
__wt_verbose(session, WT_VERB_EVICTSERVER, "waking");
} else
WT_ERR(__evict_lru_pages(session, false));
@@ -353,12 +369,12 @@ err: WT_PANIC_MSG(session, ret, "cache eviction thread error");
static int
__evict_server(WT_SESSION_IMPL *session, bool *did_work)
{
+#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE)
+ struct timespec now;
+#endif
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
-#ifdef HAVE_DIAGNOSTIC
- struct timespec now;
-#endif
uint64_t orig_pages_evicted;
conn = S2C(session);
@@ -370,7 +386,8 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work)
/* Evict pages from the cache as needed. */
WT_RET(__evict_pass(session));
- if (!F_ISSET(conn, WT_CONN_EVICTION_RUN))
+ if (!F_ISSET(conn, WT_CONN_EVICTION_RUN) ||
+ cache->pass_intr != 0)
return (0);
/*
@@ -378,28 +395,31 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work)
* otherwise we can block applications evicting large pages.
*/
if (!__wt_cache_stuck(session)) {
-
/*
- * If we gave up acquiring the lock, that indicates a
- * session is waiting for us to clear walks. Do that
- * as part of a normal pass (without the handle list
+ * Try to get the handle list lock: if we give up, that
+ * indicates a session is waiting for us to clear walks. Do
+ * that as part of a normal pass (without the handle list
* lock) to avoid deadlock.
*/
- if ((ret = __evict_lock_dhandle(session)) == EBUSY)
+ if ((ret = __evict_lock_handle_list(session)) == EBUSY)
return (0);
WT_RET(ret);
ret = __evict_clear_all_walks(session);
- __wt_spin_unlock(session, &conn->dhandle_lock);
+ __wt_readunlock(session, &conn->dhandle_lock);
WT_RET(ret);
cache->pages_evicted = 0;
} else if (cache->pages_evicted != cache->pages_evict) {
cache->pages_evicted = cache->pages_evict;
-#ifdef HAVE_DIAGNOSTIC
+#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE)
__wt_epoch(session, &cache->stuck_ts);
} else if (!F_ISSET(conn, WT_CONN_IN_MEMORY)) {
/*
- * After being stuck for 5 minutes, give up.
+ * If we're stuck for 5 minutes in diagnostic mode, or the
+ * verbose evict_stuck flag is configured, log the cache
+ * and transaction state.
+ *
+ * If we're stuck for 5 minutes in diagnostic mode, give up.
*
* We don't do this check for in-memory workloads because
* application threads are not blocked by the cache being full.
@@ -408,11 +428,22 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work)
*/
__wt_epoch(session, &now);
if (WT_TIMEDIFF_SEC(now, cache->stuck_ts) > 300) {
- ret = ETIMEDOUT;
- __wt_err(session, ret,
+#if defined(HAVE_DIAGNOSTIC)
+ __wt_err(session, ETIMEDOUT,
"Cache stuck for too long, giving up");
- WT_TRET(__wt_dump_stuck_info(session, NULL));
+ ret = ETIMEDOUT;
+ WT_TRET(__wt_verbose_dump_txn(session));
+ WT_TRET(__wt_verbose_dump_cache(session));
return (ret);
+#elif defined(HAVE_VERBOSE)
+ if (WT_VERBOSE_ISSET(session, WT_VERB_EVICT_STUCK)) {
+ WT_RET(__wt_verbose_dump_txn(session));
+ WT_RET(__wt_verbose_dump_cache(session));
+
+ /* Reset the timer. */
+ __wt_epoch(session, &cache->stuck_ts);
+ }
+#endif
}
#endif
}
@@ -697,8 +728,8 @@ __evict_pass(WT_SESSION_IMPL *session)
*/
WT_STAT_CONN_INCR(session,
cache_eviction_server_slept);
- __wt_cond_wait(
- session, cache->evict_cond, WT_THOUSAND);
+ __wt_cond_wait(session,
+ cache->evict_cond, WT_THOUSAND, NULL);
continue;
}
@@ -725,7 +756,7 @@ __evict_pass(WT_SESSION_IMPL *session)
* Clear a single walk point.
*/
static int
-__evict_clear_walk(WT_SESSION_IMPL *session, bool count_stat)
+__evict_clear_walk(WT_SESSION_IMPL *session)
{
WT_BTREE *btree;
WT_CACHE *cache;
@@ -742,14 +773,14 @@ __evict_clear_walk(WT_SESSION_IMPL *session, bool count_stat)
if ((ref = btree->evict_ref) == NULL)
return (0);
- if (count_stat)
- WT_STAT_CONN_INCR(session, cache_eviction_walks_abandoned);
+ WT_STAT_CONN_INCR(session, cache_eviction_walks_abandoned);
/*
- * Clear evict_ref first, in case releasing it forces eviction (we
- * assert we never try to evict the current eviction walk point).
+ * Clear evict_ref before releasing it in case that forces eviction (we
+ * assert that we never try to evict the current eviction walk point).
*/
btree->evict_ref = NULL;
+
WT_WITH_DHANDLE(cache->walk_session, session->dhandle,
(ret = __wt_page_release(cache->walk_session,
ref, WT_READ_NO_EVICT)));
@@ -772,7 +803,7 @@ __evict_clear_all_walks(WT_SESSION_IMPL *session)
TAILQ_FOREACH(dhandle, &conn->dhqh, q)
if (WT_PREFIX_MATCH(dhandle->name, "file:"))
WT_WITH_DHANDLE(session, dhandle,
- WT_TRET(__evict_clear_walk(session, true)));
+ WT_TRET(__evict_clear_walk(session)));
return (ret);
}
@@ -817,7 +848,7 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session)
/* Clear any existing LRU eviction walk for the file. */
WT_WITH_PASS_LOCK(session,
- ret = __evict_clear_walk(session, true));
+ ret = __evict_clear_walk(session));
(void)__wt_atomic_subv32(&cache->pass_intr, 1);
WT_ERR(ret);
@@ -1087,7 +1118,8 @@ __evict_lru_pages(WT_SESSION_IMPL *session, bool is_server)
/* If a worker thread found the queue empty, pause. */
if (ret == WT_NOTFOUND && !is_server &&
F_ISSET(S2C(session), WT_CONN_EVICTION_RUN))
- __wt_cond_wait(session, conn->evict_threads.wait_cond, 10000);
+ __wt_cond_wait(
+ session, conn->evict_threads.wait_cond, 10000, NULL);
return (ret == WT_NOTFOUND ? 0 : ret);
}
@@ -1304,7 +1336,7 @@ retry: while (slot < max_entries) {
* reference count to keep it alive while we sweep.
*/
if (!dhandle_locked) {
- WT_ERR(__evict_lock_dhandle(session));
+ WT_ERR(__evict_lock_handle_list(session));
dhandle_locked = true;
}
@@ -1383,7 +1415,7 @@ retry: while (slot < max_entries) {
(void)__wt_atomic_addi32(&dhandle->session_inuse, 1);
incr = true;
- __wt_spin_unlock(session, &conn->dhandle_lock);
+ __wt_readunlock(session, &conn->dhandle_lock);
dhandle_locked = false;
/*
@@ -1430,7 +1462,7 @@ retry: while (slot < max_entries) {
}
err: if (dhandle_locked) {
- __wt_spin_unlock(session, &conn->dhandle_lock);
+ __wt_readunlock(session, &conn->dhandle_lock);
dhandle_locked = false;
}
@@ -1526,6 +1558,19 @@ __evict_walk_file(WT_SESSION_IMPL *session,
start = queue->evict_queue + *slotp;
remaining_slots = max_entries - *slotp;
total_slots = max_entries - queue->evict_entries;
+ btree_inuse = cache_inuse = 0;
+ target_pages_clean = target_pages_dirty = 0;
+
+ /*
+ * The number of times we should fill the queue by the end of
+ * considering all trees.
+ */
+#define QUEUE_FILLS_PER_PASS 10
+
+ /*
+ * The minimum number of pages we should consider per tree.
+ */
+#define MIN_PAGES_PER_TREE 10
/*
* The target number of pages for this tree is proportional to the
@@ -1534,13 +1579,12 @@ __evict_walk_file(WT_SESSION_IMPL *session,
* cache (and only have to walk it once).
*/
if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) {
- btree_inuse = __wt_btree_bytes_inuse(session);
+ btree_inuse = __wt_btree_bytes_evictable(session);
cache_inuse = __wt_cache_bytes_inuse(cache);
bytes_per_slot = 1 + cache_inuse / total_slots;
target_pages_clean = (uint32_t)(
(btree_inuse + bytes_per_slot / 2) / bytes_per_slot);
- } else
- target_pages_clean = 0;
+ }
if (F_ISSET(cache, WT_CACHE_EVICT_DIRTY)) {
btree_inuse = __wt_btree_dirty_leaf_inuse(session);
@@ -1548,35 +1592,58 @@ __evict_walk_file(WT_SESSION_IMPL *session,
bytes_per_slot = 1 + cache_inuse / total_slots;
target_pages_dirty = (uint32_t)(
(btree_inuse + bytes_per_slot / 2) / bytes_per_slot);
- } else
- target_pages_dirty = 0;
+ }
- target_pages = WT_MAX(target_pages_clean, target_pages_dirty);
+ /*
+ * Weight the number of target pages by the number of times we want to
+ * fill the cache per pass through all the trees. Note that we don't
+ * build this into the calculation above because we don't want to favor
+ * small trees, so round to a whole number of slots (zero for small
+ * trees) before multiplying.
+ */
+ target_pages = WT_MAX(target_pages_clean, target_pages_dirty) *
+ QUEUE_FILLS_PER_PASS;
+ /*
+ * Randomly walk trees with a small fraction of the cache in case there
+ * are so many trees that none of them use enough of the cache to be
+ * allocated slots.
+ *
+ * The chance of walking a tree is equal to the chance that a random
+ * byte in cache belongs to the tree, weighted by how many times we
+ * want to fill queues during a pass through all the trees in cache.
+ */
if (target_pages == 0) {
- /*
- * Randomly walk trees with a tiny fraction of the cache in
- * case there are so many trees that none of them use enough of
- * the cache to be allocated slots. Walk small trees 1% of the
- * time.
- */
- if (__wt_random(&session->rnd) > UINT32_MAX / 100)
+ if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) {
+ btree_inuse = __wt_btree_bytes_evictable(session);
+ cache_inuse = __wt_cache_bytes_inuse(cache);
+ } else {
+ btree_inuse = __wt_btree_dirty_leaf_inuse(session);
+ cache_inuse = __wt_cache_dirty_leaf_inuse(cache);
+ }
+ if (btree_inuse == 0 || cache_inuse == 0)
+ return (0);
+ if (__wt_random64(&session->rnd) % cache_inuse >
+ btree_inuse * QUEUE_FILLS_PER_PASS)
return (0);
- target_pages = 10;
}
+ /*
+ * There is some cost associated with walking a tree. If we're going
+ * to visit this tree, always look for a minimum number of pages.
+ */
+ if (target_pages < MIN_PAGES_PER_TREE)
+ target_pages = MIN_PAGES_PER_TREE;
+
+ /*
+ * If the tree is dead or we're near the end of the queue, fill the
+ * remaining slots.
+ */
if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD) ||
target_pages > remaining_slots)
target_pages = remaining_slots;
end = start + target_pages;
- walk_flags =
- WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT;
-
- /* Randomize the walk direction. */
- if (btree->evict_walk_reverse)
- FLD_SET(walk_flags, WT_READ_PREV);
-
/*
* Examine at least a reasonable number of pages before deciding
* whether to give up. When we are only looking for dirty pages,
@@ -1588,8 +1655,41 @@ __evict_walk_file(WT_SESSION_IMPL *session,
min_pages *= 10;
/*
- * Get some more eviction candidate pages.
- *
+ * Choose a random point in the tree if looking for candidates in a
+ * tree with no starting point set. This is mostly aimed at ensuring
+ * eviction fairly visits all pages in trees with a lot of in-cache
+ * content.
+ */
+ if (btree->evict_ref == NULL) {
+ /* Ensure internal pages indexes remain valid for our walk */
+ WT_WITH_PAGE_INDEX(session, ret =
+ __wt_random_descent(session, &btree->evict_ref, true));
+ WT_RET_NOTFOUND_OK(ret);
+
+ /*
+ * Reverse the direction of the walk each time we start at a
+ * random point so both ends of the tree are equally likely to
+ * be visited.
+ */
+ btree->evict_walk_reverse = !btree->evict_walk_reverse;
+ }
+
+ walk_flags =
+ WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT;
+
+ if (btree->evict_walk_reverse)
+ FLD_SET(walk_flags, WT_READ_PREV);
+
+ /*
+ * Get some more eviction candidate pages, starting at the last saved
+ * point. Clear the saved point immediately, we assert when discarding
+ * pages we're not discarding an eviction point, so this clear must be
+ * complete before the page is released.
+ */
+ ref = btree->evict_ref;
+ btree->evict_ref = NULL;
+
+ /*
* !!! Take care terminating this loop.
*
* Don't make an extra call to __wt_tree_walk after we hit the end of a
@@ -1602,7 +1702,7 @@ __evict_walk_file(WT_SESSION_IMPL *session,
for (evict = start, pages_queued = pages_seen = refs_walked = 0;
evict < end && (ret == 0 || ret == WT_NOTFOUND);
ret = __wt_tree_walk_count(
- session, &btree->evict_ref, &refs_walked, walk_flags)) {
+ session, &ref, &refs_walked, walk_flags)) {
/*
* Check whether we're finding a good ratio of candidates vs
* pages seen. Some workloads create "deserts" in trees where
@@ -1616,7 +1716,7 @@ __evict_walk_file(WT_SESSION_IMPL *session,
if (give_up)
break;
- if ((ref = btree->evict_ref) == NULL) {
+ if (ref == NULL) {
if (++restarts == 2)
break;
WT_STAT_CONN_INCR(
@@ -1706,7 +1806,7 @@ fast: /* If the page can't be evicted, give up. */
++pages_queued;
if (WT_PAGE_IS_INTERNAL(page))
- ++internal_pages;
+ ++internal_pages;
__wt_verbose(session, WT_VERB_EVICTSERVER,
"select: %p, size %" WT_SIZET_FMT,
@@ -1719,12 +1819,10 @@ fast: /* If the page can't be evicted, give up. */
session, cache_eviction_pages_queued, (u_int)(evict - start));
/*
- * If we didn't find any candidates in the file, reverse the direction
- * of the walk and skip it next time.
+ * If we couldn't find the number of pages we were looking for, skip
+ * the tree next time.
*/
- if (give_up)
- btree->evict_walk_reverse = !btree->evict_walk_reverse;
- if (pages_queued == 0 && !urgent_queued)
+ if (pages_queued < target_pages / 2 && !urgent_queued)
btree->evict_walk_period = WT_MIN(
WT_MAX(1, 2 * btree->evict_walk_period), 100);
else if (pages_queued == target_pages)
@@ -1733,6 +1831,8 @@ fast: /* If the page can't be evicted, give up. */
btree->evict_walk_period /= 2;
/*
+ * Give up the walk occasionally.
+ *
* If we happen to end up on the root page or a page requiring urgent
* eviction, clear it. We have to track hazard pointers, and the root
* page complicates that calculation.
@@ -1744,16 +1844,20 @@ fast: /* If the page can't be evicted, give up. */
* If we land on a page requiring forced eviction, move on to the next
* page: we want this page evicted as quickly as possible.
*/
- if ((ref = btree->evict_ref) != NULL) {
- /* Give up the walk occasionally. */
+ if (ref != NULL) {
if (__wt_ref_is_root(ref) || evict == start || give_up ||
ref->page->read_gen == WT_READGEN_OLDEST ||
- ref->page->memory_footprint >= btree->splitmempage)
- WT_RET(__evict_clear_walk(session, restarts == 0));
- else if (ref->page->read_gen == WT_READGEN_OLDEST)
+ ref->page->memory_footprint >= btree->splitmempage) {
+ if (restarts == 0)
+ WT_STAT_CONN_INCR(
+ session, cache_eviction_walks_abandoned);
+ WT_RET(__wt_page_release(cache->walk_session,
+ ref, WT_READ_NO_EVICT));
+ ref = NULL;
+ } else if (ref->page->read_gen == WT_READGEN_OLDEST)
WT_RET_NOTFOUND_OK(__wt_tree_walk_count(
- session, &btree->evict_ref,
- &refs_walked, walk_flags));
+ session, &ref, &refs_walked, walk_flags));
+ btree->evict_ref = ref;
}
WT_STAT_CONN_INCRV(session, cache_eviction_walk, refs_walked);
@@ -2087,8 +2191,8 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full)
break;
case WT_NOTFOUND:
/* Allow the queue to re-populate before retrying. */
- __wt_cond_wait(
- session, conn->evict_threads.wait_cond, 10000);
+ __wt_cond_wait(session,
+ conn->evict_threads.wait_cond, 10000, NULL);
cache->app_waits++;
break;
default:
@@ -2184,226 +2288,140 @@ __wt_evict_priority_clear(WT_SESSION_IMPL *session)
S2BT(session)->evict_priority = 0;
}
-#ifdef HAVE_DIAGNOSTIC
+#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE)
/*
- * __dump_txn_state --
- * Output debugging information about the global transaction state.
+ * __verbose_dump_cache_single --
+ * Output diagnostic information about a single file in the cache.
*/
static int
-__dump_txn_state(WT_SESSION_IMPL *session, FILE *fp)
+__verbose_dump_cache_single(WT_SESSION_IMPL *session,
+ uint64_t *total_bytesp, uint64_t *total_dirty_bytesp)
{
- WT_CONNECTION_IMPL *conn;
- WT_TXN_GLOBAL *txn_global;
- WT_TXN *txn;
- WT_TXN_STATE *s;
- const char *iso_tag;
- uint64_t id;
- uint32_t i, session_cnt;
-
- conn = S2C(session);
- txn_global = &conn->txn_global;
- WT_ORDERED_READ(session_cnt, conn->session_cnt);
-
- /* Note: odd string concatenation avoids spelling errors. */
- if (fprintf(fp, "==========\n" "transaction state dump\n") < 0)
- return (EIO);
-
- if (fprintf(fp,
- "current ID: %" PRIu64 "\n"
- "last running ID: %" PRIu64 "\n"
- "oldest ID: %" PRIu64 "\n"
- "oldest named snapshot ID: %" PRIu64 "\n",
- txn_global->current, txn_global->last_running,
- txn_global->oldest_id, txn_global->nsnap_oldest_id) < 0)
- return (EIO);
-
- if (fprintf(fp,
- "checkpoint running? %s\n"
- "checkpoint generation: %" PRIu64 "\n"
- "checkpoint pinned ID: %" PRIu64 "\n"
- "checkpoint txn ID: %" PRIu64 "\n"
- "session count: %" PRIu32 "\n",
- txn_global->checkpoint_running ? "yes" : "no",
- txn_global->checkpoint_gen,
- txn_global->checkpoint_pinned,
- txn_global->checkpoint_txnid,
- session_cnt) < 0)
- return (EIO);
-
- if (fprintf(fp, "Dumping transaction state of active sessions\n") < 0)
- return (EIO);
-
- /*
- * Walk each session transaction state and dump information. Accessing
- * the content of session handles is not thread safe, so some
- * information may change while traversing if other threads are active
- * at the same time, which is OK since this is diagnostic code.
- */
- for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
- /* Skip sessions with no active transaction */
- if ((id = s->id) == WT_TXN_NONE && s->pinned_id == WT_TXN_NONE)
- continue;
+ WT_DATA_HANDLE *dhandle;
+ WT_PAGE *page;
+ WT_REF *next_walk;
+ size_t size;
+ uint64_t intl_bytes, intl_bytes_max, intl_dirty_bytes;
+ uint64_t intl_dirty_bytes_max, intl_dirty_pages, intl_pages;
+ uint64_t leaf_bytes, leaf_bytes_max, leaf_dirty_bytes;
+ uint64_t leaf_dirty_bytes_max, leaf_dirty_pages, leaf_pages;
- txn = &conn->sessions[i].txn;
- iso_tag = "INVALID";
- switch (txn->isolation) {
- case WT_ISO_READ_COMMITTED:
- iso_tag = "WT_ISO_READ_COMMITTED";
- break;
- case WT_ISO_READ_UNCOMMITTED:
- iso_tag = "WT_ISO_READ_UNCOMMITTED";
- break;
- case WT_ISO_SNAPSHOT:
- iso_tag = "WT_ISO_SNAPSHOT";
- break;
+ intl_bytes = intl_bytes_max = intl_dirty_bytes = 0;
+ intl_dirty_bytes_max = intl_dirty_pages = intl_pages = 0;
+ leaf_bytes = leaf_bytes_max = leaf_dirty_bytes = 0;
+ leaf_dirty_bytes_max = leaf_dirty_pages = leaf_pages = 0;
+
+ next_walk = NULL;
+ while (__wt_tree_walk(session, &next_walk,
+ WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 &&
+ next_walk != NULL) {
+ page = next_walk->page;
+ size = page->memory_footprint;
+
+ if (WT_PAGE_IS_INTERNAL(page)) {
+ ++intl_pages;
+ intl_bytes += size;
+ intl_bytes_max = WT_MAX(intl_bytes_max, size);
+ if (__wt_page_is_modified(page)) {
+ ++intl_dirty_pages;
+ intl_dirty_bytes += size;
+ intl_dirty_bytes_max =
+ WT_MAX(intl_dirty_bytes_max, size);
+ }
+ } else {
+ ++leaf_pages;
+ leaf_bytes += size;
+ leaf_bytes_max = WT_MAX(leaf_bytes_max, size);
+ if (__wt_page_is_modified(page)) {
+ ++leaf_dirty_pages;
+ leaf_dirty_bytes += size;
+ leaf_dirty_bytes_max =
+ WT_MAX(leaf_dirty_bytes_max, size);
+ }
}
-
- if (fprintf(fp,
- "ID: %6" PRIu64
- ", mod count: %u"
- ", pinned ID: %" PRIu64
- ", snap min: %" PRIu64
- ", snap max: %" PRIu64
- ", metadata pinned ID: %" PRIu64
- ", flags: 0x%08" PRIx32
- ", name: %s"
- ", isolation: %s" "\n",
- id,
- txn->mod_count,
- s->pinned_id,
- txn->snap_min,
- txn->snap_max,
- s->metadata_pinned,
- txn->flags,
- conn->sessions[i].name == NULL ?
- "EMPTY" : conn->sessions[i].name,
- iso_tag) < 0)
- return (EIO);
}
+ dhandle = session->dhandle;
+ if (dhandle->checkpoint == NULL)
+ WT_RET(__wt_msg(session, "%s(<live>):", dhandle->name));
+ else
+ WT_RET(__wt_msg(session, "%s(checkpoint=%s):",
+ dhandle->name, dhandle->checkpoint));
+ if (intl_pages != 0)
+ WT_RET(__wt_msg(session,
+ "internal: "
+ "%" PRIu64 " pages, "
+ "%" PRIu64 "MB, "
+ "%" PRIu64 "/%" PRIu64 " clean/dirty pages, "
+ "%" PRIu64 "/%" PRIu64 " clean/dirty MB, "
+ "%" PRIu64 "MB max page, "
+ "%" PRIu64 "MB max dirty page",
+ intl_pages,
+ intl_bytes / WT_MEGABYTE,
+ intl_pages - intl_dirty_pages,
+ intl_dirty_pages,
+ (intl_bytes - intl_dirty_bytes) / WT_MEGABYTE,
+ intl_dirty_bytes / WT_MEGABYTE,
+ intl_bytes_max / WT_MEGABYTE,
+ intl_dirty_bytes_max / WT_MEGABYTE));
+ if (leaf_pages != 0)
+ WT_RET(__wt_msg(session,
+ "leaf: "
+ "%" PRIu64 " pages, "
+ "%" PRIu64 "MB, "
+ "%" PRIu64 "/%" PRIu64 " clean/dirty pages, "
+ "%" PRIu64 "/%" PRIu64 " clean/dirty MB, "
+ "%" PRIu64 "MB max page, "
+ "%" PRIu64 "MB max dirty page",
+ leaf_pages,
+ leaf_bytes / WT_MEGABYTE,
+ leaf_pages - leaf_dirty_pages,
+ leaf_dirty_pages,
+ (leaf_bytes - leaf_dirty_bytes) / WT_MEGABYTE,
+ leaf_dirty_bytes / WT_MEGABYTE,
+ leaf_bytes_max / WT_MEGABYTE,
+ leaf_dirty_bytes_max / WT_MEGABYTE));
+
+ *total_bytesp += intl_bytes + leaf_bytes;
+ *total_dirty_bytesp += intl_dirty_bytes + leaf_dirty_bytes;
+
return (0);
}
/*
- * __dump_cache --
- * Output debugging information about the size of the files in cache.
+ * __wt_verbose_dump_cache --
+ * Output diagnostic information about the cache.
*/
-static int
-__dump_cache(WT_SESSION_IMPL *session, FILE *fp)
+int
+__wt_verbose_dump_cache(WT_SESSION_IMPL *session)
{
WT_CONNECTION_IMPL *conn;
- WT_DATA_HANDLE *dhandle, *saved_dhandle;
- WT_PAGE *page;
- WT_REF *next_walk;
- uint64_t intl_bytes, intl_bytes_max, intl_dirty_bytes;
- uint64_t intl_dirty_bytes_max, intl_dirty_pages, intl_pages;
- uint64_t leaf_bytes, leaf_bytes_max, leaf_dirty_bytes;
- uint64_t leaf_dirty_bytes_max, leaf_dirty_pages, leaf_pages;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
uint64_t total_bytes, total_dirty_bytes;
- size_t size;
conn = S2C(session);
total_bytes = total_dirty_bytes = 0;
- /* Note: odd string concatenation avoids spelling errors. */
- if (fprintf(fp, "==========\n" "cache dump\n") < 0)
- return (EIO);
+ WT_RET(__wt_msg(session, "%s", WT_DIVIDER));
+ WT_RET(__wt_msg(session, "cache dump"));
- saved_dhandle = session->dhandle;
- TAILQ_FOREACH(dhandle, &conn->dhqh, q) {
+ for (dhandle = NULL;;) {
+ WT_WITH_HANDLE_LIST_READ_LOCK(session,
+ WT_DHANDLE_NEXT(session, dhandle, &conn->dhqh, q));
+ if (dhandle == NULL)
+ break;
if (!WT_PREFIX_MATCH(dhandle->name, "file:") ||
!F_ISSET(dhandle, WT_DHANDLE_OPEN))
continue;
- intl_bytes = intl_bytes_max = intl_dirty_bytes = 0;
- intl_dirty_bytes_max = intl_dirty_pages = intl_pages = 0;
- leaf_bytes = leaf_bytes_max = leaf_dirty_bytes = 0;
- leaf_dirty_bytes_max = leaf_dirty_pages = leaf_pages = 0;
-
- next_walk = NULL;
- session->dhandle = dhandle;
- while (__wt_tree_walk(session, &next_walk,
- WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 &&
- next_walk != NULL) {
- page = next_walk->page;
- size = page->memory_footprint;
-
- if (WT_PAGE_IS_INTERNAL(page)) {
- ++intl_pages;
- intl_bytes += size;
- intl_bytes_max = WT_MAX(intl_bytes_max, size);
- if (__wt_page_is_modified(page)) {
- ++intl_dirty_pages;
- intl_dirty_bytes += size;
- intl_dirty_bytes_max =
- WT_MAX(intl_dirty_bytes_max, size);
- }
- } else {
- ++leaf_pages;
- leaf_bytes += size;
- leaf_bytes_max = WT_MAX(leaf_bytes_max, size);
- if (__wt_page_is_modified(page)) {
- ++leaf_dirty_pages;
- leaf_dirty_bytes += size;
- leaf_dirty_bytes_max =
- WT_MAX(leaf_dirty_bytes_max, size);
- }
- }
- }
- session->dhandle = NULL;
-
- if (dhandle->checkpoint == NULL) {
- if (fprintf(fp,
- "%s(<live>): \n", dhandle->name) < 0)
- return (EIO);
- } else {
- if (fprintf(fp, "%s(checkpoint=%s): \n",
- dhandle->name, dhandle->checkpoint) < 0)
- return (EIO);
- }
- if (intl_pages != 0) {
- if (fprintf(fp,
- "\t" "internal: "
- "%" PRIu64 " pages, "
- "%" PRIu64 "MB, "
- "%" PRIu64 "/%" PRIu64 " clean/dirty pages, "
- "%" PRIu64 "/%" PRIu64 " clean/dirty MB, "
- "%" PRIu64 "MB max page, "
- "%" PRIu64 "MB max dirty page\n",
- intl_pages,
- intl_bytes >> 20,
- intl_pages - intl_dirty_pages,
- intl_dirty_pages,
- (intl_bytes - intl_dirty_bytes) >> 20,
- intl_dirty_bytes >> 20,
- intl_bytes_max >> 20,
- intl_dirty_bytes_max >> 20) < 0)
- return (EIO);
- }
- if (leaf_pages != 0) {
- if (fprintf(fp,
- "\t" "leaf: "
- "%" PRIu64 " pages, "
- "%" PRIu64 "MB, "
- "%" PRIu64 "/%" PRIu64 " clean/dirty pages, "
- "%" PRIu64 "/%" PRIu64 " clean/dirty MB, "
- "%" PRIu64 "MB max page, "
- "%" PRIu64 "MB max dirty page\n",
- leaf_pages,
- leaf_bytes >> 20,
- leaf_pages - leaf_dirty_pages,
- leaf_dirty_pages,
- (leaf_bytes - leaf_dirty_bytes) >> 20,
- leaf_dirty_bytes >> 20,
- leaf_bytes_max >> 20,
- leaf_dirty_bytes_max >> 20) < 0)
- return (EIO);
- }
-
- total_bytes += intl_bytes + leaf_bytes;
- total_dirty_bytes += intl_dirty_bytes + leaf_dirty_bytes;
+ WT_WITH_DHANDLE(session, dhandle,
+ ret = __verbose_dump_cache_single(
+ session, &total_bytes, &total_dirty_bytes));
+ if (ret != 0)
+ break;
}
- session->dhandle = saved_dhandle;
+ WT_RET(ret);
/*
* Apply the overhead percentage so our total bytes are comparable with
@@ -2411,39 +2429,16 @@ __dump_cache(WT_SESSION_IMPL *session, FILE *fp)
*/
total_bytes = __wt_cache_bytes_plus_overhead(conn->cache, total_bytes);
- if (fprintf(fp,
+ WT_RET(__wt_msg(session,
"cache dump: "
- "total found: %" PRIu64 "MB vs tracked inuse %" PRIu64 "MB\n"
- "total dirty bytes: %" PRIu64 "MB\n",
- total_bytes >> 20, __wt_cache_bytes_inuse(conn->cache) >> 20,
- total_dirty_bytes >> 20) < 0)
- return (EIO);
- if (fprintf(fp, "==========\n") < 0)
- return (EIO);
+ "total found: %" PRIu64 "MB vs tracked inuse %" PRIu64 "MB",
+ total_bytes / WT_MEGABYTE,
+ __wt_cache_bytes_inuse(conn->cache) / WT_MEGABYTE));
+ WT_RET(__wt_msg(session,
+ "total dirty bytes: %" PRIu64 "MB",
+ total_dirty_bytes / WT_MEGABYTE));
+ WT_RET(__wt_msg(session, "%s", WT_DIVIDER));
return (0);
}
-
-/*
- * __wt_dump_stuck_info --
- * Dump debugging information to a file (default stderr) about the state
- * of WiredTiger when we have determined that the cache is stuck full.
- */
-int
-__wt_dump_stuck_info(WT_SESSION_IMPL *session, const char *ofile)
-{
- FILE *fp;
- WT_DECL_RET;
-
- if (ofile == NULL)
- fp = stderr;
- else if ((fp = fopen(ofile, "w")) == NULL)
- return (EIO);
-
- WT_ERR(__dump_txn_state(session, fp));
- WT_ERR(__dump_cache(session, fp));
-err: if (ofile != NULL && fclose(fp) != 0)
- return (EIO);
- return (ret);
-}
#endif
diff --git a/src/evict/evict_stat.c b/src/evict/evict_stat.c
index 2dd3b1e83a0..7c2d5722a63 100644
--- a/src/evict/evict_stat.c
+++ b/src/evict/evict_stat.c
@@ -134,5 +134,5 @@ __wt_curstat_cache_walk(WT_SESSION_IMPL *session)
WT_STAT_DATA_SET(session,
cache_state_root_size, btree->root.page->memory_footprint);
- WT_WITH_HANDLE_LIST_LOCK(session, __evict_stat_walk(session));
+ __evict_stat_walk(session);
}
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 43c1a309d52..39ca223aebf 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -483,6 +483,7 @@ struct __wt_page {
*/
struct {
WT_REF *parent_ref; /* Parent reference */
+ uint64_t split_gen; /* Generation of last split */
struct __wt_page_index {
uint32_t entries;
@@ -492,6 +493,8 @@ struct __wt_page {
} intl;
#undef pg_intl_parent_ref
#define pg_intl_parent_ref u.intl.parent_ref
+#undef pg_intl_split_gen
+#define pg_intl_split_gen u.intl.split_gen
/*
* Macros to copy/set the index because the name is obscured to ensure
@@ -593,9 +596,8 @@ struct __wt_page {
#define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */
#define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */
#define WT_PAGE_OVERFLOW_KEYS 0x10 /* Page has overflow keys */
-#define WT_PAGE_SPLIT_BLOCK 0x20 /* Split blocking eviction and splits */
-#define WT_PAGE_SPLIT_INSERT 0x40 /* A leaf page was split for append */
-#define WT_PAGE_UPDATE_IGNORE 0x80 /* Ignore updates on page discard */
+#define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */
+#define WT_PAGE_UPDATE_IGNORE 0x40 /* Ignore updates on page discard */
uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */
uint8_t unused[2]; /* Unused padding */
diff --git a/src/include/btree.i b/src/include/btree.i
index 09fa8df8c56..315efa86fa6 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -71,6 +71,30 @@ __wt_btree_bytes_inuse(WT_SESSION_IMPL *session)
}
/*
+ * __wt_btree_bytes_evictable --
+ * Return the number of bytes that can be evicted (i.e. bytes apart from
+ * the pinned root page).
+ */
+static inline uint64_t
+__wt_btree_bytes_evictable(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+ WT_CACHE *cache;
+ WT_PAGE *root_page;
+ uint64_t bytes_inmem, bytes_root;
+
+ btree = S2BT(session);
+ cache = S2C(session)->cache;
+ root_page = btree->root.page;
+
+ bytes_inmem = btree->bytes_inmem;
+ bytes_root = root_page == NULL ? 0 : root_page->memory_footprint;
+
+ return (bytes_inmem <= bytes_root ? 0 :
+ __wt_cache_bytes_plus_overhead(cache, bytes_inmem - bytes_root));
+}
+
+/*
* __wt_btree_dirty_inuse --
* Return the number of dirty bytes in use.
*/
@@ -1324,8 +1348,8 @@ __wt_page_can_evict(
* discards its WT_REF array, and a thread traversing the original
* parent page index might see a freed WT_REF.
*/
- if (WT_PAGE_IS_INTERNAL(page) &&
- F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK))
+ if (WT_PAGE_IS_INTERNAL(page) && !__wt_split_obsolete(
+ session, page->pg_intl_split_gen))
return (false);
/*
diff --git a/src/include/cache.h b/src/include/cache.h
index 70f6169200d..abd5a1901f7 100644
--- a/src/include/cache.h
+++ b/src/include/cache.h
@@ -83,7 +83,7 @@ struct __wt_cache {
uint64_t worker_evicts; /* Pages evicted by worker threads */
uint64_t evict_max_page_size; /* Largest page seen at eviction */
-#ifdef HAVE_DIAGNOSTIC
+#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE)
struct timespec stuck_ts; /* Stuck timestamp */
#endif
diff --git a/src/include/cache.i b/src/include/cache.i
index 17ab39e97d2..d71978ccf35 100644
--- a/src/include/cache.i
+++ b/src/include/cache.i
@@ -364,7 +364,7 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, bool *didworkp)
* block eviction), we don't want to highjack the thread for eviction.
*/
if (F_ISSET(session, WT_SESSION_NO_EVICTION |
- WT_SESSION_LOCKED_HANDLE_LIST | WT_SESSION_LOCKED_SCHEMA))
+ WT_SESSION_LOCKED_HANDLE_LIST_WRITE | WT_SESSION_LOCKED_SCHEMA))
return (0);
/* In memory configurations don't block when the cache is full. */
diff --git a/src/include/connection.h b/src/include/connection.h
index 64ac4271db1..ce483d3291a 100644
--- a/src/include/connection.h
+++ b/src/include/connection.h
@@ -123,12 +123,16 @@ struct __wt_named_extractor {
* main queue and the hashed queue.
*/
#define WT_CONN_DHANDLE_INSERT(conn, dhandle, bucket) do { \
+ WT_ASSERT(session, \
+ F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); \
TAILQ_INSERT_HEAD(&(conn)->dhqh, dhandle, q); \
TAILQ_INSERT_HEAD(&(conn)->dhhash[bucket], dhandle, hashq); \
++conn->dhandle_count; \
} while (0)
#define WT_CONN_DHANDLE_REMOVE(conn, dhandle, bucket) do { \
+ WT_ASSERT(session, \
+ F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); \
TAILQ_REMOVE(&(conn)->dhqh, dhandle, q); \
TAILQ_REMOVE(&(conn)->dhhash[bucket], dhandle, hashq); \
--conn->dhandle_count; \
@@ -163,13 +167,13 @@ struct __wt_connection_impl {
WT_SPINLOCK api_lock; /* Connection API spinlock */
WT_SPINLOCK checkpoint_lock; /* Checkpoint spinlock */
- WT_SPINLOCK dhandle_lock; /* Data handle list spinlock */
WT_SPINLOCK fh_lock; /* File handle queue spinlock */
WT_SPINLOCK metadata_lock; /* Metadata update spinlock */
WT_SPINLOCK reconfig_lock; /* Single thread reconfigure */
WT_SPINLOCK schema_lock; /* Schema operation spinlock */
- WT_SPINLOCK table_lock; /* Table creation spinlock */
+ WT_RWLOCK table_lock; /* Table list lock */
WT_SPINLOCK turtle_lock; /* Turtle file spinlock */
+ WT_RWLOCK dhandle_lock; /* Data handle list lock */
/*
* We distribute the btree page locks across a set of spin locks. Don't
diff --git a/src/include/dhandle.h b/src/include/dhandle.h
index dcc788f0839..4f318e7bccf 100644
--- a/src/include/dhandle.h
+++ b/src/include/dhandle.h
@@ -37,6 +37,24 @@
#define WT_SESSION_META_DHANDLE(s) \
(((WT_CURSOR_BTREE *)((s)->meta_cursor))->btree->dhandle)
+#define WT_DHANDLE_ACQUIRE(dhandle) \
+ (void)__wt_atomic_add32(&dhandle->session_ref, 1)
+
+#define WT_DHANDLE_RELEASE(dhandle) \
+ (void)__wt_atomic_sub32(&dhandle->session_ref, 1)
+
+#define WT_DHANDLE_NEXT(session, dhandle, head, field) do { \
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));\
+ if (dhandle == NULL) \
+ dhandle = TAILQ_FIRST(head); \
+ else { \
+ WT_DHANDLE_RELEASE(dhandle); \
+ dhandle = TAILQ_NEXT(dhandle, field); \
+ } \
+ if (dhandle != NULL) \
+ WT_DHANDLE_ACQUIRE(dhandle); \
+} while (0)
+
/*
* WT_DATA_HANDLE --
* A handle for a generic named data source.
diff --git a/src/include/extern.h b/src/include/extern.h
index 566eb386c29..19ad9a880df 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -98,6 +98,7 @@ extern void __wt_cursor_key_order_reset(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_A
extern void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern bool __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_btcur_reset(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
@@ -105,7 +106,6 @@ extern int __wt_btcur_insert(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((w
extern int __wt_btcur_update_check(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_btcur_remove(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_btcur_update(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_btcur_equals(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *equalp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
@@ -150,6 +150,9 @@ extern int __wt_ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, void *cookie
extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint32_t alloc_entries, bool alloc_refs, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern int __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern int __wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, bool eviction) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_las_remove_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int
__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
@@ -160,6 +163,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
extern int __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern bool __wt_split_obsolete(WT_SESSION_IMPL *session, uint64_t split_gen) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_split_stash_discard(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_split_stash_discard_all( WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp, bool closing) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
@@ -192,8 +196,6 @@ extern WT_UPDATE *__wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE
extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_search_insert(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern int __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern int __wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_las_stats_update(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_las_create(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_las_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
@@ -254,6 +256,7 @@ extern WT_THREAD_RET __wt_cache_pool_server(void *arg) WT_GCC_FUNC_DECL_ATTRIBUT
extern int __wt_checkpoint_server_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_checkpoint_server_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_checkpoint_signal(WT_SESSION_IMPL *session, wt_off_t logsize) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern int __wt_conn_dhandle_alloc( WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_conn_dhandle_find( WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_conn_btree_open( WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
@@ -262,7 +265,7 @@ extern int __wt_conn_dhandle_close_all( WT_SESSION_IMPL *session, const char *ur
extern int __wt_conn_dhandle_discard_single( WT_SESSION_IMPL *session, bool final, bool force) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern int __wt_connection_destroy(WT_CONNECTION_IMPL *conn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern void __wt_connection_destroy(WT_CONNECTION_IMPL *conn) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_logmgr_reconfig(WT_SESSION_IMPL *session, const char **cfg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_log_truncate_files( WT_SESSION_IMPL *session, WT_CURSOR *cursor, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_log_wrlsn(WT_SESSION_IMPL *session, int *yield) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
@@ -352,7 +355,7 @@ extern int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int
extern bool __wt_page_evict_urgent(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_evict_priority_set(WT_SESSION_IMPL *session, uint64_t v) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_evict_priority_clear(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern int __wt_dump_stuck_info(WT_SESSION_IMPL *session, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern int __wt_verbose_dump_cache(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_curstat_cache_walk(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
@@ -403,7 +406,7 @@ extern int __wt_log_slot_switch( WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bo
extern int __wt_log_slot_new(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_log_slot_init(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_log_slot_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern void __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern int __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int64_t __wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_clsm_request_switch(WT_CURSOR_LSM *clsm) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
@@ -613,11 +616,9 @@ extern void __wt_session_close_cache(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_
extern int __wt_session_get_btree(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint, const char *cfg[], uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern int __wt_cond_auto_alloc( WT_SESSION_IMPL *session, const char *name, bool is_signalled, uint64_t min, uint64_t max, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern void __wt_cond_auto_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern void __wt_cond_auto_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern void __wt_cond_auto_wait( WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern int __wt_cond_auto_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern int __wt_cond_auto_alloc(WT_SESSION_IMPL *session, const char *name, uint64_t min, uint64_t max, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern void __wt_cond_auto_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern void __wt_cond_auto_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool (*run_func)(WT_SESSION_IMPL *)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_decrypt(WT_SESSION_IMPL *session, WT_ENCRYPTOR *encryptor, size_t skip, WT_ITEM *in, WT_ITEM *out) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_encrypt(WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t skip, WT_ITEM *in, WT_ITEM *out) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_encrypt_size(WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t incoming_size, size_t *sizep) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
@@ -689,6 +690,7 @@ extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2) WT_GCC_FUNC_DECL_ATTRIBUT
extern void __wt_random_init(WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
extern void __wt_random_init_seed( WT_SESSION_IMPL *session, WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
extern uint32_t __wt_random(WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
+extern uint64_t __wt_random64(WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
extern int __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
@@ -741,6 +743,7 @@ extern void __wt_txn_stats_update(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATT
extern void __wt_txn_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_txn_global_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern int __wt_verbose_dump_txn(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[], bool waiting) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
diff --git a/src/include/extern_posix.h b/src/include/extern_posix.h
index 5acb7b0ed27..fed7835ada1 100644
--- a/src/include/extern_posix.h
+++ b/src/include/extern_posix.h
@@ -12,8 +12,8 @@ extern int __wt_posix_map(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, void *mapp
extern int __wt_posix_map_preload(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, const void *map, size_t length, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_posix_map_discard(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, void *map, size_t length, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_posix_unmap(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, void *mapped_region, size_t len, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, bool is_signalled, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern void __wt_cond_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern void __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_once(void (*init_routine)(void)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
diff --git a/src/include/extern_win.h b/src/include/extern_win.h
index 11b45f11304..0bfc821c7a6 100644
--- a/src/include/extern_win.h
+++ b/src/include/extern_win.h
@@ -10,8 +10,8 @@ extern int __wt_os_win(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((war
extern int __wt_getenv(WT_SESSION_IMPL *session, const char *variable, const char **envp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_win_map(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, void *mapped_regionp, size_t *lenp, void *mapped_cookiep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_win_unmap(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, void *mapped_region, size_t length, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, bool is_signalled, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern void __wt_cond_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern void __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_once(void (*init_routine)(void)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
diff --git a/src/include/flags.h b/src/include/flags.h
index 2f0c207078a..c1fff920e3b 100644
--- a/src/include/flags.h
+++ b/src/include/flags.h
@@ -53,22 +53,24 @@
#define WT_SESSION_CAN_WAIT 0x00000001
#define WT_SESSION_INTERNAL 0x00000002
#define WT_SESSION_LOCKED_CHECKPOINT 0x00000004
-#define WT_SESSION_LOCKED_HANDLE_LIST 0x00000008
-#define WT_SESSION_LOCKED_METADATA 0x00000010
-#define WT_SESSION_LOCKED_PASS 0x00000020
-#define WT_SESSION_LOCKED_SCHEMA 0x00000040
-#define WT_SESSION_LOCKED_SLOT 0x00000080
-#define WT_SESSION_LOCKED_TABLE 0x00000100
-#define WT_SESSION_LOCKED_TURTLE 0x00000200
-#define WT_SESSION_LOGGING_INMEM 0x00000400
-#define WT_SESSION_LOOKASIDE_CURSOR 0x00000800
-#define WT_SESSION_NO_CACHE 0x00001000
-#define WT_SESSION_NO_DATA_HANDLES 0x00002000
-#define WT_SESSION_NO_EVICTION 0x00004000
-#define WT_SESSION_NO_LOGGING 0x00008000
-#define WT_SESSION_NO_SCHEMA_LOCK 0x00010000
-#define WT_SESSION_QUIET_CORRUPT_FILE 0x00020000
-#define WT_SESSION_SERVER_ASYNC 0x00040000
+#define WT_SESSION_LOCKED_HANDLE_LIST_READ 0x00000008
+#define WT_SESSION_LOCKED_HANDLE_LIST_WRITE 0x00000010
+#define WT_SESSION_LOCKED_METADATA 0x00000020
+#define WT_SESSION_LOCKED_PASS 0x00000040
+#define WT_SESSION_LOCKED_SCHEMA 0x00000080
+#define WT_SESSION_LOCKED_SLOT 0x00000100
+#define WT_SESSION_LOCKED_TABLE_READ 0x00000200
+#define WT_SESSION_LOCKED_TABLE_WRITE 0x00000400
+#define WT_SESSION_LOCKED_TURTLE 0x00000800
+#define WT_SESSION_LOGGING_INMEM 0x00001000
+#define WT_SESSION_LOOKASIDE_CURSOR 0x00002000
+#define WT_SESSION_NO_CACHE 0x00004000
+#define WT_SESSION_NO_DATA_HANDLES 0x00008000
+#define WT_SESSION_NO_EVICTION 0x00010000
+#define WT_SESSION_NO_LOGGING 0x00020000
+#define WT_SESSION_NO_SCHEMA_LOCK 0x00040000
+#define WT_SESSION_QUIET_CORRUPT_FILE 0x00080000
+#define WT_SESSION_SERVER_ASYNC 0x00100000
#define WT_STAT_CLEAR 0x00000001
#define WT_STAT_JSON 0x00000002
#define WT_STAT_ON_CLOSE 0x00000004
@@ -90,28 +92,29 @@
#define WT_VERB_COMPACT 0x00000008
#define WT_VERB_EVICT 0x00000010
#define WT_VERB_EVICTSERVER 0x00000020
-#define WT_VERB_FILEOPS 0x00000040
-#define WT_VERB_HANDLEOPS 0x00000080
-#define WT_VERB_LOG 0x00000100
-#define WT_VERB_LSM 0x00000200
-#define WT_VERB_LSM_MANAGER 0x00000400
-#define WT_VERB_METADATA 0x00000800
-#define WT_VERB_MUTEX 0x00001000
-#define WT_VERB_OVERFLOW 0x00002000
-#define WT_VERB_READ 0x00004000
-#define WT_VERB_REBALANCE 0x00008000
-#define WT_VERB_RECONCILE 0x00010000
-#define WT_VERB_RECOVERY 0x00020000
-#define WT_VERB_RECOVERY_PROGRESS 0x00040000
-#define WT_VERB_SALVAGE 0x00080000
-#define WT_VERB_SHARED_CACHE 0x00100000
-#define WT_VERB_SPLIT 0x00200000
-#define WT_VERB_TEMPORARY 0x00400000
-#define WT_VERB_THREAD_GROUP 0x00800000
-#define WT_VERB_TRANSACTION 0x01000000
-#define WT_VERB_VERIFY 0x02000000
-#define WT_VERB_VERSION 0x04000000
-#define WT_VERB_WRITE 0x08000000
+#define WT_VERB_EVICT_STUCK 0x00000040
+#define WT_VERB_FILEOPS 0x00000080
+#define WT_VERB_HANDLEOPS 0x00000100
+#define WT_VERB_LOG 0x00000200
+#define WT_VERB_LSM 0x00000400
+#define WT_VERB_LSM_MANAGER 0x00000800
+#define WT_VERB_METADATA 0x00001000
+#define WT_VERB_MUTEX 0x00002000
+#define WT_VERB_OVERFLOW 0x00004000
+#define WT_VERB_READ 0x00008000
+#define WT_VERB_REBALANCE 0x00010000
+#define WT_VERB_RECONCILE 0x00020000
+#define WT_VERB_RECOVERY 0x00040000
+#define WT_VERB_RECOVERY_PROGRESS 0x00080000
+#define WT_VERB_SALVAGE 0x00100000
+#define WT_VERB_SHARED_CACHE 0x00200000
+#define WT_VERB_SPLIT 0x00400000
+#define WT_VERB_TEMPORARY 0x00800000
+#define WT_VERB_THREAD_GROUP 0x01000000
+#define WT_VERB_TRANSACTION 0x02000000
+#define WT_VERB_VERIFY 0x04000000
+#define WT_VERB_VERSION 0x08000000
+#define WT_VERB_WRITE 0x10000000
#define WT_VISIBILITY_ERR 0x00000080
/*
* flags section: END
diff --git a/src/include/log.h b/src/include/log.h
index d9fea892c68..a6be3582b4d 100644
--- a/src/include/log.h
+++ b/src/include/log.h
@@ -163,7 +163,7 @@ struct __wt_logslot {
WT_CACHE_LINE_PAD_BEGIN
volatile int64_t slot_state; /* Slot state */
int64_t slot_unbuffered; /* Unbuffered data in this slot */
- int32_t slot_error; /* Error value */
+ int slot_error; /* Error value */
wt_off_t slot_start_offset; /* Starting file offset */
wt_off_t slot_last_offset; /* Last record offset */
WT_LSN slot_release_lsn; /* Slot release LSN */
@@ -254,6 +254,7 @@ struct __wt_log {
#define WT_SLOT_POOL 128
WT_LOGSLOT *active_slot; /* Active slot */
WT_LOGSLOT slot_pool[WT_SLOT_POOL]; /* Pool of all slots */
+ int32_t pool_index; /* Index into slot pool */
size_t slot_buf_size; /* Buffer size for slots */
#ifdef HAVE_DIAGNOSTIC
uint64_t write_calls; /* Calls to log_write */
diff --git a/src/include/misc.i b/src/include/misc.i
index f36be32d6a2..d5692a3f9cf 100644
--- a/src/include/misc.i
+++ b/src/include/misc.i
@@ -11,11 +11,12 @@
* Wait on a mutex, optionally timing out.
*/
static inline void
-__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs)
+__wt_cond_wait(WT_SESSION_IMPL *session,
+ WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *))
{
bool notused;
- __wt_cond_wait_signal(session, cond, usecs, &notused);
+ __wt_cond_wait_signal(session, cond, usecs, run_func, &notused);
}
/*
diff --git a/src/include/mutex.h b/src/include/mutex.h
index 727a690bb1c..06b8c4a3304 100644
--- a/src/include/mutex.h
+++ b/src/include/mutex.h
@@ -21,8 +21,8 @@ struct __wt_condvar {
int waiters; /* Numbers of waiters, or
-1 if signalled with no waiters. */
/*
- * The following fields are only used for automatically adjusting
- * condition variables. They could be in a separate structure.
+ * The following fields are used for automatically adjusting condition
+ * variable wait times.
*/
uint64_t min_wait; /* Minimum wait duration */
uint64_t max_wait; /* Maximum wait duration */
diff --git a/src/include/packing.i b/src/include/packing.i
index 17ca261bcfc..8ba3dd536ac 100644
--- a/src/include/packing.i
+++ b/src/include/packing.i
@@ -168,10 +168,15 @@ next: if (pack->cur == pack->end)
(int)(pack->end - pack->orig), pack->orig);
return (0);
case 'u':
- case 'U':
/* Special case for items with a size prefix. */
pv->type = (!pv->havesize && *pack->cur != '\0') ? 'U' : 'u';
return (0);
+ case 'U':
+ /*
+ * Don't change the type. 'U' is used internally, so this type
+ * was already changed to explicitly include the size.
+ */
+ return (0);
case 'b':
case 'h':
case 'i':
diff --git a/src/include/schema.h b/src/include/schema.h
index bb116e5cf2f..9a6e1e54e80 100644
--- a/src/include/schema.h
+++ b/src/include/schema.h
@@ -78,6 +78,14 @@ struct __wt_table {
*/
#define WT_COLGROUPS(t) WT_MAX((t)->ncolgroups, 1)
+/* Helpers for the locked state of the handle list and table locks. */
+#define WT_SESSION_LOCKED_HANDLE_LIST \
+ (WT_SESSION_LOCKED_HANDLE_LIST_READ | \
+ WT_SESSION_LOCKED_HANDLE_LIST_WRITE)
+#define WT_SESSION_LOCKED_TABLE \
+ (WT_SESSION_LOCKED_TABLE_READ | \
+ WT_SESSION_LOCKED_TABLE_WRITE)
+
/*
* WT_WITH_LOCK_WAIT --
* Wait for a lock, perform an operation, drop the lock.
@@ -85,7 +93,7 @@ struct __wt_table {
#define WT_WITH_LOCK_WAIT(session, lock, flag, op) do { \
if (F_ISSET(session, (flag))) { \
op; \
- } else { \
+ } else { \
__wt_spin_lock_track(session, lock); \
F_SET(session, (flag)); \
op; \
@@ -122,16 +130,46 @@ struct __wt_table {
&S2C(session)->checkpoint_lock, WT_SESSION_LOCKED_CHECKPOINT, op)
/*
- * WT_WITH_HANDLE_LIST_LOCK --
- * Acquire the data handle list lock, perform an operation, drop the lock.
+ * WT_WITH_HANDLE_LIST_READ_LOCK --
+ * Acquire the data handle list lock in shared mode, perform an operation,
+ * drop the lock. The handle list lock is a read-write lock so the
+ * implementation is different to the other lock macros.
*
* Note: always waits because some operations need the handle list lock to
* discard handles, and we only expect it to be held across short
* operations.
*/
-#define WT_WITH_HANDLE_LIST_LOCK(session, op) \
- WT_WITH_LOCK_WAIT(session, \
- &S2C(session)->dhandle_lock, WT_SESSION_LOCKED_HANDLE_LIST, op)
+#define WT_WITH_HANDLE_LIST_READ_LOCK(session, op) do { \
+ if (F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)) { \
+ op; \
+ } else { \
+ __wt_readlock(session, &S2C(session)->dhandle_lock); \
+ F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \
+ op; \
+ F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \
+ __wt_readunlock(session, &S2C(session)->dhandle_lock); \
+ } \
+} while (0)
+
+/*
+ * WT_WITH_HANDLE_LIST_WRITE_LOCK --
+ * Acquire the data handle list lock in exclusive mode, perform an
+ * operation, drop the lock. The handle list lock is a read-write lock so
+ * the implementation is different to the other lock macros.
+ */
+#define WT_WITH_HANDLE_LIST_WRITE_LOCK(session, op) do { \
+ if (F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)) { \
+ op; \
+ } else { \
+ WT_ASSERT(session, \
+ !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ));\
+ __wt_writelock(session, &S2C(session)->dhandle_lock); \
+ F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \
+ op; \
+ F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \
+ __wt_writeunlock(session, &S2C(session)->dhandle_lock); \
+ } \
+} while (0)
/*
* WT_WITH_METADATA_LOCK --
@@ -165,22 +203,58 @@ struct __wt_table {
} while (0)
/*
- * WT_WITH_TABLE_LOCK, WT_WITH_TABLE_LOCK_NOWAIT --
+ * WT_WITH_TABLE_READ_LOCK, WT_WITH_TABLE_WRITE_LOCK,
+ * WT_WITH_TABLE_WRITE_LOCK_NOWAIT --
* Acquire the table lock, perform an operation, drop the lock.
+ * The table lock is a read-write lock so the implementation is different
+ * to most other lock macros.
+ *
+ * Note: readlock always waits because some operations need the table lock
+ * to discard handles, and we only expect it to be held across short
+ * operations.
*/
-#define WT_WITH_TABLE_LOCK(session, op) do { \
- WT_ASSERT(session, \
- F_ISSET(session, WT_SESSION_LOCKED_TABLE) || \
- !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); \
- WT_WITH_LOCK_WAIT(session, \
- &S2C(session)->table_lock, WT_SESSION_LOCKED_TABLE, op); \
+#define WT_WITH_TABLE_READ_LOCK(session, op) do { \
+ if (F_ISSET(session, WT_SESSION_LOCKED_TABLE)) { \
+ op; \
+ } else { \
+ WT_ASSERT(session, \
+ !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); \
+ __wt_readlock(session, &S2C(session)->table_lock); \
+ F_SET(session, WT_SESSION_LOCKED_TABLE_READ); \
+ op; \
+ F_CLR(session, WT_SESSION_LOCKED_TABLE_READ); \
+ __wt_readunlock(session, &S2C(session)->table_lock); \
+ } \
+} while (0)
+
+#define WT_WITH_TABLE_WRITE_LOCK(session, op) do { \
+ if (F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE)) { \
+ op; \
+ } else { \
+ WT_ASSERT(session, \
+ !F_ISSET(session, WT_SESSION_LOCKED_TABLE_READ | \
+ WT_SESSION_LOCKED_HANDLE_LIST)); \
+ __wt_writelock(session, &S2C(session)->table_lock); \
+ F_SET(session, WT_SESSION_LOCKED_TABLE_WRITE); \
+ op; \
+ F_CLR(session, WT_SESSION_LOCKED_TABLE_WRITE); \
+ __wt_writeunlock(session, &S2C(session)->table_lock); \
+ } \
} while (0)
-#define WT_WITH_TABLE_LOCK_NOWAIT(session, ret, op) do { \
+#define WT_WITH_TABLE_WRITE_LOCK_NOWAIT(session, ret, op) do { \
WT_ASSERT(session, \
- F_ISSET(session, WT_SESSION_LOCKED_TABLE) || \
- !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); \
- WT_WITH_LOCK_NOWAIT(session, ret, \
- &S2C(session)->table_lock, WT_SESSION_LOCKED_TABLE, op); \
+ F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE) || \
+ !F_ISSET(session, WT_SESSION_LOCKED_TABLE_READ | \
+ WT_SESSION_LOCKED_HANDLE_LIST)); \
+ if (F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE)) { \
+ op; \
+ } else if ((ret = __wt_try_writelock(session, \
+ &S2C(session)->table_lock)) == 0) { \
+ F_SET(session, WT_SESSION_LOCKED_TABLE_WRITE); \
+ op; \
+ F_CLR(session, WT_SESSION_LOCKED_TABLE_WRITE); \
+ __wt_writeunlock(session, &S2C(session)->table_lock); \
+ } \
} while (0)
/*
@@ -192,19 +266,31 @@ struct __wt_table {
WT_CONNECTION_IMPL *__conn = S2C(session); \
bool __checkpoint_locked = \
F_ISSET(session, WT_SESSION_LOCKED_CHECKPOINT); \
- bool __handle_locked = \
- F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST); \
- bool __table_locked = \
- F_ISSET(session, WT_SESSION_LOCKED_TABLE); \
+ bool __handle_read_locked = \
+ F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \
+ bool __handle_write_locked = \
+ F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \
+ bool __table_read_locked = \
+ F_ISSET(session, WT_SESSION_LOCKED_TABLE_READ); \
+ bool __table_write_locked = \
+ F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE); \
bool __schema_locked = \
F_ISSET(session, WT_SESSION_LOCKED_SCHEMA); \
- if (__handle_locked) { \
- F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST); \
- __wt_spin_unlock(session, &__conn->dhandle_lock); \
+ if (__handle_read_locked) { \
+ F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \
+ __wt_readunlock(session, &__conn->dhandle_lock); \
} \
- if (__table_locked) { \
- F_CLR(session, WT_SESSION_LOCKED_TABLE); \
- __wt_spin_unlock(session, &__conn->table_lock); \
+ if (__handle_write_locked) { \
+ F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \
+ __wt_writeunlock(session, &__conn->dhandle_lock); \
+ } \
+ if (__table_read_locked) { \
+ F_CLR(session, WT_SESSION_LOCKED_TABLE_READ); \
+ __wt_readunlock(session, &__conn->table_lock); \
+ } \
+ if (__table_write_locked) { \
+ F_CLR(session, WT_SESSION_LOCKED_TABLE_WRITE); \
+ __wt_writeunlock(session, &__conn->table_lock); \
} \
if (__schema_locked) { \
F_CLR(session, WT_SESSION_LOCKED_SCHEMA); \
@@ -223,12 +309,20 @@ struct __wt_table {
__wt_spin_lock(session, &__conn->schema_lock); \
F_SET(session, WT_SESSION_LOCKED_SCHEMA); \
} \
- if (__table_locked) { \
- __wt_spin_lock(session, &__conn->table_lock); \
- F_SET(session, WT_SESSION_LOCKED_TABLE); \
+ if (__table_read_locked) { \
+ __wt_readlock(session, &__conn->table_lock); \
+ F_SET(session, WT_SESSION_LOCKED_TABLE_READ); \
+ } \
+ if (__table_write_locked) { \
+ __wt_writelock(session, &__conn->table_lock); \
+ F_SET(session, WT_SESSION_LOCKED_TABLE_WRITE); \
+ } \
+ if (__handle_read_locked) { \
+ __wt_readlock(session, &__conn->dhandle_lock); \
+ F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \
} \
- if (__handle_locked) { \
- __wt_spin_lock(session, &__conn->dhandle_lock); \
- F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST); \
+ if (__handle_write_locked) { \
+ __wt_writelock(session, &__conn->dhandle_lock); \
+ F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \
} \
} while (0)
diff --git a/src/include/session.h b/src/include/session.h
index 7dd523aea26..085f871a34f 100644
--- a/src/include/session.h
+++ b/src/include/session.h
@@ -52,8 +52,6 @@ struct __wt_session_impl {
const char *lastop; /* Last operation */
uint32_t id; /* UID, offset in session array */
- WT_CONDVAR *cond; /* Condition variable */
-
WT_EVENT_HANDLER *event_handler;/* Application's event handlers */
WT_DATA_HANDLE *dhandle; /* Current data handle */
diff --git a/src/include/stat.h b/src/include/stat.h
index fd3e3290d95..8b2e78a4ed5 100644
--- a/src/include/stat.h
+++ b/src/include/stat.h
@@ -392,9 +392,7 @@ struct __wt_connection_stats {
int64_t lock_checkpoint_count;
int64_t lock_checkpoint_wait_application;
int64_t lock_checkpoint_wait_internal;
- int64_t lock_handle_list_count;
- int64_t lock_handle_list_wait_application;
- int64_t lock_handle_list_wait_internal;
+ int64_t lock_handle_list_wait_eviction;
int64_t lock_metadata_count;
int64_t lock_metadata_wait_application;
int64_t lock_metadata_wait_internal;
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index 90989cc679d..c148e759299 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -576,8 +576,9 @@ struct __wt_cursor {
#define WT_CURSTD_OPEN 0x00200
#define WT_CURSTD_OVERWRITE 0x00400
#define WT_CURSTD_RAW 0x00800
-#define WT_CURSTD_VALUE_EXT 0x01000 /* Value points out of the tree. */
-#define WT_CURSTD_VALUE_INT 0x02000 /* Value points into the tree. */
+#define WT_CURSTD_RAW_SEARCH 0x01000
+#define WT_CURSTD_VALUE_EXT 0x02000 /* Value points out of the tree. */
+#define WT_CURSTD_VALUE_INT 0x04000 /* Value points into the tree. */
#define WT_CURSTD_VALUE_SET (WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT)
uint32_t flags;
#endif
@@ -1982,12 +1983,13 @@ struct __wt_connection {
* as a list\, such as <code>"verbose=[evictserver\,read]"</code>., a
* list\, with values chosen from the following options: \c "api"\, \c
* "block"\, \c "checkpoint"\, \c "compact"\, \c "evict"\, \c
- * "evictserver"\, \c "fileops"\, \c "handleops"\, \c "log"\, \c "lsm"\,
- * \c "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c "overflow"\, \c
- * "read"\, \c "rebalance"\, \c "reconcile"\, \c "recovery"\, \c
- * "recovery_progress"\, \c "salvage"\, \c "shared_cache"\, \c "split"\,
- * \c "temporary"\, \c "thread_group"\, \c "transaction"\, \c "verify"\,
- * \c "version"\, \c "write"; default empty.}
+ * "evict_stuck"\, \c "evictserver"\, \c "fileops"\, \c "handleops"\, \c
+ * "log"\, \c "lsm"\, \c "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c
+ * "overflow"\, \c "read"\, \c "rebalance"\, \c "reconcile"\, \c
+ * "recovery"\, \c "recovery_progress"\, \c "salvage"\, \c
+ * "shared_cache"\, \c "split"\, \c "temporary"\, \c "thread_group"\, \c
+ * "transaction"\, \c "verify"\, \c "version"\, \c "write"; default
+ * empty.}
* @configend
* @errors
*/
@@ -2361,7 +2363,7 @@ struct __wt_connection {
* @config{exclusive, fail if the database already exists\, generally used with
* the \c create option., a boolean flag; default \c false.}
* @config{extensions, list of shared library extensions to load (using dlopen).
- * Any values specified to an library extension are passed to
+ * Any values specified to a library extension are passed to
* WT_CONNECTION::load_extension as the \c config parameter (for example\,
* <code>extensions=(/path/ext.so={entry=my_entry})</code>)., a list of strings;
* default empty.}
@@ -2513,12 +2515,13 @@ struct __wt_connection {
* WiredTiger is configured with --enable-verbose. Options are given as a
* list\, such as <code>"verbose=[evictserver\,read]"</code>., a list\, with
* values chosen from the following options: \c "api"\, \c "block"\, \c
- * "checkpoint"\, \c "compact"\, \c "evict"\, \c "evictserver"\, \c "fileops"\,
- * \c "handleops"\, \c "log"\, \c "lsm"\, \c "lsm_manager"\, \c "metadata"\, \c
- * "mutex"\, \c "overflow"\, \c "read"\, \c "rebalance"\, \c "reconcile"\, \c
- * "recovery"\, \c "recovery_progress"\, \c "salvage"\, \c "shared_cache"\, \c
- * "split"\, \c "temporary"\, \c "thread_group"\, \c "transaction"\, \c
- * "verify"\, \c "version"\, \c "write"; default empty.}
+ * "checkpoint"\, \c "compact"\, \c "evict"\, \c "evict_stuck"\, \c
+ * "evictserver"\, \c "fileops"\, \c "handleops"\, \c "log"\, \c "lsm"\, \c
+ * "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c "overflow"\, \c "read"\, \c
+ * "rebalance"\, \c "reconcile"\, \c "recovery"\, \c "recovery_progress"\, \c
+ * "salvage"\, \c "shared_cache"\, \c "split"\, \c "temporary"\, \c
+ * "thread_group"\, \c "transaction"\, \c "verify"\, \c "version"\, \c "write";
+ * default empty.}
* @config{write_through, Use \c FILE_FLAG_WRITE_THROUGH on Windows to write to
* files. Ignored on non-Windows systems. Options are given as a list\, such
* as <code>"write_through=[data]"</code>. Configuring \c write_through requires
@@ -4593,240 +4596,236 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1133
/*! lock: checkpoint lock internal thread wait time (usecs) */
#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1134
-/*! lock: handle-list lock acquisitions */
-#define WT_STAT_CONN_LOCK_HANDLE_LIST_COUNT 1135
-/*! lock: handle-list lock application thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_APPLICATION 1136
-/*! lock: handle-list lock internal thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_INTERNAL 1137
+/*! lock: handle-list lock eviction thread wait time (usecs) */
+#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_EVICTION 1135
/*! lock: metadata lock acquisitions */
-#define WT_STAT_CONN_LOCK_METADATA_COUNT 1138
+#define WT_STAT_CONN_LOCK_METADATA_COUNT 1136
/*! lock: metadata lock application thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1139
+#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1137
/*! lock: metadata lock internal thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1140
+#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1138
/*! lock: schema lock acquisitions */
-#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1141
+#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1139
/*! lock: schema lock application thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1142
+#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1140
/*! lock: schema lock internal thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1143
+#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1141
/*! lock: table lock acquisitions */
-#define WT_STAT_CONN_LOCK_TABLE_COUNT 1144
+#define WT_STAT_CONN_LOCK_TABLE_COUNT 1142
/*!
* lock: table lock application thread time waiting for the table lock
* (usecs)
*/
-#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1145
+#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1143
/*!
* lock: table lock internal thread time waiting for the table lock
* (usecs)
*/
-#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1146
+#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1144
/*! log: busy returns attempting to switch slots */
-#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1147
+#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1145
/*! log: consolidated slot closures */
-#define WT_STAT_CONN_LOG_SLOT_CLOSES 1148
+#define WT_STAT_CONN_LOG_SLOT_CLOSES 1146
/*! log: consolidated slot join races */
-#define WT_STAT_CONN_LOG_SLOT_RACES 1149
+#define WT_STAT_CONN_LOG_SLOT_RACES 1147
/*! log: consolidated slot join transitions */
-#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1150
+#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1148
/*! log: consolidated slot joins */
-#define WT_STAT_CONN_LOG_SLOT_JOINS 1151
+#define WT_STAT_CONN_LOG_SLOT_JOINS 1149
/*! log: consolidated slot unbuffered writes */
-#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1152
+#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1150
/*! log: log bytes of payload data */
-#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1153
+#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1151
/*! log: log bytes written */
-#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1154
+#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1152
/*! log: log files manually zero-filled */
-#define WT_STAT_CONN_LOG_ZERO_FILLS 1155
+#define WT_STAT_CONN_LOG_ZERO_FILLS 1153
/*! log: log flush operations */
-#define WT_STAT_CONN_LOG_FLUSH 1156
+#define WT_STAT_CONN_LOG_FLUSH 1154
/*! log: log force write operations */
-#define WT_STAT_CONN_LOG_FORCE_WRITE 1157
+#define WT_STAT_CONN_LOG_FORCE_WRITE 1155
/*! log: log force write operations skipped */
-#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1158
+#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1156
/*! log: log records compressed */
-#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1159
+#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1157
/*! log: log records not compressed */
-#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1160
+#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1158
/*! log: log records too small to compress */
-#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1161
+#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1159
/*! log: log release advances write LSN */
-#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1162
+#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1160
/*! log: log scan operations */
-#define WT_STAT_CONN_LOG_SCANS 1163
+#define WT_STAT_CONN_LOG_SCANS 1161
/*! log: log scan records requiring two reads */
-#define WT_STAT_CONN_LOG_SCAN_REREADS 1164
+#define WT_STAT_CONN_LOG_SCAN_REREADS 1162
/*! log: log server thread advances write LSN */
-#define WT_STAT_CONN_LOG_WRITE_LSN 1165
+#define WT_STAT_CONN_LOG_WRITE_LSN 1163
/*! log: log server thread write LSN walk skipped */
-#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1166
+#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1164
/*! log: log sync operations */
-#define WT_STAT_CONN_LOG_SYNC 1167
+#define WT_STAT_CONN_LOG_SYNC 1165
/*! log: log sync time duration (usecs) */
-#define WT_STAT_CONN_LOG_SYNC_DURATION 1168
+#define WT_STAT_CONN_LOG_SYNC_DURATION 1166
/*! log: log sync_dir operations */
-#define WT_STAT_CONN_LOG_SYNC_DIR 1169
+#define WT_STAT_CONN_LOG_SYNC_DIR 1167
/*! log: log sync_dir time duration (usecs) */
-#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1170
+#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1168
/*! log: log write operations */
-#define WT_STAT_CONN_LOG_WRITES 1171
+#define WT_STAT_CONN_LOG_WRITES 1169
/*! log: logging bytes consolidated */
-#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1172
+#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1170
/*! log: maximum log file size */
-#define WT_STAT_CONN_LOG_MAX_FILESIZE 1173
+#define WT_STAT_CONN_LOG_MAX_FILESIZE 1171
/*! log: number of pre-allocated log files to create */
-#define WT_STAT_CONN_LOG_PREALLOC_MAX 1174
+#define WT_STAT_CONN_LOG_PREALLOC_MAX 1172
/*! log: pre-allocated log files not ready and missed */
-#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1175
+#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1173
/*! log: pre-allocated log files prepared */
-#define WT_STAT_CONN_LOG_PREALLOC_FILES 1176
+#define WT_STAT_CONN_LOG_PREALLOC_FILES 1174
/*! log: pre-allocated log files used */
-#define WT_STAT_CONN_LOG_PREALLOC_USED 1177
+#define WT_STAT_CONN_LOG_PREALLOC_USED 1175
/*! log: records processed by log scan */
-#define WT_STAT_CONN_LOG_SCAN_RECORDS 1178
+#define WT_STAT_CONN_LOG_SCAN_RECORDS 1176
/*! log: total in-memory size of compressed records */
-#define WT_STAT_CONN_LOG_COMPRESS_MEM 1179
+#define WT_STAT_CONN_LOG_COMPRESS_MEM 1177
/*! log: total log buffer size */
-#define WT_STAT_CONN_LOG_BUFFER_SIZE 1180
+#define WT_STAT_CONN_LOG_BUFFER_SIZE 1178
/*! log: total size of compressed records */
-#define WT_STAT_CONN_LOG_COMPRESS_LEN 1181
+#define WT_STAT_CONN_LOG_COMPRESS_LEN 1179
/*! log: written slots coalesced */
-#define WT_STAT_CONN_LOG_SLOT_COALESCED 1182
+#define WT_STAT_CONN_LOG_SLOT_COALESCED 1180
/*! log: yields waiting for previous log file close */
-#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1183
+#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1181
/*! reconciliation: fast-path pages deleted */
-#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1184
+#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1182
/*! reconciliation: page reconciliation calls */
-#define WT_STAT_CONN_REC_PAGES 1185
+#define WT_STAT_CONN_REC_PAGES 1183
/*! reconciliation: page reconciliation calls for eviction */
-#define WT_STAT_CONN_REC_PAGES_EVICTION 1186
+#define WT_STAT_CONN_REC_PAGES_EVICTION 1184
/*! reconciliation: pages deleted */
-#define WT_STAT_CONN_REC_PAGE_DELETE 1187
+#define WT_STAT_CONN_REC_PAGE_DELETE 1185
/*! reconciliation: split bytes currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1188
+#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1186
/*! reconciliation: split objects currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1189
+#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1187
/*! session: open cursor count */
-#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1190
+#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1188
/*! session: open session count */
-#define WT_STAT_CONN_SESSION_OPEN 1191
+#define WT_STAT_CONN_SESSION_OPEN 1189
/*! session: table alter failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1192
+#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1190
/*! session: table alter successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1193
+#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1191
/*! session: table alter unchanged and skipped */
-#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1194
+#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1192
/*! session: table compact failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1195
+#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1193
/*! session: table compact successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1196
+#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1194
/*! session: table create failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1197
+#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1195
/*! session: table create successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1198
+#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1196
/*! session: table drop failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1199
+#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1197
/*! session: table drop successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1200
+#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1198
/*! session: table rebalance failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1201
+#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1199
/*! session: table rebalance successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1202
+#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1200
/*! session: table rename failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1203
+#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1201
/*! session: table rename successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1204
+#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1202
/*! session: table salvage failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1205
+#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1203
/*! session: table salvage successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1206
+#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1204
/*! session: table truncate failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1207
+#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1205
/*! session: table truncate successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1208
+#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1206
/*! session: table verify failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1209
+#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1207
/*! session: table verify successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1210
+#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1208
/*! thread-state: active filesystem fsync calls */
-#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1211
+#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1209
/*! thread-state: active filesystem read calls */
-#define WT_STAT_CONN_THREAD_READ_ACTIVE 1212
+#define WT_STAT_CONN_THREAD_READ_ACTIVE 1210
/*! thread-state: active filesystem write calls */
-#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1213
+#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1211
/*! thread-yield: application thread time evicting (usecs) */
-#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1214
+#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1212
/*! thread-yield: application thread time waiting for cache (usecs) */
-#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1215
+#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1213
/*! thread-yield: page acquire busy blocked */
-#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1216
+#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1214
/*! thread-yield: page acquire eviction blocked */
-#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1217
+#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1215
/*! thread-yield: page acquire locked blocked */
-#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1218
+#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1216
/*! thread-yield: page acquire read blocked */
-#define WT_STAT_CONN_PAGE_READ_BLOCKED 1219
+#define WT_STAT_CONN_PAGE_READ_BLOCKED 1217
/*! thread-yield: page acquire time sleeping (usecs) */
-#define WT_STAT_CONN_PAGE_SLEEP 1220
+#define WT_STAT_CONN_PAGE_SLEEP 1218
/*! transaction: number of named snapshots created */
-#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1221
+#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1219
/*! transaction: number of named snapshots dropped */
-#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1222
+#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1220
/*! transaction: transaction begins */
-#define WT_STAT_CONN_TXN_BEGIN 1223
+#define WT_STAT_CONN_TXN_BEGIN 1221
/*! transaction: transaction checkpoint currently running */
-#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1224
+#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1222
/*! transaction: transaction checkpoint generation */
-#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1225
+#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1223
/*! transaction: transaction checkpoint max time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1226
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1224
/*! transaction: transaction checkpoint min time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1227
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1225
/*! transaction: transaction checkpoint most recent time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1228
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1226
/*! transaction: transaction checkpoint scrub dirty target */
-#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1229
+#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1227
/*! transaction: transaction checkpoint scrub time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1230
+#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1228
/*! transaction: transaction checkpoint total time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1231
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1229
/*! transaction: transaction checkpoints */
-#define WT_STAT_CONN_TXN_CHECKPOINT 1232
+#define WT_STAT_CONN_TXN_CHECKPOINT 1230
/*!
* transaction: transaction checkpoints skipped because database was
* clean
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1233
+#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1231
/*! transaction: transaction failures due to cache overflow */
-#define WT_STAT_CONN_TXN_FAIL_CACHE 1234
+#define WT_STAT_CONN_TXN_FAIL_CACHE 1232
/*!
* transaction: transaction fsync calls for checkpoint after allocating
* the transaction ID
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1235
+#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1233
/*!
* transaction: transaction fsync duration for checkpoint after
* allocating the transaction ID (usecs)
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1236
+#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1234
/*! transaction: transaction range of IDs currently pinned */
-#define WT_STAT_CONN_TXN_PINNED_RANGE 1237
+#define WT_STAT_CONN_TXN_PINNED_RANGE 1235
/*! transaction: transaction range of IDs currently pinned by a checkpoint */
-#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1238
+#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1236
/*!
* transaction: transaction range of IDs currently pinned by named
* snapshots
*/
-#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1239
+#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1237
/*! transaction: transaction sync calls */
-#define WT_STAT_CONN_TXN_SYNC 1240
+#define WT_STAT_CONN_TXN_SYNC 1238
/*! transaction: transactions committed */
-#define WT_STAT_CONN_TXN_COMMIT 1241
+#define WT_STAT_CONN_TXN_COMMIT 1239
/*! transaction: transactions rolled back */
-#define WT_STAT_CONN_TXN_ROLLBACK 1242
+#define WT_STAT_CONN_TXN_ROLLBACK 1240
/*!
* @}
diff --git a/src/log/log.c b/src/log/log.c
index da500a74e87..d6caa55f8c7 100644
--- a/src/log/log.c
+++ b/src/log/log.c
@@ -43,11 +43,11 @@ __log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
*/
if (F_ISSET(session, WT_SESSION_LOCKED_SLOT))
__wt_spin_unlock(session, &log->log_slot_lock);
- __wt_cond_auto_signal(session, conn->log_wrlsn_cond);
+ __wt_cond_signal(session, conn->log_wrlsn_cond);
if (++yield_count < WT_THOUSAND)
__wt_yield();
else
- __wt_cond_wait(session, log->log_write_cond, 200);
+ __wt_cond_wait(session, log->log_write_cond, 200, NULL);
if (F_ISSET(session, WT_SESSION_LOCKED_SLOT))
__wt_spin_lock(session, &log->log_slot_lock);
}
@@ -62,6 +62,8 @@ static int
__log_fs_write(WT_SESSION_IMPL *session,
WT_LOGSLOT *slot, wt_off_t offset, size_t len, const void *buf)
{
+ WT_DECL_RET;
+
/*
* If we're writing into a new log file, we have to wait for all
* writes to the previous log file to complete otherwise there could
@@ -71,7 +73,10 @@ __log_fs_write(WT_SESSION_IMPL *session,
__log_wait_for_earlier_slot(session, slot);
WT_RET(__wt_log_force_sync(session, &slot->slot_release_lsn));
}
- return (__wt_write(session, slot->slot_fh, offset, len, buf));
+ if ((ret = __wt_write(session, slot->slot_fh, offset, len, buf)) != 0)
+ WT_PANIC_MSG(session, ret,
+ "%s: fatal log failure", slot->slot_fh->name);
+ return (ret);
}
/*
@@ -89,7 +94,7 @@ __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn)
log = conn->log;
log->ckpt_lsn = *ckp_lsn;
if (conn->log_cond != NULL)
- __wt_cond_auto_signal(session, conn->log_cond);
+ __wt_cond_signal(session, conn->log_cond);
}
/*
@@ -170,7 +175,7 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn)
*/
while (log->sync_lsn.l.file < min_lsn->l.file) {
__wt_cond_signal(session, S2C(session)->log_file_cond);
- __wt_cond_wait(session, log->log_sync_cond, 10000);
+ __wt_cond_wait(session, log->log_sync_cond, 10000, NULL);
}
__wt_spin_lock(session, &log->log_sync_lock);
WT_ASSERT(session, log->log_dir_fh != NULL);
@@ -915,7 +920,7 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created)
else {
WT_STAT_CONN_INCR(session, log_prealloc_missed);
if (conn->log_cond != NULL)
- __wt_cond_auto_signal(
+ __wt_cond_signal(
session, conn->log_cond);
}
}
@@ -1490,7 +1495,8 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep)
*/
if (log->sync_lsn.l.file < slot->slot_end_lsn.l.file ||
__wt_spin_trylock(session, &log->log_sync_lock) != 0) {
- __wt_cond_wait(session, log->log_sync_cond, 10000);
+ __wt_cond_wait(
+ session, log->log_sync_cond, 10000, NULL);
continue;
}
locked = true;
@@ -2126,7 +2132,11 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
WT_STAT_CONN_INCR(session, log_writes);
- __wt_log_slot_join(session, rdup_len, flags, &myslot);
+ /*
+ * The only time joining a slot should ever return an error is if it
+ * detects a panic.
+ */
+ WT_ERR(__wt_log_slot_join(session, rdup_len, flags, &myslot));
/*
* If the addition of this record crosses the buffer boundary,
* switch in a new slot.
@@ -2160,7 +2170,7 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
* XXX I've seen times when conditions are NULL.
*/
if (conn->log_cond != NULL) {
- __wt_cond_auto_signal(session, conn->log_cond);
+ __wt_cond_signal(session, conn->log_cond);
__wt_yield();
} else
WT_ERR(__wt_log_force_write(session, 1, NULL));
@@ -2169,12 +2179,14 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
/* Wait for our writes to reach the OS */
while (__wt_log_cmp(&log->write_lsn, &lsn) <= 0 &&
myslot.slot->slot_error == 0)
- __wt_cond_wait(session, log->log_write_cond, 10000);
+ __wt_cond_wait(
+ session, log->log_write_cond, 10000, NULL);
} else if (LF_ISSET(WT_LOG_FSYNC)) {
/* Wait for our writes to reach disk */
while (__wt_log_cmp(&log->sync_lsn, &lsn) <= 0 &&
myslot.slot->slot_error == 0)
- __wt_cond_wait(session, log->log_sync_cond, 10000);
+ __wt_cond_wait(
+ session, log->log_sync_cond, 10000, NULL);
}
/*
@@ -2199,12 +2211,12 @@ err:
/*
* If one of the sync flags is set, assert the proper LSN has moved to
- * match.
+ * match on success.
*/
- WT_ASSERT(session, !LF_ISSET(WT_LOG_FLUSH) ||
+ WT_ASSERT(session, ret != 0 || !LF_ISSET(WT_LOG_FLUSH) ||
__wt_log_cmp(&log->write_lsn, &lsn) >= 0);
- WT_ASSERT(session,
- !LF_ISSET(WT_LOG_FSYNC) || __wt_log_cmp(&log->sync_lsn, &lsn) >= 0);
+ WT_ASSERT(session, ret != 0 || !LF_ISSET(WT_LOG_FSYNC) ||
+ __wt_log_cmp(&log->sync_lsn, &lsn) >= 0);
return (ret);
}
diff --git a/src/log/log_slot.c b/src/log/log_slot.c
index a29a34e5652..542f010ea53 100644
--- a/src/log/log_slot.c
+++ b/src/log/log_slot.c
@@ -8,6 +8,49 @@
#include "wt_internal.h"
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __log_slot_dump --
+ * Dump the entire slot state.
+ */
+static void
+__log_slot_dump(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+ WT_LOGSLOT *slot;
+ int earliest, i;
+
+ conn = S2C(session);
+ log = conn->log;
+ earliest = 0;
+ for (i = 0; i < WT_SLOT_POOL; i++) {
+ slot = &log->slot_pool[i];
+ if (__wt_log_cmp(&slot->slot_release_lsn,
+ &log->slot_pool[earliest].slot_release_lsn) < 0)
+ earliest = i;
+ __wt_errx(session, "Slot %d:", i);
+ __wt_errx(session, " State: %" PRIx64 " Flags: %" PRIx32,
+ slot->slot_state, slot->flags);
+ __wt_errx(session, " Start LSN: %" PRIu32 "/%" PRIu32,
+ slot->slot_start_lsn.l.file, slot->slot_start_lsn.l.offset);
+ __wt_errx(session, " End LSN: %" PRIu32 "/%" PRIu32,
+ slot->slot_end_lsn.l.file, slot->slot_end_lsn.l.offset);
+ __wt_errx(session, " Release LSN: %" PRIu32 "/%" PRIu32,
+ slot->slot_release_lsn.l.file,
+ slot->slot_release_lsn.l.offset);
+ __wt_errx(session, " Offset: start: %" PRIuMAX
+ " last:%" PRIuMAX, (uintmax_t)slot->slot_start_offset,
+ (uintmax_t)slot->slot_last_offset);
+ __wt_errx(session, " Unbuffered: %" PRId64
+ " error: %" PRId32, slot->slot_unbuffered,
+ slot->slot_error);
+ }
+ __wt_errx(session, "Earliest slot: %d", earliest);
+
+}
+#endif
+
/*
* __wt_log_slot_activate --
* Initialize a slot to become active.
@@ -21,7 +64,6 @@ __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
conn = S2C(session);
log = conn->log;
- slot->slot_state = 0;
/*
* !!! slot_release_lsn must be set outside this function because
* this function may be called after a log file switch and the
@@ -30,12 +72,19 @@ __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
* set for closing the file handle on a log file switch. The flags
* are reset when the slot is freed. See log_slot_free.
*/
+ slot->slot_unbuffered = 0;
slot->slot_start_lsn = slot->slot_end_lsn = log->alloc_lsn;
slot->slot_start_offset = log->alloc_lsn.l.offset;
slot->slot_last_offset = log->alloc_lsn.l.offset;
slot->slot_fh = log->log_fh;
slot->slot_error = 0;
- slot->slot_unbuffered = 0;
+ WT_DIAGNOSTIC_YIELD;
+ /*
+ * Set the slot state last. Other threads may have a stale pointer
+ * to this slot and could try to alter the state and other fields once
+ * they see the state cleared.
+ */
+ WT_PUBLISH(slot->slot_state, 0);
}
/*
@@ -50,6 +99,10 @@ __log_slot_close(
WT_CONNECTION_IMPL *conn;
WT_LOG *log;
int64_t end_offset, new_state, old_state;
+#ifdef HAVE_DIAGNOSTIC
+ struct timespec begin, now;
+ int count;
+#endif
WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
WT_ASSERT(session, releasep != NULL);
@@ -101,9 +154,33 @@ retry:
* that value. If the state is unbuffered, wait for the unbuffered
* size to be set.
*/
- while (WT_LOG_SLOT_UNBUFFERED_ISSET(old_state) &&
- slot->slot_unbuffered == 0)
- __wt_yield();
+#ifdef HAVE_DIAGNOSTIC
+ count = 0;
+ __wt_epoch(session, &begin);
+#endif
+ if (WT_LOG_SLOT_UNBUFFERED_ISSET(old_state)) {
+ while (slot->slot_unbuffered == 0) {
+ WT_RET(WT_SESSION_CHECK_PANIC(session));
+ __wt_yield();
+#ifdef HAVE_DIAGNOSTIC
+ ++count;
+ if (count > WT_MILLION) {
+ __wt_epoch(session, &now);
+ if (WT_TIMEDIFF_SEC(now, begin) > 10) {
+ __wt_errx(session, "SLOT_CLOSE: Slot %"
+ PRIu32 " Timeout unbuffered, state 0x%"
+ PRIx64 " unbuffered %" PRIu64,
+ (uint32_t)(slot - &log->slot_pool[0]),
+ slot->slot_state,
+ slot->slot_unbuffered);
+ __log_slot_dump(session);
+ __wt_abort(session);
+ }
+ count = 0;
+ }
+#endif
+ }
+ }
end_offset =
WT_LOG_SLOT_JOINED_BUFFERED(old_state) + slot->slot_unbuffered;
@@ -218,7 +295,11 @@ __wt_log_slot_new(WT_SESSION_IMPL *session)
WT_CONNECTION_IMPL *conn;
WT_LOG *log;
WT_LOGSLOT *slot;
- int32_t i;
+ int32_t i, pool_i;
+#ifdef HAVE_DIAGNOSTIC
+ struct timespec begin, now;
+ int count;
+#endif
WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
conn = S2C(session);
@@ -232,16 +313,22 @@ __wt_log_slot_new(WT_SESSION_IMPL *session)
WT_LOG_SLOT_OPEN(slot->slot_state))
return (0);
+#ifdef HAVE_DIAGNOSTIC
+ count = 0;
+ __wt_epoch(session, &begin);
+#endif
/*
* Keep trying until we can find a free slot.
*/
for (;;) {
/*
- * For now just restart at 0. We could use log->pool_index
- * if that is inefficient.
+ * Rotate among the slots to lessen collisions.
*/
- for (i = 0; i < WT_SLOT_POOL; i++) {
- slot = &log->slot_pool[i];
+ for (i = 0, pool_i = log->pool_index; i < WT_SLOT_POOL;
+ i++, pool_i++) {
+ if (pool_i >= WT_SLOT_POOL)
+ pool_i = 0;
+ slot = &log->slot_pool[pool_i];
if (slot->slot_state == WT_LOG_SLOT_FREE) {
/*
* Acquire our starting position in the
@@ -256,14 +343,28 @@ __wt_log_slot_new(WT_SESSION_IMPL *session)
WT_STAT_CONN_INCR(session,
log_slot_transitions);
log->active_slot = slot;
+ log->pool_index = pool_i;
return (0);
}
}
/*
* If we didn't find any free slots signal the worker thread.
*/
- __wt_cond_auto_signal(session, conn->log_wrlsn_cond);
+ __wt_cond_signal(session, conn->log_wrlsn_cond);
__wt_yield();
+#ifdef HAVE_DIAGNOSTIC
+ ++count;
+ if (count > WT_MILLION) {
+ __wt_epoch(session, &now);
+ if (WT_TIMEDIFF_SEC(now, begin) > 10) {
+ __wt_errx(session,
+ "SLOT_NEW: Timeout free slot");
+ __log_slot_dump(session);
+ __wt_abort(session);
+ }
+ count = 0;
+ }
+#endif
}
/* NOTREACHED */
}
@@ -311,10 +412,13 @@ __wt_log_slot_init(WT_SESSION_IMPL *session)
/*
* We cannot initialize the release LSN in the activate function
* because that function can be called after a log file switch.
+ * The release LSN is usually the same as the slot_start_lsn except
+ * around a log file switch.
*/
slot->slot_release_lsn = log->alloc_lsn;
__wt_log_slot_activate(session, slot);
log->active_slot = slot;
+ log->pool_index = 0;
if (0) {
err: while (--i >= 0)
@@ -361,7 +465,7 @@ __wt_log_slot_destroy(WT_SESSION_IMPL *session)
* __wt_log_slot_join --
* Join a consolidated logging slot.
*/
-void
+int
__wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize,
uint32_t flags, WT_MYSLOT *myslot)
{
@@ -370,53 +474,63 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize,
WT_LOGSLOT *slot;
int64_t flag_state, new_state, old_state, released;
int32_t join_offset, new_join;
-#ifdef HAVE_DIAGNOSTIC
- bool unbuf_force;
-#endif
+ bool unbuffered, yld;
conn = S2C(session);
log = conn->log;
WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT));
+ WT_ASSERT(session, mysize != 0);
/*
* There should almost always be a slot open.
*/
+ unbuffered = false;
#ifdef HAVE_DIAGNOSTIC
- unbuf_force = (++log->write_calls % WT_THOUSAND) == 0;
+ yld = (++log->write_calls % 7) == 0;
+ if ((log->write_calls % WT_THOUSAND) == 0 ||
+ mysize > WT_LOG_SLOT_BUF_MAX) {
+#else
+ yld = false;
+ if (mysize > WT_LOG_SLOT_BUF_MAX) {
#endif
+ unbuffered = true;
+ F_SET(myslot, WT_MYSLOT_UNBUFFERED);
+ }
for (;;) {
WT_BARRIER();
+ WT_RET(WT_SESSION_CHECK_PANIC(session));
slot = log->active_slot;
old_state = slot->slot_state;
- /*
- * Try to join our size into the existing size and
- * atomically write it back into the state.
- */
- flag_state = WT_LOG_SLOT_FLAGS(old_state);
- released = WT_LOG_SLOT_RELEASED(old_state);
- join_offset = WT_LOG_SLOT_JOINED(old_state);
-#ifdef HAVE_DIAGNOSTIC
- if (unbuf_force || mysize > WT_LOG_SLOT_BUF_MAX) {
-#else
- if (mysize > WT_LOG_SLOT_BUF_MAX) {
-#endif
- new_join = join_offset + WT_LOG_SLOT_UNBUFFERED;
- F_SET(myslot, WT_MYSLOT_UNBUFFERED);
- myslot->slot = slot;
- } else
- new_join = join_offset + (int32_t)mysize;
- new_state = (int64_t)WT_LOG_SLOT_JOIN_REL(
- (int64_t)new_join, (int64_t)released, (int64_t)flag_state);
-
- /*
- * Check if the slot is open for joining and we are able to
- * swap in our size into the state.
- */
- if (WT_LOG_SLOT_OPEN(old_state) &&
- __wt_atomic_casiv64(
- &slot->slot_state, old_state, new_state))
- break;
+ if (WT_LOG_SLOT_OPEN(old_state)) {
+ /*
+ * Try to join our size into the existing size and
+ * atomically write it back into the state.
+ */
+ flag_state = WT_LOG_SLOT_FLAGS(old_state);
+ released = WT_LOG_SLOT_RELEASED(old_state);
+ join_offset = WT_LOG_SLOT_JOINED(old_state);
+ if (unbuffered)
+ new_join = join_offset + WT_LOG_SLOT_UNBUFFERED;
+ else
+ new_join = join_offset + (int32_t)mysize;
+ new_state = (int64_t)WT_LOG_SLOT_JOIN_REL(
+ (int64_t)new_join, (int64_t)released,
+ (int64_t)flag_state);
+
+ /*
+ * Braces used due to potential empty body warning.
+ */
+ if (yld) {
+ WT_DIAGNOSTIC_YIELD;
+ }
+ /*
+ * Attempt to swap our size into the state.
+ */
+ if (__wt_atomic_casiv64(
+ &slot->slot_state, old_state, new_state))
+ break;
+ }
/*
* The slot is no longer open or we lost the race to
* update it. Yield and try again.
@@ -428,8 +542,7 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize,
* We joined this slot. Fill in our information to return to
* the caller.
*/
- if (mysize != 0)
- WT_STAT_CONN_INCR(session, log_slot_joins);
+ WT_STAT_CONN_INCR(session, log_slot_joins);
if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC))
F_SET(slot, WT_SLOT_SYNC_DIR);
if (LF_ISSET(WT_LOG_FLUSH))
@@ -444,6 +557,7 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize,
myslot->slot = slot;
myslot->offset = join_offset;
myslot->end_offset = (wt_off_t)((uint64_t)join_offset + mysize);
+ return (0);
}
/*
diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c
index a2511f48e2b..60afbc99ade 100644
--- a/src/lsm/lsm_cursor.c
+++ b/src/lsm/lsm_cursor.c
@@ -1692,8 +1692,8 @@ __wt_clsm_open(WT_SESSION_IMPL *session,
bulk = cval.val != 0;
/* Get the LSM tree. */
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __wt_lsm_tree_get(session, uri, bulk, &lsm_tree));
+ ret = __wt_lsm_tree_get(session, uri, bulk, &lsm_tree);
+
/*
* Check whether the exclusive open for a bulk load succeeded, and
* if it did ensure that it's safe to bulk load into the tree.
diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c
index cbd83a5cd30..6dc06146179 100644
--- a/src/lsm/lsm_manager.c
+++ b/src/lsm/lsm_manager.c
@@ -387,8 +387,8 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session)
__wt_sleep(0, 10000);
if (TAILQ_EMPTY(&conn->lsmqh))
continue;
- __wt_spin_lock(session, &conn->dhandle_lock);
- F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST);
+ __wt_readlock(session, &conn->dhandle_lock);
+ F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ);
dhandle_locked = true;
TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) {
if (!lsm_tree->active)
@@ -448,14 +448,14 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session)
session, WT_LSM_WORK_MERGE, 0, lsm_tree));
}
}
- __wt_spin_unlock(session, &conn->dhandle_lock);
- F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST);
+ __wt_readunlock(session, &conn->dhandle_lock);
+ F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_READ);
dhandle_locked = false;
}
err: if (dhandle_locked) {
- __wt_spin_unlock(session, &conn->dhandle_lock);
- F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST);
+ __wt_readunlock(session, &conn->dhandle_lock);
+ F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_READ);
}
return (ret);
}
diff --git a/src/lsm/lsm_stat.c b/src/lsm/lsm_stat.c
index 150de968722..21e8991be94 100644
--- a/src/lsm/lsm_stat.c
+++ b/src/lsm/lsm_stat.c
@@ -33,9 +33,7 @@ __curstat_lsm_init(
"checkpoint=" WT_CHECKPOINT, NULL, NULL };
locked = false;
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree));
- WT_RET(ret);
+ WT_RET(__wt_lsm_tree_get(session, uri, false, &lsm_tree));
WT_ERR(__wt_scr_alloc(session, 0, &uribuf));
/* Propagate all, fast and/or clear to the cursors we open. */
diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c
index 71a981a6284..a9275976023 100644
--- a/src/lsm/lsm_tree.c
+++ b/src/lsm/lsm_tree.c
@@ -38,7 +38,7 @@ __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final)
/* We may be destroying an lsm_tree before it was added. */
if (F_ISSET(lsm_tree, WT_LSM_TREE_OPEN)) {
WT_ASSERT(session, final ||
- F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
+ F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE));
TAILQ_REMOVE(&S2C(session)->lsmqh, lsm_tree, q);
}
@@ -321,9 +321,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session,
metadata = NULL;
/* If the tree can be opened, it already exists. */
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree));
- if (ret == 0) {
+ if ((ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)) == 0) {
__wt_lsm_tree_release(session, lsm_tree);
return (exclusive ? EEXIST : 0);
}
@@ -339,7 +337,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session,
* error: the returned handle is NULL on error, and the metadata
* tracking macros handle cleaning up on failure.
*/
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
ret = __lsm_tree_open(session, uri, true, &lsm_tree));
if (ret == 0)
__wt_lsm_tree_release(session, lsm_tree);
@@ -404,6 +402,9 @@ __lsm_tree_find(WT_SESSION_IMPL *session,
}
*treep = lsm_tree;
+
+ WT_ASSERT(session, lsm_tree->excl_session ==
+ (exclusive ? session : NULL));
return (0);
}
@@ -456,7 +457,8 @@ __lsm_tree_open(WT_SESSION_IMPL *session,
conn = S2C(session);
lsm_tree = NULL;
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
+ WT_ASSERT(session,
+ F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE));
/* Start the LSM manager thread if it isn't running. */
if (__wt_atomic_cas32(&conn->lsm_manager.lsm_workers, 0, 1))
@@ -520,14 +522,21 @@ __wt_lsm_tree_get(WT_SESSION_IMPL *session,
{
WT_DECL_RET;
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
-
- ret = __lsm_tree_find(session, uri, exclusive, treep);
+ /*
+ * Dropping and re-acquiring the lock is safe here, since the tree open
+ * call checks to see if another thread beat it to opening the tree
+ * before proceeding.
+ */
+ if (exclusive)
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
+ ret = __lsm_tree_find(session, uri, exclusive, treep));
+ else
+ WT_WITH_HANDLE_LIST_READ_LOCK(session,
+ ret = __lsm_tree_find(session, uri, exclusive, treep));
if (ret == WT_NOTFOUND)
- ret = __lsm_tree_open(session, uri, exclusive, treep);
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
+ ret = __lsm_tree_open(session, uri, exclusive, treep));
- WT_ASSERT(session, ret != 0 ||
- (*treep)->excl_session == (exclusive ? session : NULL));
return (ret);
}
@@ -857,9 +866,7 @@ __wt_lsm_tree_alter(
locked = false;
/* Get the LSM tree. */
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree));
- WT_RET(ret);
+ WT_RET(__wt_lsm_tree_get(session, uri, false, &lsm_tree));
/* Prevent any new opens. */
__wt_lsm_tree_writelock(session, lsm_tree);
@@ -899,9 +906,7 @@ __wt_lsm_tree_drop(
locked = false;
/* Get the LSM tree. */
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __wt_lsm_tree_get(session, name, true, &lsm_tree));
- WT_RET(ret);
+ WT_RET(__wt_lsm_tree_get(session, name, true, &lsm_tree));
WT_ASSERT(session, !lsm_tree->active);
/* Prevent any new opens. */
@@ -934,7 +939,7 @@ __wt_lsm_tree_drop(
WT_ASSERT(session, !lsm_tree->active);
err: if (locked)
__wt_lsm_tree_writeunlock(session, lsm_tree);
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
tret = __lsm_tree_discard(session, lsm_tree, false));
WT_TRET(tret);
return (ret);
@@ -960,9 +965,7 @@ __wt_lsm_tree_rename(WT_SESSION_IMPL *session,
locked = false;
/* Get the LSM tree. */
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __wt_lsm_tree_get(session, olduri, true, &lsm_tree));
- WT_RET(ret);
+ WT_RET(__wt_lsm_tree_get(session, olduri, true, &lsm_tree));
/* Prevent any new opens. */
__wt_lsm_tree_writelock(session, lsm_tree);
@@ -1007,7 +1010,7 @@ err: if (locked)
* Discard this LSM tree structure. The first operation on the renamed
* tree will create a new one.
*/
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
tret = __lsm_tree_discard(session, lsm_tree, false));
WT_TRET(tret);
return (ret);
@@ -1032,9 +1035,7 @@ __wt_lsm_tree_truncate(
locked = false;
/* Get the LSM tree. */
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __wt_lsm_tree_get(session, name, true, &lsm_tree));
- WT_RET(ret);
+ WT_RET(__wt_lsm_tree_get(session, name, true, &lsm_tree));
/* Prevent any new opens. */
__wt_lsm_tree_writelock(session, lsm_tree);
@@ -1068,7 +1069,7 @@ err: if (locked)
* the last good version of the metadata will be used, resulting
* in a valid (not truncated) tree.
*/
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
tret = __lsm_tree_discard(session, lsm_tree, false));
WT_TRET(tret);
}
@@ -1157,9 +1158,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp)
/* Tell __wt_schema_worker not to look inside the LSM tree. */
*skipp = true;
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __wt_lsm_tree_get(session, name, false, &lsm_tree));
- WT_RET(ret);
+ WT_RET(__wt_lsm_tree_get(session, name, false, &lsm_tree));
if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE))
WT_ERR_MSG(session, EINVAL,
@@ -1356,9 +1355,7 @@ __wt_lsm_tree_worker(WT_SESSION_IMPL *session,
locked = false;
exclusive = FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE);
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __wt_lsm_tree_get(session, uri, exclusive, &lsm_tree));
- WT_RET(ret);
+ WT_RET(__wt_lsm_tree_get(session, uri, exclusive, &lsm_tree));
/*
* We mark that we're busy using the tree to coordinate
diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c
index d9c185a3f58..4349acf7b55 100644
--- a/src/lsm/lsm_work_unit.c
+++ b/src/lsm/lsm_work_unit.c
@@ -276,7 +276,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
!F_ISSET(chunk, WT_LSM_CHUNK_STABLE) &&
!chunk->evicted) {
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
ret = __lsm_discard_handle(session, chunk->uri, NULL));
if (ret == 0)
chunk->evicted = 1;
@@ -517,7 +517,7 @@ __lsm_drop_file(WT_SESSION_IMPL *session, const char *uri)
*
* This will fail with EBUSY if the file is still in use.
*/
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
ret = __lsm_discard_handle(session, uri, WT_CHECKPOINT));
WT_RET(ret);
diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c
index b0d0758775d..ffa00c0a5e7 100644
--- a/src/lsm/lsm_worker.c
+++ b/src/lsm/lsm_worker.c
@@ -154,7 +154,7 @@ __lsm_worker(void *arg)
/* Don't busy wait if there was any work to do. */
if (!progress) {
- __wt_cond_wait(session, cookie->work_cond, 10000);
+ __wt_cond_wait(session, cookie->work_cond, 10000, NULL);
continue;
}
}
diff --git a/src/os_posix/os_mtx_cond.c b/src/os_posix/os_mtx_cond.c
index be8b1abda31..a5ee78f9e3e 100644
--- a/src/os_posix/os_mtx_cond.c
+++ b/src/os_posix/os_mtx_cond.c
@@ -13,8 +13,7 @@
* Allocate and initialize a condition variable.
*/
int
-__wt_cond_alloc(WT_SESSION_IMPL *session,
- const char *name, bool is_signalled, WT_CONDVAR **condp)
+__wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp)
{
WT_CONDVAR *cond;
WT_DECL_RET;
@@ -27,7 +26,7 @@ __wt_cond_alloc(WT_SESSION_IMPL *session,
WT_ERR(pthread_cond_init(&cond->cond, NULL));
cond->name = name;
- cond->waiters = is_signalled ? -1 : 0;
+ cond->waiters = 0;
*condp = cond;
return (0);
@@ -42,8 +41,8 @@ err: __wt_free(session, cond);
* out period expires, let the caller know.
*/
void
-__wt_cond_wait_signal(
- WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled)
+__wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond,
+ uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled)
{
struct timespec ts;
WT_DECL_RET;
@@ -62,6 +61,23 @@ __wt_cond_wait_signal(
WT_ERR(pthread_mutex_lock(&cond->mtx));
locked = true;
+ /*
+ * It's possible to race with threads waking us up. That's not a problem
+ * if there are multiple wakeups because the next wakeup will get us, or
+ * if we're only pausing for a short period. It's a problem if there's
+ * only a single wakeup, our waker is likely waiting for us to exit.
+ * After acquiring the mutex (so we're guaranteed to be awakened by any
+ * future wakeup call), optionally check if we're OK to keep running.
+ * This won't ensure our caller won't just loop and call us again, but
+ * at least it's not our fault.
+ *
+ * Assert we're not waiting longer than a second if not checking the
+ * run status.
+ */
+ WT_ASSERT(session, run_func != NULL || usecs <= WT_MILLION);
+ if (run_func != NULL && !run_func(session))
+ goto skipping;
+
if (usecs > 0) {
__wt_epoch(session, &ts);
ts.tv_sec += (time_t)
@@ -81,7 +97,7 @@ __wt_cond_wait_signal(
ret == ETIME ||
#endif
ret == ETIMEDOUT) {
- *signalled = false;
+skipping: *signalled = false;
ret = 0;
}
diff --git a/src/os_win/os_mtx_cond.c b/src/os_win/os_mtx_cond.c
index 79c62ccd7f2..0001c6c2322 100644
--- a/src/os_win/os_mtx_cond.c
+++ b/src/os_win/os_mtx_cond.c
@@ -13,8 +13,7 @@
* Allocate and initialize a condition variable.
*/
int
-__wt_cond_alloc(WT_SESSION_IMPL *session,
- const char *name, bool is_signalled, WT_CONDVAR **condp)
+__wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp)
{
WT_CONDVAR *cond;
@@ -26,7 +25,7 @@ __wt_cond_alloc(WT_SESSION_IMPL *session,
InitializeConditionVariable(&cond->cond);
cond->name = name;
- cond->waiters = is_signalled ? -1 : 0;
+ cond->waiters = 0;
*condp = cond;
return (0);
@@ -38,8 +37,8 @@ __wt_cond_alloc(WT_SESSION_IMPL *session,
* out period expires, let the caller know.
*/
void
-__wt_cond_wait_signal(
- WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled)
+__wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond,
+ uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled)
{
BOOL sleepret;
DWORD milliseconds, windows_error;
@@ -59,8 +58,26 @@ __wt_cond_wait_signal(
EnterCriticalSection(&cond->mtx);
locked = true;
+ /*
+ * It's possible to race with threads waking us up. That's not a problem
+ * if there are multiple wakeups because the next wakeup will get us, or
+ * if we're only pausing for a short period. It's a problem if there's
+ * only a single wakeup, our waker is likely waiting for us to exit.
+ * After acquiring the mutex (so we're guaranteed to be awakened by any
+ * future wakeup call), optionally check if we're OK to keep running.
+ * This won't ensure our caller won't just loop and call us again, but
+ * at least it's not our fault.
+ *
+ * Assert we're not waiting longer than a second if not checking the
+ * run status.
+ */
+ WT_ASSERT(session, run_func != NULL || usecs <= WT_MILLION);
+
+ if (run_func != NULL && !run_func(session))
+ goto skipping;
+
if (usecs > 0) {
- milliseconds64 = usecs / 1000;
+ milliseconds64 = usecs / WT_THOUSAND;
/*
* Check for 32-bit unsigned integer overflow
@@ -90,7 +107,7 @@ __wt_cond_wait_signal(
if (sleepret == 0) {
windows_error = __wt_getlasterror();
if (windows_error == ERROR_TIMEOUT) {
- *signalled = false;
+skipping: *signalled = false;
sleepret = 1;
}
}
@@ -117,17 +134,17 @@ void
__wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond)
{
WT_DECL_RET;
- bool locked;
-
- locked = false;
__wt_verbose(session, WT_VERB_MUTEX, "signal %s", cond->name);
/*
- * Our callers are often setting flags to cause a thread to exit. Add
- * a barrier to ensure the flags are seen by the threads.
+ * Our callers often set flags to cause a thread to exit. Add a barrier
+ * to ensure exit flags are seen by the sleeping threads, otherwise we
+ * can wake up a thread, it immediately goes back to sleep, and we'll
+ * hang. Use a full barrier (we may not write before waiting on thread
+ * join).
*/
- WT_WRITE_BARRIER();
+ WT_FULL_BARRIER();
/*
* Fast path if we are in (or can enter), a state where the next waiter
diff --git a/src/schema/schema_drop.c b/src/schema/schema_drop.c
index c1a4f257648..49801e4e5f9 100644
--- a/src/schema/schema_drop.c
+++ b/src/schema/schema_drop.c
@@ -30,7 +30,7 @@ __drop_file(
WT_RET(__wt_schema_backup_check(session, filename));
/* Close all btree handles associated with this file. */
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
ret = __wt_conn_dhandle_close_all(session, uri, force));
WT_RET(ret);
diff --git a/src/schema/schema_list.c b/src/schema/schema_list.c
index ea7374b7554..74ef5135a4a 100644
--- a/src/schema/schema_list.c
+++ b/src/schema/schema_list.c
@@ -25,7 +25,7 @@ __schema_add_table(WT_SESSION_IMPL *session,
/* Make sure the metadata is open before getting other locks. */
WT_RET(__wt_metadata_cursor(session, NULL));
- WT_WITH_TABLE_LOCK(session,
+ WT_WITH_TABLE_READ_LOCK(session,
ret = __wt_schema_open_table(
session, name, namelen, ok_incomplete, &table));
WT_RET(ret);
diff --git a/src/schema/schema_rename.c b/src/schema/schema_rename.c
index f512482c162..a374f4c2831 100644
--- a/src/schema/schema_rename.c
+++ b/src/schema/schema_rename.c
@@ -33,7 +33,7 @@ __rename_file(
WT_RET(__wt_schema_backup_check(session, filename));
WT_RET(__wt_schema_backup_check(session, newfile));
/* Close any btree handles in the file. */
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
ret = __wt_conn_dhandle_close_all(session, uri, false));
WT_ERR(ret);
diff --git a/src/schema/schema_worker.c b/src/schema/schema_worker.c
index fb7f8cec074..e5f71b5d56f 100644
--- a/src/schema/schema_worker.c
+++ b/src/schema/schema_worker.c
@@ -49,7 +49,7 @@ __wt_schema_worker(WT_SESSION_IMPL *session,
* any open file handles, including checkpoints.
*/
if (FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE)) {
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
ret = __wt_conn_dhandle_close_all(
session, uri, false));
WT_ERR(ret);
diff --git a/src/session/session_api.c b/src/session/session_api.c
index fcbfa8809b3..d282c5d0c32 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -162,7 +162,7 @@ __session_alter(WT_SESSION *wt_session, const char *uri, const char *config)
cfg[1] = NULL;
WT_WITH_CHECKPOINT_LOCK(session,
WT_WITH_SCHEMA_LOCK(session,
- WT_WITH_TABLE_LOCK(session,
+ WT_WITH_TABLE_WRITE_LOCK(session,
ret = __wt_schema_alter(session, uri, cfg))));
err: if (ret != 0)
@@ -234,9 +234,6 @@ __session_close(WT_SESSION *wt_session, const char *config)
/* Release common session resources. */
WT_TRET(__wt_session_release_resources(session));
- /* Destroy the thread's mutex. */
- WT_TRET(__wt_cond_destroy(session, &session->cond));
-
/* The API lock protects opening and closing of sessions. */
__wt_spin_lock(session, &conn->api_lock);
@@ -521,7 +518,7 @@ __wt_session_create(
WT_DECL_RET;
WT_WITH_SCHEMA_LOCK(session,
- WT_WITH_TABLE_LOCK(session,
+ WT_WITH_TABLE_WRITE_LOCK(session,
ret = __wt_schema_create(session, uri, config)));
return (ret);
}
@@ -769,7 +766,7 @@ __session_rename(WT_SESSION *wt_session,
WT_WITH_CHECKPOINT_LOCK(session,
WT_WITH_SCHEMA_LOCK(session,
- WT_WITH_TABLE_LOCK(session,
+ WT_WITH_TABLE_WRITE_LOCK(session,
ret = __wt_schema_rename(session, uri, newuri, cfg))));
err: if (ret != 0)
@@ -858,21 +855,22 @@ __session_drop(WT_SESSION *wt_session, const char *uri, const char *config)
if (lock_wait)
WT_WITH_CHECKPOINT_LOCK(session,
WT_WITH_SCHEMA_LOCK(session,
- WT_WITH_TABLE_LOCK(session, ret =
+ WT_WITH_TABLE_WRITE_LOCK(session, ret =
__wt_schema_drop(session, uri, cfg))));
else
WT_WITH_CHECKPOINT_LOCK_NOWAIT(session, ret,
WT_WITH_SCHEMA_LOCK_NOWAIT(session, ret,
- WT_WITH_TABLE_LOCK_NOWAIT(session, ret, ret =
+ WT_WITH_TABLE_WRITE_LOCK_NOWAIT(session, ret,
+ ret =
__wt_schema_drop(session, uri, cfg))));
} else {
if (lock_wait)
WT_WITH_SCHEMA_LOCK(session,
- WT_WITH_TABLE_LOCK(session,
+ WT_WITH_TABLE_WRITE_LOCK(session,
ret = __wt_schema_drop(session, uri, cfg)));
else
WT_WITH_SCHEMA_LOCK_NOWAIT(session, ret,
- WT_WITH_TABLE_LOCK_NOWAIT(session, ret,
+ WT_WITH_TABLE_WRITE_LOCK_NOWAIT(session, ret,
ret = __wt_schema_drop(session, uri, cfg)));
}
@@ -1489,6 +1487,20 @@ err: API_END_RET(session, ret);
}
/*
+ * __transaction_sync_run_chk --
+ * Check to decide if the transaction sync call should continue running.
+ */
+static bool
+__transaction_sync_run_chk(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+
+ return (FLD_ISSET(conn->flags, WT_CONN_LOG_SERVER_RUN));
+}
+
+/*
* __session_transaction_sync --
* WT_SESSION->transaction_sync method.
*/
@@ -1502,7 +1514,7 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config)
WT_SESSION_IMPL *session;
WT_TXN *txn;
struct timespec now, start;
- uint64_t timeout_ms, waited_ms;
+ uint64_t remaining_usec, timeout_ms, waited_ms;
bool forever;
session = (WT_SESSION_IMPL *)wt_session;
@@ -1555,22 +1567,20 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config)
__wt_epoch(session, &start);
/*
* Keep checking the LSNs until we find it is stable or we reach
- * our timeout.
+ * our timeout, or there's some other reason to quit.
*/
while (__wt_log_cmp(&session->bg_sync_lsn, &log->sync_lsn) > 0) {
+ if (!__transaction_sync_run_chk(session))
+ WT_ERR(ETIMEDOUT);
+
__wt_cond_signal(session, conn->log_file_cond);
__wt_epoch(session, &now);
waited_ms = WT_TIMEDIFF_MS(now, start);
- if (forever || waited_ms < timeout_ms)
- /*
- * Note, we will wait an increasing amount of time
- * each iteration, likely doubling. Also note that
- * the function timeout value is in usecs (we are
- * computing the wait time in msecs and passing that
- * in, unchanged, as the usecs to wait).
- */
- __wt_cond_wait(session, log->log_sync_cond, waited_ms);
- else
+ if (forever || waited_ms < timeout_ms) {
+ remaining_usec = (timeout_ms - waited_ms) * WT_THOUSAND;
+ __wt_cond_wait(session, log->log_sync_cond,
+ remaining_usec, __transaction_sync_run_chk);
+ } else
WT_ERR(ETIMEDOUT);
}
@@ -1825,8 +1835,6 @@ __open_session(WT_CONNECTION_IMPL *conn,
session_ret->name = NULL;
session_ret->id = i;
- WT_ERR(__wt_cond_alloc(session, "session", false, &session_ret->cond));
-
if (WT_SESSION_FIRST_USE(session_ret))
__wt_random_init(&session_ret->rnd);
diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c
index f1251794b89..ee9bddbfc19 100644
--- a/src/session/session_dhandle.c
+++ b/src/session/session_dhandle.c
@@ -44,8 +44,7 @@ __session_discard_dhandle(
TAILQ_REMOVE(&session->dhandles, dhandle_cache, q);
TAILQ_REMOVE(&session->dhhash[bucket], dhandle_cache, hashq);
- (void)__wt_atomic_sub32(&dhandle_cache->dhandle->session_ref, 1);
-
+ WT_DHANDLE_RELEASE(dhandle_cache->dhandle);
__wt_overwrite_and_free(session, dhandle_cache);
}
@@ -412,17 +411,27 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session)
/*
* __session_find_shared_dhandle --
* Search for a data handle in the connection and add it to a session's
- * cache. Since the data handle isn't locked, this must be called holding
- * the handle list lock, and we must increment the handle's reference
- * count before releasing it.
+ * cache. We must increment the handle's reference count while holding
+ * the handle list lock.
*/
static int
__session_find_shared_dhandle(
WT_SESSION_IMPL *session, const char *uri, const char *checkpoint)
{
- WT_RET(__wt_conn_dhandle_find(session, uri, checkpoint));
- (void)__wt_atomic_add32(&session->dhandle->session_ref, 1);
- return (0);
+ WT_DECL_RET;
+
+ WT_WITH_HANDLE_LIST_READ_LOCK(session,
+ if ((ret = __wt_conn_dhandle_find(session, uri, checkpoint)) == 0)
+ WT_DHANDLE_ACQUIRE(session->dhandle));
+
+ if (ret != WT_NOTFOUND)
+ return (ret);
+
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
+ if ((ret = __wt_conn_dhandle_alloc(session, uri, checkpoint)) == 0)
+ WT_DHANDLE_ACQUIRE(session->dhandle));
+
+ return (ret);
}
/*
@@ -450,16 +459,16 @@ __session_get_dhandle(
* We didn't find a match in the session cache, search the shared
* handle list and cache the handle we find.
*/
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __session_find_shared_dhandle(session, uri, checkpoint));
- WT_RET(ret);
+ WT_RET(__session_find_shared_dhandle(session, uri, checkpoint));
/*
* Fixup the reference count on failure (we incremented the reference
* count while holding the handle-list lock).
*/
- if ((ret = __session_add_dhandle(session)) != 0)
- (void)__wt_atomic_sub32(&session->dhandle->session_ref, 1);
+ if ((ret = __session_add_dhandle(session)) != 0) {
+ WT_DHANDLE_RELEASE(session->dhandle);
+ session->dhandle = NULL;
+ }
return (ret);
}
@@ -505,17 +514,15 @@ __wt_session_get_btree(WT_SESSION_IMPL *session,
* reopen handles in the meantime. A combination of the schema
* and handle list locks are used to enforce this.
*/
- if (!F_ISSET(session, WT_SESSION_LOCKED_SCHEMA) ||
- !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)) {
+ if (!F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)) {
dhandle->excl_session = NULL;
dhandle->excl_ref = 0;
F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE);
__wt_writeunlock(session, &dhandle->rwlock);
WT_WITH_SCHEMA_LOCK(session,
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __wt_session_get_btree(
- session, uri, checkpoint, cfg, flags)));
+ ret = __wt_session_get_btree(
+ session, uri, checkpoint, cfg, flags));
return (ret);
}
diff --git a/src/support/cond_auto.c b/src/support/cond_auto.c
index a3ae67f5baa..600e5eab0ff 100644
--- a/src/support/cond_auto.c
+++ b/src/support/cond_auto.c
@@ -1,29 +1,9 @@
/*-
- * Public Domain 2014-2016 MongoDB, Inc.
- * Public Domain 2008-2014 WiredTiger, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
*
- * This is free and unencumbered software released into the public domain.
- *
- * Anyone is free to copy, modify, publish, use, compile, sell, or
- * distribute this software, either in source code form or as a compiled
- * binary, for any purpose, commercial or non-commercial, and by any
- * means.
- *
- * In jurisdictions that recognize copyright laws, the author or authors
- * of this software dedicate any and all copyright interest in the
- * software to the public domain. We make this dedication for the benefit
- * of the public at large and to the detriment of our heirs and
- * successors. We intend this dedication to be an overt act of
- * relinquishment in perpetuity of all present and future rights to this
- * software under copyright law.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
+ * See the file LICENSE for redistribution information.
*/
#include "wt_internal.h"
@@ -38,13 +18,12 @@
* Allocate and initialize an automatically adjusting condition variable.
*/
int
-__wt_cond_auto_alloc(
- WT_SESSION_IMPL *session, const char *name,
- bool is_signalled, uint64_t min, uint64_t max, WT_CONDVAR **condp)
+__wt_cond_auto_alloc(WT_SESSION_IMPL *session,
+ const char *name, uint64_t min, uint64_t max, WT_CONDVAR **condp)
{
WT_CONDVAR *cond;
- WT_RET(__wt_cond_alloc(session, name, is_signalled, condp));
+ WT_RET(__wt_cond_alloc(session, name, condp));
cond = *condp;
cond->min_wait = min;
@@ -55,33 +34,19 @@ __wt_cond_auto_alloc(
}
/*
- * __wt_cond_auto_signal --
- * Signal a condition variable.
- */
-void
-__wt_cond_auto_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond)
-{
-
- WT_ASSERT(session, cond->min_wait != 0);
- __wt_cond_signal(session, cond);
-}
-
-/*
* __wt_cond_auto_wait_signal --
* Wait on a mutex, optionally timing out. If we get it before the time
* out period expires, let the caller know.
- * TODO: Can this version of the API be removed, now that we have the
- * auto adjusting condition variables?
*/
void
-__wt_cond_auto_wait_signal(
- WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool *signalled)
+__wt_cond_auto_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond,
+ bool progress, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled)
{
uint64_t delta;
/*
* Catch cases where this function is called with a condition variable
- * that was initialized non-auto.
+ * that wasn't initialized to do automatic adjustments.
*/
WT_ASSERT(session, cond->min_wait != 0);
@@ -94,7 +59,8 @@ __wt_cond_auto_wait_signal(
cond->max_wait, cond->prev_wait + delta);
}
- __wt_cond_wait_signal(session, cond, cond->prev_wait, signalled);
+ __wt_cond_wait_signal(
+ session, cond, cond->prev_wait, run_func, signalled);
if (progress || *signalled)
WT_STAT_CONN_INCR(session, cond_auto_wait_reset);
@@ -108,24 +74,10 @@ __wt_cond_auto_wait_signal(
* out period expires, let the caller know.
*/
void
-__wt_cond_auto_wait(
- WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress)
+__wt_cond_auto_wait(WT_SESSION_IMPL *session,
+ WT_CONDVAR *cond, bool progress, bool (*run_func)(WT_SESSION_IMPL *))
{
- bool signalled;
-
- /*
- * Call the signal version so the wait period is reset if the
- * condition is woken explicitly.
- */
- __wt_cond_auto_wait_signal(session, cond, progress, &signalled);
-}
+ bool notused;
-/*
- * __wt_cond_auto_destroy --
- * Destroy a condition variable.
- */
-int
-__wt_cond_auto_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp)
-{
- return (__wt_cond_destroy(session, condp));
+ __wt_cond_auto_wait_signal(session, cond, progress, run_func, &notused);
}
diff --git a/src/support/rand.c b/src/support/rand.c
index a5b229b9abc..4fae43edc8e 100644
--- a/src/support/rand.c
+++ b/src/support/rand.c
@@ -120,3 +120,15 @@ __wt_random(WT_RAND_STATE volatile * rnd_state)
return ((z << 16) + (w & 65535));
}
+
+/*
+ * __wt_random64 --
+ * Return a 64-bit pseudo-random number.
+ */
+uint64_t
+__wt_random64(WT_RAND_STATE volatile * rnd_state)
+ WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
+{
+ return (((uint64_t)__wt_random(rnd_state) << 32) +
+ __wt_random(rnd_state));
+}
diff --git a/src/support/stat.c b/src/support/stat.c
index 167d17137ce..fd38e1b79ee 100644
--- a/src/support/stat.c
+++ b/src/support/stat.c
@@ -759,9 +759,7 @@ static const char * const __stats_connection_desc[] = {
"lock: checkpoint lock acquisitions",
"lock: checkpoint lock application thread wait time (usecs)",
"lock: checkpoint lock internal thread wait time (usecs)",
- "lock: handle-list lock acquisitions",
- "lock: handle-list lock application thread wait time (usecs)",
- "lock: handle-list lock internal thread wait time (usecs)",
+ "lock: handle-list lock eviction thread wait time (usecs)",
"lock: metadata lock acquisitions",
"lock: metadata lock application thread wait time (usecs)",
"lock: metadata lock internal thread wait time (usecs)",
@@ -1044,9 +1042,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->lock_checkpoint_count = 0;
stats->lock_checkpoint_wait_application = 0;
stats->lock_checkpoint_wait_internal = 0;
- stats->lock_handle_list_count = 0;
- stats->lock_handle_list_wait_application = 0;
- stats->lock_handle_list_wait_internal = 0;
+ stats->lock_handle_list_wait_eviction = 0;
stats->lock_metadata_count = 0;
stats->lock_metadata_wait_application = 0;
stats->lock_metadata_wait_internal = 0;
@@ -1351,12 +1347,8 @@ __wt_stat_connection_aggregate(
WT_STAT_READ(from, lock_checkpoint_wait_application);
to->lock_checkpoint_wait_internal +=
WT_STAT_READ(from, lock_checkpoint_wait_internal);
- to->lock_handle_list_count +=
- WT_STAT_READ(from, lock_handle_list_count);
- to->lock_handle_list_wait_application +=
- WT_STAT_READ(from, lock_handle_list_wait_application);
- to->lock_handle_list_wait_internal +=
- WT_STAT_READ(from, lock_handle_list_wait_internal);
+ to->lock_handle_list_wait_eviction +=
+ WT_STAT_READ(from, lock_handle_list_wait_eviction);
to->lock_metadata_count += WT_STAT_READ(from, lock_metadata_count);
to->lock_metadata_wait_application +=
WT_STAT_READ(from, lock_metadata_wait_application);
diff --git a/src/support/thread_group.c b/src/support/thread_group.c
index beb143e63e2..2b4b7ad4e61 100644
--- a/src/support/thread_group.c
+++ b/src/support/thread_group.c
@@ -259,7 +259,7 @@ __wt_thread_group_create(
__wt_rwlock_init(session, &group->lock);
WT_ERR(__wt_cond_alloc(
- session, "Thread group cond", false, &group->wait_cond));
+ session, "thread group cond", &group->wait_cond));
cond_alloced = true;
__wt_writelock(session, &group->lock);
diff --git a/src/txn/txn.c b/src/txn/txn.c
index 660d37b17d5..e5e59c2b901 100644
--- a/src/txn/txn.c
+++ b/src/txn/txn.c
@@ -803,3 +803,98 @@ __wt_txn_global_destroy(WT_SESSION_IMPL *session)
__wt_rwlock_destroy(session, &txn_global->nsnap_rwlock);
__wt_free(session, txn_global->states);
}
+
+#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE)
+/*
+ * __wt_verbose_dump_txn --
+ * Output diagnostic information about the global transaction state.
+ */
+int
+__wt_verbose_dump_txn(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN *txn;
+ WT_TXN_STATE *s;
+ const char *iso_tag;
+ uint64_t id;
+ uint32_t i, session_cnt;
+
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+
+ WT_RET(__wt_msg(session, "%s", WT_DIVIDER));
+ WT_RET(__wt_msg(session, "transaction state dump"));
+
+ WT_RET(__wt_msg(session, "current ID: %" PRIu64, txn_global->current));
+ WT_RET(__wt_msg(session,
+ "last running ID: %" PRIu64, txn_global->last_running));
+ WT_RET(__wt_msg(session, "oldest ID: %" PRIu64, txn_global->oldest_id));
+ WT_RET(__wt_msg(session,
+ "oldest named snapshot ID: %" PRIu64, txn_global->nsnap_oldest_id));
+
+ WT_RET(__wt_msg(session, "checkpoint running? %s",
+ txn_global->checkpoint_running ? "yes" : "no"));
+ WT_RET(__wt_msg(session,
+ "checkpoint generation: %" PRIu64, txn_global->checkpoint_gen));
+ WT_RET(__wt_msg(session,
+ "checkpoint pinned ID: %" PRIu64, txn_global->checkpoint_pinned));
+ WT_RET(__wt_msg(session,
+ "checkpoint txn ID: %" PRIu64, txn_global->checkpoint_txnid));
+
+ WT_ORDERED_READ(session_cnt, conn->session_cnt);
+ WT_RET(__wt_msg(session, "session count: %" PRIu32, session_cnt));
+
+ WT_RET(__wt_msg(session, "Transaction state of active sessions:"));
+
+ /*
+ * Walk each session transaction state and dump information. Accessing
+ * the content of session handles is not thread safe, so some
+ * information may change while traversing if other threads are active
+ * at the same time, which is OK since this is diagnostic code.
+ */
+ for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
+ /* Skip sessions with no active transaction */
+ if ((id = s->id) == WT_TXN_NONE && s->pinned_id == WT_TXN_NONE)
+ continue;
+
+ txn = &conn->sessions[i].txn;
+ iso_tag = "INVALID";
+ switch (txn->isolation) {
+ case WT_ISO_READ_COMMITTED:
+ iso_tag = "WT_ISO_READ_COMMITTED";
+ break;
+ case WT_ISO_READ_UNCOMMITTED:
+ iso_tag = "WT_ISO_READ_UNCOMMITTED";
+ break;
+ case WT_ISO_SNAPSHOT:
+ iso_tag = "WT_ISO_SNAPSHOT";
+ break;
+ }
+
+ WT_RET(__wt_msg(session,
+ "ID: %6" PRIu64
+ ", mod count: %u"
+ ", pinned ID: %" PRIu64
+ ", snap min: %" PRIu64
+ ", snap max: %" PRIu64
+ ", metadata pinned ID: %" PRIu64
+ ", flags: 0x%08" PRIx32
+ ", name: %s"
+ ", isolation: %s",
+ id,
+ txn->mod_count,
+ s->pinned_id,
+ txn->snap_min,
+ txn->snap_max,
+ s->metadata_pinned,
+ txn->flags,
+ conn->sessions[i].name == NULL ?
+ "EMPTY" : conn->sessions[i].name,
+ iso_tag));
+ }
+ WT_RET(__wt_msg(session, "%s", WT_DIVIDER));
+
+ return (0);
+}
+#endif
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index 3b19162fd3d..3261c8089f4 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -525,6 +525,17 @@ __checkpoint_verbose_track(WT_SESSION_IMPL *session,
}
/*
+ * __checkpoint_fail_reset --
+ * Reset fields when a failure occurs.
+ */
+static void
+__checkpoint_fail_reset(WT_SESSION_IMPL *session)
+{
+ S2BT(session)->modified = true;
+ S2BT(session)->ckpt = NULL;
+}
+
+/*
* __txn_checkpoint --
* Checkpoint a database or a list of objects in the database.
*/
@@ -543,7 +554,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
void *saved_meta_next;
u_int i;
uint64_t fsync_duration_usecs;
- bool full, idle, logging, tracking;
+ bool failed, full, idle, logging, tracking;
const char *txn_cfg[] = { WT_CONFIG_BASE(session,
WT_SESSION_begin_transaction), "isolation=snapshot", NULL };
@@ -639,10 +650,9 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
*/
WT_ASSERT(session, session->ckpt_handle_next == 0);
WT_WITH_SCHEMA_LOCK(session,
- WT_WITH_TABLE_LOCK(session,
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __checkpoint_apply_all(
- session, cfg, __wt_checkpoint_get_handles, NULL))));
+ WT_WITH_TABLE_READ_LOCK(session,
+ ret = __checkpoint_apply_all(
+ session, cfg, __wt_checkpoint_get_handles, NULL)));
WT_ERR(ret);
/*
@@ -825,12 +835,13 @@ err: /*
* overwritten the checkpoint, so what ends up on disk is not
* consistent.
*/
- if (ret != 0 && !conn->modified)
+ failed = ret != 0;
+ if (failed)
conn->modified = true;
session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED;
if (tracking)
- WT_TRET(__wt_meta_track_off(session, false, ret != 0));
+ WT_TRET(__wt_meta_track_off(session, false, failed));
cache->eviction_scrub_limit = 0.0;
WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0);
@@ -863,6 +874,13 @@ err: /*
for (i = 0; i < session->ckpt_handle_next; ++i) {
if (session->ckpt_handle[i] == NULL)
continue;
+ /*
+ * If the operation failed, mark all trees dirty so they are
+ * included if a future checkpoint can succeed.
+ */
+ if (failed)
+ WT_WITH_DHANDLE(session, session->ckpt_handle[i],
+ __checkpoint_fail_reset(session));
WT_WITH_DHANDLE(session, session->ckpt_handle[i],
WT_TRET(__wt_session_release_btree(session)));
}
@@ -1341,7 +1359,6 @@ __checkpoint_tree(
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
WT_LSN ckptlsn;
- int was_modified;
bool fake_ckpt;
WT_UNUSED(cfg);
@@ -1352,7 +1369,6 @@ __checkpoint_tree(
conn = S2C(session);
dhandle = session->dhandle;
fake_ckpt = false;
- was_modified = btree->modified;
/*
* Set the checkpoint LSN to the maximum LSN so that if logging is
@@ -1483,10 +1499,9 @@ err: /*
* If the checkpoint didn't complete successfully, make sure the
* tree is marked dirty.
*/
- if (ret != 0 && !btree->modified && was_modified) {
+ if (ret != 0) {
btree->modified = true;
- if (!S2C(session)->modified)
- S2C(session)->modified = true;
+ S2C(session)->modified = true;
}
__wt_meta_ckptlist_free(session, ckptbase);
diff --git a/src/txn/txn_log.c b/src/txn/txn_log.c
index 7ad295f421b..2931dc1ce82 100644
--- a/src/txn/txn_log.c
+++ b/src/txn/txn_log.c
@@ -269,7 +269,7 @@ __wt_txn_checkpoint_logread(WT_SESSION_IMPL *session,
WT_ITEM ckpt_snapshot_unused;
uint32_t ckpt_file, ckpt_offset;
u_int ckpt_nsnapshot_unused;
- const char *fmt = WT_UNCHECKED_STRING(IIIU);
+ const char *fmt = WT_UNCHECKED_STRING(IIIu);
if ((ret = __wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
&ckpt_file, &ckpt_offset,
@@ -297,7 +297,7 @@ __wt_txn_checkpoint_log(
uint8_t *end, *p;
size_t recsize;
uint32_t i, rectype = WT_LOGREC_CHECKPOINT;
- const char *fmt = WT_UNCHECKED_STRING(IIIIU);
+ const char *fmt = WT_UNCHECKED_STRING(IIIIu);
txn = &session->txn;
ckpt_lsn = &txn->ckpt_lsn;
diff --git a/src/utilities/util.h b/src/utilities/util.h
index cf12d7d4aa6..93a96d44219 100644
--- a/src/utilities/util.h
+++ b/src/utilities/util.h
@@ -40,7 +40,6 @@ int util_flush(WT_SESSION *, const char *);
int util_list(WT_SESSION *, int, char *[]);
int util_load(WT_SESSION *, int, char *[]);
int util_loadtext(WT_SESSION *, int, char *[]);
-char *util_name(WT_SESSION *, const char *, const char *);
int util_printlog(WT_SESSION *, int, char *[]);
int util_read(WT_SESSION *, int, char *[]);
int util_read_line(WT_SESSION *, ULINE *, bool, bool *);
@@ -51,5 +50,6 @@ int util_stat(WT_SESSION *, int, char *[]);
int util_str2recno(WT_SESSION *, const char *p, uint64_t *recnop);
int util_truncate(WT_SESSION *, int, char *[]);
int util_upgrade(WT_SESSION *, int, char *[]);
+char *util_uri(WT_SESSION *, const char *, const char *);
int util_verify(WT_SESSION *, int, char *[]);
int util_write(WT_SESSION *, int, char *[]);
diff --git a/src/utilities/util_alter.c b/src/utilities/util_alter.c
index d228c15cd48..ef01a1ed826 100644
--- a/src/utilities/util_alter.c
+++ b/src/utilities/util_alter.c
@@ -34,9 +34,12 @@ util_alter(WT_SESSION *session, int argc, char *argv[])
for (configp = argv;
configp != NULL && *configp != NULL; configp += 2)
if ((ret = session->alter(
- session, configp[0], configp[1])) != 0)
- break;
- return (ret);
+ session, configp[0], configp[1])) != 0) {
+ (void)util_err(session, ret,
+ "session.alter: %s, %s", configp[0], configp[1]);
+ return (1);
+ }
+ return (0);
}
static int
diff --git a/src/utilities/util_compact.c b/src/utilities/util_compact.c
index c114eb207fa..e469b4dce6e 100644
--- a/src/utilities/util_compact.c
+++ b/src/utilities/util_compact.c
@@ -30,21 +30,13 @@ util_compact(WT_SESSION *session, int argc, char *argv[])
/* The remaining argument is the table name. */
if (argc != 1)
return (usage());
- if ((uri = util_name(session, *argv, "table")) == NULL)
+ if ((uri = util_uri(session, *argv, "table")) == NULL)
return (1);
- if ((ret = session->compact(session, uri, NULL)) != 0) {
- fprintf(stderr, "%s: compact(%s): %s\n",
- progname, uri, session->strerror(session, ret));
- goto err;
- }
-
- if (0) {
-err: ret = 1;
- }
+ if ((ret = session->compact(session, uri, NULL)) != 0)
+ (void)util_err(session, ret, "session.compact: %s", uri);
free(uri);
-
return (ret);
}
diff --git a/src/utilities/util_create.c b/src/utilities/util_create.c
index 4e609736f2d..7c22a67792b 100644
--- a/src/utilities/util_create.c
+++ b/src/utilities/util_create.c
@@ -15,9 +15,9 @@ util_create(WT_SESSION *session, int argc, char *argv[])
{
WT_DECL_RET;
int ch;
- const char *config, *uri;
+ char *config, *uri;
- config = NULL;
+ config = uri = NULL;
while ((ch = __wt_getopt(progname, argc, argv, "c:")) != EOF)
switch (ch) {
case 'c': /* command-line configuration */
@@ -35,12 +35,14 @@ util_create(WT_SESSION *session, int argc, char *argv[])
if (argc != 1)
return (usage());
- if ((uri = util_name(session, *argv, "table")) == NULL)
+ if ((uri = util_uri(session, *argv, "table")) == NULL)
return (1);
if ((ret = session->create(session, uri, config)) != 0)
- return (util_err(session, ret, "%s: session.create", uri));
- return (0);
+ (void)util_err(session, ret, "session.create: %s", uri);
+
+ free(uri);
+ return (ret);
}
static int
diff --git a/src/utilities/util_drop.c b/src/utilities/util_drop.c
index ba41445dfb6..456005d445d 100644
--- a/src/utilities/util_drop.c
+++ b/src/utilities/util_drop.c
@@ -15,8 +15,9 @@ util_drop(WT_SESSION *session, int argc, char *argv[])
{
WT_DECL_RET;
int ch;
- char *name;
+ char *uri;
+ uri = NULL;
while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
switch (ch) {
case '?':
@@ -30,12 +31,13 @@ util_drop(WT_SESSION *session, int argc, char *argv[])
/* The remaining argument is the uri. */
if (argc != 1)
return (usage());
- if ((name = util_name(session, *argv, "table")) == NULL)
+ if ((uri = util_uri(session, *argv, "table")) == NULL)
return (1);
- ret = session->drop(session, name, "force");
+ if ((ret = session->drop(session, uri, "force")) != 0)
+ (void)util_err(session, ret, "session.drop: %s", uri);
- free(name);
+ free(uri);
return (ret);
}
diff --git a/src/utilities/util_dump.c b/src/utilities/util_dump.c
index 3f8b4a49dfe..cded40a8b45 100644
--- a/src/utilities/util_dump.c
+++ b/src/utilities/util_dump.c
@@ -37,10 +37,10 @@ util_dump(WT_SESSION *session, int argc, char *argv[])
size_t len;
int ch, i;
bool hex, json, reverse;
- char *checkpoint, *config, *name, *p, *simplename;
+ char *checkpoint, *config, *p, *simpleuri, *uri;
hex = json = reverse = false;
- checkpoint = config = name = simplename = NULL;
+ checkpoint = config = simpleuri = uri = NULL;
cursor = NULL;
while ((ch = __wt_getopt(progname, argc, argv, "c:f:jrx")) != EOF)
switch (ch) {
@@ -89,11 +89,11 @@ util_dump(WT_SESSION *session, int argc, char *argv[])
if (json && i > 0)
if (dump_json_separator(session) != 0)
goto err;
- free(name);
- free(simplename);
- name = simplename = NULL;
+ free(uri);
+ free(simpleuri);
+ uri = simpleuri = NULL;
- if ((name = util_name(session, argv[i], "table")) == NULL)
+ if ((uri = util_uri(session, argv[i], "table")) == NULL)
goto err;
len =
@@ -113,19 +113,19 @@ util_dump(WT_SESSION *session, int argc, char *argv[])
(void)strcat(config, json ? "dump=json" :
(hex ? "dump=hex" : "dump=print"));
if ((ret = session->open_cursor(
- session, name, NULL, config, &cursor)) != 0) {
+ session, uri, NULL, config, &cursor)) != 0) {
fprintf(stderr, "%s: cursor open(%s) failed: %s\n",
- progname, name, session->strerror(session, ret));
+ progname, uri, session->strerror(session, ret));
goto err;
}
- if ((simplename = strdup(name)) == NULL) {
+ if ((simpleuri = strdup(uri)) == NULL) {
(void)util_err(session, errno, NULL);
goto err;
}
- if ((p = strchr(simplename, '(')) != NULL)
+ if ((p = strchr(simpleuri, '(')) != NULL)
*p = '\0';
- if (dump_config(session, simplename, cursor, hex, json) != 0)
+ if (dump_config(session, simpleuri, cursor, hex, json) != 0)
goto err;
if (dump_record(cursor, reverse, json) != 0)
@@ -148,8 +148,8 @@ err: ret = 1;
}
free(config);
- free(name);
- free(simplename);
+ free(uri);
+ free(simpleuri);
if (cursor != NULL && (ret = cursor->close(cursor)) != 0) {
(void)util_err(session, ret, NULL);
ret = 1;
diff --git a/src/utilities/util_list.c b/src/utilities/util_list.c
index e91dbfce05b..f19ba4d1f97 100644
--- a/src/utilities/util_list.c
+++ b/src/utilities/util_list.c
@@ -19,10 +19,10 @@ util_list(WT_SESSION *session, int argc, char *argv[])
WT_DECL_RET;
int ch;
bool cflag, vflag;
- char *name;
+ char *uri;
cflag = vflag = false;
- name = NULL;
+ uri = NULL;
while ((ch = __wt_getopt(progname, argc, argv, "cv")) != EOF)
switch (ch) {
case 'c':
@@ -42,17 +42,16 @@ util_list(WT_SESSION *session, int argc, char *argv[])
case 0:
break;
case 1:
- if ((name = util_name(session, *argv, "table")) == NULL)
+ if ((uri = util_uri(session, *argv, "table")) == NULL)
return (1);
break;
default:
return (usage());
}
- ret = list_print(session, name, cflag, vflag);
-
- free(name);
+ ret = list_print(session, uri, cflag, vflag);
+ free(uri);
return (ret);
}
@@ -99,7 +98,7 @@ list_get_allocsize(WT_SESSION *session, const char *key, size_t *allocsize)
* List the high-level objects in the database.
*/
static int
-list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag)
+list_print(WT_SESSION *session, const char *uri, bool cflag, bool vflag)
{
WT_CURSOR *cursor;
WT_DECL_RET;
@@ -120,7 +119,7 @@ list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag)
ret, "%s: WT_SESSION.open_cursor", WT_METADATA_URI));
}
- found = name == NULL;
+ found = uri == NULL;
while ((ret = cursor->next(cursor)) == 0) {
/* Get the key. */
if ((ret = cursor->get_key(cursor, &key)) != 0)
@@ -129,8 +128,8 @@ list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag)
/*
* If a name is specified, only show objects that match.
*/
- if (name != NULL) {
- if (!WT_PREFIX_MATCH(key, name))
+ if (uri != NULL) {
+ if (!WT_PREFIX_MATCH(key, uri))
continue;
found = true;
}
@@ -161,7 +160,7 @@ list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag)
if (ret != WT_NOTFOUND)
return (util_cerr(cursor, "next", ret));
if (!found) {
- fprintf(stderr, "%s: %s: not found\n", progname, name);
+ fprintf(stderr, "%s: %s: not found\n", progname, uri);
return (1);
}
diff --git a/src/utilities/util_load.c b/src/utilities/util_load.c
index ac18df80851..ca77643eb49 100644
--- a/src/utilities/util_load.c
+++ b/src/utilities/util_load.c
@@ -126,7 +126,7 @@ load_dump(WT_SESSION *session)
append ? ",append" : "", no_overwrite ? ",overwrite=false" : "");
if ((ret = session->open_cursor(
session, uri, NULL, config, &cursor)) != 0) {
- ret = util_err(session, ret, "%s: session.open", uri);
+ ret = util_err(session, ret, "%s: session.open_cursor", uri);
goto err;
}
diff --git a/src/utilities/util_load_json.c b/src/utilities/util_load_json.c
index 020a4ed9ba9..1189d49a483 100644
--- a/src/utilities/util_load_json.c
+++ b/src/utilities/util_load_json.c
@@ -242,7 +242,7 @@ json_data(WT_SESSION *session,
LF_ISSET(LOAD_JSON_NO_OVERWRITE) ? ",overwrite=false" : "");
if ((ret = session->open_cursor(
session, uri, NULL, config, &cursor)) != 0) {
- ret = util_err(session, ret, "%s: session.open", uri);
+ ret = util_err(session, ret, "%s: session.open_cursor", uri);
goto err;
}
keyformat = cursor->key_format;
diff --git a/src/utilities/util_loadtext.c b/src/utilities/util_loadtext.c
index f9c5b6e9a1f..7602d43f8c9 100644
--- a/src/utilities/util_loadtext.c
+++ b/src/utilities/util_loadtext.c
@@ -15,9 +15,11 @@ static int usage(void);
int
util_loadtext(WT_SESSION *session, int argc, char *argv[])
{
+ WT_DECL_RET;
int ch;
- const char *uri;
+ char *uri;
+ uri = NULL;
while ((ch = __wt_getopt(progname, argc, argv, "f:")) != EOF)
switch (ch) {
case 'f': /* input file */
@@ -35,10 +37,13 @@ util_loadtext(WT_SESSION *session, int argc, char *argv[])
/* The remaining argument is the uri. */
if (argc != 1)
return (usage());
- if ((uri = util_name(session, *argv, "table")) == NULL)
+ if ((uri = util_uri(session, *argv, "table")) == NULL)
return (1);
- return (text(session, uri));
+ ret = text(session, uri);
+
+ free(uri);
+ return (ret);
}
/*
@@ -61,7 +66,7 @@ text(WT_SESSION *session, const char *uri)
*/
if ((ret = session->open_cursor(
session, uri, NULL, "append,overwrite", &cursor)) != 0)
- return (util_err(session, ret, "%s: session.open", uri));
+ return (util_err(session, ret, "%s: session.open_cursor", uri));
/*
* We're about to load strings, make sure the formats match.
diff --git a/src/utilities/util_main.c b/src/utilities/util_main.c
index 001a66d6d9e..7157f0d90fe 100644
--- a/src/utilities/util_main.c
+++ b/src/utilities/util_main.c
@@ -285,11 +285,11 @@ usage(void)
}
/*
- * util_name --
+ * util_uri --
* Build a name.
*/
char *
-util_name(WT_SESSION *session, const char *s, const char *type)
+util_uri(WT_SESSION *session, const char *s, const char *type)
{
size_t len;
char *name;
diff --git a/src/utilities/util_printlog.c b/src/utilities/util_printlog.c
index e7fa2134934..5f3ed43905b 100644
--- a/src/utilities/util_printlog.c
+++ b/src/utilities/util_printlog.c
@@ -14,8 +14,8 @@ int
util_printlog(WT_SESSION *session, int argc, char *argv[])
{
WT_DECL_RET;
- int ch;
uint32_t flags;
+ int ch;
flags = 0;
while ((ch = __wt_getopt(progname, argc, argv, "f:x")) != EOF)
@@ -41,17 +41,9 @@ util_printlog(WT_SESSION *session, int argc, char *argv[])
if (argc != 0)
return (usage());
- ret = __wt_txn_printlog(session, flags);
-
- if (ret != 0) {
- fprintf(stderr, "%s: printlog failed: %s\n",
- progname, session->strerror(session, ret));
- goto err;
- }
+ if ((ret = __wt_txn_printlog(session, flags)) != 0)
+ (void)util_err(session, ret, "printlog");
- if (0) {
-err: ret = 1;
- }
return (ret);
}
diff --git a/src/utilities/util_read.c b/src/utilities/util_read.c
index 2e766377aa9..393949b6a1c 100644
--- a/src/utilities/util_read.c
+++ b/src/utilities/util_read.c
@@ -18,8 +18,9 @@ util_read(WT_SESSION *session, int argc, char *argv[])
uint64_t recno;
int ch;
bool rkey, rval;
- const char *uri, *value;
+ char *uri, *value;
+ uri = NULL;
while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
switch (ch) {
case '?':
@@ -32,13 +33,19 @@ util_read(WT_SESSION *session, int argc, char *argv[])
/* The remaining arguments are a uri followed by a list of keys. */
if (argc < 2)
return (usage());
- if ((uri = util_name(session, *argv, "table")) == NULL)
+ if ((uri = util_uri(session, *argv, "table")) == NULL)
return (1);
- /* Open the object. */
- if ((ret = session->open_cursor(
- session, uri, NULL, NULL, &cursor)) != 0)
- return (util_err(session, ret, "%s: session.open", uri));
+ /*
+ * Open the object; free allocated memory immediately to simplify
+ * future error handling.
+ */
+ if ((ret =
+ session->open_cursor(session, uri, NULL, NULL, &cursor)) != 0)
+ (void)util_err(session, ret, "%s: session.open_cursor", uri);
+ free(uri);
+ if (ret != 0)
+ return (ret);
/*
* A simple search only makes sense if the key format is a string or a
diff --git a/src/utilities/util_rebalance.c b/src/utilities/util_rebalance.c
index 45f161487e5..c188ea17d22 100644
--- a/src/utilities/util_rebalance.c
+++ b/src/utilities/util_rebalance.c
@@ -15,9 +15,9 @@ util_rebalance(WT_SESSION *session, int argc, char *argv[])
{
WT_DECL_RET;
int ch;
- char *name;
+ char *uri;
- name = NULL;
+ uri = NULL;
while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
switch (ch) {
case '?':
@@ -30,25 +30,21 @@ util_rebalance(WT_SESSION *session, int argc, char *argv[])
/* The remaining argument is the table name. */
if (argc != 1)
return (usage());
- if ((name = util_name(session, *argv, "table")) == NULL)
+ if ((uri = util_uri(session, *argv, "table")) == NULL)
return (1);
- if ((ret = session->rebalance(session, name, NULL)) != 0) {
- fprintf(stderr, "%s: rebalance(%s): %s\n",
- progname, name, session->strerror(session, ret));
- goto err;
+ if ((ret = session->rebalance(session, uri, NULL)) != 0)
+ (void)util_err(session, ret, "session.rebalance: %s", uri);
+ else {
+ /*
+ * Verbose configures a progress counter, move to the next
+ * line.
+ */
+ if (verbose)
+ printf("\n");
}
- /* Verbose configures a progress counter, move to the next line. */
- if (verbose)
- printf("\n");
-
- if (0) {
-err: ret = 1;
- }
-
- free(name);
-
+ free(uri);
return (ret);
}
diff --git a/src/utilities/util_rename.c b/src/utilities/util_rename.c
index aee299c6e63..bb2d40cd103 100644
--- a/src/utilities/util_rename.c
+++ b/src/utilities/util_rename.c
@@ -30,22 +30,15 @@ util_rename(WT_SESSION *session, int argc, char *argv[])
/* The remaining arguments are the object uri and new name. */
if (argc != 2)
return (usage());
- if ((uri = util_name(session, *argv, "table")) == NULL)
+ if ((uri = util_uri(session, *argv, "table")) == NULL)
return (1);
newuri = argv[1];
- if ((ret = session->rename(session, uri, newuri, NULL)) != 0) {
- fprintf(stderr, "%s: rename %s to %s: %s\n",
- progname, uri, newuri, session->strerror(session, ret));
- goto err;
- }
-
- if (0) {
-err: ret = 1;
- }
+ if ((ret = session->rename(session, uri, newuri, NULL)) != 0)
+ (void)util_err(
+ session, ret, "session.rename: %s, %s", uri, newuri);
free(uri);
-
return (ret);
}
diff --git a/src/utilities/util_salvage.c b/src/utilities/util_salvage.c
index 679d1074457..6cc2278b846 100644
--- a/src/utilities/util_salvage.c
+++ b/src/utilities/util_salvage.c
@@ -16,10 +16,10 @@ util_salvage(WT_SESSION *session, int argc, char *argv[])
WT_DECL_RET;
int ch;
const char *force;
- char *name;
+ char *uri;
force = NULL;
- name = NULL;
+ uri = NULL;
while ((ch = __wt_getopt(progname, argc, argv, "F")) != EOF)
switch (ch) {
case 'F':
@@ -35,25 +35,21 @@ util_salvage(WT_SESSION *session, int argc, char *argv[])
/* The remaining argument is the file name. */
if (argc != 1)
return (usage());
- if ((name = util_name(session, *argv, "file")) == NULL)
+ if ((uri = util_uri(session, *argv, "file")) == NULL)
return (1);
- if ((ret = session->salvage(session, name, force)) != 0) {
- fprintf(stderr, "%s: salvage(%s): %s\n",
- progname, name, session->strerror(session, ret));
- goto err;
+ if ((ret = session->salvage(session, uri, force)) != 0)
+ (void)util_err(session, ret, "session.salvage: %s", uri);
+ else {
+ /*
+ * Verbose configures a progress counter, move to the next
+ * line.
+ */
+ if (verbose)
+ printf("\n");
}
- /* Verbose configures a progress counter, move to the next line. */
- if (verbose)
- printf("\n");
-
- if (0) {
-err: ret = 1;
- }
-
- free(name);
-
+ free(uri);
return (ret);
}
diff --git a/src/utilities/util_stat.c b/src/utilities/util_stat.c
index 4376f559ceb..1b75d9ea8bf 100644
--- a/src/utilities/util_stat.c
+++ b/src/utilities/util_stat.c
@@ -55,7 +55,7 @@ util_stat(WT_SESSION *session, int argc, char *argv[])
objname = (char *)"";
break;
case 1:
- if ((objname = util_name(session, *argv, "table")) == NULL)
+ if ((objname = util_uri(session, *argv, "table")) == NULL)
return (1);
objname_free = true;
break;
@@ -82,8 +82,8 @@ util_stat(WT_SESSION *session, int argc, char *argv[])
(ret = cursor->next(cursor)) == 0 &&
(ret = cursor->get_value(cursor, &desc, &pval, NULL)) == 0)
if (printf("%s=%s\n", desc, pval) < 0) {
- ret = errno;
- break;
+ (void)util_err(session, errno, "printf");
+ goto err;
}
if (ret == WT_NOTFOUND)
ret = 0;
diff --git a/src/utilities/util_truncate.c b/src/utilities/util_truncate.c
index 9325c0d7e84..35de02345c8 100644
--- a/src/utilities/util_truncate.c
+++ b/src/utilities/util_truncate.c
@@ -15,8 +15,9 @@ util_truncate(WT_SESSION *session, int argc, char *argv[])
{
WT_DECL_RET;
int ch;
- char *name;
+ char *uri;
+ uri = NULL;
while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
switch (ch) {
case '?':
@@ -30,13 +31,13 @@ util_truncate(WT_SESSION *session, int argc, char *argv[])
/* The remaining argument is the uri. */
if (argc != 1)
return (usage());
- if ((name = util_name(session, *argv, "table")) == NULL)
+ if ((uri = util_uri(session, *argv, "table")) == NULL)
return (1);
- if ((ret = session->truncate(session, name, NULL, NULL, NULL)) != 0)
- return (util_err(session, ret, "%s: session.truncate", name));
+ if ((ret = session->truncate(session, uri, NULL, NULL, NULL)) != 0)
+ (void)util_err(session, ret, "session.truncate: %s", uri);
- free(name);
+ free(uri);
return (ret);
}
diff --git a/src/utilities/util_upgrade.c b/src/utilities/util_upgrade.c
index 63b23f28c16..f89bd46e133 100644
--- a/src/utilities/util_upgrade.c
+++ b/src/utilities/util_upgrade.c
@@ -15,9 +15,9 @@ util_upgrade(WT_SESSION *session, int argc, char *argv[])
{
WT_DECL_RET;
int ch;
- char *name;
+ char *uri;
- name = NULL;
+ uri = NULL;
while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
switch (ch) {
case '?':
@@ -30,25 +30,21 @@ util_upgrade(WT_SESSION *session, int argc, char *argv[])
/* The remaining argument is the table name. */
if (argc != 1)
return (usage());
- if ((name = util_name(session, *argv, "table")) == NULL)
+ if ((uri = util_uri(session, *argv, "table")) == NULL)
return (1);
- if ((ret = session->upgrade(session, name, NULL)) != 0) {
- fprintf(stderr, "%s: upgrade(%s): %s\n",
- progname, name, session->strerror(session, ret));
- goto err;
+ if ((ret = session->upgrade(session, uri, NULL)) != 0)
+ (void)util_err(session, ret, "session.upgrade: %s", uri);
+ else {
+ /*
+ * Verbose configures a progress counter, move to the next
+ * line.
+ */
+ if (verbose)
+ printf("\n");
}
- /* Verbose configures a progress counter, move to the next line. */
- if (verbose)
- printf("\n");
-
- if (0) {
-err: ret = 1;
- }
-
- free(name);
-
+ free(uri);
return (ret);
}
diff --git a/src/utilities/util_verify.c b/src/utilities/util_verify.c
index 82bdd780cd3..d0587fcfc8c 100644
--- a/src/utilities/util_verify.c
+++ b/src/utilities/util_verify.c
@@ -17,10 +17,10 @@ util_verify(WT_SESSION *session, int argc, char *argv[])
size_t size;
int ch;
bool dump_address, dump_blocks, dump_layout, dump_pages;
- char *config, *dump_offsets, *name;
+ char *config, *dump_offsets, *uri;
dump_address = dump_blocks = dump_layout = dump_pages = false;
- config = dump_offsets = name = NULL;
+ config = dump_offsets = uri = NULL;
while ((ch = __wt_getopt(progname, argc, argv, "d:")) != EOF)
switch (ch) {
case 'd':
@@ -55,7 +55,7 @@ util_verify(WT_SESSION *session, int argc, char *argv[])
/* The remaining argument is the table name. */
if (argc != 1)
return (usage());
- if ((name = util_name(session, *argv, "table")) == NULL)
+ if ((uri = util_uri(session, *argv, "table")) == NULL)
return (1);
/* Build the configuration string as necessary. */
@@ -69,7 +69,7 @@ util_verify(WT_SESSION *session, int argc, char *argv[])
strlen("dump_offsets[],") +
(dump_offsets == NULL ? 0 : strlen(dump_offsets)) + 20;
if ((config = malloc(size)) == NULL) {
- (void)util_err(session, errno, NULL);
+ ret = util_err(session, errno, NULL);
goto err;
}
snprintf(config, size,
@@ -82,23 +82,19 @@ util_verify(WT_SESSION *session, int argc, char *argv[])
dump_offsets != NULL ? "]," : "",
dump_pages ? "dump_pages," : "");
}
- if ((ret = session->verify(session, name, config)) != 0) {
- fprintf(stderr, "%s: verify(%s): %s\n",
- progname, name, session->strerror(session, ret));
- goto err;
+ if ((ret = session->verify(session, uri, config)) != 0)
+ (void)util_err(session, ret, "session.verify: %s", uri);
+ else {
+ /*
+ * Verbose configures a progress counter, move to the next
+ * line.
+ */
+ if (verbose)
+ printf("\n");
}
- /* Verbose configures a progress counter, move to the next line. */
- if (verbose)
- printf("\n");
-
- if (0) {
-err: ret = 1;
- }
-
- free(config);
- free(name);
-
+err: free(config);
+ free(uri);
return (ret);
}
diff --git a/src/utilities/util_write.c b/src/utilities/util_write.c
index 7d9bce02b36..b931fad064d 100644
--- a/src/utilities/util_write.c
+++ b/src/utilities/util_write.c
@@ -18,10 +18,10 @@ util_write(WT_SESSION *session, int argc, char *argv[])
uint64_t recno;
int ch;
bool append, overwrite, rkey;
- const char *uri;
- char config[100];
+ char *uri, config[100];
append = overwrite = false;
+ uri = NULL;
while ((ch = __wt_getopt(progname, argc, argv, "ao")) != EOF)
switch (ch) {
case 'a':
@@ -47,15 +47,21 @@ util_write(WT_SESSION *session, int argc, char *argv[])
} else
if (argc < 3 || ((argc - 1) % 2 != 0))
return (usage());
- if ((uri = util_name(session, *argv, "table")) == NULL)
+ if ((uri = util_uri(session, *argv, "table")) == NULL)
return (1);
- /* Open the object. */
+ /*
+ * Open the object; free allocated memory immediately to simplify
+ * future error handling.
+ */
(void)snprintf(config, sizeof(config), "%s,%s",
append ? "append=true" : "", overwrite ? "overwrite=true" : "");
- if ((ret = session->open_cursor(
- session, uri, NULL, config, &cursor)) != 0)
- return (util_err(session, ret, "%s: session.open", uri));
+ if ((ret =
+ session->open_cursor(session, uri, NULL, config, &cursor)) != 0)
+ (void)util_err(session, ret, "%s: session.open_cursor", uri);
+ free(uri);
+ if (ret != 0)
+ return (ret);
/*
* A simple search only makes sense if the key format is a string or a
diff --git a/test/csuite/Makefile.am b/test/csuite/Makefile.am
index a96492c1e71..e2b72532703 100644
--- a/test/csuite/Makefile.am
+++ b/test/csuite/Makefile.am
@@ -37,9 +37,21 @@ noinst_PROGRAMS += test_wt2834_join_bloom_fix
test_wt2853_perf_SOURCES = wt2853_perf/main.c
noinst_PROGRAMS += test_wt2853_perf
+test_wt2909_checkpoint_integrity_SOURCES = wt2909_checkpoint_integrity/main.c
+noinst_PROGRAMS += test_wt2909_checkpoint_integrity
+
test_wt2999_join_extractor_SOURCES = wt2999_join_extractor/main.c
noinst_PROGRAMS += test_wt2999_join_extractor
+test_wt3120_filesys_SOURCES = wt3120_filesys/main.c
+noinst_PROGRAMS += test_wt3120_filesys
+
+test_wt3135_search_near_collator_SOURCES = wt3135_search_near_collator/main.c
+noinst_PROGRAMS += test_wt3135_search_near_collator
+
+test_wt3184_dup_index_collator_SOURCES = wt3184_dup_index_collator/main.c
+noinst_PROGRAMS += test_wt3184_dup_index_collator
+
# Run this during a "make check" smoke test.
TESTS = $(noinst_PROGRAMS)
LOG_COMPILER = $(TEST_WRAPPER)
diff --git a/test/csuite/wt2909_checkpoint_integrity/main.c b/test/csuite/wt2909_checkpoint_integrity/main.c
new file mode 100644
index 00000000000..ddf249fb406
--- /dev/null
+++ b/test/csuite/wt2909_checkpoint_integrity/main.c
@@ -0,0 +1,666 @@
+/*-
+ * Public Domain 2014-2016 MongoDB, Inc.
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "test_util.h"
+
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/wait.h>
+
+/*
+ * JIRA ticket reference: WT-2909
+ * Test case description:
+ *
+ * This test attempts to check the integrity of checkpoints by injecting
+ * failures (by means of a custom file system) and then trying to recover. To
+ * insulate the top level program from various crashes that may occur when
+ * injecting failures, the "populate" code runs in another process, and is
+ * expected to sometimes fail. Then the top level program runs recovery (with
+ * the normal file system) and checks the results. Any failure at the top level
+ * indicates a checkpoint integrity problem.
+ *
+ * Each subtest uses the same kind of schema and data, the only variance is
+ * when the faults are injected. At the moment, this test only injects during
+ * checkpoints, and only injects write failures. It varies in the number of
+ * successful writes that occur before an injected failure (during a checkpoint
+ * operation), this can be indicated with "-o N". When N is not specified, the
+ * test attempts to find the optimal range of N for testing. Clearly when N is
+ * large, then the checkpoint may be successfully written, and the data
+ * represented by the checkpoint will be fully present. When N is small,
+ * nothing of interest is written and no data is present. To find the sweet
+ * spot where interesting failures occur, the test does a binary search to find
+ * the approximate N that divides the "small" and "large" cases. This is not
+ * strictly deterministic, a given N may give different results on different
+ * runs. But approximate optimal N can be determined, allowing a series of
+ * additional tests clustered around this N.
+ *
+ * The data is stored in two tables, one having indices. Both tables have
+ * the same keys and are updated with the same key in a single transaction.
+ *
+ * Failure mode:
+ * If one table is out of step with the other, that is detected as a failure at
+ * the top level. If an index is missing values (or has extra values), that is
+ * likewise a failure at the top level. If the tables or the home directory
+ * cannot be opened, that is a top level error. The tables must be present
+ * as an initial checkpoint is done without any injected fault.
+ */
+
+/*
+ * This program does not run on Windows. The non-portable aspects at minimum
+ * are fork/exec the use of environment variables (used by fail_fs), and file
+ * name and build locations of dynamically loaded libraries.
+ */
+#define BIG_SIZE (1024 * 10)
+#define BIG_CONTENTS "<Big String Contents>"
+#define MAX_ARGS 20
+#define MAX_OP_RANGE 1000
+#define STDERR_FILE "stderr.txt"
+#define STDOUT_FILE "stdout.txt"
+#define TESTS_PER_OP_VALUE 3
+#define VERBOSE_PRINT 10000
+
+static int check_results(TEST_OPTS *, uint64_t *);
+static void check_values(WT_CURSOR *, int, int, int, char *);
+static int create_big_string(char **);
+static void cursor_count_items(WT_CURSOR *, uint64_t *);
+static void disable_failures(void);
+static void enable_failures(uint64_t, uint64_t);
+static void generate_key(uint64_t, int *);
+static void generate_value(uint32_t, uint64_t, char *, int *, int *, int *,
+ char **);
+static void run_check_subtest(TEST_OPTS *, const char *, uint64_t, bool,
+ uint64_t *);
+static void run_check_subtest_range(TEST_OPTS *, const char *, bool);
+static int run_process(TEST_OPTS *, const char *, char *[], int *);
+static int subtest_main(int, char *[], bool);
+static void subtest_populate(TEST_OPTS *, bool);
+int main(int, char *[]);
+
+extern int __wt_optind;
+
+#define WT_FAIL_FS_LIB "../../ext/test/fail_fs/.libs/libwiredtiger_fail_fs.so"
+
+/*
+ * check_results --
+ * Check all the tables and verify the results.
+ */
+static int
+check_results(TEST_OPTS *opts, uint64_t *foundp)
+{
+ WT_CURSOR *maincur, *maincur2, *v0cur, *v1cur, *v2cur;
+ WT_SESSION *session;
+ uint64_t count, idxcount, nrecords;
+ uint32_t rndint;
+ int key, key_got, ret, v0, v1, v2;
+ char *bigref, *big;
+
+ testutil_check(create_big_string(&bigref));
+ nrecords = opts->nrecords;
+ testutil_check(wiredtiger_open(opts->home, NULL,
+ "create,log=(enabled)", &opts->conn));
+ testutil_check(
+ opts->conn->open_session(opts->conn, NULL, NULL, &session));
+
+ testutil_check(session->open_cursor(session, "table:subtest", NULL,
+ NULL, &maincur));
+ testutil_check(session->open_cursor(session, "table:subtest2", NULL,
+ NULL, &maincur2));
+ testutil_check(session->open_cursor(session, "index:subtest:v0", NULL,
+ NULL, &v0cur));
+ testutil_check(session->open_cursor(session, "index:subtest:v1", NULL,
+ NULL, &v1cur));
+ testutil_check(session->open_cursor(session, "index:subtest:v2", NULL,
+ NULL, &v2cur));
+
+ count = 0;
+ while ((ret = maincur->next(maincur)) == 0) {
+ testutil_check(maincur2->next(maincur2));
+ testutil_check(maincur2->get_key(maincur2, &key_got));
+ testutil_check(maincur2->get_value(maincur2, &rndint));
+
+ generate_key(count, &key);
+ generate_value(rndint, count, bigref, &v0, &v1, &v2, &big);
+ testutil_assert(key == key_got);
+
+ /* Check the key/values in main table. */
+ testutil_check(maincur->get_key(maincur, &key_got));
+ testutil_assert(key == key_got);
+ check_values(maincur, v0, v1, v2, big);
+
+ /* Check the values in the indices. */
+ v0cur->set_key(v0cur, v0);
+ testutil_check(v0cur->search(v0cur));
+ check_values(v0cur, v0, v1, v2, big);
+ v1cur->set_key(v1cur, v1);
+ testutil_check(v1cur->search(v1cur));
+ check_values(v1cur, v0, v1, v2, big);
+ v2cur->set_key(v2cur, v2);
+ testutil_check(v2cur->search(v2cur));
+ check_values(v2cur, v0, v1, v2, big);
+
+ count++;
+ if (count % VERBOSE_PRINT == 0 && opts->verbose)
+ printf("checked %" PRIu64 "/%" PRIu64 "\n", count,
+ nrecords);
+ }
+ if (count % VERBOSE_PRINT != 0 && opts->verbose)
+ printf("checked %" PRIu64 "/%" PRIu64 "\n", count, nrecords);
+
+ /*
+ * Always expect at least one entry, as populate does a
+ * checkpoint after the first insert.
+ */
+ testutil_assert(count > 0);
+ testutil_assert(ret == WT_NOTFOUND);
+ testutil_assert(maincur2->next(maincur2) == WT_NOTFOUND);
+ cursor_count_items(v0cur, &idxcount);
+ testutil_assert(count == idxcount);
+ cursor_count_items(v1cur, &idxcount);
+ testutil_assert(count == idxcount);
+ cursor_count_items(v2cur, &idxcount);
+ testutil_assert(count == idxcount);
+
+ testutil_check(opts->conn->close(opts->conn, NULL));
+ opts->conn = NULL;
+
+ free(bigref);
+ *foundp = count;
+ return (0);
+}
+
+/*
+ * check_values --
+ * Check that the values in the cursor match the given values.
+ */
+static void
+check_values(WT_CURSOR *cursor, int v0, int v1, int v2, char *big)
+{
+ int v0_got, v1_got, v2_got;
+ char *big_got;
+
+ testutil_check(cursor->get_value(cursor, &v0_got, &v1_got, &v2_got,
+ &big_got));
+ testutil_assert(v0 == v0_got);
+ testutil_assert(v1 == v1_got);
+ testutil_assert(v2 == v2_got);
+ testutil_assert(strcmp(big, big_got) == 0);
+}
+
+/*
+ * create_big_string --
+ * Create and fill the "reference" big array.
+ */
+static int create_big_string(char **bigp)
+{
+ size_t i, mod;
+ char *big;
+
+ if ((big = malloc(BIG_SIZE + 1)) == NULL)
+ return (ENOMEM);
+ mod = strlen(BIG_CONTENTS);
+ for (i = 0; i < BIG_SIZE; i++) {
+ big[i] = BIG_CONTENTS[i % mod];
+ }
+ big[BIG_SIZE] = '\0';
+ *bigp = big;
+ return (0);
+}
+
+/*
+ * cursor_count_items --
+ * Count the number of items in the table by traversing
+ * through the cursor.
+ */
+static void
+cursor_count_items(WT_CURSOR *cursor, uint64_t *countp)
+{
+ int ret;
+
+ *countp = 0;
+
+ testutil_check(cursor->reset(cursor));
+ while ((ret = cursor->next(cursor)) == 0)
+ (*countp)++;
+ testutil_assert(ret == WT_NOTFOUND);
+}
+
+/*
+ * disable_failures --
+ * Disable failures in the fail file system.
+ */
+static void
+disable_failures(void)
+{
+ testutil_check(setenv("WT_FAIL_FS_ENABLE", "0", 1));
+}
+
+/*
+ * enable_failures --
+ * Enable failures in the fail file system.
+ */
+static void
+enable_failures(uint64_t allow_writes, uint64_t allow_reads)
+{
+ char value[100];
+
+ testutil_check(setenv("WT_FAIL_FS_ENABLE", "1", 1));
+ snprintf(value, sizeof(value), "%" PRIu64, allow_writes);
+ testutil_check(setenv("WT_FAIL_FS_WRITE_ALLOW", value, 1));
+ snprintf(value, sizeof(value), "%" PRIu64, allow_reads);
+ testutil_check(setenv("WT_FAIL_FS_READ_ALLOW", value, 1));
+}
+
+/*
+ * generate_key --
+ * Generate a key used by the "subtest" and "subtest2" tables.
+ */
+static void
+generate_key(uint64_t i, int *keyp)
+{
+ *keyp = (int)i;
+}
+
+/*
+ * generate_value --
+ * Generate values for the "subtest" table.
+ */
+static void
+generate_value(uint32_t rndint, uint64_t i, char *bigref,
+ int *v0p, int *v1p, int *v2p, char **bigp)
+{
+ *v0p = (int)(i * 7);
+ *v1p = (int)(i * 10007);
+ *v2p = (int)(i * 100000007);
+ *bigp = &bigref[rndint % BIG_SIZE];
+}
+
+/*
+ * run_check_subtest --
+ * Run the subtest with the given parameters and check the results.
+ */
+static void
+run_check_subtest(TEST_OPTS *opts, const char *debugger, uint64_t nops,
+ bool close_test, uint64_t *nresultsp)
+{
+ int estatus, narg;
+ char rarg[20], sarg[20], *subtest_args[MAX_ARGS];
+
+ narg = 0;
+ if (debugger != NULL) {
+ subtest_args[narg++] = (char *)debugger;
+ subtest_args[narg++] = (char *)"--";
+ }
+
+ subtest_args[narg++] = (char *)opts->progname;
+ /* "subtest" must appear before arguments */
+ if (close_test)
+ subtest_args[narg++] = (char *)"subtest_close";
+ else
+ subtest_args[narg++] = (char *)"subtest";
+ subtest_args[narg++] = (char *)"-h";
+ subtest_args[narg++] = opts->home;
+ subtest_args[narg++] = (char *)"-v"; /* subtest is always verbose */
+ subtest_args[narg++] = (char *)"-p";
+ subtest_args[narg++] = (char *)"-o";
+ snprintf(sarg, sizeof(sarg), "%" PRIu64, nops);
+ subtest_args[narg++] = sarg; /* number of operations */
+ subtest_args[narg++] = (char *)"-n";
+ snprintf(rarg, sizeof(rarg), "%" PRIu64, opts->nrecords);
+ subtest_args[narg++] = rarg; /* number of records */
+ subtest_args[narg++] = NULL;
+ testutil_assert(narg <= MAX_ARGS);
+ if (opts->verbose)
+ printf("running a separate process with %" PRIu64
+ " operations until fail...\n", nops);
+ testutil_clean_work_dir(opts->home);
+ testutil_check(run_process(
+ opts, debugger != NULL ? debugger : opts->progname,
+ subtest_args, &estatus));
+ if (opts->verbose)
+ printf("process exited %d\n", estatus);
+
+ /*
+ * Verify results in parent process.
+ */
+ testutil_check(check_results(opts, nresultsp));
+}
+
+/*
+ * run_check_subtest_range --
+ *
+ * Run successive tests via binary search that determines the approximate
+ * crossover point between when data is recoverable or not. Once that is
+ * determined, run the subtest in a range near that crossover point.
+ *
+ * The theory is that running at the crossover point will tend to trigger
+ * "interesting" failures at the borderline when the checkpoint is about to,
+ * or has, succeeded. If any of those failures creates a WT home directory
+ * that cannot be recovered, the top level test will fail.
+ */
+static void
+run_check_subtest_range(TEST_OPTS *opts, const char *debugger, bool close_test)
+{
+ uint64_t cutoff, high, low, mid, nops, nresults;
+ int i;
+ bool got_failure, got_success;
+
+ if (opts->verbose)
+ printf("Determining best range of operations until failure, "
+ "with close_test %s.\n",
+ (close_test ? "enabled" : "disabled"));
+
+ run_check_subtest(opts, debugger, 1, close_test, &cutoff);
+ low = 0;
+ high = MAX_OP_RANGE;
+ mid = (low + high) / 2;
+ while (mid != low) {
+ run_check_subtest(opts, debugger, mid, close_test,
+ &nresults);
+ if (nresults > cutoff)
+ high = mid;
+ else
+ low = mid;
+ mid = (low + high) / 2;
+ }
+ /*
+ * mid is the number of ops that is the crossover point.
+ * Run some tests near that point to try to trigger weird
+ * failures. If mid is too low or too high, it indicates
+ * there is a fundamental problem with the test.
+ */
+ testutil_assert(mid > 1 && mid < MAX_OP_RANGE - 1);
+ if (opts->verbose)
+ printf("Retesting around %" PRIu64 " operations.\n",
+ mid);
+
+ got_failure = false;
+ got_success = false;
+ for (nops = mid - 10; nops < mid + 10; nops++) {
+ for (i = 0; i < TESTS_PER_OP_VALUE; i++) {
+ run_check_subtest(opts, debugger, nops,
+ close_test, &nresults);
+ if (nresults > cutoff)
+ got_failure = true;
+ else
+ got_success = true;
+ }
+ }
+ /*
+ * Check that it really ran with a crossover point.
+ */
+ testutil_assert(got_failure);
+ testutil_assert(got_success);
+}
+
+/*
+ * run_process --
+ * Run a program with arguments, wait until it completes.
+ */
+static int
+run_process(TEST_OPTS *opts, const char *prog, char *argv[], int *status)
+{
+ int pid;
+ char **arg;
+
+ if (opts->verbose) {
+ printf("running: ");
+ for (arg = argv; *arg != NULL; arg++)
+ printf("%s ", *arg);
+ printf("\n");
+ }
+ if ((pid = fork()) == 0) {
+ (void)execv(prog, argv);
+ testutil_die(errno, "%s", prog);
+ } else if (pid < 0)
+ return (errno);
+
+ (void)waitpid(pid, status, 0);
+ return (0);
+}
+
+/*
+ * subtest_main --
+ * The main program for the subtest
+ */
+static int
+subtest_main(int argc, char *argv[], bool close_test)
+{
+ TEST_OPTS *opts, _opts;
+ WT_SESSION *session;
+ char config[1024], filename[1024];
+ struct rlimit rlim;
+
+ if (testutil_disable_long_tests())
+ return (0);
+ opts = &_opts;
+ memset(opts, 0, sizeof(*opts));
+ memset(&rlim, 0, sizeof(rlim));
+
+ /* No core files during fault injection tests. */
+ testutil_check(setrlimit(RLIMIT_CORE, &rlim));
+ testutil_check(testutil_parse_opts(argc, argv, opts));
+ testutil_make_work_dir(opts->home);
+
+ /* Redirect stderr, stdout. */
+ sprintf(filename, "%s/%s", opts->home, STDERR_FILE);
+ testutil_assert(freopen(filename, "a", stderr) != NULL);
+ sprintf(filename, "%s/%s", opts->home, STDOUT_FILE);
+ testutil_assert(freopen(filename, "a", stdout) != NULL);
+ snprintf(config, sizeof(config),
+ "create,cache_size=250M,log=(enabled),"
+ "transaction_sync=(enabled,method=none),extensions=("
+ WT_FAIL_FS_LIB
+ "=(early_load,config={environment=true,verbose=true})]");
+
+ testutil_check(wiredtiger_open(opts->home, NULL, config, &opts->conn));
+ testutil_check(
+ opts->conn->open_session(opts->conn, NULL, NULL, &session));
+
+ testutil_check(session->create(session, "table:subtest",
+ "key_format=i,value_format=iiiS,"
+ "columns=(id,v0,v1,v2,big)"));
+
+ testutil_check(session->create(session, "table:subtest2",
+ "key_format=i,value_format=i"));
+
+ testutil_check(session->create(session, "index:subtest:v0",
+ "columns=(v0)"));
+ testutil_check(session->create(session, "index:subtest:v1",
+ "columns=(v1)"));
+ testutil_check(session->create(session, "index:subtest:v2",
+ "columns=(v2)"));
+
+ testutil_check(session->close(session, NULL));
+
+ subtest_populate(opts, close_test);
+
+ testutil_cleanup(opts);
+
+ return (0);
+}
+
+/*
+ * This macro is used as a substitute for testutil_check, except that it is
+ * aware of when a failure may be expected due to the effects of the fail_fs.
+ * This macro is used only in subtest_populate(), it uses local variables.
+ */
+#define CHECK(expr) { \
+ int _ret; \
+ _ret = expr; \
+ if (_ret != 0) { \
+ if (!failmode || \
+ (_ret != WT_RUN_RECOVERY && _ret != EIO)) { \
+ fprintf(stderr, " BAD RETURN %d for \"%s\"\n", \
+ _ret, #expr); \
+ testutil_check(_ret); \
+ } else \
+ failed = true; \
+ } \
+}
+
+/*
+ * subtest_populate --
+ * Populate the tables.
+ */
+static void
+subtest_populate(TEST_OPTS *opts, bool close_test)
+{
+ WT_CURSOR *maincur, *maincur2;
+ WT_RAND_STATE rnd;
+ WT_SESSION *session;
+ uint64_t i, nrecords;
+ uint32_t rndint;
+ int key, v0, v1, v2;
+ char *big, *bigref;
+ bool failed, failmode;
+
+ failmode = failed = false;
+ __wt_random_init_seed(NULL, &rnd);
+ CHECK(create_big_string(&bigref));
+ nrecords = opts->nrecords;
+
+ CHECK(opts->conn->open_session(
+ opts->conn, NULL, NULL, &session));
+
+ CHECK(session->open_cursor(session, "table:subtest", NULL,
+ NULL, &maincur));
+
+ CHECK(session->open_cursor(session, "table:subtest2", NULL,
+ NULL, &maincur2));
+
+ for (i = 0; i < nrecords && !failed; i++) {
+ rndint = __wt_random(&rnd);
+ generate_key(i, &key);
+ generate_value(rndint, i, bigref, &v0, &v1, &v2, &big);
+ CHECK(session->begin_transaction(session, NULL));
+ maincur->set_key(maincur, key);
+ maincur->set_value(maincur, v0, v1, v2, big);
+ CHECK(maincur->insert(maincur));
+
+ maincur2->set_key(maincur2, key);
+ maincur2->set_value(maincur2, rndint);
+ CHECK(maincur2->insert(maincur2));
+ CHECK(session->commit_transaction(session, NULL));
+
+ if (i == 0)
+ /*
+ * Force an initial checkpoint, that helps to
+ * distinguish a clear failure from just not running
+ * long enough.
+ */
+ CHECK(session->checkpoint(session, NULL));
+
+ if ((i + 1) % VERBOSE_PRINT == 0 && opts->verbose)
+ printf(" %" PRIu64 "/%" PRIu64 "\n",
+ (i + 1), nrecords);
+ /* Attempt to isolate the failures to checkpointing. */
+ if (i == (nrecords/100)) {
+ enable_failures(opts->nops, 1000000);
+ failmode = true; /* CHECK should expect failures. */
+ CHECK(session->checkpoint(session, NULL));
+ failmode = false;
+ disable_failures();
+ if (failed && opts->verbose)
+ printf("checkpoint failed (expected).\n");
+ }
+ }
+
+ /*
+ * Closing handles after an extreme fail is likely to cause
+ * cascading failures (or crashes), so recommended practice is
+ * to immediately exit. We're interested in testing both with
+ * and without the recommended practice.
+ */
+ if (failed) {
+ if (!close_test) {
+ fprintf(stderr, "exit early.\n");
+ exit(0);
+ } else
+ fprintf(stderr, "closing after failure.\n");
+ }
+
+ free(bigref);
+ CHECK(maincur->close(maincur));
+ CHECK(maincur2->close(maincur2));
+ CHECK(session->close(session, NULL));
+}
+
+/*
+ * main --
+ * The main program for the test. When invoked with "subtest"
+ * argument, run the subtest. Otherwise, run a separate process
+ * for each needed subtest, and check the results.
+ */
+int
+main(int argc, char *argv[])
+{
+ TEST_OPTS *opts, _opts;
+ uint64_t nresults;
+ const char *debugger;
+
+ if (testutil_disable_long_tests())
+ return (0);
+ opts = &_opts;
+ memset(opts, 0, sizeof(*opts));
+ debugger = NULL;
+
+ testutil_check(testutil_parse_opts(argc, argv, opts));
+ argc -= __wt_optind;
+ argv += __wt_optind;
+ if (opts->nrecords == 0)
+ opts->nrecords = 50000;
+
+ while (argc > 0) {
+ if (strcmp(argv[0], "subtest") == 0)
+ return (subtest_main(argc, argv, false));
+ else if (strcmp(argv[0], "subtest_close") == 0)
+ return (subtest_main(argc, argv, true));
+ else if (strcmp(argv[0], "gdb") == 0)
+ debugger = "/usr/bin/gdb";
+ else
+ testutil_assert(false);
+ argc--;
+ argv++;
+ }
+ if (opts->verbose) {
+ printf("Number of operations until failure: %" PRIu64
+ " (change with -o N)\n", opts->nops);
+ printf("Number of records: %" PRIu64
+ " (change with -n N)\n", opts->nrecords);
+ }
+ if (opts->nops == 0) {
+ run_check_subtest_range(opts, debugger, false);
+ run_check_subtest_range(opts, debugger, true);
+ } else
+ run_check_subtest(opts, debugger, opts->nops,
+ opts->nrecords, &nresults);
+
+ testutil_clean_work_dir(opts->home);
+ testutil_cleanup(opts);
+
+ return (0);
+}
diff --git a/test/csuite/wt3120_filesys/main.c b/test/csuite/wt3120_filesys/main.c
new file mode 100644
index 00000000000..09dce624066
--- /dev/null
+++ b/test/csuite/wt3120_filesys/main.c
@@ -0,0 +1,99 @@
+/*-
+ * Public Domain 2014-2017 MongoDB, Inc.
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "test_util.h"
+
+/*
+ * JIRA ticket reference: WT-3120
+ * Test case description: A simple file system extension built into
+ * a shared library.
+ * Failure mode: Loading the file system and closing the connection
+ * is enough to evoke the failure. This test does slightly more
+ * than that.
+ */
+
+#define WT_FAIL_FS_LIB "../../ext/test/fail_fs/.libs/libwiredtiger_fail_fs.so"
+
+int
+main(int argc, char *argv[])
+{
+ TEST_OPTS *opts, _opts;
+ WT_CURSOR *cursor;
+ WT_SESSION *session;
+ char *kstr, *vstr;
+ char buf[1024];
+
+ opts = &_opts;
+ memset(opts, 0, sizeof(*opts));
+ testutil_check(testutil_parse_opts(argc, argv, opts));
+ testutil_make_work_dir(opts->home);
+
+ snprintf(buf, sizeof(buf),
+ "create,extensions=(" WT_FAIL_FS_LIB "=(early_load=true))");
+ testutil_check(wiredtiger_open(opts->home, NULL, buf, &opts->conn));
+ testutil_check(
+ opts->conn->open_session(opts->conn, NULL, NULL, &session));
+ testutil_check(session->create(session, opts->uri,
+ "key_format=S,value_format=S"));
+
+ testutil_check(session->open_cursor(session, opts->uri, NULL, NULL,
+ &cursor));
+ cursor->set_key(cursor, "a");
+ cursor->set_value(cursor, "0");
+ testutil_check(cursor->insert(cursor));
+ cursor->set_key(cursor, "b");
+ cursor->set_value(cursor, "1");
+ testutil_check(cursor->insert(cursor));
+ testutil_check(cursor->close(cursor));
+ testutil_check(session->close(session, NULL));
+
+ /* Force to disk and re-open. */
+ testutil_check(opts->conn->close(opts->conn, NULL));
+ testutil_check(wiredtiger_open(opts->home, NULL, NULL, &opts->conn));
+
+ testutil_check(
+ opts->conn->open_session(opts->conn, NULL, NULL, &session));
+ testutil_check(session->open_cursor(session, opts->uri, NULL, NULL,
+ &cursor));
+ testutil_check(cursor->next(cursor));
+ testutil_check(cursor->get_key(cursor, &kstr));
+ testutil_check(cursor->get_value(cursor, &vstr));
+ testutil_assert(strcmp(kstr, "a") == 0);
+ testutil_assert(strcmp(vstr, "0") == 0);
+ testutil_check(cursor->next(cursor));
+ testutil_check(cursor->get_key(cursor, &kstr));
+ testutil_check(cursor->get_value(cursor, &vstr));
+ testutil_assert(strcmp(kstr, "b") == 0);
+ testutil_assert(strcmp(vstr, "1") == 0);
+ testutil_assert(cursor->next(cursor) == WT_NOTFOUND);
+ testutil_check(cursor->close(cursor));
+ testutil_check(session->close(session, NULL));
+ printf("Success\n");
+
+ testutil_cleanup(opts);
+ return (EXIT_SUCCESS);
+}
diff --git a/test/csuite/wt3135_search_near_collator/main.c b/test/csuite/wt3135_search_near_collator/main.c
new file mode 100644
index 00000000000..8783034a7d8
--- /dev/null
+++ b/test/csuite/wt3135_search_near_collator/main.c
@@ -0,0 +1,360 @@
+/*-
+ * Public Domain 2014-2016 MongoDB, Inc.
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "test_util.h"
+
+/*
+ * JIRA ticket reference: WT-3135
+ * Test case description: Each set of data is ordered and contains
+ * five elements (0-4). We insert elements 1 and 3, and then do
+ * search_near and search for each element. For each set of data, we perform
+ * these tests first using a custom collator, and second using a custom collator
+ * and extractor. In each case there are index keys having variable length.
+ * Failure mode: In the reported test case, the custom compare routine is
+ * given a truncated key to compare, and the unpack functions return errors
+ * because the truncation appeared in the middle of a key.
+ */
+
+#define TEST_ENTRY_COUNT 5
+typedef const char *TEST_SET[TEST_ENTRY_COUNT];
+static TEST_SET test_sets[] = {
+ { "0", "01", "012", "0123", "01234" },
+ { "A", "B", "C", "D", "E" },
+ { "5", "54", "543", "5432", "54321" },
+ { "54321", "5433", "544", "55", "6" }
+};
+#define TEST_SET_COUNT (sizeof(test_sets) / sizeof(test_sets[0]))
+
+static bool
+item_str_equal(WT_ITEM *item, const char *str)
+{
+ return (item->size == strlen(str) + 1 && strncmp((char *)item->data,
+ str, item->size) == 0);
+}
+
+static int
+compare_int(int64_t a, int64_t b)
+{
+ return (a < b ? -1 : (a > b ? 1 : 0));
+}
+
+static int
+index_compare_primary(WT_PACK_STREAM *s1, WT_PACK_STREAM *s2, int *cmp)
+{
+ int64_t pkey1, pkey2;
+ int rc1, rc2;
+
+ rc1 = wiredtiger_unpack_int(s1, &pkey1);
+ rc2 = wiredtiger_unpack_int(s2, &pkey2);
+
+ if (rc1 == 0 && rc2 == 0)
+ *cmp = compare_int(pkey1, pkey2);
+ else if (rc1 != 0 && rc2 != 0)
+ *cmp = 0;
+ else if (rc1 != 0)
+ *cmp = -1;
+ else
+ *cmp = 1;
+ return (0);
+}
+
+static int
+index_compare_S(WT_COLLATOR *collator, WT_SESSION *session,
+ const WT_ITEM *key1, const WT_ITEM *key2, int *cmp)
+{
+ WT_PACK_STREAM *s1, *s2;
+ const char *skey1, *skey2;
+
+ (void)collator;
+
+ testutil_check(wiredtiger_unpack_start(session, "Si", key1->data,
+ key1->size, &s1));
+ testutil_check(wiredtiger_unpack_start(session, "Si", key2->data,
+ key2->size, &s2));
+
+ testutil_check(wiredtiger_unpack_str(s1, &skey1));
+ testutil_check(wiredtiger_unpack_str(s2, &skey2));
+
+ if ((*cmp = strcmp(skey1, skey2)) == 0)
+ testutil_check(index_compare_primary(s1, s2, cmp));
+
+ testutil_check(wiredtiger_pack_close(s1, NULL));
+ testutil_check(wiredtiger_pack_close(s2, NULL));
+
+ return (0);
+}
+
+static int
+index_compare_u(WT_COLLATOR *collator, WT_SESSION *session,
+ const WT_ITEM *key1, const WT_ITEM *key2, int *cmp)
+{
+ WT_ITEM skey1, skey2;
+ WT_PACK_STREAM *s1, *s2;
+
+ (void)collator;
+
+ testutil_check(wiredtiger_unpack_start(session, "ui", key1->data,
+ key1->size, &s1));
+ testutil_check(wiredtiger_unpack_start(session, "ui", key2->data,
+ key2->size, &s2));
+
+ testutil_check(wiredtiger_unpack_item(s1, &skey1));
+ testutil_check(wiredtiger_unpack_item(s2, &skey2));
+
+ if ((*cmp = strcmp(skey1.data, skey2.data)) == 0)
+ testutil_check(index_compare_primary(s1, s2, cmp));
+
+ testutil_check(wiredtiger_pack_close(s1, NULL));
+ testutil_check(wiredtiger_pack_close(s2, NULL));
+
+ return (0);
+}
+
+static int
+index_extractor_u(WT_EXTRACTOR *extractor, WT_SESSION *session,
+ const WT_ITEM *key, const WT_ITEM *value, WT_CURSOR *result_cursor)
+{
+ (void)extractor;
+ (void)session;
+ (void)key;
+
+ result_cursor->set_key(result_cursor, value);
+ return result_cursor->insert(result_cursor);
+}
+
+static WT_COLLATOR collator_S = { index_compare_S, NULL, NULL };
+static WT_COLLATOR collator_u = { index_compare_u, NULL, NULL };
+static WT_EXTRACTOR extractor_u = { index_extractor_u, NULL, NULL };
+
+/*
+ * Check search() and search_near() using the test string indicated
+ * by test_index.
+ */
+static void
+search_using_str(WT_CURSOR *cursor, TEST_SET test_set, int test_index)
+{
+ int exact, ret;
+ const char *result;
+ const char *str_01, *str_0123, *test_str;
+
+ testutil_assert(test_index >= 0 && test_index <= 4);
+ str_01 = test_set[1];
+ str_0123 = test_set[3];
+ test_str = test_set[test_index];
+
+ cursor->set_key(cursor, test_str);
+ testutil_check(cursor->search_near(cursor, &exact));
+ testutil_check(cursor->get_key(cursor, &result));
+
+ if (test_index == 0)
+ testutil_assert(strcmp(result, str_01) == 0 && exact > 0);
+ else if (test_index == 1)
+ testutil_assert(strcmp(result, str_01) == 0 && exact == 0);
+ else if (test_index == 2)
+ testutil_assert((strcmp(result, str_0123) == 0 && exact > 0) ||
+ (strcmp(result, str_01) == 0 && exact < 0));
+ else if (test_index == 3)
+ testutil_assert(strcmp(result, str_0123) == 0 && exact == 0);
+ else if (test_index == 4)
+ testutil_assert(strcmp(result, str_0123) == 0 && exact < 0);
+
+ cursor->set_key(cursor, test_str);
+ ret = cursor->search(cursor);
+
+ if (test_index == 0 || test_index == 2 || test_index == 4)
+ testutil_assert(ret == WT_NOTFOUND);
+ else if (test_index == 1 || test_index == 3)
+ testutil_assert(ret == 0);
+}
+
+/*
+ * Check search() and search_near() using the test string indicated
+ * by test_index against a table containing a variable sized item.
+ */
+static void
+search_using_item(WT_CURSOR *cursor, TEST_SET test_set, int test_index)
+{
+ WT_ITEM item;
+ size_t testlen;
+ int exact, ret;
+ const char *str_01, *str_0123, *test_str;
+
+ testutil_assert(test_index >= 0 && test_index <= 4);
+ str_01 = test_set[1];
+ str_0123 = test_set[3];
+ test_str = test_set[test_index];
+
+ testlen = strlen(test_str) + 1;
+ item.data = test_str;
+ item.size = testlen;
+ cursor->set_key(cursor, &item);
+ testutil_check(cursor->search_near(cursor, &exact));
+ testutil_check(cursor->get_key(cursor, &item));
+
+ if (test_index == 0)
+ testutil_assert(item_str_equal(&item, str_01) && exact > 0);
+ else if (test_index == 1)
+ testutil_assert(item_str_equal(&item, str_01) && exact == 0);
+ else if (test_index == 2)
+ testutil_assert((item_str_equal(&item, str_0123) && exact > 0)
+ || (item_str_equal(&item, str_01) && exact < 0));
+ else if (test_index == 3)
+ testutil_assert(item_str_equal(&item, str_0123) && exact == 0);
+ else if (test_index == 4)
+ testutil_assert(item_str_equal(&item, str_0123) && exact < 0);
+
+ item.data = test_str;
+ item.size = testlen;
+ cursor->set_key(cursor, &item);
+ ret = cursor->search(cursor);
+
+ if (test_index == 0 || test_index == 2 || test_index == 4)
+ testutil_assert(ret == WT_NOTFOUND);
+ else if (test_index == 1 || test_index == 3)
+ testutil_assert(ret == 0);
+}
+
+/*
+ * For each set of data, perform tests.
+ */
+static void
+test_one_set(WT_SESSION *session, TEST_SET set)
+{
+ WT_CURSOR *cursor;
+ WT_ITEM item;
+ int32_t i;
+
+ /*
+ * Part 1: Using a custom collator, insert some elements
+ * and verify results from search_near.
+ */
+
+ testutil_check(session->create(session,
+ "table:main", "key_format=i,value_format=S,columns=(k,v)"));
+ testutil_check(session->create(session,
+ "index:main:def_collator", "columns=(v)"));
+ testutil_check(session->create(session,
+ "index:main:custom_collator",
+ "columns=(v),collator=collator_S"));
+
+ /* Insert only elements #1 and #3. */
+ testutil_check(session->open_cursor(session,
+ "table:main", NULL, NULL, &cursor));
+ cursor->set_key(cursor, 0);
+ cursor->set_value(cursor, set[1]);
+ testutil_check(cursor->insert(cursor));
+ cursor->set_key(cursor, 1);
+ cursor->set_value(cursor, set[3]);
+ testutil_check(cursor->insert(cursor));
+ testutil_check(cursor->close(cursor));
+
+ /* Check all elements in def_collator index. */
+ testutil_check(session->open_cursor(session,
+ "index:main:def_collator", NULL, NULL, &cursor));
+ for (i = 0; i < (int32_t)TEST_ENTRY_COUNT; i++)
+ search_using_str(cursor, set, i);
+ testutil_check(cursor->close(cursor));
+
+ /* Check all elements in custom_collator index */
+ testutil_check(session->open_cursor(session,
+ "index:main:custom_collator", NULL, NULL, &cursor));
+ for (i = 0; i < (int32_t)TEST_ENTRY_COUNT; i++)
+ search_using_str(cursor, set, i);
+ testutil_check(cursor->close(cursor));
+
+ /*
+ * Part 2: perform the same checks using a custom collator and
+ * extractor.
+ */
+ testutil_check(session->create(session,
+ "table:main2", "key_format=i,value_format=u,columns=(k,v)"));
+
+ testutil_check(session->create(session, "index:main2:idx_w_coll",
+ "key_format=u,collator=collator_u,extractor=extractor_u"));
+
+ testutil_check(session->open_cursor(session,
+ "table:main2", NULL, NULL, &cursor));
+
+ memset(&item, 0, sizeof(item));
+ item.size = strlen(set[1]) + 1;
+ item.data = set[1];
+ cursor->set_key(cursor, 1);
+ cursor->set_value(cursor, &item);
+ testutil_check(cursor->insert(cursor));
+
+ item.size = strlen(set[3]) + 1;
+ item.data = set[3];
+ cursor->set_key(cursor, 3);
+ cursor->set_value(cursor, &item);
+ testutil_check(cursor->insert(cursor));
+
+ testutil_check(cursor->close(cursor));
+
+ testutil_check(session->open_cursor(session,
+ "index:main2:idx_w_coll", NULL, NULL, &cursor));
+ for (i = 0; i < (int32_t)TEST_ENTRY_COUNT; i++)
+ search_using_item(cursor, set, i);
+ testutil_check(cursor->close(cursor));
+
+ testutil_check(session->drop(session, "table:main", NULL));
+ testutil_check(session->drop(session, "table:main2", NULL));
+}
+
+int
+main(int argc, char *argv[])
+{
+ TEST_OPTS *opts, _opts;
+ WT_SESSION *session;
+ size_t i;
+
+ opts = &_opts;
+ memset(opts, 0, sizeof(*opts));
+ testutil_check(testutil_parse_opts(argc, argv, opts));
+ testutil_make_work_dir(opts->home);
+
+ testutil_check(wiredtiger_open(opts->home, NULL, "create",
+ &opts->conn));
+ testutil_check(
+ opts->conn->open_session(opts->conn, NULL, NULL, &session));
+
+ /* Add any collators and extractors used by tests */
+ testutil_check(opts->conn->add_collator(opts->conn, "collator_S",
+ &collator_S, NULL));
+ testutil_check(opts->conn->add_collator(opts->conn, "collator_u",
+ &collator_u, NULL));
+ testutil_check(opts->conn->add_extractor(opts->conn, "extractor_u",
+ &extractor_u, NULL));
+
+ for (i = 0; i < TEST_SET_COUNT; i++) {
+ printf("test set %" WT_SIZET_FMT "\n", i);
+ test_one_set(session, test_sets[i]);
+ }
+
+ testutil_check(session->close(session, NULL));
+ testutil_cleanup(opts);
+ return (EXIT_SUCCESS);
+}
diff --git a/test/csuite/wt3184_dup_index_collator/main.c b/test/csuite/wt3184_dup_index_collator/main.c
new file mode 100644
index 00000000000..bcefd2f1a3b
--- /dev/null
+++ b/test/csuite/wt3184_dup_index_collator/main.c
@@ -0,0 +1,168 @@
+/*-
+ * Public Domain 2014-2016 MongoDB, Inc.
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "test_util.h"
+
+/*
+ * JIRA ticket reference: WT-3184
+ * Test case description: Each set of data is ordered and contains
+ * five elements (0-4). We insert elements 1 and 3, and then do
+ * search_near and search for each element. For each set of data, we perform
+ * these tests first using a custom collator, and second using a custom collator
+ * and extractor. In each case there are index keys having variable length.
+ * Failure mode: In the reported test case, the custom compare routine is
+ * given a truncated key to compare, and the unpack functions return errors
+ * because the truncation appeared in the middle of a key.
+ */
+
+static int
+compare_int(int32_t a, int32_t b)
+{
+ return (a < b ? -1 : (a > b ? 1 : 0));
+}
+
+static int32_t
+item_to_int(WT_ITEM *item)
+{
+ testutil_assert(item->size == sizeof(int32_t));
+ return (*(int32_t *)item->data);
+}
+
+static int
+compare_int_items(WT_ITEM *itema, WT_ITEM *itemb)
+{
+ testutil_assert(itema->size == sizeof(int32_t));
+ testutil_assert(itemb->size == sizeof(int32_t));
+ return (compare_int(item_to_int(itema), item_to_int(itemb)));
+}
+
+static void
+print_int_item(const char *str, const WT_ITEM *item)
+{
+ if (item->size > 0) {
+ testutil_assert(item->size == sizeof(int32_t));
+ printf("%s%" PRId32, str, *(int32_t *)item->data);
+ } else
+ printf("%s<empty>", str);
+}
+
+static int
+index_compare(WT_COLLATOR *collator, WT_SESSION *session,
+ const WT_ITEM *key1, const WT_ITEM *key2, int *cmp)
+{
+ WT_ITEM ikey1, pkey1, ikey2, pkey2;
+
+ (void)collator;
+ testutil_check(wiredtiger_struct_unpack(session,
+ key1->data, key1->size, "uu", &ikey1, &pkey1));
+ testutil_check(wiredtiger_struct_unpack(session,
+ key2->data, key2->size, "uu", &ikey2, &pkey2));
+
+ print_int_item("index_compare: index key1 = ", &ikey1);
+ print_int_item(", primary key1 = ", &pkey1);
+ print_int_item(", index key2 = ", &ikey2);
+ print_int_item(", primary key2 = ", &pkey2);
+ printf("\n");
+
+ if ((*cmp = compare_int_items(&ikey1, &ikey2)) != 0)
+ return (0);
+
+ if (pkey1.size != 0 && pkey2.size != 0)
+ *cmp = compare_int_items(&pkey1, &pkey2);
+ else if (pkey1.size != 0)
+ *cmp = 1;
+ else if (pkey2.size != 0)
+ *cmp = -1;
+ else
+ *cmp = 0;
+
+ return (0);
+}
+
+static WT_COLLATOR index_coll = { index_compare, NULL, NULL };
+
+int
+main(int argc, char *argv[])
+{
+ TEST_OPTS *opts, _opts;
+ WT_CURSOR *cursor, *cursor1;
+ WT_ITEM got, k, v;
+ WT_SESSION *session;
+ int32_t ki, vi;
+
+ opts = &_opts;
+ memset(opts, 0, sizeof(*opts));
+ testutil_check(testutil_parse_opts(argc, argv, opts));
+ testutil_make_work_dir(opts->home);
+
+ testutil_check(wiredtiger_open(opts->home, NULL, "create",
+ &opts->conn));
+ testutil_check(
+ opts->conn->open_session(opts->conn, NULL, NULL, &session));
+
+ testutil_check(opts->conn->add_collator(opts->conn, "index_coll",
+ &index_coll, NULL));
+
+ testutil_check(session->create(session,
+ "table:main", "key_format=u,value_format=u,columns=(k,v)"));
+ testutil_check(session->create(session,
+ "index:main:index", "columns=(v),collator=index_coll"));
+
+ printf("adding new record\n");
+ testutil_check(session->open_cursor(session, "table:main", NULL, NULL,
+ &cursor));
+
+ ki = 13;
+ vi = 17;
+
+ k.data = &ki; k.size = sizeof(ki);
+ v.data = &vi; v.size = sizeof(vi);
+
+ cursor->set_key(cursor, &k);
+ cursor->set_value(cursor, &v);
+ testutil_check(cursor->insert(cursor));
+ testutil_check(cursor->close(cursor));
+
+ printf("positioning index cursor\n");
+
+ testutil_check(session->open_cursor(session, "index:main:index", NULL,
+ NULL, &cursor));
+ cursor->set_key(cursor, &v);
+ testutil_check(cursor->search(cursor));
+
+ printf("duplicating cursor\n");
+ testutil_check(session->open_cursor(session, NULL, cursor, NULL,
+ &cursor1));
+ cursor->get_value(cursor, &got);
+ testutil_assert(item_to_int(&got) == 17);
+ cursor1->get_value(cursor1, &got);
+ testutil_assert(item_to_int(&got) == 17);
+
+ testutil_check(session->close(session, NULL));
+ testutil_cleanup(opts);
+ return (EXIT_SUCCESS);
+}
diff --git a/test/suite/test_cursor_random.py b/test/suite/test_cursor_random.py
index 3bda6dc9946..ee0f85a29ee 100644
--- a/test/suite/test_cursor_random.py
+++ b/test/suite/test_cursor_random.py
@@ -71,6 +71,15 @@ class test_cursor_random(wttest.WiredTigerTestCase):
self.assertEquals(cursor.reset(), 0)
cursor.close()
+ # Check that next_random fails with an empty tree, repeatedly.
+ def test_cursor_random_empty(self):
+ uri = self.type
+ self.session.create(uri, 'key_format=S,value_format=S')
+ cursor = self.session.open_cursor(uri, None, self.config)
+ for i in range(1,5):
+ self.assertTrue(cursor.next(), wiredtiger.WT_NOTFOUND)
+ cursor.close
+
# Check that next_random works with a single value, repeatedly.
def test_cursor_random_single_record(self):
uri = self.type
@@ -127,6 +136,46 @@ class test_cursor_random(wttest.WiredTigerTestCase):
def test_cursor_random_multiple_page_records(self):
self.cursor_random_multiple_page_records(0)
+ # Check that next_random fails in the presence of a set of values, some of
+ # which are deleted.
+ def test_cursor_random_deleted_partial(self):
+ uri = self.type
+ ds = self.dataset(self, uri, 10000,
+ config='allocation_size=512,leaf_page_max=512')
+ ds.populate()
+
+ # Close the connection so everything is forced to disk.
+ self.reopen_conn()
+
+ start = self.session.open_cursor(uri, None)
+ start.set_key(ds.key(10))
+ end = self.session.open_cursor(uri, None)
+ end.set_key(ds.key(10000-10))
+ self.session.truncate(None, start, end, None)
+ self.assertEqual(start.close(), 0)
+ self.assertEqual(end.close(), 0)
+
+ cursor = self.session.open_cursor(uri, None, self.config)
+ for i in range(1,10):
+ self.assertEqual(cursor.next(), 0)
+
+ # Check that next_random fails in the presence of a set of values, all of
+ # which are deleted.
+ def test_cursor_random_deleted_all(self):
+ uri = self.type
+ ds = self.dataset(self, uri, 10000,
+ config='allocation_size=512,leaf_page_max=512')
+ ds.populate()
+
+ # Close the connection so everything is forced to disk.
+ self.reopen_conn()
+
+ self.session.truncate(uri, None, None, None)
+
+ cursor = self.session.open_cursor(uri, None, self.config)
+ for i in range(1,10):
+ self.assertTrue(cursor.next(), wiredtiger.WT_NOTFOUND)
+
# Check that opening a random cursor on column-store returns not-supported.
class test_cursor_random_column(wttest.WiredTigerTestCase):
scenarios = make_scenarios([
diff --git a/test/suite/test_reconfig04.py b/test/suite/test_reconfig04.py
index be5e6d3729e..51d9b91c1f4 100644
--- a/test/suite/test_reconfig04.py
+++ b/test/suite/test_reconfig04.py
@@ -26,9 +26,7 @@
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
-import fnmatch, os, time
import wiredtiger, wttest
-from wtdataset import SimpleDataSet
# test_reconfig04.py
# Test WT_SESSION::reconfigure
diff --git a/test/suite/test_sweep01.py b/test/suite/test_sweep01.py
index 71f8fcb180e..5559190caca 100644
--- a/test/suite/test_sweep01.py
+++ b/test/suite/test_sweep01.py
@@ -116,10 +116,15 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
# Give slow machines time to process files.
stat_cursor = self.session.open_cursor('statistics:', None, None)
this_nfile = stat_cursor[stat.conn.file_open][2]
+ removed = stat_cursor[stat.conn.dh_sweep_remove][2]
stat_cursor.close()
self.pr("==== loop " + str(sleep))
self.pr("this_nfile " + str(this_nfile))
- if this_nfile == final_nfile:
+ self.pr("removed " + str(removed))
+ # On slow machines there can be a lag where files get closed but
+ # the sweep server cannot yet remove the handles. So wait for the
+ # removed statistic to indicate forward progress too.
+ if this_nfile == final_nfile and removed != remove1:
break
c.close()
self.pr("Sweep loop took " + str(sleep))
diff --git a/test/utility/misc.c b/test/utility/misc.c
index 1491c9a6938..1ba08ddd77f 100644
--- a/test/utility/misc.c
+++ b/test/utility/misc.c
@@ -78,7 +78,7 @@ testutil_work_dir_from_path(char *buffer, size_t len, const char *dir)
* Remove the work directory.
*/
void
-testutil_clean_work_dir(char *dir)
+testutil_clean_work_dir(const char *dir)
{
size_t len;
int ret;
diff --git a/test/utility/test_util.h b/test/utility/test_util.h
index f6a9cd68e02..489bbe18d87 100644
--- a/test/utility/test_util.h
+++ b/test/utility/test_util.h
@@ -183,7 +183,7 @@ void *dmalloc(size_t);
void *drealloc(void *, size_t);
void *dstrdup(const void *);
void *dstrndup(const char *, size_t);
-void testutil_clean_work_dir(char *);
+void testutil_clean_work_dir(const char *);
void testutil_cleanup(TEST_OPTS *);
bool testutil_disable_long_tests(void);
void testutil_make_work_dir(char *);