summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2019-07-02 14:01:12 +1000
committerLuke Chen <luke.chen@mongodb.com>2019-07-02 14:02:00 +1000
commit11a4b5016f134896e088d8c1c96d1c17225a3c86 (patch)
tree3385861ec5bc7f5ce74049b8bc6bba3fc91720cf /src/third_party/wiredtiger
parentcdf7c88be60b287c316beda42b4ff9f197617942 (diff)
downloadmongo-11a4b5016f134896e088d8c1c96d1c17225a3c86.tar.gz
Import wiredtiger: d86b3a8a331a1ec478c4ea75ef1b15856b429790 from branch mongodb-4.2
ref: ee1bae2623..d86b3a8a33 for: 4.2.0-rc3 WT-4758 Create a workload that bottlenecks on the eviction server filling eviction queues WT-4821 Update evergreen config to pull correct source for largescale test WT-4875 Fix commit timestamp assert function to consider non transactional tombstones WT-4877 Uninitialized memory being written during checkpoints WT-4878 Disable random dhandle selection and fine tune eviction target calculations WT-4881 Soften the restrictions on re-entering reconciliation
Diffstat (limited to 'src/third_party/wiredtiger')
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/config.c27
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-scan.wtperf24
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/track.c18
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/wtperf.c277
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/wtperf.h5
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i10
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/block/block_ckpt_scan.c2
-rw-r--r--src/third_party/wiredtiger/src/docs/wtperf.dox8
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_lru.c92
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c104
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c7
-rw-r--r--src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c4
-rwxr-xr-x[-rw-r--r--]src/third_party/wiredtiger/test/evergreen.yml1
14 files changed, 449 insertions, 132 deletions
diff --git a/src/third_party/wiredtiger/bench/wtperf/config.c b/src/third_party/wiredtiger/bench/wtperf/config.c
index 549257b5089..18522e3f7e7 100644
--- a/src/third_party/wiredtiger/bench/wtperf/config.c
+++ b/src/third_party/wiredtiger/bench/wtperf/config.c
@@ -735,7 +735,8 @@ config_sanity(WTPERF *wtperf)
((opts->checkpoint_threads != 0 &&
opts->checkpoint_interval > opts->run_time) ||
opts->report_interval > opts->run_time ||
- opts->sample_interval > opts->run_time)) {
+ opts->sample_interval > opts->run_time ||
+ opts->scan_interval > opts->run_time)) {
fprintf(stderr, "interval value longer than the run-time\n");
return (EINVAL);
}
@@ -757,6 +758,29 @@ config_sanity(WTPERF *wtperf)
return (EINVAL);
}
+ if (opts->scan_pct > 100) {
+ fprintf(stderr,
+ "Invalid scan_pct - should be a percentage\n");
+ return (EINVAL);
+ }
+
+ /* If we have separate tables for scanning, we need a separate count. */
+ if ((opts->scan_icount > 0 && opts->scan_table_count == 0) ||
+ (opts->scan_icount == 0 && opts->scan_table_count > 0)) {
+ fprintf(stderr,
+ "scan_icount %" PRIu32
+ " and scan_table_count %" PRIu32
+ " must both be zero or nonzero.\n",
+ opts->scan_icount, opts->scan_table_count);
+ return (EINVAL);
+ }
+ if (opts->scan_interval > 0 && opts->icount == 0 &&
+ opts->scan_icount == 0) {
+ fprintf(stderr,
+ "Invalid scan_interval - requires icount to be non-zero\n");
+ return (EINVAL);
+ }
+
if (opts->value_sz_max < opts->value_sz) {
if (F_ISSET(wtperf, CFG_GROW)) {
fprintf(stderr, "value_sz_max %" PRIu32
@@ -948,6 +972,7 @@ config_opt_print(WTPERF *wtperf)
opts->checkpoint_threads, opts->checkpoint_interval);
printf("\t" "Reporting interval: %" PRIu32 "\n", opts->report_interval);
printf("\t" "Sampling interval: %" PRIu32 "\n", opts->sample_interval);
+ printf("\t" "Scan interval: %" PRIu32 "\n", opts->scan_interval);
printf("\t" "Verbosity: %" PRIu32 "\n", opts->verbose);
}
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-scan.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-scan.wtperf
new file mode 100644
index 00000000000..9c3dfa10d84
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-scan.wtperf
@@ -0,0 +1,24 @@
+# wtperf options file: evict btree configuration
+conn_config="cache_size=40G,checkpoint=(wait=60,log_size=2GB),eviction=(threads_min=12,threads_max=12),log=(enabled=true),session_max=600,eviction_target=60,statistics=(fast),statistics_log=(wait=1,json)"
+# 1B records * (key=12 + value=138) is about 150G total data size
+key_sz=12
+value_sz=138
+log_like_table=true
+table_config="type=file"
+icount=1000000000
+report_interval=5
+run_time=3600
+# Scans every 10 minutes for all the scan specific tables.
+# .4B records * (key=12 + value=138) is about 60G total data size for scan
+# Running on a machine with 64G physical memory, this exhausts both the
+# WT cache and the system cache.
+scan_interval=600
+scan_pct=100
+scan_table_count=20
+scan_icount=400000000
+populate_threads=5
+table_count=100
+threads=((count=400,reads=1),(count=20,inserts=1,throttle=500),(count=10,updates=1,throttle=500))
+# Add throughput/latency monitoring
+max_latency=50000
+sample_interval=5
diff --git a/src/third_party/wiredtiger/bench/wtperf/track.c b/src/third_party/wiredtiger/bench/wtperf/track.c
index ca380703764..3b8832dc6bf 100644
--- a/src/third_party/wiredtiger/bench/wtperf/track.c
+++ b/src/third_party/wiredtiger/bench/wtperf/track.c
@@ -69,6 +69,24 @@ sum_ckpt_ops(WTPERF *wtperf)
}
/*
+ * Return total scan operations.
+ */
+uint64_t
+sum_scan_ops(WTPERF *wtperf)
+{
+ CONFIG_OPTS *opts;
+ uint64_t total;
+
+ opts = wtperf->opts;
+
+ if (opts->scan_interval > 0)
+ total = wtperf->scanthreads->scan.ops;
+ else
+ total = 0;
+ return (total);
+}
+
+/*
* Return total operations count for the worker threads.
*/
static uint64_t
diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.c b/src/third_party/wiredtiger/bench/wtperf/wtperf.c
index ecbd91fe8cc..dc3bd4f891f 100644
--- a/src/third_party/wiredtiger/bench/wtperf/wtperf.c
+++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.c
@@ -41,6 +41,7 @@ static WT_THREAD_RET monitor(void *);
static WT_THREAD_RET populate_thread(void *);
static void randomize_value(WTPERF_THREAD *, char *);
static void recreate_dir(const char *);
+static WT_THREAD_RET scan_worker(void *);
static int start_all_runs(WTPERF *);
static int start_run(WTPERF *);
static void start_threads(WTPERF *, WORKLOAD *,
@@ -119,6 +120,15 @@ randomize_value(WTPERF_THREAD *thread, char *value_buf)
static uint32_t
map_key_to_table(CONFIG_OPTS *opts, uint64_t k)
{
+ /*
+ * The first part of the key range is reserved for dedicated
+ * scan tables, if any. The scan tables do not grow, but the
+ * rest of the key space may.
+ */
+ if (k < opts->scan_icount)
+ return ((uint32_t)
+ (opts->table_count + k % opts->scan_table_count));
+ k -= opts->scan_icount;
if (opts->range_partition) {
/* Take care to return a result in [0..table_count-1]. */
if (k > opts->icount + opts->random_range)
@@ -362,6 +372,7 @@ worker_async(void *arg)
continue;
break;
default:
+ lprintf(wtperf, 0, 0, "invalid op!");
goto err; /* can't happen */
}
@@ -376,8 +387,10 @@ worker_async(void *arg)
wtperf->uris[map_key_to_table(wtperf->opts, next_val)],
NULL, &cb, &asyncop)) == EBUSY)
(void)usleep(10000);
- if (ret != 0)
+ if (ret != 0) {
+ lprintf(wtperf, ret, 0, "failed async_new_op");
goto err;
+ }
asyncop->app_private = thread;
asyncop->set_key(asyncop, key_buf);
@@ -513,6 +526,7 @@ worker(void *arg)
WT_CURSOR **cursors, *cursor, *log_table_cursor, *tmp_cursor;
WT_SESSION *session;
size_t i;
+ uint32_t total_table_count;
int64_t ops, ops_per_txn;
uint64_t log_id, next_val, usecs;
uint8_t *op, *op_end;
@@ -570,8 +584,9 @@ worker(void *arg)
goto err;
}
} else {
- cursors = dcalloc(opts->table_count, sizeof(WT_CURSOR *));
- for (i = 0; i < opts->table_count; i++) {
+ total_table_count = opts->table_count + opts->scan_table_count;
+ cursors = dcalloc(total_table_count, sizeof(WT_CURSOR *));
+ for (i = 0; i < total_table_count; i++) {
if ((ret = session->open_cursor(session,
wtperf->uris[i], NULL, NULL, &cursors[i])) != 0) {
lprintf(wtperf, ret, 0,
@@ -669,7 +684,6 @@ worker(void *arg)
__wt_epoch(NULL, &start);
cursor->set_key(cursor, key_buf);
-
switch (*op) {
case WORKER_READ:
/*
@@ -764,7 +778,8 @@ worker(void *arg)
if (ret == WT_NOTFOUND)
break;
-op_err: if (ret == WT_ROLLBACK && ops_per_txn != 0) {
+op_err: if (ret == WT_ROLLBACK &&
+ (ops_per_txn != 0 || opts->log_like_table)) {
/*
* If we are running with explicit transactions
* configured and we hit a WT_ROLLBACK, then we
@@ -1031,7 +1046,7 @@ populate_thread(void *arg)
WT_SESSION *session;
size_t i;
uint64_t op, usecs;
- uint32_t opcount;
+ uint32_t opcount, total_table_count;
int intxn, measure_latency, ret, stress_checkpoint_due;
char *value_buf, *key_buf;
const char *cursor_config;
@@ -1044,6 +1059,7 @@ populate_thread(void *arg)
cursors = NULL;
ret = stress_checkpoint_due = 0;
trk = &thread->insert;
+ total_table_count = opts->table_count + opts->scan_table_count;
key_buf = thread->key_buf;
value_buf = thread->value_buf;
@@ -1058,8 +1074,8 @@ populate_thread(void *arg)
cursor_config =
(opts->populate_threads == 1 && !opts->index) ? "bulk" : NULL;
/* Create the cursors. */
- cursors = dcalloc(opts->table_count, sizeof(WT_CURSOR *));
- for (i = 0; i < opts->table_count; i++) {
+ cursors = dcalloc(total_table_count, sizeof(WT_CURSOR *));
+ for (i = 0; i < total_table_count; i++) {
if ((ret = session->open_cursor(
session, wtperf->uris[i], NULL,
cursor_config, &cursors[i])) != 0) {
@@ -1073,7 +1089,7 @@ populate_thread(void *arg)
/* Populate the databases. */
for (intxn = 0, opcount = 0;;) {
op = get_next_incr(wtperf);
- if (op > opts->icount)
+ if (op > (uint64_t)opts->icount + (uint64_t)opts->scan_icount)
break;
if (opts->populate_ops_per_txn != 0 && !intxn) {
@@ -1166,7 +1182,6 @@ populate_thread(void *arg)
err: wtperf->error = wtperf->stop = true;
}
free(cursors);
-
return (WT_THREAD_RET_VALUE);
}
@@ -1216,7 +1231,7 @@ populate_async(void *arg)
/* Populate the databases. */
for (;;) {
op = get_next_incr(wtperf);
- if (op > opts->icount)
+ if (op > (uint64_t)opts->icount + (uint64_t)opts->scan_icount)
break;
/*
* Allocate an async op for whichever table.
@@ -1225,8 +1240,10 @@ populate_async(void *arg)
conn, wtperf->uris[map_key_to_table(wtperf->opts, op)],
NULL, &cb, &asyncop)) == EBUSY)
(void)usleep(10000);
- if (ret != 0)
+ if (ret != 0) {
+ lprintf(wtperf, ret, 0, "Failed async_new_op");
goto err;
+ }
asyncop->app_private = thread;
generate_key(opts, key_buf, op);
@@ -1248,8 +1265,10 @@ populate_async(void *arg)
* async_flush and those calls will convoy. That is not the
* most efficient way, but we want to flush before measuring latency.
*/
- if (conn->async_flush(conn) != 0)
+ if (conn->async_flush(conn) != 0) {
+ lprintf(wtperf, ret, 0, "Failed async flush");
goto err;
+ }
if (measure_latency) {
__wt_epoch(NULL, &stop);
++trk->latency_ops;
@@ -1327,6 +1346,7 @@ monitor(void *arg)
"insert ops per second,"
"update ops per second,"
"checkpoints,"
+ "scans,"
"read average latency(uS),"
"read minimum latency(uS),"
"read maximum latency(uS),"
@@ -1378,7 +1398,7 @@ monitor(void *arg)
(void)fprintf(fp,
"%s,%" PRIu32
",%" PRIu64 ",%" PRIu64 ",%" PRIu64
- ",%c"
+ ",%c,%c"
",%" PRIu32 ",%" PRIu32 ",%" PRIu32
",%" PRIu32 ",%" PRIu32 ",%" PRIu32
",%" PRIu32 ",%" PRIu32 ",%" PRIu32
@@ -1386,6 +1406,7 @@ monitor(void *arg)
buf, wtperf->totalsec,
cur_reads, cur_inserts, cur_updates,
wtperf->ckpt ? 'Y' : 'N',
+ wtperf->scan ? 'Y' : 'N',
read_avg, read_min, read_max,
insert_avg, insert_min, insert_max,
update_avg, update_min, update_max);
@@ -1547,6 +1568,141 @@ err: wtperf->error = wtperf->stop = true;
return (WT_THREAD_RET_VALUE);
}
+static WT_THREAD_RET
+scan_worker(void *arg)
+{
+ CONFIG_OPTS *opts;
+ WTPERF *wtperf;
+ WTPERF_THREAD *thread;
+ WT_CONNECTION *conn;
+ WT_CURSOR *cursor, **cursors;
+ WT_SESSION *session;
+ const char *uri;
+ char *key_buf;
+ struct timespec e, s;
+ uint32_t i, ntables, pct, table_start;
+ uint64_t cur_id, end_id, incr, items, start_id, tot_items;
+ int ret;
+
+ thread = (WTPERF_THREAD *)arg;
+ key_buf = thread->key_buf;
+ wtperf = thread->wtperf;
+ opts = wtperf->opts;
+ conn = wtperf->conn;
+ session = NULL;
+ cursors = NULL;
+ items = 0;
+ uri = NULL;
+
+ /*
+ * Figure out how many items we should scan.
+ * We base the percentage on the icount.
+ */
+ pct = opts->scan_pct == 0 ? 100 : opts->scan_pct;
+ start_id = cur_id = 1;
+
+ /*
+ * When we scan the tables, we will increment the key by an amount
+ * that causes us to visit each table in order, and jump ahead in
+ * the key space when returning to a table. By doing this, we don't
+ * repeat keys until we visit them all, but we don't visit keys in
+ * sequential order. This might better emulate the access pattern
+ * to a main table when an index is scanned, or a more complex query
+ * is performed.
+ */
+ if (opts->scan_icount != 0) {
+ end_id = opts->scan_icount;
+ tot_items = (opts->scan_icount * pct) / 100;
+ incr = opts->scan_table_count * 1000 + 1;
+ table_start = opts->table_count;
+ ntables = opts->scan_table_count;
+ } else {
+ end_id = opts->icount;
+ tot_items = (opts->icount * pct) / 100;
+ incr = opts->table_count * 1000 + 1;
+ table_start = 0;
+ ntables = opts->table_count;
+ }
+ if ((ret = conn->open_session(
+ conn, NULL, opts->sess_config, &session)) != 0) {
+ lprintf(wtperf, ret, 0,
+ "open_session failed in scan thread.");
+ goto err;
+ }
+ cursors = dmalloc(ntables * sizeof(WT_CURSOR *));
+ for (i = 0; i < ntables; i++)
+ if ((ret = session->open_cursor(
+ session, wtperf->uris[i + table_start], NULL, NULL,
+ &cursors[i])) != 0) {
+ lprintf(wtperf, ret, 0,
+ "open_cursor failed in scan thread.");
+ goto err;
+ }
+
+ while (!wtperf->stop) {
+ /* Break the sleep up, so we notice interrupts faster. */
+ for (i = 0; i < opts->scan_interval; i++) {
+ sleep(1);
+ if (wtperf->stop)
+ break;
+ }
+ /* If the workers are done, don't bother with a final call. */
+ if (wtperf->stop)
+ break;
+
+ __wt_epoch(NULL, &s);
+
+ wtperf->scan = true;
+ items = 0;
+ while (items < tot_items && !wtperf->stop) {
+ cursor = cursors[map_key_to_table(opts, cur_id) -
+ table_start];
+ generate_key(opts, key_buf, cur_id);
+ cursor->set_key(cursor, key_buf);
+ if ((ret = cursor->search(cursor)) != 0) {
+ lprintf(wtperf, ret, 0, "Failed scan search "
+ "key %s, items %d", key_buf, (int)items);
+ goto err;
+ }
+
+ items++;
+ cur_id += incr;
+ if (cur_id >= end_id) {
+ /*
+ * Continue with the next slice of the key
+ * space.
+ */
+ cur_id = ++start_id;
+ if (cur_id >= end_id)
+ cur_id = start_id = 1;
+ }
+ }
+ wtperf->scan = false;
+ if (ret == 0)
+ ++thread->scan.ops;
+ else {
+ lprintf(wtperf, ret, 0, "Scan operation failed for %s.",
+ uri);
+ goto err;
+ }
+ __wt_epoch(NULL, &e);
+ }
+
+ if (session != NULL &&
+ ((ret = session->close(session, NULL)) != 0)) {
+ lprintf(wtperf, ret, 0,
+ "Error closing session in scan worker.");
+ goto err;
+ }
+
+ /* Notify our caller we failed and shut the system down. */
+ if (0) {
+err: wtperf->error = wtperf->stop = true;
+ }
+ free(cursors);
+ return (WT_THREAD_RET_VALUE);
+}
+
static int
execute_populate(WTPERF *wtperf)
{
@@ -1556,18 +1712,19 @@ execute_populate(WTPERF *wtperf)
WTPERF_THREAD *popth;
WT_THREAD_CALLBACK(*pfunc)(void *);
size_t i;
- uint64_t last_ops, msecs, print_ops_sec;
+ uint64_t last_ops, msecs, print_ops_sec, max_key;
uint32_t interval, tables;
wt_thread_t idle_table_cycle_thread;
double print_secs;
int elapsed, ret;
opts = wtperf->opts;
+ max_key = (uint64_t)opts->icount + (uint64_t)opts->scan_icount;
lprintf(wtperf, 0, 1,
"Starting %" PRIu32
- " populate thread(s) for %" PRIu32 " items",
- opts->populate_threads, opts->icount);
+ " populate thread(s) for %" PRIu64 " items",
+ opts->populate_threads, max_key);
/* Start cycling idle tables if configured. */
start_idle_table_cycle(wtperf, &idle_table_cycle_thread);
@@ -1587,7 +1744,7 @@ execute_populate(WTPERF *wtperf)
__wt_epoch(NULL, &start);
for (elapsed = 0, interval = 0, last_ops = 0;
- wtperf->insert_key < opts->icount && !wtperf->error;) {
+ wtperf->insert_key < max_key && !wtperf->error;) {
/*
* Sleep for 100th of a second, report_interval is in second
* granularity, each 100th increment of elapsed is a single
@@ -1751,8 +1908,8 @@ execute_workload(WTPERF *wtperf)
WT_SESSION **sessions;
WT_THREAD_CALLBACK(*pfunc)(void *);
wt_thread_t idle_table_cycle_thread;
- uint64_t last_ckpts, last_inserts, last_reads, last_truncates;
- uint64_t last_updates;
+ uint64_t last_ckpts, last_scans;
+ uint64_t last_inserts, last_reads, last_truncates, last_updates;
uint32_t interval, run_ops, run_time;
u_int i;
int ret;
@@ -1763,8 +1920,8 @@ execute_workload(WTPERF *wtperf)
wtperf->insert_ops = wtperf->read_ops = wtperf->truncate_ops = 0;
wtperf->update_ops = 0;
- last_ckpts = last_inserts = last_reads = last_truncates = 0;
- last_updates = 0;
+ last_ckpts = last_scans = 0;
+ last_inserts = last_reads = last_truncates = last_updates = 0;
ret = 0;
sessions = NULL;
@@ -1844,6 +2001,7 @@ execute_workload(WTPERF *wtperf)
/* Sum the operations we've done. */
wtperf->ckpt_ops = sum_ckpt_ops(wtperf);
+ wtperf->scan_ops = sum_scan_ops(wtperf);
wtperf->insert_ops = sum_insert_ops(wtperf);
wtperf->read_ops = sum_read_ops(wtperf);
wtperf->update_ops = sum_update_ops(wtperf);
@@ -1863,18 +2021,21 @@ execute_workload(WTPERF *wtperf)
lprintf(wtperf, 0, 1,
"%" PRIu64 " reads, %" PRIu64 " inserts, %" PRIu64
" updates, %" PRIu64 " truncates, %" PRIu64
- " checkpoints in %" PRIu32 " secs (%" PRIu32 " total secs)",
+ " checkpoints, %" PRIu64 " scans in %" PRIu32
+ " secs (%" PRIu32 " total secs)",
wtperf->read_ops - last_reads,
wtperf->insert_ops - last_inserts,
wtperf->update_ops - last_updates,
wtperf->truncate_ops - last_truncates,
wtperf->ckpt_ops - last_ckpts,
+ wtperf->scan_ops - last_scans,
opts->report_interval, wtperf->totalsec);
last_reads = wtperf->read_ops;
last_inserts = wtperf->insert_ops;
last_updates = wtperf->update_ops;
last_truncates = wtperf->truncate_ops;
last_ckpts = wtperf->ckpt_ops;
+ last_scans = wtperf->scan_ops;
}
/* Notify the worker threads they are done. */
@@ -1971,16 +2132,17 @@ create_uris(WTPERF *wtperf)
{
CONFIG_OPTS *opts;
size_t len;
- uint32_t i;
+ uint32_t i, total_table_count;
opts = wtperf->opts;
- wtperf->uris = dcalloc(opts->table_count, sizeof(char *));
+ total_table_count = opts->table_count + opts->scan_table_count;
+ wtperf->uris = dcalloc(total_table_count, sizeof(char *));
len = strlen("table:") + strlen(opts->table_name) + 20;
- for (i = 0; i < opts->table_count; i++) {
+ for (i = 0; i < total_table_count; i++) {
/* If there is only one table, just use the base name. */
wtperf->uris[i] = dmalloc(len);
- if (opts->table_count == 1)
+ if (total_table_count == 1)
testutil_check(__wt_snprintf(wtperf->uris[i],
len, "table:%s", opts->table_name));
else
@@ -2003,6 +2165,7 @@ create_tables(WTPERF *wtperf)
WT_SESSION *session;
size_t i;
int ret;
+ uint32_t total_table_count;
char buf[512];
opts = wtperf->opts;
@@ -2030,7 +2193,8 @@ create_tables(WTPERF *wtperf)
return (ret);
}
- for (i = 0; i < opts->table_count; i++) {
+ total_table_count = opts->table_count + opts->scan_table_count;
+ for (i = 0; i < total_table_count; i++) {
if (opts->log_partial && i > 0) {
if (((ret = session->create(session,
wtperf->uris[i], wtperf->partial_config)) != 0)) {
@@ -2075,8 +2239,10 @@ wtperf_copy(const WTPERF *src, WTPERF **retp)
CONFIG_OPTS *opts;
WTPERF *dest;
size_t i;
+ uint32_t total_table_count;
opts = src->opts;
+ total_table_count = opts->table_count + opts->scan_table_count;
dest = dcalloc(1, sizeof(WTPERF));
@@ -2091,8 +2257,8 @@ wtperf_copy(const WTPERF *src, WTPERF **retp)
dest->reopen_config = dstrdup(src->reopen_config);
if (src->uris != NULL) {
- dest->uris = dcalloc(opts->table_count, sizeof(char *));
- for (i = 0; i < opts->table_count; i++)
+ dest->uris = dcalloc(total_table_count, sizeof(char *));
+ for (i = 0; i < total_table_count; i++)
dest->uris[i] = dstrdup(src->uris[i]);
}
@@ -2100,6 +2266,7 @@ wtperf_copy(const WTPERF *src, WTPERF **retp)
dest->async_config = dstrdup(src->async_config);
dest->ckptthreads = NULL;
+ dest->scanthreads = NULL;
dest->popthreads = NULL;
dest->workers = NULL;
@@ -2137,7 +2304,7 @@ wtperf_free(WTPERF *wtperf)
free(wtperf->log_table_uri);
if (wtperf->uris != NULL) {
- for (i = 0; i < opts->table_count; i++)
+ for (i = 0; i < opts->table_count + opts->scan_table_count; i++)
free(wtperf->uris[i]);
free(wtperf->uris);
}
@@ -2145,6 +2312,7 @@ wtperf_free(WTPERF *wtperf)
free(wtperf->async_config);
free(wtperf->ckptthreads);
+ free(wtperf->scanthreads);
free(wtperf->popthreads);
free(wtperf->workers);
@@ -2336,10 +2504,19 @@ start_run(WTPERF *wtperf)
"Starting %" PRIu32 " checkpoint thread(s)",
opts->checkpoint_threads);
wtperf->ckptthreads = dcalloc(
- opts->checkpoint_threads, sizeof(WTPERF_THREAD));
+ opts->checkpoint_threads, sizeof(WTPERF_THREAD));
start_threads(wtperf, NULL, wtperf->ckptthreads,
opts->checkpoint_threads, checkpoint_worker);
}
+ /* Start the scan thread. */
+ if (opts->scan_interval != 0) {
+ lprintf(wtperf, 0, 1,
+ "Starting 1 scan thread");
+ wtperf->scanthreads = dcalloc(
+ 1, sizeof(WTPERF_THREAD));
+ start_threads(wtperf, NULL, wtperf->scanthreads,
+ 1, scan_worker);
+ }
if (opts->pre_load_data)
pre_load_data(wtperf);
@@ -2353,6 +2530,7 @@ start_run(WTPERF *wtperf)
wtperf->truncate_ops = sum_truncate_ops(wtperf);
wtperf->update_ops = sum_update_ops(wtperf);
wtperf->ckpt_ops = sum_ckpt_ops(wtperf);
+ wtperf->scan_ops = sum_scan_ops(wtperf);
total_ops =
wtperf->read_ops + wtperf->insert_ops + wtperf->update_ops;
@@ -2381,6 +2559,9 @@ start_run(WTPERF *wtperf)
lprintf(wtperf, 0, 1,
"Executed %" PRIu64 " checkpoint operations",
wtperf->ckpt_ops);
+ lprintf(wtperf, 0, 1,
+ "Executed %" PRIu64 " scan operations",
+ wtperf->scan_ops);
latency_print(wtperf);
}
@@ -2394,6 +2575,7 @@ err: if (ret == 0)
wtperf->stop = true;
stop_threads(1, wtperf->ckptthreads);
+ stop_threads(1, wtperf->scanthreads);
if (monitor_created != 0)
testutil_check(__wt_thread_join(NULL, &monitor_thread));
@@ -2794,9 +2976,11 @@ start_threads(WTPERF *wtperf, WORKLOAD *workp,
* for latency measurements, for the same reason.
*/
thread->ckpt.min_latency =
+ thread->scan.min_latency =
thread->insert.min_latency = thread->read.min_latency =
thread->update.min_latency = UINT32_MAX;
- thread->ckpt.max_latency = thread->insert.max_latency =
+ thread->ckpt.max_latency = thread->scan.max_latency =
+ thread->insert.max_latency =
thread->read.max_latency = thread->update.max_latency = 0;
}
@@ -2852,10 +3036,12 @@ drop_all_tables(WTPERF *wtperf)
CONFIG_OPTS *opts;
WT_SESSION *session;
size_t i;
+ uint32_t total_table_count;
uint64_t msecs;
int ret, t_ret;
opts = wtperf->opts;
+ total_table_count = opts->table_count + opts->scan_table_count;
/* Drop any tables. */
if ((ret = wtperf->conn->open_session(
@@ -2865,7 +3051,7 @@ drop_all_tables(WTPERF *wtperf)
return (ret);
}
__wt_epoch(NULL, &start);
- for (i = 0; i < opts->table_count; i++) {
+ for (i = 0; i < total_table_count; i++) {
if ((ret =
session->drop(session, wtperf->uris[i], NULL)) != 0) {
lprintf(wtperf, ret, 0,
@@ -2877,7 +3063,7 @@ drop_all_tables(WTPERF *wtperf)
msecs = WT_TIMEDIFF_MS(stop, start);
lprintf(wtperf, 0, 1,
"Executed %" PRIu32 " drop operations average time %" PRIu64 "ms",
- opts->table_count, msecs / opts->table_count);
+ total_table_count, msecs / total_table_count);
err: if ((t_ret = session->close(session, NULL)) != 0 && ret == 0)
ret = t_ret;
@@ -2888,18 +3074,20 @@ static uint64_t
wtperf_value_range(WTPERF *wtperf)
{
CONFIG_OPTS *opts;
+ uint64_t total_icount;
opts = wtperf->opts;
+ total_icount = (uint64_t)opts->scan_icount + (uint64_t)opts->icount;
if (opts->random_range)
- return (opts->icount + opts->random_range);
+ return (total_icount + opts->random_range);
/*
* It is legal to configure a zero size populate phase, hide that
* from other code by pretending the range is 1 in that case.
*/
- if (opts->icount + wtperf->insert_key == 0)
+ if (total_icount + wtperf->insert_key == 0)
return (1);
- return (opts->icount +
+ return (total_icount +
wtperf->insert_key - (u_int)(wtperf->workers_cnt + 1));
}
@@ -2910,12 +3098,15 @@ wtperf_rand(WTPERF_THREAD *thread)
WT_CURSOR *rnd_cursor;
WTPERF *wtperf;
double S1, S2, U;
- uint64_t rval;
+ uint64_t end_range, range, rval, start_range;
int ret;
char *key_buf;
wtperf = thread->wtperf;
opts = wtperf->opts;
+ end_range = wtperf_value_range(wtperf);
+ start_range = opts->scan_icount;
+ range = end_range - start_range;
/*
* If we have a random cursor set up then use it.
@@ -2953,7 +3144,7 @@ wtperf_rand(WTPERF_THREAD *thread)
if (opts->pareto != 0) {
#define PARETO_SHAPE 1.5
S1 = (-1 / PARETO_SHAPE);
- S2 = wtperf_value_range(wtperf) *
+ S2 = range *
(opts->pareto / 100.0) * (PARETO_SHAPE - 1);
U = 1 - (double)rval / (double)UINT32_MAX;
rval = (uint64_t)((pow(U, S1) - 1) * S2);
@@ -2962,13 +3153,13 @@ wtperf_rand(WTPERF_THREAD *thread)
* 2% of the time, from my testing. That will lead to the
* first item in the table being "hot".
*/
- if (rval > wtperf_value_range(wtperf))
+ if (rval > end_range)
rval = 0;
}
/*
* Wrap the key to within the expected range and avoid zero: we never
* insert that key.
*/
- rval = (rval % wtperf_value_range(wtperf)) + 1;
- return (rval);
+ rval = (rval % range) + 1;
+ return (start_range + rval);
}
diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.h b/src/third_party/wiredtiger/bench/wtperf/wtperf.h
index 7e43a62459a..e5163409b4e 100644
--- a/src/third_party/wiredtiger/bench/wtperf/wtperf.h
+++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.h
@@ -131,6 +131,7 @@ struct __wtperf { /* Per-database structure */
WTPERF_THREAD *ckptthreads; /* Checkpoint threads */
WTPERF_THREAD *popthreads; /* Populate threads */
+ WTPERF_THREAD *scanthreads; /* Scan threads */
#define WORKLOAD_MAX 50
WTPERF_THREAD *workers; /* Worker threads */
@@ -141,6 +142,7 @@ struct __wtperf { /* Per-database structure */
/* State tracking variables. */
uint64_t ckpt_ops; /* checkpoint operations */
+ uint64_t scan_ops; /* scan operations */
uint64_t insert_ops; /* insert operations */
uint64_t read_ops; /* read operations */
uint64_t truncate_ops; /* truncate operations */
@@ -150,6 +152,7 @@ struct __wtperf { /* Per-database structure */
uint64_t log_like_table_key; /* used to allocate IDs for log table */
volatile bool ckpt; /* checkpoint in progress */
+ volatile bool scan; /* scan in progress */
volatile bool error; /* thread error */
volatile bool stop; /* notify threads to stop */
volatile bool in_warmup; /* running warmup phase */
@@ -245,6 +248,7 @@ struct __wtperf_thread { /* Per-thread structure */
TRACK ckpt; /* Checkpoint operations */
TRACK insert; /* Insert operations */
TRACK read; /* Read operations */
+ TRACK scan; /* Scan operations */
TRACK update; /* Update operations */
TRACK truncate; /* Truncate operations */
TRACK truncate_sleep; /* Truncate sleep operations */
@@ -273,6 +277,7 @@ void start_idle_table_cycle(WTPERF *, wt_thread_t *);
void stop_idle_table_cycle(WTPERF *, wt_thread_t);
void worker_throttle(WTPERF_THREAD *);
uint64_t sum_ckpt_ops(WTPERF *);
+uint64_t sum_scan_ops(WTPERF *);
uint64_t sum_insert_ops(WTPERF *);
uint64_t sum_pop_ops(WTPERF *);
uint64_t sum_read_ops(WTPERF *);
diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i b/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i
index d312ee8526d..079c419908f 100644
--- a/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i
+++ b/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i
@@ -172,6 +172,16 @@ DEF_OPT_AS_UINT32(sample_interval, 0,
DEF_OPT_AS_UINT32(sample_rate, 50,
"how often the latency of operations is measured. One for every operation,"
"two for every second operation, three for every third operation etc.")
+DEF_OPT_AS_UINT32(scan_icount, 0,
+ "number of records in scan tables to populate")
+DEF_OPT_AS_UINT32(scan_interval, 0,
+ "scan tables every interval seconds during the workload phase,"
+ " 0 to disable")
+DEF_OPT_AS_UINT32(scan_pct, 10,
+ "percentage of entire data set scanned, if scan_interval is enabled")
+DEF_OPT_AS_UINT32(scan_table_count, 0,
+ "number of separate tables to be used for scanning. Zero indicates "
+ "that tables are shared with other operations")
DEF_OPT_AS_CONFIG_STRING(sess_config, "", "session configuration string")
DEF_OPT_AS_UINT32(session_count_idle, 0,
"number of idle sessions to create. Default 0.")
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 9809bf4e591..786a70c3e62 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -1,5 +1,5 @@
{
- "commit": "ee1bae262347285f46b5c56cc0490d20b9ee9c98",
+ "commit": "d86b3a8a331a1ec478c4ea75ef1b15856b429790",
"github": "wiredtiger/wiredtiger.git",
"vendor": "wiredtiger",
"branch": "mongodb-4.2"
diff --git a/src/third_party/wiredtiger/src/block/block_ckpt_scan.c b/src/third_party/wiredtiger/src/block/block_ckpt_scan.c
index 57934dd0422..0ab4ea72318 100644
--- a/src/third_party/wiredtiger/src/block/block_ckpt_scan.c
+++ b/src/third_party/wiredtiger/src/block/block_ckpt_scan.c
@@ -91,6 +91,8 @@ __wt_block_checkpoint_final(WT_SESSION_IMPL *session,
*/
size = buf->size + WT_INTPACK64_MAXSIZE;
WT_RET(__wt_buf_extend(session, buf, size));
+ p = (uint8_t *)buf->mem + buf->size;
+ memset(p, 0, WT_INTPACK64_MAXSIZE);
file_size_offset = buf->size;
buf->size = size;
diff --git a/src/third_party/wiredtiger/src/docs/wtperf.dox b/src/third_party/wiredtiger/src/docs/wtperf.dox
index 8aa95c5b635..2525a77d62b 100644
--- a/src/third_party/wiredtiger/src/docs/wtperf.dox
+++ b/src/third_party/wiredtiger/src/docs/wtperf.dox
@@ -202,6 +202,14 @@ total workload seconds
performance logging every interval seconds, 0 to disable
@par sample_rate (unsigned int, default=50)
how often the latency of operations is measured. One for every operation, two for every second operation, three for every third operation etc.
+@par scan_icount (unsigned int, default=0)
+number of records in scan tables to populate
+@par scan_interval (unsigned int, default=0)
+scan tables every interval seconds during the workload phase, 0 to disable
+@par scan_pct (unsigned int, default=10)
+percentage of entire data set scanned, if scan_interval is enabled
+@par scan_table_count (unsigned int, default=0)
+number of separate tables to be used for scanning. Zero indicates that tables are shared with other operations
@par sess_config (string, default="")
session configuration string
@par session_count_idle (unsigned int, default=0)
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index 397306e6aa6..931216376b9 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -18,7 +18,7 @@ static int __evict_server(WT_SESSION_IMPL *, bool *);
static void __evict_tune_workers(WT_SESSION_IMPL *session);
static int __evict_walk(WT_SESSION_IMPL *, WT_EVICT_QUEUE *);
static int __evict_walk_tree(
- WT_SESSION_IMPL *, WT_EVICT_QUEUE *, u_int, u_int *, uint64_t *);
+ WT_SESSION_IMPL *, WT_EVICT_QUEUE *, u_int, u_int *);
#define WT_EVICT_HAS_WORKERS(s) \
(S2C(s)->evict_threads.current_threads > 1)
@@ -1399,19 +1399,19 @@ __evict_walk_choose_dhandle(
u_int dh_bucket_count, rnd_bucket, rnd_dh;
conn = S2C(session);
- *dhandle_p = NULL;
WT_ASSERT(session, __wt_rwlock_islocked(session, &conn->dhandle_lock));
- /* Nothing to do if the dhandle list is empty. */
- if (TAILQ_EMPTY(&conn->dhqh))
- return;
+#undef RANDOM_DH_SELECTION_ENABLED
+
+#ifdef RANDOM_DH_SELECTION_ENABLED
+ *dhandle_p = NULL;
/*
- * If we do not have a lot of dhandles, most hash buckets will be empty.
+ * If we don't have many dhandles, most hash buckets will be empty.
* Just pick a random dhandle from the list in that case.
*/
- if (conn->dhandle_count < 10 * WT_HASH_ARRAY_SIZE) {
+ if (conn->dhandle_count < WT_HASH_ARRAY_SIZE / 4) {
rnd_dh = __wt_random(&session->rnd) % conn->dhandle_count;
dhandle = TAILQ_FIRST(&conn->dhqh);
for (; rnd_dh > 0; rnd_dh--)
@@ -1435,6 +1435,18 @@ __evict_walk_choose_dhandle(
dhandle = TAILQ_FIRST(&conn->dhhash[rnd_bucket]);
for (; rnd_dh > 0; rnd_dh--)
dhandle = TAILQ_NEXT(dhandle, hashq);
+#else
+ /* Just step through dhandles. */
+ dhandle = *dhandle_p;
+ if (dhandle != NULL)
+ dhandle = TAILQ_NEXT(dhandle, q);
+ if (dhandle == NULL)
+ dhandle = TAILQ_FIRST(&conn->dhqh);
+
+ WT_UNUSED(dh_bucket_count);
+ WT_UNUSED(rnd_bucket);
+ WT_UNUSED(rnd_dh);
+#endif
*dhandle_p = dhandle;
}
@@ -1452,9 +1464,8 @@ __evict_walk(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue)
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
WT_TRACK_OP_DECL;
- uint64_t loop_count;
- uint64_t pages_seen_file, pages_seen_interim, pages_seen_total;
- u_int max_entries, retries, slot, start_slot, total_candidates;
+ u_int loop_count, max_entries, retries, slot, start_slot;
+ u_int total_candidates;
bool dhandle_locked, incr;
WT_TRACK_OP_INIT(session);
@@ -1480,31 +1491,14 @@ __evict_walk(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue)
total_candidates = (u_int)(F_ISSET(cache, WT_CACHE_EVICT_CLEAN) ?
__wt_cache_pages_inuse(cache) : cache->pages_dirty_leaf);
max_entries = WT_MIN(max_entries, 1 + total_candidates / 2);
- pages_seen_interim = pages_seen_total = 0;
retry: loop_count = 0;
- while (slot < max_entries) {
- loop_count++;
-
+ while (slot < max_entries && loop_count++ < conn->dhandle_count) {
/* We're done if shutting down or reconfiguring. */
if (F_ISSET(conn, WT_CONN_CLOSING) ||
F_ISSET(conn, WT_CONN_RECONFIGURING))
break;
- /* If we have seen enough pages in this walk, we're done. */
- if (pages_seen_total > WT_EVICT_WALK_INCR * 100)
- break;
-
- /*
- * If we are not finding pages at all, we're done.
- * Every 100th iteration, check if we made progress.
- */
- if (loop_count % 100 == 0) {
- if (pages_seen_interim == pages_seen_total)
- break;
- pages_seen_interim = pages_seen_total;
- }
-
/*
* If another thread is waiting on the eviction server to clear
* the walk point in a tree, give up.
@@ -1620,9 +1614,8 @@ retry: loop_count = 0;
*/
cache->walk_tree = dhandle;
WT_WITH_DHANDLE(session, dhandle,
- ret = __evict_walk_tree(session, queue,
- max_entries, &slot, &pages_seen_file));
- pages_seen_total += pages_seen_file;
+ ret = __evict_walk_tree(
+ session, queue, max_entries, &slot));
WT_ASSERT(session, __wt_session_gen(
session, WT_GEN_SPLIT) == 0);
@@ -1713,22 +1706,14 @@ __evict_push_candidate(WT_SESSION_IMPL *session,
* Calculate how many pages to queue for a given tree.
*/
static uint32_t
-__evict_walk_target(WT_SESSION_IMPL *session, u_int max_entries)
+__evict_walk_target(WT_SESSION_IMPL *session)
{
WT_CACHE *cache;
uint64_t btree_inuse, bytes_per_slot, cache_inuse;
uint32_t target_pages_clean, target_pages_dirty, target_pages;
- uint32_t total_slots;
cache = S2C(session)->cache;
target_pages_clean = target_pages_dirty = 0;
- total_slots = max_entries;
-
- /*
- * The number of times we should fill the queue by the end of
- * considering all trees.
- */
-#define QUEUE_FILLS_PER_PASS 10
/*
* The minimum number of pages we should consider per tree.
@@ -1744,7 +1729,7 @@ __evict_walk_target(WT_SESSION_IMPL *session, u_int max_entries)
if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) {
btree_inuse = __wt_btree_bytes_evictable(session);
cache_inuse = __wt_cache_bytes_inuse(cache);
- bytes_per_slot = 1 + cache_inuse / total_slots;
+ bytes_per_slot = 1 + cache_inuse / cache->evict_slots;
target_pages_clean = (uint32_t)(
(btree_inuse + bytes_per_slot / 2) / bytes_per_slot);
}
@@ -1752,20 +1737,12 @@ __evict_walk_target(WT_SESSION_IMPL *session, u_int max_entries)
if (F_ISSET(cache, WT_CACHE_EVICT_DIRTY)) {
btree_inuse = __wt_btree_dirty_leaf_inuse(session);
cache_inuse = __wt_cache_dirty_leaf_inuse(cache);
- bytes_per_slot = 1 + cache_inuse / total_slots;
+ bytes_per_slot = 1 + cache_inuse / cache->evict_slots;
target_pages_dirty = (uint32_t)(
(btree_inuse + bytes_per_slot / 2) / bytes_per_slot);
}
- /*
- * Weight the number of target pages by the number of times we want to
- * fill the cache per pass through all the trees. Note that we don't
- * build this into the calculation above because we don't want to favor
- * small trees, so round to a whole number of slots (zero for small
- * trees) before multiplying.
- */
- target_pages = WT_MAX(target_pages_clean, target_pages_dirty) *
- QUEUE_FILLS_PER_PASS;
+ target_pages = WT_MAX(target_pages_clean, target_pages_dirty);
/*
* Walk trees with a small fraction of the cache in case there are so
@@ -1800,8 +1777,8 @@ __evict_walk_target(WT_SESSION_IMPL *session, u_int max_entries)
* Get a few page eviction candidates from a single underlying file.
*/
static int
-__evict_walk_tree(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue,
- u_int max_entries, u_int *slotp, uint64_t *pages_seen_p)
+__evict_walk_tree(WT_SESSION_IMPL *session,
+ WT_EVICT_QUEUE *queue, u_int max_entries, u_int *slotp)
{
WT_BTREE *btree;
WT_CACHE *cache;
@@ -1821,7 +1798,6 @@ __evict_walk_tree(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue,
last_parent = NULL;
restarts = 0;
give_up = urgent_queued = false;
- *pages_seen_p = 0;
/*
* Figure out how many slots to fill from this tree.
@@ -1830,12 +1806,10 @@ __evict_walk_tree(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue,
start = queue->evict_queue + *slotp;
remaining_slots = max_entries - *slotp;
if (btree->evict_walk_progress >= btree->evict_walk_target) {
- btree->evict_walk_target =
- __evict_walk_target(session, max_entries);
+ btree->evict_walk_target = __evict_walk_target(session);
btree->evict_walk_progress = 0;
}
- target_pages = WT_MIN(btree->evict_walk_target / QUEUE_FILLS_PER_PASS,
- btree->evict_walk_target - btree->evict_walk_progress);
+ target_pages = btree->evict_walk_target - btree->evict_walk_progress;
if (target_pages > remaining_slots)
target_pages = remaining_slots;
@@ -2194,8 +2168,6 @@ fast: /* If the page can't be evicted, give up. */
btree->evict_ref = ref;
}
- *pages_seen_p = pages_seen;
-
WT_STAT_CONN_INCRV(session, cache_eviction_walk, refs_walked);
WT_STAT_CONN_INCRV(session, cache_eviction_pages_seen, pages_seen);
WT_STAT_DATA_INCRV(session, cache_eviction_pages_seen, pages_seen);
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 51cabeda029..7193e6f2b2c 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -27,6 +27,8 @@ static void __rec_write_page_status(WT_SESSION_IMPL *, WT_RECONCILE *);
static int __rec_write_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
static int __rec_write_wrapup_err(
WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int __reconcile(WT_SESSION_IMPL *,
+ WT_REF *, WT_SALVAGE_COOKIE *, uint32_t, bool *, bool *);
/*
* __wt_reconcile --
@@ -36,19 +38,15 @@ int
__wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
WT_SALVAGE_COOKIE *salvage, uint32_t flags, bool *lookaside_retryp)
{
- WT_BTREE *btree;
WT_DECL_RET;
WT_PAGE *page;
- WT_PAGE_MODIFY *mod;
- WT_RECONCILE *r;
- uint64_t oldest_id;
+ bool no_reconcile_set, page_locked;
- btree = S2BT(session);
- page = ref->page;
- mod = page->modify;
if (lookaside_retryp != NULL)
*lookaside_retryp = false;
+ page = ref->page;
+
__wt_verbose(session, WT_VERB_RECONCILE,
"%p reconcile %s (%s%s%s)",
(void *)ref, __wt_page_type_string(page->type),
@@ -77,10 +75,19 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
LF_ISSET(WT_REC_VISIBLE_ALL) ||
F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT));
- /* We shouldn't get called with a clean page, that's an error. */
+ /* It's an error to be called with a clean page. */
WT_ASSERT(session, __wt_page_is_modified(page));
/*
+ * Reconciliation acquires and releases pages, and in rare cases that
+ * page release triggers eviction. If the page is dirty, eviction can
+ * trigger reconciliation, and we re-enter this code. Reconciliation
+ * isn't re-entrant, so we need to ensure that doesn't happen.
+ */
+ no_reconcile_set = F_ISSET(session, WT_SESSION_NO_RECONCILE);
+ F_SET(session, WT_SESSION_NO_RECONCILE);
+
+ /*
* Reconciliation locks the page for three reasons:
* Reconciliation reads the lists of page updates, obsolete updates
* cannot be discarded while reconciliation is in progress;
@@ -90,6 +97,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
* a child page splitting during the reconciliation.
*/
WT_PAGE_LOCK(session, page);
+ page_locked = true;
/*
* Now that the page is locked, if attempting to evict it, check again
@@ -97,20 +105,37 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
* while we were waiting to acquire the lock (e.g., the page could have
* split).
*/
- if (LF_ISSET(WT_REC_EVICT) &&
- !__wt_page_can_evict(session, ref, NULL)) {
- WT_PAGE_UNLOCK(session, page);
- return (__wt_set_return(session, EBUSY));
- }
+ if (LF_ISSET(WT_REC_EVICT) && !__wt_page_can_evict(session, ref, NULL))
+ WT_ERR(__wt_set_return(session, EBUSY));
- /* Initialize the reconciliation structure for each new run. */
- if ((ret = __rec_init(
- session, ref, flags, salvage, &session->reconcile)) != 0) {
+ /*
+ * Reconcile the page. The reconciliation code unlocks the page as soon
+ * as possible, and returns that information.
+ */
+ ret = __reconcile(session, ref,
+ salvage, flags, lookaside_retryp, &page_locked);
+
+err:
+ if (page_locked)
WT_PAGE_UNLOCK(session, page);
- return (ret);
- }
- r = session->reconcile;
+ if (!no_reconcile_set)
+ F_CLR(session, WT_SESSION_NO_RECONCILE);
+ return (ret);
+}
+/*
+ * __reconcile_save_evict_state --
+ * Save the transaction state that causes history to be pinned, whether
+ * reconciliation succeeds or fails.
+ */
+static void
+__reconcile_save_evict_state(
+ WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
+{
+ WT_PAGE_MODIFY *mod;
+ uint64_t oldest_id;
+
+ mod = ref->page->modify;
oldest_id = __wt_txn_oldest_id(session);
/*
@@ -136,6 +161,32 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
WT_ASSERT(session, WT_TXNID_LE(mod->last_oldest_id, oldest_id));
mod->last_oldest_id = oldest_id;
#endif
+}
+
+/*
+ * __reconcile --
+ * Reconcile an in-memory page into its on-disk format, and write it.
+ */
+static int
+__reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage,
+ uint32_t flags, bool *lookaside_retryp, bool *page_lockedp)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_MODIFY *mod;
+ WT_RECONCILE *r;
+
+ btree = S2BT(session);
+ page = ref->page;
+ mod = page->modify;
+
+ /* Save the eviction state. */
+ __reconcile_save_evict_state(session, ref, flags);
+
+ /* Initialize the reconciliation structure for each new run. */
+ WT_RET(__rec_init(session, ref, flags, salvage, &session->reconcile));
+ r = session->reconcile;
/* Reconcile the page. */
switch (page->type) {
@@ -190,6 +241,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
S2C(session)->txn_global.stable_timestamp;
/* Release the reconciliation lock. */
+ *page_lockedp = false;
WT_PAGE_UNLOCK(session, page);
/* Update statistics. */
@@ -522,7 +574,16 @@ __rec_init(WT_SESSION_IMPL *session,
btree = S2BT(session);
page = ref->page;
- if ((r = *(WT_RECONCILE **)reconcilep) == NULL) {
+ /*
+ * Reconciliation is not re-entrant, make sure that doesn't happen. Our
+ * caller sets WT_SESSION_IMPL.WT_SESSION_NO_RECONCILE to prevent it,
+ * but it's been a problem in the past, check to be sure.
+ */
+ r = *(WT_RECONCILE **)reconcilep;
+ if (r != NULL && r->ref != NULL)
+ WT_RET_MSG(session, WT_ERROR, "reconciliation re-entered");
+
+ if (r == NULL) {
WT_RET(__wt_calloc_one(session, &r));
session->reconcile_cleanup = __rec_destroy_session;
@@ -535,9 +596,6 @@ __rec_init(WT_SESSION_IMPL *session,
F_SET(&r->chunkB.image, WT_ITEM_ALIGNED);
}
- /* Reconciliation is not re-entrant, make sure that doesn't happen. */
- WT_ASSERT(session, r->ref == NULL);
-
/* Remember the configuration. */
r->ref = ref;
r->page = page;
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index 0469882c08e..53bde4a499b 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -731,11 +731,12 @@ __txn_commit_timestamps_assert(WT_SESSION_IMPL *session)
} else
upd = op->u.op_upd->next;
/*
- * Skip over any aborted update structures or ones
- * from our own transaction.
+ * Skip over any aborted update structures, internally
+ * created update structures or ones from our own
+ * transaction.
*/
while (upd != NULL && (upd->txnid == WT_TXN_ABORTED ||
- upd->txnid == txn->id))
+ upd->txnid == WT_TXN_NONE || upd->txnid == txn->id))
upd = upd->next;
/*
diff --git a/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c b/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c
index fea9ad0bfe3..40b4c543500 100644
--- a/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c
+++ b/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c
@@ -139,8 +139,10 @@ op(WT_SESSION *session, WT_RAND_STATE *rnd, WT_CURSOR **cpp)
/* Close the cursor half the time, otherwise cache it. */
if (__wt_random(rnd) % 2 == 0)
testutil_check(cursor->close(cursor));
- else
+ else {
+ testutil_check(cursor->reset(cursor));
*cpp = cursor;
+ }
(void)__wt_atomic_add64(&worker, 1);
}
diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml
index e9ae4d1e37b..5883b8dc42e 100644..100755
--- a/src/third_party/wiredtiger/test/evergreen.yml
+++ b/src/third_party/wiredtiger/test/evergreen.yml
@@ -1021,6 +1021,7 @@ tasks:
run_on:
- rhel62-large
commands:
+ - func: "fetch source"
- func: "fetch mongo-tests repo"
- command: shell.exec
params: