Import wiredtiger: d86b3a8a331a1ec478c4ea75ef1b15856b429790 from branch mongodb-4.2

ref: ee1bae2623..d86b3a8a33 for: 4.2.0-rc3 WT-4758 Create a workload that bottlenecks on the eviction server filling eviction queues WT-4821 Update evergreen config to pull correct source for largescale test WT-4875 Fix commit timestamp assert function to consider non transactional tombstones WT-4877 Uninitialized memory being written during checkpoints WT-4878 Disable random dhandle selection and fine tune eviction target calculations WT-4881 Soften the restrictions on re-entering reconciliation
author: Luke Chen <luke.chen@mongodb.com> 2019-07-02 14:01:12 +1000
committer: Luke Chen <luke.chen@mongodb.com> 2019-07-02 14:02:00 +1000
commit: 11a4b5016f134896e088d8c1c96d1c17225a3c86 (patch)
tree: 3385861ec5bc7f5ce74049b8bc6bba3fc91720cf /src/third_party/wiredtiger
parent: cdf7c88be60b287c316beda42b4ff9f197617942 (diff)
download: mongo-11a4b5016f134896e088d8c1c96d1c17225a3c86.tar.gz
14 files changed, 449 insertions, 132 deletions
diff --git a/src/third_party/wiredtiger/bench/wtperf/config.c b/src/third_party/wiredtiger/bench/wtperf/config.c
index 549257b5089..18522e3f7e7 100644
--- a/src/third_party/wiredtiger/bench/wtperf/config.c
+++ b/src/third_party/wiredtiger/bench/wtperf/config.c
@@ -735,7 +735,8 @@ config_sanity(WTPERF *wtperf)
 	    ((opts->checkpoint_threads != 0 &&
 	    opts->checkpoint_interval > opts->run_time) ||
 	    opts->report_interval > opts->run_time ||
-	    opts->sample_interval > opts->run_time)) {
+	    opts->sample_interval > opts->run_time ||
+	    opts->scan_interval > opts->run_time)) {
 		fprintf(stderr, "interval value longer than the run-time\n");
 		return (EINVAL);
 	}
@@ -757,6 +758,29 @@ config_sanity(WTPERF *wtperf)
 		return (EINVAL);
 	}
 
+	if (opts->scan_pct > 100) {
+		fprintf(stderr,
+		    "Invalid scan_pct - should be a percentage\n");
+		return (EINVAL);
+	}
+
+	/* If we have separate tables for scanning, we need a separate count. */
+	if ((opts->scan_icount > 0 && opts->scan_table_count == 0) ||
+	    (opts->scan_icount == 0 && opts->scan_table_count > 0)) {
+		fprintf(stderr,
+		    "scan_icount %" PRIu32
+		    " and scan_table_count %" PRIu32
+		    " must both be zero or nonzero.\n",
+		    opts->scan_icount, opts->scan_table_count);
+		return (EINVAL);
+	}
+	if (opts->scan_interval > 0 && opts->icount == 0 &&
+	    opts->scan_icount == 0) {
+		fprintf(stderr,
+		    "Invalid scan_interval - requires icount to be non-zero\n");
+		return (EINVAL);
+	}
+
 	if (opts->value_sz_max < opts->value_sz) {
 		if (F_ISSET(wtperf, CFG_GROW)) {
 			fprintf(stderr, "value_sz_max %" PRIu32
@@ -948,6 +972,7 @@ config_opt_print(WTPERF *wtperf)
 	    opts->checkpoint_threads, opts->checkpoint_interval);
 	printf("\t" "Reporting interval: %" PRIu32 "\n", opts->report_interval);
 	printf("\t" "Sampling interval: %" PRIu32 "\n", opts->sample_interval);
+	printf("\t" "Scan interval: %" PRIu32 "\n", opts->scan_interval);
 
 	printf("\t" "Verbosity: %" PRIu32 "\n", opts->verbose);
 }
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-scan.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-scan.wtperf
new file mode 100644
index 00000000000..9c3dfa10d84
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-scan.wtperf
@@ -0,0 +1,24 @@
+# wtperf options file: evict btree configuration
+conn_config="cache_size=40G,checkpoint=(wait=60,log_size=2GB),eviction=(threads_min=12,threads_max=12),log=(enabled=true),session_max=600,eviction_target=60,statistics=(fast),statistics_log=(wait=1,json)"
+# 1B records * (key=12 + value=138) is about 150G total data size
+key_sz=12
+value_sz=138
+log_like_table=true
+table_config="type=file"
+icount=1000000000
+report_interval=5
+run_time=3600
+# Scans every 10 minutes for all the scan specific tables.
+# .4B records * (key=12 + value=138) is about 60G total data size for scan
+# Running on a machine with 64G physical memory, this exhausts both the
+# WT cache and the system cache.
+scan_interval=600
+scan_pct=100
+scan_table_count=20
+scan_icount=400000000
+populate_threads=5
+table_count=100
+threads=((count=400,reads=1),(count=20,inserts=1,throttle=500),(count=10,updates=1,throttle=500))
+# Add throughput/latency monitoring
+max_latency=50000
+sample_interval=5
diff --git a/src/third_party/wiredtiger/bench/wtperf/track.c b/src/third_party/wiredtiger/bench/wtperf/track.c
index ca380703764..3b8832dc6bf 100644
--- a/src/third_party/wiredtiger/bench/wtperf/track.c
+++ b/src/third_party/wiredtiger/bench/wtperf/track.c
@@ -69,6 +69,24 @@ sum_ckpt_ops(WTPERF *wtperf)
 }
 
 /*
+ * Return total scan operations.
+ */
+uint64_t
+sum_scan_ops(WTPERF *wtperf)
+{
+	CONFIG_OPTS *opts;
+	uint64_t total;
+
+	opts = wtperf->opts;
+
+	if (opts->scan_interval > 0)
+		total = wtperf->scanthreads->scan.ops;
+	else
+		total = 0;
+	return (total);
+}
+
+/*
  * Return total operations count for the worker threads.
  */
 static uint64_t
diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.c b/src/third_party/wiredtiger/bench/wtperf/wtperf.c
index ecbd91fe8cc..dc3bd4f891f 100644
--- a/src/third_party/wiredtiger/bench/wtperf/wtperf.c
+++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.c
@@ -41,6 +41,7 @@ static WT_THREAD_RET monitor(void *);
 static WT_THREAD_RET populate_thread(void *);
 static void	 randomize_value(WTPERF_THREAD *, char *);
 static void	 recreate_dir(const char *);
+static WT_THREAD_RET scan_worker(void *);
 static int	 start_all_runs(WTPERF *);
 static int	 start_run(WTPERF *);
 static void	 start_threads(WTPERF *, WORKLOAD *,
@@ -119,6 +120,15 @@ randomize_value(WTPERF_THREAD *thread, char *value_buf)
 static uint32_t
 map_key_to_table(CONFIG_OPTS *opts, uint64_t k)
 {
+	/*
+	 * The first part of the key range is reserved for dedicated
+	 * scan tables, if any.  The scan tables do not grow, but the
+	 * rest of the key space may.
+	 */
+	if (k < opts->scan_icount)
+		return ((uint32_t)
+		    (opts->table_count + k % opts->scan_table_count));
+	k -= opts->scan_icount;
 	if (opts->range_partition) {
 		/* Take care to return a result in [0..table_count-1]. */
 		if (k > opts->icount + opts->random_range)
@@ -362,6 +372,7 @@ worker_async(void *arg)
 				continue;
 			break;
 		default:
+			lprintf(wtperf, 0, 0, "invalid op!");
 			goto err;		/* can't happen */
 		}
 
@@ -376,8 +387,10 @@ worker_async(void *arg)
 		    wtperf->uris[map_key_to_table(wtperf->opts, next_val)],
 		    NULL, &cb, &asyncop)) == EBUSY)
 			(void)usleep(10000);
-		if (ret != 0)
+		if (ret != 0) {
+			lprintf(wtperf, ret, 0, "failed async_new_op");
 			goto err;
+		}
 
 		asyncop->app_private = thread;
 		asyncop->set_key(asyncop, key_buf);
@@ -513,6 +526,7 @@ worker(void *arg)
 	WT_CURSOR **cursors, *cursor, *log_table_cursor, *tmp_cursor;
 	WT_SESSION *session;
 	size_t i;
+	uint32_t total_table_count;
 	int64_t ops, ops_per_txn;
 	uint64_t log_id, next_val, usecs;
 	uint8_t *op, *op_end;
@@ -570,8 +584,9 @@ worker(void *arg)
 			goto err;
 		}
 	} else {
-		cursors = dcalloc(opts->table_count, sizeof(WT_CURSOR *));
-		for (i = 0; i < opts->table_count; i++) {
+		total_table_count = opts->table_count + opts->scan_table_count;
+		cursors = dcalloc(total_table_count, sizeof(WT_CURSOR *));
+		for (i = 0; i < total_table_count; i++) {
 			if ((ret = session->open_cursor(session,
 			    wtperf->uris[i], NULL, NULL, &cursors[i])) != 0) {
 				lprintf(wtperf, ret, 0,
@@ -669,7 +684,6 @@ worker(void *arg)
 			__wt_epoch(NULL, &start);
 
 		cursor->set_key(cursor, key_buf);
-
 		switch (*op) {
 		case WORKER_READ:
 			/*
@@ -764,7 +778,8 @@ worker(void *arg)
 			if (ret == WT_NOTFOUND)
 				break;
 
-op_err:			if (ret == WT_ROLLBACK && ops_per_txn != 0) {
+op_err:			if (ret == WT_ROLLBACK &&
+			    (ops_per_txn != 0 || opts->log_like_table)) {
 				/*
 				 * If we are running with explicit transactions
 				 * configured and we hit a WT_ROLLBACK, then we
@@ -1031,7 +1046,7 @@ populate_thread(void *arg)
 	WT_SESSION *session;
 	size_t i;
 	uint64_t op, usecs;
-	uint32_t opcount;
+	uint32_t opcount, total_table_count;
 	int intxn, measure_latency, ret, stress_checkpoint_due;
 	char *value_buf, *key_buf;
 	const char *cursor_config;
@@ -1044,6 +1059,7 @@ populate_thread(void *arg)
 	cursors = NULL;
 	ret = stress_checkpoint_due = 0;
 	trk = &thread->insert;
+	total_table_count = opts->table_count + opts->scan_table_count;
 
 	key_buf = thread->key_buf;
 	value_buf = thread->value_buf;
@@ -1058,8 +1074,8 @@ populate_thread(void *arg)
 	cursor_config =
 	    (opts->populate_threads == 1 && !opts->index) ? "bulk" : NULL;
 	/* Create the cursors. */
-	cursors = dcalloc(opts->table_count, sizeof(WT_CURSOR *));
-	for (i = 0; i < opts->table_count; i++) {
+	cursors = dcalloc(total_table_count, sizeof(WT_CURSOR *));
+	for (i = 0; i < total_table_count; i++) {
 		if ((ret = session->open_cursor(
 		    session, wtperf->uris[i], NULL,
 		    cursor_config, &cursors[i])) != 0) {
@@ -1073,7 +1089,7 @@ populate_thread(void *arg)
 	/* Populate the databases. */
 	for (intxn = 0, opcount = 0;;) {
 		op = get_next_incr(wtperf);
-		if (op > opts->icount)
+		if (op > (uint64_t)opts->icount + (uint64_t)opts->scan_icount)
 			break;
 
 		if (opts->populate_ops_per_txn != 0 && !intxn) {
@@ -1166,7 +1182,6 @@ populate_thread(void *arg)
 err:		wtperf->error = wtperf->stop = true;
 	}
 	free(cursors);
-
 	return (WT_THREAD_RET_VALUE);
 }
 
@@ -1216,7 +1231,7 @@ populate_async(void *arg)
 	/* Populate the databases. */
 	for (;;) {
 		op = get_next_incr(wtperf);
-		if (op > opts->icount)
+		if (op > (uint64_t)opts->icount + (uint64_t)opts->scan_icount)
 			break;
 		/*
 		 * Allocate an async op for whichever table.
@@ -1225,8 +1240,10 @@ populate_async(void *arg)
 		    conn, wtperf->uris[map_key_to_table(wtperf->opts, op)],
 		    NULL, &cb, &asyncop)) == EBUSY)
 			(void)usleep(10000);
-		if (ret != 0)
+		if (ret != 0) {
+			lprintf(wtperf, ret, 0, "Failed async_new_op");
 			goto err;
+		}
 
 		asyncop->app_private = thread;
 		generate_key(opts, key_buf, op);
@@ -1248,8 +1265,10 @@ populate_async(void *arg)
 	 * async_flush and those calls will convoy.  That is not the
 	 * most efficient way, but we want to flush before measuring latency.
 	 */
-	if (conn->async_flush(conn) != 0)
+	if (conn->async_flush(conn) != 0) {
+		lprintf(wtperf, ret, 0, "Failed async flush");
 		goto err;
+	}
 	if (measure_latency) {
 		__wt_epoch(NULL, &stop);
 		++trk->latency_ops;
@@ -1327,6 +1346,7 @@ monitor(void *arg)
 	    "insert ops per second,"
 	    "update ops per second,"
 	    "checkpoints,"
+	    "scans,"
 	    "read average latency(uS),"
 	    "read minimum latency(uS),"
 	    "read maximum latency(uS),"
@@ -1378,7 +1398,7 @@ monitor(void *arg)
 		(void)fprintf(fp,
 		    "%s,%" PRIu32
 		    ",%" PRIu64 ",%" PRIu64 ",%" PRIu64
-		    ",%c"
+		    ",%c,%c"
 		    ",%" PRIu32 ",%" PRIu32 ",%" PRIu32
 		    ",%" PRIu32 ",%" PRIu32 ",%" PRIu32
 		    ",%" PRIu32 ",%" PRIu32 ",%" PRIu32
@@ -1386,6 +1406,7 @@ monitor(void *arg)
 		    buf, wtperf->totalsec,
 		    cur_reads, cur_inserts, cur_updates,
 		    wtperf->ckpt ? 'Y' : 'N',
+		    wtperf->scan ? 'Y' : 'N',
 		    read_avg, read_min, read_max,
 		    insert_avg, insert_min, insert_max,
 		    update_avg, update_min, update_max);
@@ -1547,6 +1568,141 @@ err:		wtperf->error = wtperf->stop = true;
 	return (WT_THREAD_RET_VALUE);
 }
 
+static WT_THREAD_RET
+scan_worker(void *arg)
+{
+	CONFIG_OPTS *opts;
+	WTPERF *wtperf;
+	WTPERF_THREAD *thread;
+	WT_CONNECTION *conn;
+	WT_CURSOR *cursor, **cursors;
+	WT_SESSION *session;
+	const char *uri;
+	char *key_buf;
+	struct timespec e, s;
+	uint32_t i, ntables, pct, table_start;
+	uint64_t cur_id, end_id, incr, items, start_id, tot_items;
+	int ret;
+
+	thread = (WTPERF_THREAD *)arg;
+	key_buf = thread->key_buf;
+	wtperf = thread->wtperf;
+	opts = wtperf->opts;
+	conn = wtperf->conn;
+	session = NULL;
+	cursors = NULL;
+	items = 0;
+	uri = NULL;
+
+	/*
+	 * Figure out how many items we should scan.
+	 * We base the percentage on the icount.
+	 */
+	pct = opts->scan_pct == 0 ? 100 : opts->scan_pct;
+	start_id = cur_id = 1;
+
+	/*
+	 * When we scan the tables, we will increment the key by an amount
+	 * that causes us to visit each table in order, and jump ahead in
+	 * the key space when returning to a table. By doing this, we don't
+	 * repeat keys until we visit them all, but we don't visit keys in
+	 * sequential order.  This might better emulate the access pattern
+	 * to a main table when an index is scanned, or a more complex query
+	 * is performed.
+	 */
+	if (opts->scan_icount != 0) {
+		end_id = opts->scan_icount;
+		tot_items = (opts->scan_icount * pct) / 100;
+		incr = opts->scan_table_count * 1000 + 1;
+		table_start = opts->table_count;
+		ntables = opts->scan_table_count;
+	} else {
+		end_id = opts->icount;
+		tot_items = (opts->icount * pct) / 100;
+		incr = opts->table_count * 1000 + 1;
+		table_start = 0;
+		ntables = opts->table_count;
+	}
+	if ((ret = conn->open_session(
+	    conn, NULL, opts->sess_config, &session)) != 0) {
+		lprintf(wtperf, ret, 0,
+		    "open_session failed in scan thread.");
+		goto err;
+	}
+	cursors = dmalloc(ntables * sizeof(WT_CURSOR *));
+	for (i = 0; i < ntables; i++)
+		if ((ret = session->open_cursor(
+		    session, wtperf->uris[i + table_start], NULL, NULL,
+		    &cursors[i])) != 0) {
+			lprintf(wtperf, ret, 0,
+			    "open_cursor failed in scan thread.");
+			goto err;
+		}
+
+	while (!wtperf->stop) {
+		/* Break the sleep up, so we notice interrupts faster. */
+		for (i = 0; i < opts->scan_interval; i++) {
+			sleep(1);
+			if (wtperf->stop)
+				break;
+		}
+		/* If the workers are done, don't bother with a final call. */
+		if (wtperf->stop)
+			break;
+
+		__wt_epoch(NULL, &s);
+
+		wtperf->scan = true;
+		items = 0;
+		while (items < tot_items && !wtperf->stop) {
+			cursor = cursors[map_key_to_table(opts, cur_id) -
+			    table_start];
+			generate_key(opts, key_buf, cur_id);
+			cursor->set_key(cursor, key_buf);
+			if ((ret = cursor->search(cursor)) != 0) {
+				lprintf(wtperf, ret, 0, "Failed scan search "
+				    "key %s, items %d", key_buf, (int)items);
+				goto err;
+			}
+
+			items++;
+			cur_id += incr;
+			if (cur_id >= end_id) {
+				/*
+				 * Continue with the next slice of the key
+				 * space.
+				 */
+				cur_id = ++start_id;
+				if (cur_id >= end_id)
+					cur_id = start_id = 1;
+			}
+		}
+		wtperf->scan = false;
+		if (ret == 0)
+			++thread->scan.ops;
+		else {
+			lprintf(wtperf, ret, 0, "Scan operation failed for %s.",
+			    uri);
+			goto err;
+		}
+		__wt_epoch(NULL, &e);
+	}
+
+	if (session != NULL &&
+	    ((ret = session->close(session, NULL)) != 0)) {
+		lprintf(wtperf, ret, 0,
+		    "Error closing session in scan worker.");
+		goto err;
+	}
+
+	/* Notify our caller we failed and shut the system down. */
+	if (0) {
+err:		wtperf->error = wtperf->stop = true;
+	}
+	free(cursors);
+	return (WT_THREAD_RET_VALUE);
+}
+
 static int
 execute_populate(WTPERF *wtperf)
 {
@@ -1556,18 +1712,19 @@ execute_populate(WTPERF *wtperf)
 	WTPERF_THREAD *popth;
 	WT_THREAD_CALLBACK(*pfunc)(void *);
 	size_t i;
-	uint64_t last_ops, msecs, print_ops_sec;
+	uint64_t last_ops, msecs, print_ops_sec, max_key;
 	uint32_t interval, tables;
 	wt_thread_t idle_table_cycle_thread;
 	double print_secs;
 	int elapsed, ret;
 
 	opts = wtperf->opts;
+	max_key = (uint64_t)opts->icount + (uint64_t)opts->scan_icount;
 
 	lprintf(wtperf, 0, 1,
 	    "Starting %" PRIu32
-	    " populate thread(s) for %" PRIu32 " items",
-	    opts->populate_threads, opts->icount);
+	    " populate thread(s) for %" PRIu64 " items",
+	    opts->populate_threads, max_key);
 
 	/* Start cycling idle tables if configured. */
 	start_idle_table_cycle(wtperf, &idle_table_cycle_thread);
@@ -1587,7 +1744,7 @@ execute_populate(WTPERF *wtperf)
 
 	__wt_epoch(NULL, &start);
 	for (elapsed = 0, interval = 0, last_ops = 0;
-	    wtperf->insert_key < opts->icount && !wtperf->error;) {
+	     wtperf->insert_key < max_key && !wtperf->error;) {
 		/*
 		 * Sleep for 100th of a second, report_interval is in second
 		 * granularity, each 100th increment of elapsed is a single
@@ -1751,8 +1908,8 @@ execute_workload(WTPERF *wtperf)
 	WT_SESSION **sessions;
 	WT_THREAD_CALLBACK(*pfunc)(void *);
 	wt_thread_t idle_table_cycle_thread;
-	uint64_t last_ckpts, last_inserts, last_reads, last_truncates;
-	uint64_t last_updates;
+	uint64_t last_ckpts, last_scans;
+	uint64_t last_inserts, last_reads, last_truncates, last_updates;
 	uint32_t interval, run_ops, run_time;
 	u_int i;
 	int ret;
@@ -1763,8 +1920,8 @@ execute_workload(WTPERF *wtperf)
 	wtperf->insert_ops = wtperf->read_ops = wtperf->truncate_ops = 0;
 	wtperf->update_ops = 0;
 
-	last_ckpts = last_inserts = last_reads = last_truncates = 0;
-	last_updates = 0;
+	last_ckpts = last_scans = 0;
+	last_inserts = last_reads = last_truncates = last_updates = 0;
 	ret = 0;
 
 	sessions = NULL;
@@ -1844,6 +2001,7 @@ execute_workload(WTPERF *wtperf)
 
 		/* Sum the operations we've done. */
 		wtperf->ckpt_ops = sum_ckpt_ops(wtperf);
+		wtperf->scan_ops = sum_scan_ops(wtperf);
 		wtperf->insert_ops = sum_insert_ops(wtperf);
 		wtperf->read_ops = sum_read_ops(wtperf);
 		wtperf->update_ops = sum_update_ops(wtperf);
@@ -1863,18 +2021,21 @@ execute_workload(WTPERF *wtperf)
 		lprintf(wtperf, 0, 1,
 		    "%" PRIu64 " reads, %" PRIu64 " inserts, %" PRIu64
 		    " updates, %" PRIu64 " truncates, %" PRIu64
-		    " checkpoints in %" PRIu32 " secs (%" PRIu32 " total secs)",
+		    " checkpoints, %" PRIu64 " scans in %" PRIu32
+		    " secs (%" PRIu32 " total secs)",
 		    wtperf->read_ops - last_reads,
 		    wtperf->insert_ops - last_inserts,
 		    wtperf->update_ops - last_updates,
 		    wtperf->truncate_ops - last_truncates,
 		    wtperf->ckpt_ops - last_ckpts,
+		    wtperf->scan_ops - last_scans,
 		    opts->report_interval, wtperf->totalsec);
 		last_reads = wtperf->read_ops;
 		last_inserts = wtperf->insert_ops;
 		last_updates = wtperf->update_ops;
 		last_truncates = wtperf->truncate_ops;
 		last_ckpts = wtperf->ckpt_ops;
+		last_scans = wtperf->scan_ops;
 	}
 
 	/* Notify the worker threads they are done. */
@@ -1971,16 +2132,17 @@ create_uris(WTPERF *wtperf)
 {
 	CONFIG_OPTS *opts;
 	size_t len;
-	uint32_t i;
+	uint32_t i, total_table_count;
 
 	opts = wtperf->opts;
 
-	wtperf->uris = dcalloc(opts->table_count, sizeof(char *));
+	total_table_count = opts->table_count + opts->scan_table_count;
+	wtperf->uris = dcalloc(total_table_count, sizeof(char *));
 	len = strlen("table:") + strlen(opts->table_name) + 20;
-	for (i = 0; i < opts->table_count; i++) {
+	for (i = 0; i < total_table_count; i++) {
 		/* If there is only one table, just use the base name. */
 		wtperf->uris[i] = dmalloc(len);
-		if (opts->table_count == 1)
+		if (total_table_count == 1)
 			testutil_check(__wt_snprintf(wtperf->uris[i],
 			    len, "table:%s", opts->table_name));
 		else
@@ -2003,6 +2165,7 @@ create_tables(WTPERF *wtperf)
 	WT_SESSION *session;
 	size_t i;
 	int ret;
+	uint32_t total_table_count;
 	char buf[512];
 
 	opts = wtperf->opts;
@@ -2030,7 +2193,8 @@ create_tables(WTPERF *wtperf)
 		return (ret);
 	}
 
-	for (i = 0; i < opts->table_count; i++) {
+	total_table_count = opts->table_count + opts->scan_table_count;
+	for (i = 0; i < total_table_count; i++) {
 		if (opts->log_partial && i > 0) {
 			if (((ret = session->create(session,
 			    wtperf->uris[i], wtperf->partial_config)) != 0)) {
@@ -2075,8 +2239,10 @@ wtperf_copy(const WTPERF *src, WTPERF **retp)
 	CONFIG_OPTS *opts;
 	WTPERF *dest;
 	size_t i;
+	uint32_t total_table_count;
 
 	opts = src->opts;
+	total_table_count = opts->table_count + opts->scan_table_count;
 
 	dest = dcalloc(1, sizeof(WTPERF));
 
@@ -2091,8 +2257,8 @@ wtperf_copy(const WTPERF *src, WTPERF **retp)
 		dest->reopen_config = dstrdup(src->reopen_config);
 
 	if (src->uris != NULL) {
-		dest->uris = dcalloc(opts->table_count, sizeof(char *));
-		for (i = 0; i < opts->table_count; i++)
+		dest->uris = dcalloc(total_table_count, sizeof(char *));
+		for (i = 0; i < total_table_count; i++)
 			dest->uris[i] = dstrdup(src->uris[i]);
 	}
 
@@ -2100,6 +2266,7 @@ wtperf_copy(const WTPERF *src, WTPERF **retp)
 		dest->async_config = dstrdup(src->async_config);
 
 	dest->ckptthreads = NULL;
+	dest->scanthreads = NULL;
 	dest->popthreads = NULL;
 
 	dest->workers = NULL;
@@ -2137,7 +2304,7 @@ wtperf_free(WTPERF *wtperf)
 	free(wtperf->log_table_uri);
 
 	if (wtperf->uris != NULL) {
-		for (i = 0; i < opts->table_count; i++)
+		for (i = 0; i < opts->table_count + opts->scan_table_count; i++)
 			free(wtperf->uris[i]);
 		free(wtperf->uris);
 	}
@@ -2145,6 +2312,7 @@ wtperf_free(WTPERF *wtperf)
 	free(wtperf->async_config);
 
 	free(wtperf->ckptthreads);
+	free(wtperf->scanthreads);
 	free(wtperf->popthreads);
 
 	free(wtperf->workers);
@@ -2336,10 +2504,19 @@ start_run(WTPERF *wtperf)
 			    "Starting %" PRIu32 " checkpoint thread(s)",
 			    opts->checkpoint_threads);
 			wtperf->ckptthreads = dcalloc(
-			     opts->checkpoint_threads, sizeof(WTPERF_THREAD));
+			    opts->checkpoint_threads, sizeof(WTPERF_THREAD));
 			start_threads(wtperf, NULL, wtperf->ckptthreads,
 			    opts->checkpoint_threads, checkpoint_worker);
 		}
+		/* Start the scan thread. */
+		if (opts->scan_interval != 0) {
+			lprintf(wtperf, 0, 1,
+			    "Starting 1 scan thread");
+			wtperf->scanthreads = dcalloc(
+			    1, sizeof(WTPERF_THREAD));
+			start_threads(wtperf, NULL, wtperf->scanthreads,
+			    1, scan_worker);
+		}
 		if (opts->pre_load_data)
 			pre_load_data(wtperf);
 
@@ -2353,6 +2530,7 @@ start_run(WTPERF *wtperf)
 		wtperf->truncate_ops = sum_truncate_ops(wtperf);
 		wtperf->update_ops = sum_update_ops(wtperf);
 		wtperf->ckpt_ops = sum_ckpt_ops(wtperf);
+		wtperf->scan_ops = sum_scan_ops(wtperf);
 		total_ops =
 		    wtperf->read_ops + wtperf->insert_ops + wtperf->update_ops;
 
@@ -2381,6 +2559,9 @@ start_run(WTPERF *wtperf)
 		lprintf(wtperf, 0, 1,
 		    "Executed %" PRIu64 " checkpoint operations",
 		    wtperf->ckpt_ops);
+		lprintf(wtperf, 0, 1,
+		    "Executed %" PRIu64 " scan operations",
+		    wtperf->scan_ops);
 
 		latency_print(wtperf);
 	}
@@ -2394,6 +2575,7 @@ err:		if (ret == 0)
 	wtperf->stop = true;
 
 	stop_threads(1, wtperf->ckptthreads);
+	stop_threads(1, wtperf->scanthreads);
 
 	if (monitor_created != 0)
 		testutil_check(__wt_thread_join(NULL, &monitor_thread));
@@ -2794,9 +2976,11 @@ start_threads(WTPERF *wtperf, WORKLOAD *workp,
 		 * for latency measurements, for the same reason.
 		 */
 		thread->ckpt.min_latency =
+		thread->scan.min_latency =
 		thread->insert.min_latency = thread->read.min_latency =
 		thread->update.min_latency = UINT32_MAX;
-		thread->ckpt.max_latency = thread->insert.max_latency =
+		thread->ckpt.max_latency = thread->scan.max_latency =
+		thread->insert.max_latency =
 		thread->read.max_latency = thread->update.max_latency = 0;
 	}
 
@@ -2852,10 +3036,12 @@ drop_all_tables(WTPERF *wtperf)
 	CONFIG_OPTS *opts;
 	WT_SESSION *session;
 	size_t i;
+	uint32_t total_table_count;
 	uint64_t msecs;
 	int ret, t_ret;
 
 	opts = wtperf->opts;
+	total_table_count = opts->table_count + opts->scan_table_count;
 
 	/* Drop any tables. */
 	if ((ret = wtperf->conn->open_session(
@@ -2865,7 +3051,7 @@ drop_all_tables(WTPERF *wtperf)
 		return (ret);
 	}
 	__wt_epoch(NULL, &start);
-	for (i = 0; i < opts->table_count; i++) {
+	for (i = 0; i < total_table_count; i++) {
 		if ((ret =
 		    session->drop(session, wtperf->uris[i], NULL)) != 0) {
 			lprintf(wtperf, ret, 0,
@@ -2877,7 +3063,7 @@ drop_all_tables(WTPERF *wtperf)
 	msecs = WT_TIMEDIFF_MS(stop, start);
 	lprintf(wtperf, 0, 1,
 	    "Executed %" PRIu32 " drop operations average time %" PRIu64 "ms",
-	    opts->table_count, msecs / opts->table_count);
+	    total_table_count, msecs / total_table_count);
 
 err:	if ((t_ret = session->close(session, NULL)) != 0 && ret == 0)
 		ret = t_ret;
@@ -2888,18 +3074,20 @@ static uint64_t
 wtperf_value_range(WTPERF *wtperf)
 {
 	CONFIG_OPTS *opts;
+	uint64_t total_icount;
 
 	opts = wtperf->opts;
+	total_icount = (uint64_t)opts->scan_icount + (uint64_t)opts->icount;
 
 	if (opts->random_range)
-		return (opts->icount + opts->random_range);
+		return (total_icount + opts->random_range);
 	/*
 	 * It is legal to configure a zero size populate phase, hide that
 	 * from other code by pretending the range is 1 in that case.
 	 */
-	if (opts->icount + wtperf->insert_key == 0)
+	if (total_icount + wtperf->insert_key == 0)
 		return (1);
-	return (opts->icount +
+	return (total_icount +
 	    wtperf->insert_key - (u_int)(wtperf->workers_cnt + 1));
 }
 
@@ -2910,12 +3098,15 @@ wtperf_rand(WTPERF_THREAD *thread)
 	WT_CURSOR *rnd_cursor;
 	WTPERF *wtperf;
 	double S1, S2, U;
-	uint64_t rval;
+	uint64_t end_range, range, rval, start_range;
 	int ret;
 	char *key_buf;
 
 	wtperf = thread->wtperf;
 	opts = wtperf->opts;
+	end_range = wtperf_value_range(wtperf);
+	start_range = opts->scan_icount;
+	range = end_range - start_range;
 
 	/*
 	 * If we have a random cursor set up then use it.
@@ -2953,7 +3144,7 @@ wtperf_rand(WTPERF_THREAD *thread)
 	if (opts->pareto != 0) {
 #define	PARETO_SHAPE	1.5
 		S1 = (-1 / PARETO_SHAPE);
-		S2 = wtperf_value_range(wtperf) *
+		S2 = range *
 		    (opts->pareto / 100.0) * (PARETO_SHAPE - 1);
 		U = 1 - (double)rval / (double)UINT32_MAX;
 		rval = (uint64_t)((pow(U, S1) - 1) * S2);
@@ -2962,13 +3153,13 @@ wtperf_rand(WTPERF_THREAD *thread)
 		 * 2% of the time, from my testing. That will lead to the
 		 * first item in the table being "hot".
 		 */
-		if (rval > wtperf_value_range(wtperf))
+		if (rval > end_range)
 			rval = 0;
 	}
 	/*
 	 * Wrap the key to within the expected range and avoid zero: we never
 	 * insert that key.
 	 */
-	rval = (rval % wtperf_value_range(wtperf)) + 1;
-	return (rval);
+	rval = (rval % range) + 1;
+	return (start_range + rval);
 }
diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.h b/src/third_party/wiredtiger/bench/wtperf/wtperf.h
index 7e43a62459a..e5163409b4e 100644
--- a/src/third_party/wiredtiger/bench/wtperf/wtperf.h
+++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.h
@@ -131,6 +131,7 @@ struct __wtperf {			/* Per-database structure */
 
 	WTPERF_THREAD *ckptthreads;	/* Checkpoint threads */
 	WTPERF_THREAD *popthreads;	/* Populate threads */
+	WTPERF_THREAD *scanthreads;	/* Scan threads */
 
 #define	WORKLOAD_MAX	50
 	WTPERF_THREAD	*workers;	/* Worker threads */
@@ -141,6 +142,7 @@ struct __wtperf {			/* Per-database structure */
 
 	/* State tracking variables. */
 	uint64_t ckpt_ops;		/* checkpoint operations */
+	uint64_t scan_ops;		/* scan operations */
 	uint64_t insert_ops;		/* insert operations */
 	uint64_t read_ops;		/* read operations */
 	uint64_t truncate_ops;		/* truncate operations */
@@ -150,6 +152,7 @@ struct __wtperf {			/* Per-database structure */
 	uint64_t log_like_table_key;	/* used to allocate IDs for log table */
 
 	volatile bool ckpt;		/* checkpoint in progress */
+	volatile bool scan;		/* scan in progress */
 	volatile bool error;		/* thread error */
 	volatile bool stop;		/* notify threads to stop */
 	volatile bool in_warmup;	/* running warmup phase */
@@ -245,6 +248,7 @@ struct __wtperf_thread {		/* Per-thread structure */
 	TRACK ckpt;			/* Checkpoint operations */
 	TRACK insert;			/* Insert operations */
 	TRACK read;			/* Read operations */
+	TRACK scan;			/* Scan operations */
 	TRACK update;			/* Update operations */
 	TRACK truncate;			/* Truncate operations */
 	TRACK truncate_sleep;		/* Truncate sleep operations */
@@ -273,6 +277,7 @@ void	 start_idle_table_cycle(WTPERF *, wt_thread_t *);
 void	 stop_idle_table_cycle(WTPERF *, wt_thread_t);
 void	 worker_throttle(WTPERF_THREAD *);
 uint64_t sum_ckpt_ops(WTPERF *);
+uint64_t sum_scan_ops(WTPERF *);
 uint64_t sum_insert_ops(WTPERF *);
 uint64_t sum_pop_ops(WTPERF *);
 uint64_t sum_read_ops(WTPERF *);
diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i b/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i
index d312ee8526d..079c419908f 100644
--- a/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i
+++ b/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i
@@ -172,6 +172,16 @@ DEF_OPT_AS_UINT32(sample_interval, 0,
 DEF_OPT_AS_UINT32(sample_rate, 50,
     "how often the latency of operations is measured. One for every operation,"
     "two for every second operation, three for every third operation etc.")
+DEF_OPT_AS_UINT32(scan_icount, 0,
+    "number of records in scan tables to populate")
+DEF_OPT_AS_UINT32(scan_interval, 0,
+    "scan tables every interval seconds during the workload phase,"
+    " 0 to disable")
+DEF_OPT_AS_UINT32(scan_pct, 10,
+    "percentage of entire data set scanned, if scan_interval is enabled")
+DEF_OPT_AS_UINT32(scan_table_count, 0,
+    "number of separate tables to be used for scanning. Zero indicates "
+    "that tables are shared with other operations")
 DEF_OPT_AS_CONFIG_STRING(sess_config, "", "session configuration string")
 DEF_OPT_AS_UINT32(session_count_idle, 0,
     "number of idle sessions to create. Default 0.")
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 9809bf4e591..786a70c3e62 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -1,5 +1,5 @@
 {
-    "commit": "ee1bae262347285f46b5c56cc0490d20b9ee9c98", 
+    "commit": "d86b3a8a331a1ec478c4ea75ef1b15856b429790", 
     "github": "wiredtiger/wiredtiger.git", 
     "vendor": "wiredtiger", 
     "branch": "mongodb-4.2"
diff --git a/src/third_party/wiredtiger/src/block/block_ckpt_scan.c b/src/third_party/wiredtiger/src/block/block_ckpt_scan.c
index 57934dd0422..0ab4ea72318 100644
--- a/src/third_party/wiredtiger/src/block/block_ckpt_scan.c
+++ b/src/third_party/wiredtiger/src/block/block_ckpt_scan.c
@@ -91,6 +91,8 @@ __wt_block_checkpoint_final(WT_SESSION_IMPL *session,
 	 */
 	size = buf->size + WT_INTPACK64_MAXSIZE;
 	WT_RET(__wt_buf_extend(session, buf, size));
+	p = (uint8_t *)buf->mem + buf->size;
+	memset(p, 0, WT_INTPACK64_MAXSIZE);
 	file_size_offset = buf->size;
 	buf->size = size;
 
diff --git a/src/third_party/wiredtiger/src/docs/wtperf.dox b/src/third_party/wiredtiger/src/docs/wtperf.dox
index 8aa95c5b635..2525a77d62b 100644
--- a/src/third_party/wiredtiger/src/docs/wtperf.dox
+++ b/src/third_party/wiredtiger/src/docs/wtperf.dox
@@ -202,6 +202,14 @@ total workload seconds
 performance logging every interval seconds, 0 to disable
 @par sample_rate (unsigned int, default=50)
 how often the latency of operations is measured. One for every operation, two for every second operation, three for every third operation etc.
+@par scan_icount (unsigned int, default=0)
+number of records in scan tables to populate
+@par scan_interval (unsigned int, default=0)
+scan tables every interval seconds during the workload phase,  0 to disable
+@par scan_pct (unsigned int, default=10)
+percentage of entire data set scanned, if scan_interval is enabled
+@par scan_table_count (unsigned int, default=0)
+number of separate tables to be used for scanning. Zero indicates  that tables are shared with other operations
 @par sess_config (string, default="")
 session configuration string
 @par session_count_idle (unsigned int, default=0)
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index 397306e6aa6..931216376b9 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -18,7 +18,7 @@ static int  __evict_server(WT_SESSION_IMPL *, bool *);
 static void __evict_tune_workers(WT_SESSION_IMPL *session);
 static int  __evict_walk(WT_SESSION_IMPL *, WT_EVICT_QUEUE *);
 static int  __evict_walk_tree(
-    WT_SESSION_IMPL *, WT_EVICT_QUEUE *, u_int, u_int *, uint64_t *);
+    WT_SESSION_IMPL *, WT_EVICT_QUEUE *, u_int, u_int *);
 
 #define	WT_EVICT_HAS_WORKERS(s)				\
 	(S2C(s)->evict_threads.current_threads > 1)
@@ -1399,19 +1399,19 @@ __evict_walk_choose_dhandle(
 	u_int dh_bucket_count, rnd_bucket, rnd_dh;
 
 	conn = S2C(session);
-	*dhandle_p = NULL;
 
 	WT_ASSERT(session, __wt_rwlock_islocked(session, &conn->dhandle_lock));
 
-	/* Nothing to do if the dhandle list is empty. */
-	if (TAILQ_EMPTY(&conn->dhqh))
-		return;
+#undef RANDOM_DH_SELECTION_ENABLED
+
+#ifdef RANDOM_DH_SELECTION_ENABLED
+	*dhandle_p = NULL;
 
 	/*
-	 * If we do not have a lot of dhandles, most hash buckets will be empty.
+	 * If we don't have many dhandles, most hash buckets will be empty.
 	 * Just pick a random dhandle from the list in that case.
 	 */
-	if (conn->dhandle_count < 10 * WT_HASH_ARRAY_SIZE) {
+	if (conn->dhandle_count < WT_HASH_ARRAY_SIZE / 4) {
 		rnd_dh = __wt_random(&session->rnd) % conn->dhandle_count;
 		dhandle = TAILQ_FIRST(&conn->dhqh);
 		for (; rnd_dh > 0; rnd_dh--)
@@ -1435,6 +1435,18 @@ __evict_walk_choose_dhandle(
 	dhandle = TAILQ_FIRST(&conn->dhhash[rnd_bucket]);
 	for (; rnd_dh > 0; rnd_dh--)
 		dhandle = TAILQ_NEXT(dhandle, hashq);
+#else
+	/* Just step through dhandles. */
+	dhandle = *dhandle_p;
+	if (dhandle != NULL)
+		dhandle = TAILQ_NEXT(dhandle, q);
+	if (dhandle == NULL)
+		dhandle = TAILQ_FIRST(&conn->dhqh);
+
+	WT_UNUSED(dh_bucket_count);
+	WT_UNUSED(rnd_bucket);
+	WT_UNUSED(rnd_dh);
+#endif
 
 	*dhandle_p = dhandle;
 }
@@ -1452,9 +1464,8 @@ __evict_walk(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue)
 	WT_DATA_HANDLE *dhandle;
 	WT_DECL_RET;
 	WT_TRACK_OP_DECL;
-	uint64_t loop_count;
-	uint64_t pages_seen_file, pages_seen_interim, pages_seen_total;
-	u_int max_entries, retries, slot, start_slot, total_candidates;
+	u_int loop_count, max_entries, retries, slot, start_slot;
+	u_int total_candidates;
 	bool dhandle_locked, incr;
 
 	WT_TRACK_OP_INIT(session);
@@ -1480,31 +1491,14 @@ __evict_walk(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue)
 	total_candidates = (u_int)(F_ISSET(cache, WT_CACHE_EVICT_CLEAN) ?
 	    __wt_cache_pages_inuse(cache) : cache->pages_dirty_leaf);
 	max_entries = WT_MIN(max_entries, 1 + total_candidates / 2);
-	pages_seen_interim = pages_seen_total = 0;
 
 retry:	loop_count = 0;
-	while (slot < max_entries) {
-		loop_count++;
-
+	while (slot < max_entries && loop_count++ < conn->dhandle_count) {
 		/* We're done if shutting down or reconfiguring. */
 		if (F_ISSET(conn, WT_CONN_CLOSING) ||
 		    F_ISSET(conn, WT_CONN_RECONFIGURING))
 			break;
 
-		/* If we have seen enough pages in this walk, we're done. */
-		if (pages_seen_total > WT_EVICT_WALK_INCR * 100)
-			break;
-
-		/*
-		 * If we are not finding pages at all, we're done.
-		 * Every 100th iteration, check if we made progress.
-		 */
-		if (loop_count % 100 == 0) {
-			if (pages_seen_interim == pages_seen_total)
-				break;
-			pages_seen_interim = pages_seen_total;
-		}
-
 		/*
 		 * If another thread is waiting on the eviction server to clear
 		 * the walk point in a tree, give up.
@@ -1620,9 +1614,8 @@ retry:	loop_count = 0;
 				 */
 				cache->walk_tree = dhandle;
 				WT_WITH_DHANDLE(session, dhandle,
-				    ret = __evict_walk_tree(session, queue,
-					max_entries, &slot, &pages_seen_file));
-				pages_seen_total += pages_seen_file;
+				    ret = __evict_walk_tree(
+				    session, queue, max_entries, &slot));
 
 				WT_ASSERT(session, __wt_session_gen(
 				    session, WT_GEN_SPLIT) == 0);
@@ -1713,22 +1706,14 @@ __evict_push_candidate(WT_SESSION_IMPL *session,
  *	Calculate how many pages to queue for a given tree.
  */
 static uint32_t
-__evict_walk_target(WT_SESSION_IMPL *session, u_int max_entries)
+__evict_walk_target(WT_SESSION_IMPL *session)
 {
 	WT_CACHE *cache;
 	uint64_t btree_inuse, bytes_per_slot, cache_inuse;
 	uint32_t target_pages_clean, target_pages_dirty, target_pages;
-	uint32_t total_slots;
 
 	cache = S2C(session)->cache;
 	target_pages_clean = target_pages_dirty = 0;
-	total_slots = max_entries;
-
-	/*
-	 * The number of times we should fill the queue by the end of
-	 * considering all trees.
-	 */
-#define	QUEUE_FILLS_PER_PASS	10
 
 	/*
 	 * The minimum number of pages we should consider per tree.
@@ -1744,7 +1729,7 @@ __evict_walk_target(WT_SESSION_IMPL *session, u_int max_entries)
 	if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) {
 		btree_inuse = __wt_btree_bytes_evictable(session);
 		cache_inuse = __wt_cache_bytes_inuse(cache);
-		bytes_per_slot = 1 + cache_inuse / total_slots;
+		bytes_per_slot = 1 + cache_inuse / cache->evict_slots;
 		target_pages_clean = (uint32_t)(
 		    (btree_inuse + bytes_per_slot / 2) / bytes_per_slot);
 	}
@@ -1752,20 +1737,12 @@ __evict_walk_target(WT_SESSION_IMPL *session, u_int max_entries)
 	if (F_ISSET(cache, WT_CACHE_EVICT_DIRTY)) {
 		btree_inuse = __wt_btree_dirty_leaf_inuse(session);
 		cache_inuse = __wt_cache_dirty_leaf_inuse(cache);
-		bytes_per_slot = 1 + cache_inuse / total_slots;
+		bytes_per_slot = 1 + cache_inuse / cache->evict_slots;
 		target_pages_dirty = (uint32_t)(
 		    (btree_inuse + bytes_per_slot / 2) / bytes_per_slot);
 	}
 
-	/*
-	 * Weight the number of target pages by the number of times we want to
-	 * fill the cache per pass through all the trees.  Note that we don't
-	 * build this into the calculation above because we don't want to favor
-	 * small trees, so round to a whole number of slots (zero for small
-	 * trees) before multiplying.
-	 */
-	target_pages = WT_MAX(target_pages_clean, target_pages_dirty) *
-	    QUEUE_FILLS_PER_PASS;
+	target_pages = WT_MAX(target_pages_clean, target_pages_dirty);
 
 	/*
 	 * Walk trees with a small fraction of the cache in case there are so
@@ -1800,8 +1777,8 @@ __evict_walk_target(WT_SESSION_IMPL *session, u_int max_entries)
  *	Get a few page eviction candidates from a single underlying file.
  */
 static int
-__evict_walk_tree(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue,
-    u_int max_entries, u_int *slotp, uint64_t *pages_seen_p)
+__evict_walk_tree(WT_SESSION_IMPL *session,
+    WT_EVICT_QUEUE *queue, u_int max_entries, u_int *slotp)
 {
 	WT_BTREE *btree;
 	WT_CACHE *cache;
@@ -1821,7 +1798,6 @@ __evict_walk_tree(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue,
 	last_parent = NULL;
 	restarts = 0;
 	give_up = urgent_queued = false;
-	*pages_seen_p = 0;
 
 	/*
 	 * Figure out how many slots to fill from this tree.
@@ -1830,12 +1806,10 @@ __evict_walk_tree(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue,
 	start = queue->evict_queue + *slotp;
 	remaining_slots = max_entries - *slotp;
 	if (btree->evict_walk_progress >= btree->evict_walk_target) {
-		btree->evict_walk_target =
-		    __evict_walk_target(session, max_entries);
+		btree->evict_walk_target = __evict_walk_target(session);
 		btree->evict_walk_progress = 0;
 	}
-	target_pages = WT_MIN(btree->evict_walk_target / QUEUE_FILLS_PER_PASS,
-	    btree->evict_walk_target - btree->evict_walk_progress);
+	target_pages = btree->evict_walk_target - btree->evict_walk_progress;
 
 	if (target_pages > remaining_slots)
 		target_pages = remaining_slots;
@@ -2194,8 +2168,6 @@ fast:		/* If the page can't be evicted, give up. */
 		btree->evict_ref = ref;
 	}
 
-	*pages_seen_p = pages_seen;
-
 	WT_STAT_CONN_INCRV(session, cache_eviction_walk, refs_walked);
 	WT_STAT_CONN_INCRV(session, cache_eviction_pages_seen, pages_seen);
 	WT_STAT_DATA_INCRV(session, cache_eviction_pages_seen, pages_seen);
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 51cabeda029..7193e6f2b2c 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -27,6 +27,8 @@ static void __rec_write_page_status(WT_SESSION_IMPL *, WT_RECONCILE *);
 static int  __rec_write_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
 static int  __rec_write_wrapup_err(
 		WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int __reconcile(WT_SESSION_IMPL *,
+		WT_REF *, WT_SALVAGE_COOKIE *, uint32_t, bool *, bool *);
 
 /*
  * __wt_reconcile --
@@ -36,19 +38,15 @@ int
 __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
     WT_SALVAGE_COOKIE *salvage, uint32_t flags, bool *lookaside_retryp)
 {
-	WT_BTREE *btree;
 	WT_DECL_RET;
 	WT_PAGE *page;
-	WT_PAGE_MODIFY *mod;
-	WT_RECONCILE *r;
-	uint64_t oldest_id;
+	bool no_reconcile_set, page_locked;
 
-	btree = S2BT(session);
-	page = ref->page;
-	mod = page->modify;
 	if (lookaside_retryp != NULL)
 		*lookaside_retryp = false;
 
+	page = ref->page;
+
 	__wt_verbose(session, WT_VERB_RECONCILE,
 	    "%p reconcile %s (%s%s%s)",
 	    (void *)ref, __wt_page_type_string(page->type),
@@ -77,10 +75,19 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
 	    LF_ISSET(WT_REC_VISIBLE_ALL) ||
 	    F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT));
 
-	/* We shouldn't get called with a clean page, that's an error. */
+	/* It's an error to be called with a clean page. */
 	WT_ASSERT(session, __wt_page_is_modified(page));
 
 	/*
+	 * Reconciliation acquires and releases pages, and in rare cases that
+	 * page release triggers eviction. If the page is dirty, eviction can
+	 * trigger reconciliation, and we re-enter this code. Reconciliation
+	 * isn't re-entrant, so we need to ensure that doesn't happen.
+	 */
+	no_reconcile_set = F_ISSET(session, WT_SESSION_NO_RECONCILE);
+	F_SET(session, WT_SESSION_NO_RECONCILE);
+
+	/*
 	 * Reconciliation locks the page for three reasons:
 	 *    Reconciliation reads the lists of page updates, obsolete updates
 	 * cannot be discarded while reconciliation is in progress;
@@ -90,6 +97,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
 	 * a child page splitting during the reconciliation.
 	 */
 	WT_PAGE_LOCK(session, page);
+	page_locked = true;
 
 	/*
 	 * Now that the page is locked, if attempting to evict it, check again
@@ -97,20 +105,37 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
 	 * while we were waiting to acquire the lock (e.g., the page could have
 	 * split).
 	 */
-	if (LF_ISSET(WT_REC_EVICT) &&
-	    !__wt_page_can_evict(session, ref, NULL)) {
-		WT_PAGE_UNLOCK(session, page);
-		return (__wt_set_return(session, EBUSY));
-	}
+	if (LF_ISSET(WT_REC_EVICT) && !__wt_page_can_evict(session, ref, NULL))
+		WT_ERR(__wt_set_return(session, EBUSY));
 
-	/* Initialize the reconciliation structure for each new run. */
-	if ((ret = __rec_init(
-	    session, ref, flags, salvage, &session->reconcile)) != 0) {
+	/*
+	 * Reconcile the page. The reconciliation code unlocks the page as soon
+	 * as possible, and returns that information.
+	 */
+	ret = __reconcile(session, ref,
+	    salvage, flags, lookaside_retryp, &page_locked);
+
+err:
+	if (page_locked)
 		WT_PAGE_UNLOCK(session, page);
-		return (ret);
-	}
-	r = session->reconcile;
+	if (!no_reconcile_set)
+		F_CLR(session, WT_SESSION_NO_RECONCILE);
+	return (ret);
+}
 
+/*
+ * __reconcile_save_evict_state --
+ *	Save the transaction state that causes history to be pinned, whether
+ * reconciliation succeeds or fails.
+ */
+static void
+__reconcile_save_evict_state(
+    WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
+{
+	WT_PAGE_MODIFY *mod;
+	uint64_t oldest_id;
+
+	mod = ref->page->modify;
 	oldest_id = __wt_txn_oldest_id(session);
 
 	/*
@@ -136,6 +161,32 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
 	WT_ASSERT(session, WT_TXNID_LE(mod->last_oldest_id, oldest_id));
 	mod->last_oldest_id = oldest_id;
 #endif
+}
+
+/*
+ * __reconcile --
+ *	Reconcile an in-memory page into its on-disk format, and write it.
+ */
+static int
+__reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage,
+    uint32_t flags, bool *lookaside_retryp, bool *page_lockedp)
+{
+	WT_BTREE *btree;
+	WT_DECL_RET;
+	WT_PAGE *page;
+	WT_PAGE_MODIFY *mod;
+	WT_RECONCILE *r;
+
+	btree = S2BT(session);
+	page = ref->page;
+	mod = page->modify;
+
+	/* Save the eviction state. */
+	__reconcile_save_evict_state(session, ref, flags);
+
+	/* Initialize the reconciliation structure for each new run. */
+	WT_RET(__rec_init(session, ref, flags, salvage, &session->reconcile));
+	r = session->reconcile;
 
 	/* Reconcile the page. */
 	switch (page->type) {
@@ -190,6 +241,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
 		    S2C(session)->txn_global.stable_timestamp;
 
 	/* Release the reconciliation lock. */
+	*page_lockedp = false;
 	WT_PAGE_UNLOCK(session, page);
 
 	/* Update statistics. */
@@ -522,7 +574,16 @@ __rec_init(WT_SESSION_IMPL *session,
 	btree = S2BT(session);
 	page = ref->page;
 
-	if ((r = *(WT_RECONCILE **)reconcilep) == NULL) {
+	/*
+	 * Reconciliation is not re-entrant, make sure that doesn't happen. Our
+	 * caller sets WT_SESSION_IMPL.WT_SESSION_NO_RECONCILE to prevent it,
+	 * but it's been a problem in the past, check to be sure.
+	 */
+	r = *(WT_RECONCILE **)reconcilep;
+	if (r != NULL && r->ref != NULL)
+		WT_RET_MSG(session, WT_ERROR, "reconciliation re-entered");
+
+	if (r == NULL) {
 		WT_RET(__wt_calloc_one(session, &r));
 		session->reconcile_cleanup = __rec_destroy_session;
 
@@ -535,9 +596,6 @@ __rec_init(WT_SESSION_IMPL *session,
 		F_SET(&r->chunkB.image, WT_ITEM_ALIGNED);
 	}
 
-	/* Reconciliation is not re-entrant, make sure that doesn't happen. */
-	WT_ASSERT(session, r->ref == NULL);
-
 	/* Remember the configuration. */
 	r->ref = ref;
 	r->page = page;
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index 0469882c08e..53bde4a499b 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -731,11 +731,12 @@ __txn_commit_timestamps_assert(WT_SESSION_IMPL *session)
 			} else
 				upd = op->u.op_upd->next;
 			/*
-			 * Skip over any aborted update structures or ones
-			 * from our own transaction.
+			 * Skip over any aborted update structures, internally
+			 * created update structures or ones from our own
+			 * transaction.
 			 */
 			while (upd != NULL && (upd->txnid == WT_TXN_ABORTED ||
-			    upd->txnid == txn->id))
+			    upd->txnid == WT_TXN_NONE || upd->txnid == txn->id))
 				upd = upd->next;
 
 			/*
diff --git a/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c b/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c
index fea9ad0bfe3..40b4c543500 100644
--- a/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c
+++ b/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c
@@ -139,8 +139,10 @@ op(WT_SESSION *session, WT_RAND_STATE *rnd, WT_CURSOR **cpp)
 	/* Close the cursor half the time, otherwise cache it. */
 	if (__wt_random(rnd) % 2 == 0)
 		testutil_check(cursor->close(cursor));
-	else
+	else {
+		testutil_check(cursor->reset(cursor));
 		*cpp = cursor;
+	}
 
 	(void)__wt_atomic_add64(&worker, 1);
 }
diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml
index e9ae4d1e37b..5883b8dc42e 100644..100755
--- a/src/third_party/wiredtiger/test/evergreen.yml
+++ b/src/third_party/wiredtiger/test/evergreen.yml
@@ -1021,6 +1021,7 @@ tasks:
     run_on:
       - rhel62-large
     commands:
+      - func: "fetch source"
       - func: "fetch mongo-tests repo"
       - command: shell.exec
         params:
author	Luke Chen <luke.chen@mongodb.com>	2019-07-02 14:01:12 +1000
committer	Luke Chen <luke.chen@mongodb.com>	2019-07-02 14:02:00 +1000
commit	11a4b5016f134896e088d8c1c96d1c17225a3c86 (patch)
tree	3385861ec5bc7f5ce74049b8bc6bba3fc91720cf /src/third_party/wiredtiger
parent	cdf7c88be60b287c316beda42b4ff9f197617942 (diff)
download	mongo-11a4b5016f134896e088d8c1c96d1c17225a3c86.tar.gz