Import wiredtiger-wiredtiger-2.6.1-500-g26d1ad2.tar.gz from wiredtiger branch mongodb-3.2

author: Michael Cahill <michael.cahill@mongodb.com> 2015-08-12 20:56:25 +1000
committer: Michael Cahill <michael.cahill@mongodb.com> 2015-08-12 20:56:25 +1000
commit: 7bb09c0377f5160857617c38ab07955f8f4b03f6 (patch)
tree: 2d7041f6e1cc121c743c368406485e39280b551c /src/third_party/wiredtiger
parent: 6d9cbeb53eb9e6d39a24a8ac54b71e105483730b (diff)
download: mongo-7bb09c0377f5160857617c38ab07955f8f4b03f6.tar.gz
51 files changed, 1182 insertions, 551 deletions
diff --git a/src/third_party/wiredtiger/bench/wtperf/Makefile.am b/src/third_party/wiredtiger/bench/wtperf/Makefile.am
index 0630a27f640..15f151d84b2 100644
--- a/src/third_party/wiredtiger/bench/wtperf/Makefile.am
+++ b/src/third_party/wiredtiger/bench/wtperf/Makefile.am
@@ -5,7 +5,7 @@ LDADD = $(top_builddir)/libwiredtiger.la -lm
 noinst_PROGRAMS = wtperf
 wtperf_LDFLAGS = -static
 wtperf_SOURCES =\
-	config.c misc.c track.c wtperf.c wtperf.h wtperf_opt.i
+	config.c misc.c track.c wtperf.c wtperf_truncate.c wtperf.h wtperf_opt.i
 
 TESTS = smoke.sh
 AM_TESTS_ENVIRONMENT = rm -rf WT_TEST ; mkdir WT_TEST ;
diff --git a/src/third_party/wiredtiger/bench/wtperf/config.c b/src/third_party/wiredtiger/bench/wtperf/config.c
index 47e052d6055..721b41432cb 100644
--- a/src/third_party/wiredtiger/bench/wtperf/config.c
+++ b/src/third_party/wiredtiger/bench/wtperf/config.c
@@ -95,6 +95,8 @@ config_assign(CONFIG *dest, const CONFIG *src)
 				*pstr = newstr;
 			}
 		}
+
+	STAILQ_INIT(&dest->stone_head);
 	return (0);
 }
 
@@ -122,6 +124,7 @@ config_free(CONFIG *cfg)
 		free(cfg->uris);
 	}
 
+	cleanup_truncate_config(cfg);
 	free(cfg->ckptthreads);
 	free(cfg->popthreads);
 	free(cfg->base_uri);
@@ -243,6 +246,28 @@ config_threads(CONFIG *cfg, const char *config, size_t len)
 					goto err;
 				continue;
 			}
+			if (STRING_MATCH("truncate", k.str, k.len)) {
+				if ((workp->truncate = v.val) != 1)
+					goto err;
+				/* There can only be one Truncate thread. */
+				if (cfg->has_truncate != 0) {
+					goto err;
+				}
+				cfg->has_truncate = 1;
+				continue;
+			}
+			if (STRING_MATCH("truncate_pct", k.str, k.len)) {
+				if (v.val <= 0)
+					goto err;
+				workp->truncate_pct = (uint64_t)v.val;
+				continue;
+			}
+			if (STRING_MATCH("truncate_count", k.str, k.len)) {
+				if (v.val <= 0)
+					goto err;
+				workp->truncate_count = (uint64_t)v.val;
+				continue;
+			}
 			goto err;
 		}
 		if (ret == WT_NOTFOUND)
@@ -253,9 +278,21 @@ config_threads(CONFIG *cfg, const char *config, size_t len)
 		scan = NULL;
 		if (ret != 0)
 			goto err;
-
-		if (workp->insert == 0 &&
-		    workp->read == 0 && workp->update == 0)
+		if (workp->insert == 0 && workp->read == 0 &&
+		    workp->update == 0 && workp->truncate == 0)
+			goto err;
+		/* Why run with truncate if we don't want any truncation. */
+		if (workp->truncate != 0 &&
+		    workp->truncate_pct == 0 && workp->truncate_count == 0)
+			goto err;
+		if (workp->truncate != 0 &&
+		    (workp->truncate_pct < 1 || workp->truncate_pct > 99))
+			goto err;
+		/* Truncate should have its own exclusive thread. */
+		if (workp->truncate != 0 && workp->threads > 1)
+			goto err;
+		if (workp->truncate != 0 &&
+		    (workp->insert > 0 || workp->read > 0 || workp->update > 0))
 			goto err;
 		cfg->workers_cnt += (u_int)workp->threads;
 	}
@@ -640,9 +677,11 @@ config_print(CONFIG *cfg)
 		for (i = 0, workp = cfg->workload;
 		    i < cfg->workload_cnt; ++i, ++workp)
 			printf("\t\t%" PRId64 " threads (inserts=%" PRId64
-			    ", reads=%" PRId64 ", updates=%" PRId64 ")\n",
+			    ", reads=%" PRId64 ", updates=%" PRId64 
+			    ", truncates=% " PRId64 ")\n",
 			    workp->threads,
-			    workp->insert, workp->read, workp->update);
+			    workp->insert, workp->read,
+			    workp->update, workp->truncate);
 	}
 
 	printf("\t" "Checkpoint threads, interval: %" PRIu32 ", %" PRIu32 "\n",
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/mongodb-oplog.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/mongodb-oplog.wtperf
new file mode 100644
index 00000000000..34235f04518
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/mongodb-oplog.wtperf
@@ -0,0 +1,11 @@
+# wtperf options file to simulate populating a MongoDB oplog
+conn_config="cache_size=2GB,checkpoint=(wait=60)"
+table_config="type=file"
+# Start with a small set of inserts in the populate phase.
+icount=50000
+report_interval=5
+run_time=500
+populate_threads=1
+# Setup three threads to insert into the oplog
+# Setup one thread to be doing truncates from the oplog
+threads=((count=3,inserts=1,throttle=2000),(count=1,truncate=1,truncate_pct=10,truncate_count=50000))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/truncate-btree-populate.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/truncate-btree-populate.wtperf
new file mode 100644
index 00000000000..4e4ae7500f0
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/truncate-btree-populate.wtperf
@@ -0,0 +1,7 @@
+# Truncate workload population phase
+conn_config="cache_size=2GB,checkpoint=(wait=60)"
+table_config="type=file"
+# Start with a small set of inserts in the populate phase.
+icount=50000
+report_interval=5
+populate_threads=1
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/truncate-btree-workload.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/truncate-btree-workload.wtperf
new file mode 100644
index 00000000000..55e01dcd0dc
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/truncate-btree-workload.wtperf
@@ -0,0 +1,9 @@
+# truncate workload. work phase
+conn_config="cache_size=2GB,checkpoint=(wait=60)"
+table_config="type=file"
+create=false
+report_interval=5
+run_time=500
+# Setup three threads to insert into the oplog
+# Setup one thread to be doing truncates from the oplog
+threads=((count=3,inserts=1,throttle=2000),(count=1,truncate=1,truncate_pct=10,truncate_count=50000))
diff --git a/src/third_party/wiredtiger/bench/wtperf/track.c b/src/third_party/wiredtiger/bench/wtperf/track.c
index 8ea4201246a..75f5a012a94 100644
--- a/src/third_party/wiredtiger/bench/wtperf/track.c
+++ b/src/third_party/wiredtiger/bench/wtperf/track.c
@@ -98,6 +98,11 @@ sum_read_ops(CONFIG *cfg)
 	return (sum_ops(cfg, offsetof(CONFIG_THREAD, read)));
 }
 uint64_t
+sum_truncate_ops(CONFIG *cfg)
+{
+	return (sum_ops(cfg, offsetof(CONFIG_THREAD, truncate)));
+}
+uint64_t
 sum_update_ops(CONFIG *cfg)
 {
 	return (sum_ops(cfg, offsetof(CONFIG_THREAD, update)));
diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.c b/src/third_party/wiredtiger/bench/wtperf/wtperf.c
index 1c9ce963c9a..f079d6272d7 100644
--- a/src/third_party/wiredtiger/bench/wtperf/wtperf.c
+++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.c
@@ -50,6 +50,7 @@ static const CONFIG default_cfg = {
 	0,				/* checkpoint operations */
 	0,				/* insert operations */
 	0,				/* read operations */
+	0,				/* truncate operations */
 	0,				/* update operations */
 	0,				/* insert key */
 	0,				/* checkpoint in progress */
@@ -57,6 +58,8 @@ static const CONFIG default_cfg = {
 	0,				/* notify threads to stop */
 	0,				/* in warmup phase */
 	0,				/* total seconds running */
+	0,				/* has truncate */
+	{NULL, NULL},			/* the truncate queue */
 
 #define	OPT_DEFINE_DEFAULT
 #include "wtperf_opt.i"
@@ -100,15 +103,6 @@ get_next_incr(CONFIG *cfg)
 	return (WT_ATOMIC_ADD8(cfg->insert_key, 1));
 }
 
-static inline void
-generate_key(CONFIG *cfg, char *key_buf, uint64_t keyno)
-{
-	/*
-	 * Don't change to snprintf, sprintf is faster in some tests.
-	 */
-	sprintf(key_buf, "%0*" PRIu64, cfg->key_sz - 1, keyno);
-}
-
 static void
 randomize_value(CONFIG_THREAD *thread, char *value_buf)
 {
@@ -258,6 +252,8 @@ op_name(uint8_t *op)
 		return ("insert_rmw");
 	case WORKER_READ:
 		return ("read");
+	case WORKER_TRUNCATE:
+		return ("truncate");
 	case WORKER_UPDATE:
 		return ("update");
 	default:
@@ -389,7 +385,7 @@ worker(void *arg)
 	size_t i;
 	uint64_t next_val, usecs;
 	uint8_t *op, *op_end;
-	int measure_latency, ret;
+	int measure_latency, ret, truncated;
 	char *value_buf, *key_buf, *value;
 	char buf[512];
 
@@ -444,6 +440,11 @@ worker(void *arg)
 		goto err;
 	}
 
+	/* Setup for truncate */
+	if (thread->workload->truncate != 0)
+		if ((ret = setup_truncate(cfg, thread, session)) != 0)
+			goto err;
+
 	key_buf = thread->key_buf;
 	value_buf = thread->value_buf;
 
@@ -486,6 +487,10 @@ worker(void *arg)
 			if (wtperf_value_range(cfg) < next_val)
 				continue;
 			break;
+		case WORKER_TRUNCATE:
+			/* Required but not used. */
+			next_val = wtperf_rand(thread);
+			break;
 		default:
 			goto err;		/* can't happen */
 		}
@@ -502,10 +507,9 @@ worker(void *arg)
 		 * is 0, to avoid first time latency spikes.
 		 */
 		measure_latency =
-		    cfg->sample_interval != 0 && trk->ops != 0 && (
-		    trk->ops % cfg->sample_rate == 0);
-		if (measure_latency &&
-		    (ret = __wt_epoch(NULL, &start)) != 0) {
+		    cfg->sample_interval != 0 && trk != NULL &&
+		    trk->ops != 0 && (trk->ops % cfg->sample_rate == 0);
+		if (measure_latency && (ret = __wt_epoch(NULL, &start)) != 0) {
 			lprintf(cfg, ret, 0, "Get time call failed");
 			goto err;
 		}
@@ -548,6 +552,18 @@ worker(void *arg)
 			if ((ret = cursor->insert(cursor)) == 0)
 				break;
 			goto op_err;
+		case WORKER_TRUNCATE:
+			if ((ret = run_truncate(
+			    cfg, thread, cursor, session, &truncated)) == 0) {
+				if (truncated)
+					trk = &thread->truncate;
+				else
+					trk = &thread->truncate_sleep;
+				/* Pause between truncate attempts */
+				(void)usleep(1000);
+				break;
+			}
+			goto op_err;
 		case WORKER_UPDATE:
 			if ((ret = cursor->search(cursor)) == 0) {
 				if ((ret = cursor->get_value(
@@ -711,16 +727,33 @@ run_mix_schedule(CONFIG *cfg, WORKLOAD *workp)
 {
 	int64_t pct;
 
-	/* Confirm reads, inserts and updates cannot all be zero. */
-	if (workp->insert == 0 && workp->read == 0 && workp->update == 0) {
+	/* Confirm reads, inserts, truncates and updates cannot all be zero. */
+	if (workp->insert == 0 && workp->read == 0 &&
+	    workp->truncate == 0 && workp->update == 0) {
 		lprintf(cfg, EINVAL, 0, "no operations scheduled");
 		return (EINVAL);
 	}
 
 	/*
+	 * Handle truncate first - it's a special case that can't be used in
+	 * a mixed workload.
+	 */
+	if (workp->truncate != 0) {
+		if (workp->insert != 0 ||
+		    workp->read != 0 || workp->update != 0) {
+			lprintf(cfg, EINVAL, 0,
+			    "Can't configure truncate in a mixed workload");
+			return (EINVAL);
+		}
+		memset(workp->ops, WORKER_TRUNCATE, sizeof(workp->ops));
+		return (0);
+	}
+
+	/*
 	 * Check for a simple case where the thread is only doing insert or
-	 * update operations (because the default operation for a job-mix is
-	 * read, the subsequent code works fine if only reads are specified).
+	 * update operations (because the default operation for a
+	 * job-mix is read, the subsequent code works fine if only reads are
+	 * specified).
 	 */
 	if (workp->insert != 0 && workp->read == 0 && workp->update == 0) {
 		memset(workp->ops,
@@ -840,10 +873,9 @@ populate_thread(void *arg)
 		cursor = cursors[op % cfg->table_count];
 		generate_key(cfg, key_buf, op);
 		measure_latency =
-		    cfg->sample_interval != 0 && trk->ops != 0 && (
-		    trk->ops % cfg->sample_rate == 0);
-		if (measure_latency &&
-		    (ret = __wt_epoch(NULL, &start)) != 0) {
+		    cfg->sample_interval != 0 &&
+		    trk->ops != 0 && (trk->ops % cfg->sample_rate == 0);
+		if (measure_latency && (ret = __wt_epoch(NULL, &start)) != 0) {
 			lprintf(cfg, ret, 0, "Get time call failed");
 			goto err;
 		}
@@ -961,10 +993,9 @@ populate_async(void *arg)
 	 * the time to process by workers.
 	 */
 	measure_latency =
-	    cfg->sample_interval != 0 && trk->ops != 0 && (
-	    trk->ops % cfg->sample_rate == 0);
-	if (measure_latency &&
-	    (ret = __wt_epoch(NULL, &start)) != 0) {
+	    cfg->sample_interval != 0 &&
+	    trk->ops != 0 && (trk->ops % cfg->sample_rate == 0);
+	if (measure_latency && (ret = __wt_epoch(NULL, &start)) != 0) {
 		lprintf(cfg, ret, 0, "Get time call failed");
 		goto err;
 	}
@@ -1006,8 +1037,7 @@ populate_async(void *arg)
 		goto err;
 	if (measure_latency) {
 		if ((ret = __wt_epoch(NULL, &stop)) != 0) {
-			lprintf(cfg, ret, 0,
-			    "Get time call failed");
+			lprintf(cfg, ret, 0, "Get time call failed");
 			goto err;
 		}
 		++trk->latency_ops;
@@ -1246,8 +1276,9 @@ execute_populate(CONFIG *cfg)
 	CONFIG_THREAD *popth;
 	WT_ASYNC_OP *asyncop;
 	size_t i;
-	uint64_t last_ops, msecs;
+	uint64_t last_ops, msecs, print_ops_sec;
 	uint32_t interval, tables;
+	double print_secs;
 	int elapsed, ret;
 	void *(*pfunc)(void *);
 
@@ -1325,10 +1356,22 @@ execute_populate(CONFIG *cfg)
 
 	lprintf(cfg, 0, 1, "Finished load of %" PRIu32 " items", cfg->icount);
 	msecs = ns_to_ms(WT_TIMEDIFF(stop, start));
+
+	/*
+	 * This is needed as the divisions will fail if the insert takes no time
+	 * which will only be the case when there is no data to insert.
+	 */
+	if (msecs == 0) {
+		print_secs = 0;
+		print_ops_sec = 0;
+	} else {
+		print_secs = (double)msecs / (double)MSEC_PER_SEC;
+		print_ops_sec =
+		    (uint64_t)((cfg->icount / msecs) / MSEC_PER_SEC);
+	}
 	lprintf(cfg, 0, 1,
 	    "Load time: %.2f\n" "load ops/sec: %" PRIu64,
-	    (double)msecs / (double)MSEC_PER_SEC,
-	    (uint64_t)((cfg->icount / msecs) / MSEC_PER_SEC));
+	    print_secs, print_ops_sec);
 
 	/*
 	 * If configured, compact to allow LSM merging to complete.  We
@@ -1420,16 +1463,19 @@ execute_workload(CONFIG *cfg)
 {
 	CONFIG_THREAD *threads;
 	WORKLOAD *workp;
-	uint64_t last_ckpts, last_inserts, last_reads, last_updates;
+	uint64_t last_ckpts, last_inserts, last_reads, last_truncates;
+	uint64_t last_updates;
 	uint32_t interval, run_ops, run_time;
 	u_int i;
 	int ret, t_ret;
 	void *(*pfunc)(void *);
 
 	cfg->insert_key = 0;
-	cfg->insert_ops = cfg->read_ops = cfg->update_ops = 0;
+	cfg->insert_ops = cfg->read_ops = cfg->truncate_ops = 0;
+	cfg->update_ops = 0;
 
-	last_ckpts = last_inserts = last_reads = last_updates = 0;
+	last_ckpts = last_inserts = last_reads = last_truncates = 0;
+	last_updates = 0;
 	ret = 0;
 
 	if (cfg->warmup != 0)
@@ -1454,9 +1500,9 @@ execute_workload(CONFIG *cfg)
 	    workp = cfg->workload; i < cfg->workload_cnt; ++i, ++workp) {
 		lprintf(cfg, 0, 1,
 		    "Starting workload #%d: %" PRId64 " threads, inserts=%"
-		    PRId64 ", reads=%" PRId64 ", updates=%" PRId64,
-		    i + 1,
-		    workp->threads, workp->insert, workp->read, workp->update);
+		    PRId64 ", reads=%" PRId64 ", updates=%" PRId64
+		    ", truncate=%" PRId64, i + 1, workp->threads, workp->insert,
+		    workp->read, workp->update, workp->truncate);
 
 		/* Figure out the workload's schedule. */
 		if ((ret = run_mix_schedule(cfg, workp)) != 0)
@@ -1496,6 +1542,7 @@ execute_workload(CONFIG *cfg)
 		cfg->insert_ops = sum_insert_ops(cfg);
 		cfg->read_ops = sum_read_ops(cfg);
 		cfg->update_ops = sum_update_ops(cfg);
+		cfg->truncate_ops = sum_truncate_ops(cfg);
 
 		/* If we're checking total operations, see if we're done. */
 		if (run_ops != 0 && run_ops <=
@@ -1510,16 +1557,18 @@ execute_workload(CONFIG *cfg)
 
 		lprintf(cfg, 0, 1,
 		    "%" PRIu64 " reads, %" PRIu64 " inserts, %" PRIu64
-		    " updates, %" PRIu64 " checkpoints in %" PRIu32
-		    " secs (%" PRIu32 " total secs)",
+		    " updates, %" PRIu64 " truncates, %" PRIu64
+		    " checkpoints in %" PRIu32 " secs (%" PRIu32 " total secs)",
 		    cfg->read_ops - last_reads,
 		    cfg->insert_ops - last_inserts,
 		    cfg->update_ops - last_updates,
+		    cfg->truncate_ops - last_truncates,
 		    cfg->ckpt_ops - last_ckpts,
 		    cfg->report_interval, cfg->totalsec);
 		last_reads = cfg->read_ops;
 		last_inserts = cfg->insert_ops;
 		last_updates = cfg->update_ops;
+		last_truncates = cfg->truncate_ops;
 		last_ckpts = cfg->ckpt_ops;
 	}
 
@@ -1902,6 +1951,7 @@ start_run(CONFIG *cfg)
 		/* One final summation of the operations we've completed. */
 		cfg->read_ops = sum_read_ops(cfg);
 		cfg->insert_ops = sum_insert_ops(cfg);
+		cfg->truncate_ops = sum_truncate_ops(cfg);
 		cfg->update_ops = sum_update_ops(cfg);
 		cfg->ckpt_ops = sum_ckpt_ops(cfg);
 		total_ops = cfg->read_ops + cfg->insert_ops + cfg->update_ops;
@@ -1917,6 +1967,11 @@ start_run(CONFIG *cfg)
 		    cfg->insert_ops, (cfg->insert_ops * 100) / total_ops,
 		    cfg->insert_ops / cfg->run_time);
 		lprintf(cfg, 0, 1,
+		    "Executed %" PRIu64 " truncate operations (%" PRIu64
+		    "%%) %" PRIu64 " ops/sec",
+		    cfg->truncate_ops, (cfg->truncate_ops * 100) / total_ops,
+		    cfg->truncate_ops / cfg->run_time);
+		lprintf(cfg, 0, 1,
 		    "Executed %" PRIu64 " update operations (%" PRIu64
 		    "%%) %" PRIu64 " ops/sec",
 		    cfg->update_ops, (cfg->update_ops * 100) / total_ops,
@@ -2062,14 +2117,25 @@ main(int argc, char *argv[])
 			break;
 		}
 
+	if (cfg->populate_threads == 0 && cfg->icount != 0) {
+		lprintf(cfg, 1, 0,
+		    "Cannot have 0 populate threads when icount is set\n");
+		goto err;
+	}
+
 	cfg->async_config = NULL;
 	/*
 	 * If the user specified async_threads we use async for all ops.
 	 * If the user wants compaction, then we also enable async for
 	 * the compact operation, but not for the workloads.
 	 */
-	if (cfg->async_threads > 0)
+	if (cfg->async_threads > 0) {
+		if (cfg->has_truncate > 0) {
+			lprintf(cfg, 1, 0, "Cannot run truncate and async\n");
+			goto err;
+		}
 		cfg->use_asyncops = 1;
+	}
 	if (cfg->compact && cfg->async_threads == 0)
 		cfg->async_threads = 2;
 	if (cfg->async_threads > 0) {
@@ -2091,6 +2157,18 @@ main(int argc, char *argv[])
 	if ((ret = config_compress(cfg)) != 0)
 		goto err;
 
+	/* You can't have truncate on a random collection. */
+	if (cfg->has_truncate && cfg->random_range) {
+		lprintf(cfg, 1, 0, "Cannot run truncate and random_range\n");
+		goto err;
+	}
+
+	/* We can't run truncate with more than one table. */
+	if (cfg->has_truncate && cfg->table_count > 1) {
+		lprintf(cfg, 1, 0, "Cannot truncate more than 1 table\n");
+		goto err;
+	}
+
 	/* Build the URI from the table name. */
 	req_len = strlen("table:") +
 	    strlen(HELIUM_NAME) + strlen(cfg->table_name) + 2;
@@ -2361,7 +2439,12 @@ wtperf_value_range(CONFIG *cfg)
 {
 	if (cfg->random_range)
 		return (cfg->icount + cfg->random_range);
-
+	/*
+	 * It is legal to configure a zero size populate phase, hide that
+	 * from other code by pretending the range is 1 in that case.
+	 */
+	if (cfg->icount + cfg->insert_key == 0)
+		return (1);
 	return (cfg->icount + cfg->insert_key - (u_int)(cfg->workers_cnt + 1));
 }
 
diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.h b/src/third_party/wiredtiger/bench/wtperf/wtperf.h
index 874cdc499b1..58dc65388ae 100644
--- a/src/third_party/wiredtiger/bench/wtperf/wtperf.h
+++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.h
@@ -26,6 +26,9 @@
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
+#ifndef	HAVE_WTPERF_H
+#define	HAVE_WTPERF_H
+
 #ifndef _WIN32
 #include <sys/time.h>
 #endif
@@ -90,14 +93,39 @@ typedef struct {
 	int64_t throttle;		/* Maximum operations/second */
 		/* Number of operations per transaction. Zero for autocommit */
 	int64_t ops_per_txn;
+	int64_t truncate;		/* Truncate ratio */
+	uint64_t truncate_pct;		/* Truncate Percent */
+	uint64_t truncate_count;	/* Truncate Count */
 
 #define	WORKER_INSERT		1	/* Insert */
 #define	WORKER_INSERT_RMW	2	/* Insert with read-modify-write */
 #define	WORKER_READ		3	/* Read */
-#define	WORKER_UPDATE		4	/* Update */
+#define	WORKER_TRUNCATE		4	/* Truncate */
+#define	WORKER_UPDATE		5	/* Update */
 	uint8_t ops[100];		/* Operation schedule */
 } WORKLOAD;
 
+/* Steering items for the truncate workload */
+typedef struct __truncate_struct TRUNCATE_CONFIG;
+struct __truncate_struct {
+	uint64_t stone_gap;
+	uint64_t needed_stones;
+	uint64_t final_stone_gap;
+	uint64_t expected_total;
+	uint64_t total_inserts;
+	uint64_t last_total_inserts;
+	uint64_t num_stones;
+	uint64_t last_key;
+};
+
+/* Queue entry for use with the Truncate Logic */
+struct __truncate_queue_entry {
+	char *key;			/* Truncation point */
+	uint64_t diff;			/* Number of items to be truncated*/
+	STAILQ_ENTRY(__truncate_queue_entry) q;
+};
+typedef struct __truncate_queue_entry TRUNCATE_QUEUE_ENTRY;
+
 #define	LOG_PARTIAL_CONFIG	",log=(enabled=false)"
 /*
  * NOTE:  If you add any fields to this structure here, you must also add
@@ -135,6 +163,7 @@ struct __config {			/* Configuration structure */
 	uint64_t ckpt_ops;		/* checkpoint operations */
 	uint64_t insert_ops;		/* insert operations */
 	uint64_t read_ops;		/* read operations */
+	uint64_t truncate_ops;		/* truncate operations */
 	uint64_t update_ops;		/* update operations */
 
 	uint64_t insert_key;		/* insert key */
@@ -146,6 +175,11 @@ struct __config {			/* Configuration structure */
 
 	volatile uint32_t totalsec;	/* total seconds running */
 
+	u_int		 has_truncate;  /* if there is a truncate workload */
+
+	/* Queue head for use with the Truncate Logic */
+	STAILQ_HEAD(__truncate_qh, __truncate_queue_entry) stone_head;
+
 	/* Fields changeable on command line are listed in wtperf_opt.i */
 #define	OPT_DECLARE_STRUCT
 #include "wtperf_opt.i"
@@ -211,7 +245,7 @@ typedef struct {
 struct __config_thread {		/* Per-thread structure */
 	CONFIG *cfg;			/* Enclosing configuration */
 
-	uint64_t rnd;			/* Random number generation state */
+	WT_RAND_STATE rnd;		/* Random number generation state */
 
 	pthread_t handle;		/* Handle */
 
@@ -223,8 +257,13 @@ struct __config_thread {		/* Per-thread structure */
 	TRACK insert;			/* Insert operations */
 	TRACK read;			/* Read operations */
 	TRACK update;			/* Update operations */
+	TRACK truncate;			/* Truncate operations */
+	TRACK truncate_sleep;		/* Truncate sleep operations */
+	TRUNCATE_CONFIG trunc_cfg;	/* Truncate configuration */
+
 };
 
+void	 cleanup_truncate_config(CONFIG *);
 int	 config_assign(CONFIG *, const CONFIG *);
 int	 config_compress(CONFIG *);
 void	 config_free(CONFIG *);
@@ -238,11 +277,15 @@ void	 latency_read(CONFIG *, uint32_t *, uint32_t *, uint32_t *);
 void	 latency_update(CONFIG *, uint32_t *, uint32_t *, uint32_t *);
 void	 latency_print(CONFIG *);
 int	 enomem(const CONFIG *);
+int	 run_truncate(
+    CONFIG *, CONFIG_THREAD *, WT_CURSOR *, WT_SESSION *, int *);
 int	 setup_log_file(CONFIG *);
+int	 setup_truncate(CONFIG *, CONFIG_THREAD *, WT_SESSION *);
 uint64_t sum_ckpt_ops(CONFIG *);
 uint64_t sum_insert_ops(CONFIG *);
 uint64_t sum_pop_ops(CONFIG *);
 uint64_t sum_read_ops(CONFIG *);
+uint64_t sum_truncate_ops(CONFIG *);
 uint64_t sum_update_ops(CONFIG *);
 void	 usage(void);
 
@@ -251,3 +294,14 @@ void	 lprintf(const CONFIG *, int err, uint32_t, const char *, ...)
 __attribute__((format (printf, 4, 5)))
 #endif
 ;
+
+static inline void
+generate_key(CONFIG *cfg, char *key_buf, uint64_t keyno)
+{
+	/*
+	 * Don't change to snprintf, sprintf is faster in some tests.
+	 */
+	sprintf(key_buf, "%0*" PRIu64, cfg->key_sz - 1, keyno);
+}
+
+#endif
diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i b/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i
index 6cb39ac3cc4..7e29aa0f3c2 100644
--- a/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i
+++ b/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i
@@ -167,7 +167,8 @@ DEF_OPT_AS_STRING(threads, "", "workload configuration: each 'count' "
     "'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' "
     "which would create 2 threads doing nothing but reads and 8 threads "
     "each doing 50% inserts and 25% reads and updates.  Allowed configuration "
-    "values are 'count', 'throttle', 'reads', 'inserts', 'updates'. There are "
+    "values are 'count', 'throttle', 'reads', 'inserts', 'updates', 'truncate',"
+    " 'truncate_pct' and 'truncate_count'. There are "
     "also behavior modifiers, supported modifiers are 'ops_per_txn'")
 DEF_OPT_AS_CONFIG_STRING(transaction_config, "",
     "transaction configuration string, relevant when populate_opts_per_txn "
diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf_truncate.c b/src/third_party/wiredtiger/bench/wtperf/wtperf_truncate.c
new file mode 100644
index 00000000000..0d5d1045e1e
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/wtperf_truncate.c
@@ -0,0 +1,216 @@
+/*-
+ * Public Domain 2014-2015 MongoDB, Inc.
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "wtperf.h"
+
+static inline uint64_t
+decode_key(char *key_buf)
+{
+	return (strtoull(key_buf, NULL, 10));
+}
+
+int
+setup_truncate(CONFIG *cfg, CONFIG_THREAD *thread, WT_SESSION *session) {
+
+	TRUNCATE_CONFIG *trunc_cfg;
+	TRUNCATE_QUEUE_ENTRY *truncate_item;
+	WORKLOAD *workload;
+	WT_CURSOR *cursor;
+	char *key, *truncate_key;
+	int ret;
+	uint64_t end_point, final_stone_gap, i, start_point;
+
+	end_point = final_stone_gap = start_point = 0;
+	trunc_cfg = &thread->trunc_cfg;
+	workload = thread->workload;
+
+	/* We are limited to only one table when running truncate. */
+	if ((ret = session->open_cursor(
+	    session, cfg->uris[0], NULL, NULL, &cursor)) != 0)
+		goto err;
+
+	/* How many entries between each stone. */
+	trunc_cfg->stone_gap =
+	    (workload->truncate_count * workload->truncate_pct) / 100;
+	/* How many stones we need. */
+	trunc_cfg->needed_stones =
+	    workload->truncate_count / trunc_cfg->stone_gap;
+
+	final_stone_gap = trunc_cfg->stone_gap;
+
+	/* Reset this value for use again. */
+	trunc_cfg->stone_gap = 0;
+
+	/*
+	 * Here we check if there is data in the collection. If there is
+	 * data available, then we need to setup some initial truncation
+	 * stones.
+	 */
+	if ((ret = cursor->next(cursor)) != 0 ||
+	    (ret = cursor->get_key(cursor, &key)) != 0) {
+		lprintf(cfg, ret, 0, "truncate setup start: failed");
+		goto err;
+	}
+
+	start_point = decode_key(key);
+	if ((cursor->reset(cursor)) != 0 || (ret = cursor->prev(cursor)) != 0 ||
+	    (ret = cursor->get_key(cursor, &key)) != 0) {
+		lprintf(cfg, ret, 0, "truncate setup end: failed");
+		goto err;
+	}
+	end_point = decode_key(key);
+
+	/* Assign stones if there are enough documents. */
+	if (start_point + trunc_cfg->needed_stones > end_point)
+		trunc_cfg->stone_gap = 0;
+	else
+		trunc_cfg->stone_gap =
+		    (end_point - start_point) / trunc_cfg->needed_stones;
+
+	/* If we have enough data allocate some stones. */
+	if (trunc_cfg->stone_gap != 0) {
+		trunc_cfg->expected_total = (end_point - start_point);
+		for (i = 1; i <= trunc_cfg->needed_stones; i++) {
+			truncate_key = calloc(cfg->key_sz, 1);
+			if (truncate_key == NULL) {
+				ret = enomem(cfg);
+				goto err;
+			}
+			truncate_item = calloc(sizeof(TRUNCATE_QUEUE_ENTRY), 1);
+			if (truncate_item == NULL) {
+				free(truncate_key);
+				ret = enomem(cfg);
+				goto err;
+			}
+			generate_key(
+			    cfg, truncate_key, trunc_cfg->stone_gap * i);
+			truncate_item->key = truncate_key;
+			truncate_item->diff =
+			    (trunc_cfg->stone_gap * i) - trunc_cfg->last_key;
+			STAILQ_INSERT_TAIL( &cfg->stone_head, truncate_item, q);
+			trunc_cfg->last_key = trunc_cfg->stone_gap * i;
+			trunc_cfg->num_stones++;
+		}
+	}
+	trunc_cfg->stone_gap = final_stone_gap;
+
+err:	if ((ret = cursor->close(cursor)) != 0) {
+		lprintf(cfg, ret, 0, "truncate setup: cursor close failed");
+	}
+	return (ret);
+}
+
+int
+run_truncate(CONFIG *cfg, CONFIG_THREAD *thread,
+    WT_CURSOR *cursor, WT_SESSION *session, int *truncatedp) {
+
+	TRUNCATE_CONFIG *trunc_cfg;
+	TRUNCATE_QUEUE_ENTRY *truncate_item;
+	char *truncate_key;
+	int ret, t_ret;
+
+	ret = 0;
+	trunc_cfg = &thread->trunc_cfg;
+
+	*truncatedp = 0;
+	/* Update the total inserts */
+	trunc_cfg->total_inserts = sum_insert_ops(cfg);
+	trunc_cfg->expected_total +=
+	    (trunc_cfg->total_inserts - trunc_cfg->last_total_inserts);
+	trunc_cfg->last_total_inserts = trunc_cfg->total_inserts;
+
+	/* We are done if there isn't enough data to trigger a new milestone. */
+	if (trunc_cfg->expected_total <= trunc_cfg->needed_stones)
+		return (0);
+
+	while (trunc_cfg->num_stones < trunc_cfg->needed_stones) {
+		trunc_cfg->last_key += trunc_cfg->stone_gap;
+		truncate_key = calloc(cfg->key_sz, 1);
+		if (truncate_key == NULL) {
+			lprintf(cfg, ENOMEM, 0,
+			    "truncate: couldn't allocate key array");
+			return (ENOMEM);
+		}
+		truncate_item = calloc(sizeof(TRUNCATE_QUEUE_ENTRY), 1);
+		if (truncate_item == NULL) {
+			free(truncate_key);
+			lprintf(cfg, ENOMEM, 0,
+			    "truncate: couldn't allocate item");
+			return (ENOMEM);
+		}
+		generate_key(cfg, truncate_key, trunc_cfg->last_key);
+		truncate_item->key = truncate_key;
+		truncate_item->diff = trunc_cfg->stone_gap;
+		STAILQ_INSERT_TAIL(&cfg->stone_head, truncate_item, q);
+		trunc_cfg->num_stones++;
+	}
+
+	/* We are done if there isn't enough data to trigger a truncate. */
+	if (trunc_cfg->num_stones == 0 ||
+	    trunc_cfg->expected_total <= thread->workload->truncate_count)
+		return (0);
+
+	truncate_item = STAILQ_FIRST(&cfg->stone_head);
+	trunc_cfg->num_stones--;
+	STAILQ_REMOVE_HEAD(&cfg->stone_head, q);
+	cursor->set_key(cursor,truncate_item->key);
+	if ((ret = cursor->search(cursor)) != 0) {
+		lprintf(cfg, ret, 0, "Truncate search: failed");
+		goto err;
+	}
+
+	if ((ret = session->truncate(session, NULL, NULL, cursor, NULL)) != 0) {
+		lprintf(cfg, ret, 0, "Truncate: failed");
+		goto err;
+	}
+
+
+	*truncatedp = 1;
+	trunc_cfg->expected_total -= truncate_item->diff;
+
+err:	free(truncate_item->key);
+	free(truncate_item);
+	t_ret = cursor->reset(cursor);
+	if (t_ret != 0)
+		lprintf(cfg, t_ret, 0, "Cursor reset failed");
+	if (ret == 0 && t_ret != 0)
+		ret = t_ret;
+	return (ret);
+}
+
+void
+cleanup_truncate_config(CONFIG *cfg) {
+	TRUNCATE_QUEUE_ENTRY *truncate_item;
+
+	while (!STAILQ_EMPTY(&cfg->stone_head)) {
+		truncate_item = STAILQ_FIRST(&cfg->stone_head);
+		STAILQ_REMOVE_HEAD(&cfg->stone_head, q);
+		free(truncate_item->key);
+		free(truncate_item);
+	}
+}
diff --git a/src/third_party/wiredtiger/dist/api_err.py b/src/third_party/wiredtiger/dist/api_err.py
index d39f076656f..936c7bb11a7 100644
--- a/src/third_party/wiredtiger/dist/api_err.py
+++ b/src/third_party/wiredtiger/dist/api_err.py
@@ -100,7 +100,7 @@ tfile.write('''/* DO NOT EDIT: automatically built by dist/api_err.py. */
 
 /*
  * __wt_wiredtiger_error --
- *\tReturn a constant string for WiredTiger POSIX-standard and errors.
+ *\tReturn a constant string for POSIX-standard and WiredTiger errors.
  */
 const char *
 __wt_wiredtiger_error(int error)
@@ -119,8 +119,8 @@ for err in errors:
 tfile.write('''\t}
 
 \t/*
-\t * POSIX errors are non-negative integers; check for 0 explicitly
-\t * in-case the underlying strerror doesn't handle 0, some don't.
+\t * POSIX errors are non-negative integers; check for 0 explicitly incase
+\t * the underlying strerror doesn't handle 0, some historically didn't.
 \t */
 \tif (error == 0)
 \t\treturn ("Successful return: 0");
diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list
index 623a34447a8..a9ae2a10006 100644
--- a/src/third_party/wiredtiger/dist/s_define.list
+++ b/src/third_party/wiredtiger/dist/s_define.list
@@ -21,8 +21,8 @@ WT_ATOMIC_ADD2
 WT_ATOMIC_CAS1
 WT_ATOMIC_CAS2
 WT_ATOMIC_FETCH_ADD1
-WT_ATOMIC_FETCH_ADD2
 WT_ATOMIC_FETCH_ADD4
+WT_ATOMIC_FETCH_ADD8
 WT_ATOMIC_STORE1
 WT_ATOMIC_STORE2
 WT_ATOMIC_SUB1
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok
index 1ed92b79ba8..a104bb011da 100644
--- a/src/third_party/wiredtiger/dist/s_string.ok
+++ b/src/third_party/wiredtiger/dist/s_string.ok
@@ -609,6 +609,7 @@ idx
 ifdef's
 ikey
 impl
+incase
 incr
 incrementing
 indices
@@ -743,6 +744,7 @@ nop
 noraw
 notfound
 notsup
+notused
 nset
 nsnap
 nul
diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py
index caf68364696..77061b36dcb 100644
--- a/src/third_party/wiredtiger/dist/stat_data.py
+++ b/src/third_party/wiredtiger/dist/stat_data.py
@@ -208,10 +208,12 @@ connection_stats = [
     ##########################################
     # Dhandle statistics
     ##########################################
-    DhandleStat('dh_conn_handles', 'connection dhandles swept'),
-    DhandleStat('dh_conn_ref', 'connection candidate referenced'),
-    DhandleStat('dh_conn_sweeps', 'connection sweeps'),
-    DhandleStat('dh_conn_tod', 'connection time-of-death sets'),
+    DhandleStat('dh_sweep_close', 'connection sweep dhandles closed'),
+    DhandleStat('dh_sweep_remove',
+        'connection sweep dhandles removed from hash list'),
+    DhandleStat('dh_sweep_ref', 'connection sweep candidate became referenced'),
+    DhandleStat('dh_sweep_tod', 'connection sweep time-of-death sets'),
+    DhandleStat('dh_sweeps', 'connection sweeps'),
     DhandleStat('dh_session_handles', 'session dhandles swept'),
     DhandleStat('dh_session_sweeps', 'session sweep attempts'),
 
diff --git a/src/third_party/wiredtiger/src/block/block_ckpt.c b/src/third_party/wiredtiger/src/block/block_ckpt.c
index c88c44fb9c3..abcad392e33 100644
--- a/src/third_party/wiredtiger/src/block/block_ckpt.c
+++ b/src/third_party/wiredtiger/src/block/block_ckpt.c
@@ -403,7 +403,8 @@ __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
 	 */
 	if (block->ckpt_inprogress) {
 		__wt_errx(session,
-		    "%s: checkpointed without the checkpoint being resolved",
+		    "%s: checkpointed without first resolving the previous "
+		    "checkpoint",
 		    block->name);
 
 		WT_RET(__wt_block_checkpoint_resolve(session, block));
diff --git a/src/third_party/wiredtiger/src/block/block_open.c b/src/third_party/wiredtiger/src/block/block_open.c
index df42a14816f..fd00e0c7deb 100644
--- a/src/third_party/wiredtiger/src/block/block_open.c
+++ b/src/third_party/wiredtiger/src/block/block_open.c
@@ -132,8 +132,7 @@ __block_destroy(WT_SESSION_IMPL *session, WT_BLOCK *block)
 	bucket = block->name_hash % WT_HASH_ARRAY_SIZE;
 	WT_CONN_BLOCK_REMOVE(conn, block, bucket);
 
-	if (block->name != NULL)
-		__wt_free(session, block->name);
+	__wt_free(session, block->name);
 
 	if (block->fh != NULL)
 		WT_TRET(__wt_close(session, &block->fh));
@@ -195,14 +194,20 @@ __wt_block_open(WT_SESSION_IMPL *session,
 		}
 	}
 
-	/* Basic structure allocation, initialization. */
+	/*
+	 * Basic structure allocation, initialization.
+	 *
+	 * Note: set the block's name-hash value before any work that can fail
+	 * because cleanup calls the block destroy code which uses that hash
+	 * value to remove the block from the underlying linked lists.
+	 */
 	WT_ERR(__wt_calloc_one(session, &block));
 	block->ref = 1;
+	block->name_hash = hash;
+	block->allocsize = allocsize;
 	WT_CONN_BLOCK_INSERT(conn, block, bucket);
 
 	WT_ERR(__wt_strdup(session, filename, &block->name));
-	block->name_hash = hash;
-	block->allocsize = allocsize;
 
 	WT_ERR(__wt_config_gets(session, cfg, "block_allocation", &cval));
 	block->allocfirst =
diff --git a/src/third_party/wiredtiger/src/block/block_read.c b/src/third_party/wiredtiger/src/block/block_read.c
index ef944fcb152..0d631396b41 100644
--- a/src/third_party/wiredtiger/src/block/block_read.c
+++ b/src/third_party/wiredtiger/src/block/block_read.c
@@ -192,21 +192,29 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
 	buf->size = size;
 
 	blk = WT_BLOCK_HEADER_REF(buf->mem);
-	page_cksum = blk->cksum;
-	if (page_cksum == cksum) {
+	if (blk->cksum == cksum) {
 		blk->cksum = 0;
 		page_cksum = __wt_cksum(buf->mem,
 		    F_ISSET(blk, WT_BLOCK_DATA_CKSUM) ?
 		    size : WT_BLOCK_COMPRESS_SKIP);
 		if (page_cksum == cksum)
 			return (0);
-	}
 
-	if (!F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))
-		__wt_errx(session,
-		    "read checksum error [%" PRIu32 "B @ %" PRIuMAX ", %"
-		    PRIu32 " != %" PRIu32 "]",
-		    size, (uintmax_t)offset, cksum, page_cksum);
+		if (!F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))
+			__wt_errx(session,
+			    "read checksum error for %" PRIu32 "B block at "
+			    "offset %" PRIuMAX ": calculated block checksum "
+			    "of %" PRIu32 " doesn't match expected checksum "
+			    "of %" PRIu32,
+			    size, (uintmax_t)offset, page_cksum, cksum);
+	} else
+		if (!F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))
+			__wt_errx(session,
+			    "read checksum error for %" PRIu32 "B block at "
+			    "offset %" PRIuMAX ": block header checksum "
+			    "of %" PRIu32 " doesn't match expected checksum "
+			    "of %" PRIu32,
+			    size, (uintmax_t)offset, blk->cksum, cksum);
 
 	/* Panic if a checksum fails during an ordinary read. */
 	return (block->verify ||
diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c
index f257a955801..2705f371fb5 100644
--- a/src/third_party/wiredtiger/src/btree/bt_walk.c
+++ b/src/third_party/wiredtiger/src/btree/bt_walk.c
@@ -81,10 +81,11 @@ __wt_tree_walk(WT_SESSION_IMPL *session,
 	WT_PAGE *page;
 	WT_PAGE_INDEX *pindex;
 	WT_REF *couple, *couple_orig, *ref;
-	int prev, skip;
+	int empty_internal, prev, skip;
 	uint32_t slot;
 
 	btree = S2BT(session);
+	empty_internal = 0;
 
 	/*
 	 * Tree walks are special: they look inside page structures that splits
@@ -171,6 +172,15 @@ ascend:	/*
 		    (!prev && slot == pindex->entries - 1)) {
 			ref = ref->home->pg_intl_parent_ref;
 
+			/*
+			 * If we got all the way through an internal page and
+			 * all of the child pages were deleted, evict it.
+			 */
+			if (empty_internal) {
+				__wt_page_evict_soon(ref->page);
+				empty_internal = 0;
+			}
+
 			/* Optionally skip internal pages. */
 			if (LF_ISSET(WT_READ_SKIP_INTL))
 				goto ascend;
@@ -226,6 +236,13 @@ ascend:	/*
 			if (ref->pindex_hint != slot)
 				ref->pindex_hint = slot;
 
+			/*
+			 * If we see any child states other than deleted, the
+			 * page isn't empty.
+			 */
+			if (ref->state != WT_REF_DELETED)
+				empty_internal = 0;
+
 			if (LF_ISSET(WT_READ_CACHE)) {
 				/*
 				 * Only look at unlocked pages in memory:
@@ -338,10 +355,10 @@ ascend:	/*
 			 */
 descend:		couple = ref;
 			page = ref->page;
-			if (page->type == WT_PAGE_ROW_INT ||
-			    page->type == WT_PAGE_COL_INT) {
+			if (WT_PAGE_IS_INTERNAL(page)) {
 				WT_INTL_INDEX_GET(session, page, pindex);
 				slot = prev ? pindex->entries - 1 : 0;
+				empty_internal = 1;
 			} else {
 				*refp = ref;
 				goto done;
diff --git a/src/third_party/wiredtiger/src/btree/col_modify.c b/src/third_party/wiredtiger/src/btree/col_modify.c
index 2fe09681090..fb7c9a1ce90 100644
--- a/src/third_party/wiredtiger/src/btree/col_modify.c
+++ b/src/third_party/wiredtiger/src/btree/col_modify.c
@@ -160,7 +160,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
 		 * The serial mutex acts as our memory barrier to flush these
 		 * writes before inserting them into the list.
 		 */
-		if (WT_SKIP_FIRST(ins_head) == NULL || recno == 0)
+		if (cbt->ins_stack[0] == NULL || recno == 0)
 			for (i = 0; i < skipdepth; i++) {
 				cbt->ins_stack[i] = &ins_head->head[i];
 				ins->next[i] = cbt->next_stack[i] = NULL;
diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c
index f0a10cdf528..3331632b725 100644
--- a/src/third_party/wiredtiger/src/btree/row_modify.c
+++ b/src/third_party/wiredtiger/src/btree/row_modify.c
@@ -192,7 +192,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
 		 * The serial mutex acts as our memory barrier to flush these
 		 * writes before inserting them into the list.
 		 */
-		if (WT_SKIP_FIRST(ins_head) == NULL)
+		if (cbt->ins_stack[0] == NULL)
 			for (i = 0; i < skipdepth; i++) {
 				cbt->ins_stack[i] = &ins_head->head[i];
 				ins->next[i] = cbt->next_stack[i] = NULL;
@@ -263,7 +263,6 @@ int
 __wt_update_alloc(
     WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep)
 {
-	WT_UPDATE *upd;
 	size_t size;
 
 	/*
@@ -271,16 +270,15 @@ __wt_update_alloc(
 	 * the value into place.
 	 */
 	size = value == NULL ? 0 : value->size;
-	WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE) + size, &upd));
+	WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE) + size, updp));
 	if (value == NULL)
-		WT_UPDATE_DELETED_SET(upd);
+		WT_UPDATE_DELETED_SET(*updp);
 	else {
-		upd->size = WT_STORE_SIZE(size);
-		memcpy(WT_UPDATE_DATA(upd), value->data, size);
+		(*updp)->size = WT_STORE_SIZE(size);
+		memcpy(WT_UPDATE_DATA(*updp), value->data, size);
 	}
 
-	*updp = upd;
-	*sizep = WT_UPDATE_MEMSIZE(upd);
+	*sizep = WT_UPDATE_MEMSIZE(*updp);
 	return (0);
 }
 
diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c
index 9803b924355..d83d3253c44 100644
--- a/src/third_party/wiredtiger/src/btree/row_srch.c
+++ b/src/third_party/wiredtiger/src/btree/row_srch.c
@@ -471,6 +471,7 @@ __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
 	WT_PAGE *page;
 	WT_PAGE_INDEX *pindex;
 	WT_REF *current, *descent;
+	uint32_t cnt;
 
 	btree = S2BT(session);
 
@@ -528,18 +529,22 @@ restart:
 
 	/*
 	 * If the tree is new (and not empty), it might have a large insert
-	 * list, pick the key in the middle of that insert list.
+	 * list. Count how many records are in the list.
 	 */
 	F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
 	if ((cbt->ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
 		WT_ERR(WT_NOTFOUND);
-	for (p = t = WT_SKIP_FIRST(cbt->ins_head);;) {
+	for (cnt = 1, p = WT_SKIP_FIRST(cbt->ins_head);; ++cnt)
 		if ((p = WT_SKIP_NEXT(p)) == NULL)
 			break;
-		if ((p = WT_SKIP_NEXT(p)) == NULL)
+
+	/*
+	 * Select a random number from 0 to (N - 1), return that record.
+	 */
+	cnt = __wt_random(&session->rnd) % cnt;
+	for (p = t = WT_SKIP_FIRST(cbt->ins_head);; t = p)
+		if (cnt-- == 0 || (p = WT_SKIP_NEXT(p)) == NULL)
 			break;
-		t = WT_SKIP_NEXT(t);
-	}
 	cbt->ref = current;
 	cbt->compare = 0;
 	cbt->ins = t;
diff --git a/src/third_party/wiredtiger/src/conn/api_strerror.c b/src/third_party/wiredtiger/src/conn/api_strerror.c
index e41e402a1fd..92f12402537 100644
--- a/src/third_party/wiredtiger/src/conn/api_strerror.c
+++ b/src/third_party/wiredtiger/src/conn/api_strerror.c
@@ -13,7 +13,7 @@
 
 /*
  * __wt_wiredtiger_error --
- *	Return a constant string for WiredTiger POSIX-standard and errors.
+ *	Return a constant string for POSIX-standard and WiredTiger errors.
  */
 const char *
 __wt_wiredtiger_error(int error)
@@ -41,8 +41,8 @@ __wt_wiredtiger_error(int error)
 	}
 
 	/*
-	 * POSIX errors are non-negative integers; check for 0 explicitly
-	 * in-case the underlying strerror doesn't handle 0, some don't.
+	 * POSIX errors are non-negative integers; check for 0 explicitly incase
+	 * the underlying strerror doesn't handle 0, some historically didn't.
 	 */
 	if (error == 0)
 		return ("Successful return: 0");
diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c
index 067ad00560e..bc96ddd117a 100644
--- a/src/third_party/wiredtiger/src/conn/conn_api.c
+++ b/src/third_party/wiredtiger/src/conn/conn_api.c
@@ -1904,7 +1904,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
 	conn->hazard_max = (uint32_t)cval.val;
 
 	WT_ERR(__wt_config_gets(session, cfg, "session_max", &cval));
-	conn->session_size = (uint32_t)cval.val + WT_NUM_INTERNAL_SESSIONS;
+	conn->session_size = (uint32_t)cval.val + WT_EXTRA_INTERNAL_SESSIONS;
 
 	WT_ERR(__wt_config_gets(session, cfg, "session_scratch_max", &cval));
 	conn->session_scratch_max = (size_t)cval.val;
diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
index 8ff54ec2a6d..1ea609f6578 100644
--- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c
+++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
@@ -591,6 +591,7 @@ __conn_dhandle_remove(WT_SESSION_IMPL *session, int final)
 	bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE;
 
 	WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
+	WT_ASSERT(session, dhandle != conn->cache->evict_file_next);
 
 	/* Check if the handle was reacquired by a session while we waited. */
 	if (!final &&
diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c
index de4bf7268ed..dae0293d790 100644
--- a/src/third_party/wiredtiger/src/conn/conn_log.c
+++ b/src/third_party/wiredtiger/src/conn/conn_log.c
@@ -500,7 +500,7 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield)
 			WT_RET(__wt_log_slot_free(session, slot));
 			if (free_i != NULL && *free_i == WT_SLOT_POOL &&
 			    slot->slot_state == WT_LOG_SLOT_FREE)
-				*free_i = save_i;
+				*free_i = written[i].slot_index;
 		}
 	}
 	return (0);
diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c
index c4350d90adb..3f3808579a9 100644
--- a/src/third_party/wiredtiger/src/conn/conn_open.c
+++ b/src/third_party/wiredtiger/src/conn/conn_open.c
@@ -128,7 +128,8 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
 	 * conditional because we allocate the log path so that printlog can
 	 * run without running logging or recovery.
 	 */
-	if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
+	if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
+	    FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE))
 		WT_TRET(__wt_txn_checkpoint_log(
 		    session, 1, WT_TXN_LOG_CKPT_STOP, NULL));
 	F_CLR(conn, WT_CONN_LOG_SERVER_RUN);
diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c
index ec6f628a02e..492b89bb8a8 100644
--- a/src/third_party/wiredtiger/src/conn/conn_sweep.c
+++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c
@@ -14,49 +14,44 @@
  *	handles.
  */
 static int
-__sweep_mark(WT_SESSION_IMPL *session, int *dead_handlesp)
+__sweep_mark(WT_SESSION_IMPL *session, time_t now)
 {
 	WT_CONNECTION_IMPL *conn;
 	WT_DATA_HANDLE *dhandle;
-	time_t now;
 
 	conn = S2C(session);
-	*dead_handlesp = 0;
 
-	/* Don't discard handles that have been open recently. */
-	WT_RET(__wt_seconds(session, &now));
-
-	WT_STAT_FAST_CONN_INCR(session, dh_conn_sweeps);
 	SLIST_FOREACH(dhandle, &conn->dhlh, l) {
 		if (WT_IS_METADATA(dhandle))
 			continue;
-		if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) {
-			++*dead_handlesp;
-			continue;
-		}
-		if (dhandle->session_inuse != 0 ||
-		    now <= dhandle->timeofdeath + conn->sweep_idle_time ||
-		    conn->sweep_idle_time == 0)
-			continue;
-		if (dhandle->timeofdeath == 0) {
-			dhandle->timeofdeath = now;
-			WT_STAT_FAST_CONN_INCR(session, dh_conn_tod);
+
+		/*
+		 * There are some internal increments of the in-use count such
+		 * as eviction.  Don't keep handles alive because of those
+		 * cases, but if we see multiple cursors open, clear the time
+		 * of death.
+		 */
+		if (dhandle->session_inuse > 1)
+			dhandle->timeofdeath = 0;
+
+		if (F_ISSET(dhandle, WT_DHANDLE_DEAD) ||
+		    dhandle->session_inuse != 0 ||
+		    dhandle->timeofdeath != 0)
 			continue;
-		}
 
-		/* We now have a candidate to close. */
-		++*dead_handlesp;
+		dhandle->timeofdeath = now;
+		WT_STAT_FAST_CONN_INCR(session, dh_sweep_tod);
 	}
 
 	return (0);
 }
 
 /*
- * __sweep_expire_handle --
+ * __sweep_expire_one --
  *	Mark a single handle dead.
  */
 static int
-__sweep_expire_handle(WT_SESSION_IMPL *session)
+__sweep_expire_one(WT_SESSION_IMPL *session)
 {
 	WT_BTREE *btree;
 	WT_DATA_HANDLE *dhandle;
@@ -113,42 +108,31 @@ err:	WT_TRET(__wt_writeunlock(session, dhandle->rwlock));
  *	until we have reached the configured minimum number of handles.
  */
 static int
-__sweep_expire(WT_SESSION_IMPL *session)
+__sweep_expire(WT_SESSION_IMPL *session, time_t now)
 {
 	WT_CONNECTION_IMPL *conn;
 	WT_DATA_HANDLE *dhandle;
 	WT_DECL_RET;
-	time_t now;
 
 	conn = S2C(session);
 
-	/* If sweep_idle_time is 0, then we won't expire any cursors */
-	if (conn->sweep_idle_time == 0)
-		return (0);
-
-	/* Don't discard handles that have been open recently. */
-	WT_RET(__wt_seconds(session, &now));
-
-	WT_STAT_FAST_CONN_INCR(session, dh_conn_sweeps);
 	SLIST_FOREACH(dhandle, &conn->dhlh, l) {
 		/*
-		 * Ignore open files once the open file count reaches the
+		 * Ignore open files once the btree file count is below the
 		 * minimum number of handles.
 		 */
-		if (conn->open_file_count < conn->sweep_handles_min)
+		if (conn->open_btree_count < conn->sweep_handles_min)
 			break;
 
-		if (WT_IS_METADATA(dhandle))
-			continue;
-		if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
-		    F_ISSET(dhandle, WT_DHANDLE_DEAD))
-			continue;
-		if (dhandle->session_inuse != 0 ||
+		if (WT_IS_METADATA(dhandle) ||
+		    F_ISSET(dhandle, WT_DHANDLE_DEAD) ||
+		    dhandle->session_inuse != 0 ||
+		    dhandle->timeofdeath == 0 ||
 		    now <= dhandle->timeofdeath + conn->sweep_idle_time)
 			continue;
 
 		WT_WITH_DHANDLE(session, dhandle,
-		    ret = __sweep_expire_handle(session));
+		    ret = __sweep_expire_one(session));
 		WT_RET_BUSY_OK(ret);
 	}
 
@@ -156,11 +140,12 @@ __sweep_expire(WT_SESSION_IMPL *session)
 }
 
 /*
- * __sweep_flush --
- *	Flush pages from dead trees.
+ * __sweep_discard_trees --
+ *	Discard pages from dead trees.
  */
 static int
-__sweep_flush(WT_SESSION_IMPL *session)
+__sweep_discard_trees(
+    WT_SESSION_IMPL *session, time_t now, u_int *dead_handlesp)
 {
 	WT_CONNECTION_IMPL *conn;
 	WT_DATA_HANDLE *dhandle;
@@ -168,8 +153,14 @@ __sweep_flush(WT_SESSION_IMPL *session)
 
 	conn = S2C(session);
 
-	WT_STAT_FAST_CONN_INCR(session, dh_conn_sweeps);
+	*dead_handlesp = 0;
+
 	SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+		if (!F_ISSET(dhandle, WT_DHANDLE_OPEN | WT_DHANDLE_EXCLUSIVE) &&
+		    (dhandle->timeofdiscard == 0 ||
+		    now <= dhandle->timeofdiscard + conn->sweep_idle_time))
+			++*dead_handlesp;
+
 		if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
 		    !F_ISSET(dhandle, WT_DHANDLE_DEAD))
 			continue;
@@ -178,9 +169,12 @@ __sweep_flush(WT_SESSION_IMPL *session)
 		WT_WITH_DHANDLE(session, dhandle, ret =
 		    __wt_conn_btree_sync_and_close(session, 0, 0));
 
-		/* We closed the btree handle, bump the statistic. */
-		if (ret == 0)
-			WT_STAT_FAST_CONN_INCR(session, dh_conn_handles);
+		/* We closed the btree handle. */
+		if (ret == 0) {
+			WT_STAT_FAST_CONN_INCR(session, dh_sweep_close);
+			++*dead_handlesp;
+		} else
+			WT_STAT_FAST_CONN_INCR(session, dh_sweep_ref);
 
 		WT_RET_BUSY_OK(ret);
 	}
@@ -189,52 +183,75 @@ __sweep_flush(WT_SESSION_IMPL *session)
 }
 
 /*
+ * __sweep_remove_one --
+ *	Remove a closed handle from the connection list.
+ */
+static int
+__sweep_remove_one(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle)
+{
+	WT_DECL_RET;
+
+	/* Try to get exclusive access. */
+	WT_RET(__wt_try_writelock(session, dhandle->rwlock));
+
+	/*
+	 * If there are no longer any references to the handle in any
+	 * sessions, attempt to discard it.
+	 */
+	if (F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_OPEN) ||
+	    dhandle->session_inuse != 0 || dhandle->session_ref != 0)
+		WT_ERR(EBUSY);
+
+	WT_WITH_DHANDLE(session, dhandle,
+	    ret = __wt_conn_dhandle_discard_single(session, 0, 1));
+
+	/*
+	 * If the handle was not successfully discarded, unlock it and
+	 * don't retry the discard until it times out again.
+	 */
+	if (ret != 0) {
+err:		WT_TRET(__wt_writeunlock(session, dhandle->rwlock));
+	}
+
+	return (ret);
+}
+
+/*
  * __sweep_remove_handles --
- *	Remove closed dhandles from the connection list.
+ *	Remove closed handles from the connection list.
  */
 static int
-__sweep_remove_handles(WT_SESSION_IMPL *session)
+__sweep_remove_handles(WT_SESSION_IMPL *session, time_t now)
 {
 	WT_CONNECTION_IMPL *conn;
 	WT_DATA_HANDLE *dhandle, *dhandle_next;
 	WT_DECL_RET;
 
 	conn = S2C(session);
-	dhandle = SLIST_FIRST(&conn->dhlh);
 
-	for (; dhandle != NULL; dhandle = dhandle_next) {
+	for (dhandle = SLIST_FIRST(&conn->dhlh);
+	    dhandle != NULL;
+	    dhandle = dhandle_next) {
 		dhandle_next = SLIST_NEXT(dhandle, l);
 		if (WT_IS_METADATA(dhandle))
 			continue;
-		if (F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
-		    dhandle->session_inuse != 0 ||
-		    dhandle->session_ref != 0)
+		if (F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_OPEN) ||
+		    dhandle->session_inuse != 0 || dhandle->session_ref != 0)
 			continue;
-
-		/* Make sure we get exclusive access. */
-		if ((ret =
-		    __wt_try_writelock(session, dhandle->rwlock)) == EBUSY)
+		if (dhandle->timeofdiscard != 0 &&
+		    now <= dhandle->timeofdiscard + conn->sweep_idle_time)
 			continue;
-		WT_RET(ret);
 
-		/*
-		 * If there are no longer any references to the handle in any
-		 * sessions, attempt to discard it.
-		 */
-		if (F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
-		    dhandle->session_inuse != 0 || dhandle->session_ref != 0) {
-			WT_RET(__wt_writeunlock(session, dhandle->rwlock));
-			continue;
+		WT_WITH_HANDLE_LIST_LOCK(session,
+		    ret = __sweep_remove_one(session, dhandle));
+		if (ret == 0)
+			WT_STAT_FAST_CONN_INCR(
+			    session, dh_sweep_remove);
+		else {
+			WT_STAT_FAST_CONN_INCR(session, dh_sweep_ref);
+			dhandle->timeofdiscard = now;
 		}
-
-		WT_WITH_DHANDLE(session, dhandle,
-		    ret = __wt_conn_dhandle_discard_single(session, 0, 1));
-
-		/* If the handle was not successfully discarded, unlock it. */
-		if (ret != 0)
-			WT_TRET(__wt_writeunlock(session, dhandle->rwlock));
 		WT_RET_BUSY_OK(ret);
-		WT_STAT_FAST_CONN_INCR(session, dh_conn_ref);
 	}
 
 	return (ret == EBUSY ? 0 : ret);
@@ -250,7 +267,8 @@ __sweep_server(void *arg)
 	WT_CONNECTION_IMPL *conn;
 	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int dead_handles;
+	time_t now;
+	u_int dead_handles;
 
 	session = arg;
 	conn = S2C(session);
@@ -263,35 +281,30 @@ __sweep_server(void *arg)
 		/* Wait until the next event. */
 		WT_ERR(__wt_cond_wait(session, conn->sweep_cond,
 		    (uint64_t)conn->sweep_interval * WT_MILLION));
+		WT_ERR(__wt_seconds(session, &now));
+
+		WT_STAT_FAST_CONN_INCR(session, dh_sweeps);
 
 		/*
 		 * Mark handles with a time of death, and report whether any
-		 * handles are marked dead.
+		 * handles are marked dead.  If sweep_idle_time is 0, handles
+		 * never become idle.
 		 */
-		WT_ERR(__sweep_mark(session, &dead_handles));
+		if (conn->sweep_idle_time != 0)
+			WT_ERR(__sweep_mark(session, now));
 
 		/*
-		 * We only want to flush and expire if there are no dead handles
-		 * and if either the sweep_idle_time is not 0, or if we have
-		 * reached the configured limit of handles.
+		 * Close handles if we have reached the configured limit.
+		 * If sweep_idle_time is 0, handles never become idle.
 		 */
-		if (dead_handles == 0 &&
-		    (conn->open_file_count < conn->sweep_handles_min ||
-		    conn->sweep_idle_time != 0))
-			continue;
+		if (conn->sweep_idle_time != 0 &&
+		    conn->open_btree_count >= conn->sweep_handles_min)
+			WT_ERR(__sweep_expire(session, now));
 
-		/* Close handles if we have reached the configured limit */
-		if (conn->open_file_count >= conn->sweep_handles_min) {
-			WT_WITH_HANDLE_LIST_LOCK(session,
-			    ret = __sweep_expire(session));
-			WT_ERR(ret);
-		}
-
-		WT_ERR(__sweep_flush(session));
+		WT_ERR(__sweep_discard_trees(session, now, &dead_handles));
 
-		WT_WITH_HANDLE_LIST_LOCK(session,
-		    ret = __sweep_remove_handles(session));
-		WT_ERR(ret);
+		if (dead_handles > 0)
+			WT_ERR(__sweep_remove_handles(session, now));
 	}
 
 	if (0) {
diff --git a/src/third_party/wiredtiger/src/cursor/cur_index.c b/src/third_party/wiredtiger/src/cursor/cur_index.c
index 6f4d5e85f5a..7dad85e9d38 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_index.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_index.c
@@ -427,7 +427,11 @@ __wt_curindex_open(WT_SESSION_IMPL *session,
 	else
 		namesize = (size_t)(columns - idxname);
 
-	WT_RET(__wt_schema_open_index(session, table, idxname, namesize, &idx));
+	if ((ret = __wt_schema_open_index(
+	    session, table, idxname, namesize, &idx)) != 0) {
+		__wt_schema_release_table(session, table);
+		return (ret);
+	}
 	WT_RET(__wt_calloc_one(session, &cindex));
 
 	cursor = &cindex->iface;
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index 513da401ae6..a03d8b9147d 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -158,7 +158,6 @@ __evict_server(void *arg)
 	WT_CACHE *cache;
 	WT_CONNECTION_IMPL *conn;
 	WT_DECL_RET;
-	WT_EVICT_WORKER *worker;
 	WT_SESSION_IMPL *session;
 
 	session = arg;
@@ -173,30 +172,6 @@ __evict_server(void *arg)
 			break;
 
 		/*
-		 * If we have caught up and there are more than the minimum
-		 * number of eviction workers running, shut one down.
-		 */
-		if (conn->evict_workers > conn->evict_workers_min) {
-			WT_TRET(__wt_verbose(session, WT_VERB_EVICTSERVER,
-			    "Stopping evict worker: %"PRIu32"\n",
-			    conn->evict_workers));
-			worker = &conn->evict_workctx[--conn->evict_workers];
-			F_CLR(worker, WT_EVICT_WORKER_RUN);
-			WT_TRET(__wt_cond_signal(
-			    session, cache->evict_waiter_cond));
-			WT_TRET(__wt_thread_join(session, worker->tid));
-			/*
-			 * Flag errors here with a message, but don't shut down
-			 * the eviction server - that's fatal.
-			 */
-			WT_ASSERT(session, ret == 0);
-			if (ret != 0) {
-				(void)__wt_msg(session,
-				    "Error stopping eviction worker: %d", ret);
-				ret = 0;
-			}
-		}
-		/*
 		 * Clear the walks so we don't pin pages while asleep,
 		 * otherwise we can block applications evicting large pages.
 		 */
@@ -571,9 +546,14 @@ static int
 __evict_clear_walk(WT_SESSION_IMPL *session)
 {
 	WT_BTREE *btree;
+	WT_CACHE *cache;
 	WT_REF *ref;
 
 	btree = S2BT(session);
+	cache = S2C(session)->cache;
+
+	if (session->dhandle == cache->evict_file_next)
+		cache->evict_file_next = NULL;
 
 	if ((ref = btree->evict_ref) == NULL)
 		return (0);
@@ -593,21 +573,17 @@ __evict_clear_walk(WT_SESSION_IMPL *session)
 static int
 __evict_clear_walks(WT_SESSION_IMPL *session)
 {
-	WT_CACHE *cache;
 	WT_CONNECTION_IMPL *conn;
 	WT_DECL_RET;
 	WT_SESSION_IMPL *s;
 	u_int i, session_cnt;
 
 	conn = S2C(session);
-	cache = conn->cache;
 
 	WT_ORDERED_READ(session_cnt, conn->session_cnt);
 	for (s = conn->sessions, i = 0; i < session_cnt; ++s, ++i) {
 		if (!s->active || !F_ISSET(s, WT_SESSION_CLEAR_EVICT_WALK))
 			continue;
-		if (s->dhandle == cache->evict_file_next)
-			cache->evict_file_next = NULL;
 		WT_WITH_DHANDLE(
 		    session, s->dhandle, WT_TRET(__evict_clear_walk(session)));
 	}
@@ -631,7 +607,8 @@ __evict_request_walk_clear(WT_SESSION_IMPL *session)
 
 	F_SET(session, WT_SESSION_CLEAR_EVICT_WALK);
 
-	while (btree->evict_ref != NULL && ret == 0) {
+	while (ret == 0 && (btree->evict_ref != NULL ||
+	    cache->evict_file_next == session->dhandle)) {
 		F_SET(cache, WT_CACHE_CLEAR_WALKS);
 		ret = __wt_cond_wait(
 		    session, cache->evict_waiter_cond, 100000);
@@ -982,9 +959,17 @@ retry:	while (slot < max_entries && ret == 0) {
 			dhandle_locked = 1;
 		}
 
-		if (dhandle == NULL)
-			dhandle = SLIST_FIRST(&conn->dhlh);
-		else {
+		if (dhandle == NULL) {
+			/*
+			 * On entry, continue from wherever we got to in the
+			 * scan last time through.  If we don't have a saved
+			 * handle, start from the beginning of the list.
+			 */
+			if ((dhandle = cache->evict_file_next) != NULL)
+				cache->evict_file_next = NULL;
+			else
+				dhandle = SLIST_FIRST(&conn->dhlh);
+		} else {
 			if (incr) {
 				WT_ASSERT(session, dhandle->session_inuse > 0);
 				(void)WT_ATOMIC_SUB4(dhandle->session_inuse, 1);
@@ -1002,15 +987,6 @@ retry:	while (slot < max_entries && ret == 0) {
 		    !F_ISSET(dhandle, WT_DHANDLE_OPEN))
 			continue;
 
-		/*
-		 * Each time we reenter this function, start at the next handle
-		 * on the list.
-		 */
-		if (cache->evict_file_next != NULL &&
-		    cache->evict_file_next != dhandle)
-			continue;
-		cache->evict_file_next = NULL;
-
 		/* Skip files that don't allow eviction. */
 		btree = dhandle->handle;
 		if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
@@ -1071,6 +1047,9 @@ retry:	while (slot < max_entries && ret == 0) {
 	}
 
 	if (incr) {
+		/* Remember the file we should visit first, next loop. */
+		cache->evict_file_next = dhandle;
+
 		WT_ASSERT(session, dhandle->session_inuse > 0);
 		(void)WT_ATOMIC_SUB4(dhandle->session_inuse, 1);
 		incr = 0;
@@ -1084,21 +1063,17 @@ retry:	while (slot < max_entries && ret == 0) {
 	/*
 	 * Walk the list of files a few times if we don't find enough pages.
 	 * Try two passes through all the files, give up when we have some
-	 * candidates and we aren't finding more.  Take care not to skip files
-	 * on subsequent passes.
+	 * candidates and we aren't finding more.
 	 */
 	if (!F_ISSET(cache, WT_CACHE_CLEAR_WALKS) && ret == 0 &&
 	    slot < max_entries && (retries < 2 ||
 	    (!LF_ISSET(WT_EVICT_PASS_WOULD_BLOCK) && retries < 10 &&
 	    (slot == cache->evict_entries || slot > start_slot)))) {
-		cache->evict_file_next = NULL;
 		start_slot = slot;
 		++retries;
 		goto retry;
 	}
 
-	/* Remember the file we should visit first, next loop. */
-	cache->evict_file_next = dhandle;
 	cache->evict_entries = slot;
 	return (ret);
 }
@@ -1270,6 +1245,9 @@ fast:		/* If the page can't be evicted, give up. */
 		WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER,
 		    "select: %p, size %" PRIu64, page, page->memory_footprint));
 	}
+	WT_RET_NOTFOUND_OK(ret);
+
+	*slotp += (u_int)(evict - start);
 
 	/*
 	 * If we happen to end up on the root page, clear it.  We have to track
@@ -1282,16 +1260,12 @@ fast:		/* If the page can't be evicted, give up. */
 	if ((ref = btree->evict_ref) != NULL && (__wt_ref_is_root(ref) ||
 	    ref->page->read_gen == WT_READGEN_OLDEST)) {
 		btree->evict_ref = NULL;
-		__wt_page_release(session, ref, WT_READ_NO_EVICT);
+		WT_RET(__wt_page_release(session, ref, WT_READ_NO_EVICT));
 	}
 
-	/* If the walk was interrupted by a locked page, that's okay. */
-	if (ret == WT_NOTFOUND)
-		ret = 0;
-
-	*slotp += (u_int)(evict - start);
 	WT_STAT_FAST_CONN_INCRV(session, cache_eviction_walk, pages_walked);
-	return (ret);
+
+	return (0);
 }
 
 /*
diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i
index d13ec1972fb..da014a14e35 100644
--- a/src/third_party/wiredtiger/src/include/btree.i
+++ b/src/third_party/wiredtiger/src/include/btree.i
@@ -1027,29 +1027,6 @@ __wt_page_can_evict(WT_SESSION_IMPL *session,
 	}
 
 	/*
-	 * If we aren't (potentially) doing eviction that can restore updates
-	 * and the updates on this page are too recent, give up.
-	 *
-	 * Don't rely on new updates being skipped by the transaction used
-	 * for transaction reads: (1) there are paths that dirty pages for
-	 * artificial reasons; (2) internal pages aren't transactional; and
-	 * (3) if an update was skipped during the checkpoint (leaving the page
-	 * dirty), then rolled back, we could still successfully overwrite a
-	 * page and corrupt the checkpoint.
-	 *
-	 * Further, we can't race with the checkpoint's reconciliation of
-	 * an internal page as we evict a clean child from the page's subtree.
-	 * This works in the usual way: eviction locks the page and then checks
-	 * for existing hazard pointers, the checkpoint thread reconciling an
-	 * internal page acquires hazard pointers on child pages it reads, and
-	 * is blocked by the exclusive lock.
-	 */
-	if (page->read_gen != WT_READGEN_OLDEST &&
-	    !__wt_txn_visible_all(session, __wt_page_is_modified(page) ?
-	    mod->update_txn : mod->rec_max_txn))
-		return (0);
-
-	/*
 	 * If the page was recently split in-memory, don't force it out: we
 	 * hope an eviction thread will find it first.  The check here is
 	 * similar to __wt_txn_visible_all, but ignores the checkpoint's
diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h
index cd55aadfc07..06a020b80e8 100644
--- a/src/third_party/wiredtiger/src/include/connection.h
+++ b/src/third_party/wiredtiger/src/include/connection.h
@@ -100,10 +100,10 @@ struct __wt_named_extractor {
 };
 
 /*
- * Allocate some additional slots for internal sessions.  There is a default
- * session for each connection, plus a session for each server thread.
+ * Allocate some additional slots for internal sessions so the user cannot
+ * configure too few sessions for us to run.
  */
-#define	WT_NUM_INTERNAL_SESSIONS	10
+#define	WT_EXTRA_INTERNAL_SESSIONS	10
 
 /*
  * WT_CONN_CHECK_PANIC --
@@ -325,7 +325,8 @@ struct __wt_connection_impl {
 #define	WT_CONN_LOG_ENABLED	0x02	/* Logging is enabled */
 #define	WT_CONN_LOG_EXISTED	0x04	/* Log files found */
 #define	WT_CONN_LOG_PREALLOC	0x08	/* Pre-allocation is enabled */
-#define	WT_CONN_LOG_RECOVER_ERR	0x10	/* Error if recovery required */
+#define	WT_CONN_LOG_RECOVER_DONE	0x10	/* Recovery completed */
+#define	WT_CONN_LOG_RECOVER_ERR	0x20	/* Error if recovery required */
 	uint32_t	 log_flags;	/* Global logging configuration */
 	WT_CONDVAR	*log_cond;	/* Log server wait mutex */
 	WT_SESSION_IMPL *log_session;	/* Log server session */
diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i
index 9e592ede450..143a8e87449 100644
--- a/src/third_party/wiredtiger/src/include/cursor.i
+++ b/src/third_party/wiredtiger/src/include/cursor.i
@@ -187,6 +187,12 @@ __cursor_func_init(WT_CURSOR_BTREE *cbt, int reenter)
 	if (reenter)
 		WT_RET(__curfile_leave(cbt));
 
+	/*
+	 * Any old insert position is now invalid.  We rely on this being
+	 * cleared to detect if a new skiplist is installed after a search.
+	 */
+	cbt->ins_stack[0] = NULL;
+
 	/* If the transaction is idle, check that the cache isn't full. */
 	WT_RET(__wt_txn_idle_cache_check(session));
 
diff --git a/src/third_party/wiredtiger/src/include/dhandle.h b/src/third_party/wiredtiger/src/include/dhandle.h
index 22a0a2c1dd4..8bb649513c7 100644
--- a/src/third_party/wiredtiger/src/include/dhandle.h
+++ b/src/third_party/wiredtiger/src/include/dhandle.h
@@ -45,6 +45,7 @@ struct __wt_data_handle {
 	uint32_t session_ref;		/* Sessions referencing this handle */
 	int32_t	 session_inuse;		/* Sessions using this handle */
 	time_t	 timeofdeath;		/* Use count went to 0 */
+	time_t	 timeofdiscard;		/* Time of last failed discard */
 
 	uint64_t name_hash;		/* Hash of name */
 	const char *name;		/* Object name as a URI */
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 0826fa7b10b..e98545c3466 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -640,8 +640,8 @@ extern uint32_t __wt_nlpo2(uint32_t v);
 extern uint32_t __wt_log2_int(uint32_t n);
 extern int __wt_ispo2(uint32_t v);
 extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2);
-extern void __wt_random_init(uint64_t volatile *rnd_state);
-extern uint32_t __wt_random(uint64_t volatile *rnd_state);
+extern void __wt_random_init(WT_RAND_STATE volatile *rnd_state);
+extern uint32_t __wt_random(WT_RAND_STATE volatile *rnd_state);
 extern int __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size);
 extern int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4)));
 extern int __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4)));
diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h
index 95c43f6772d..7fb6ae13d38 100644
--- a/src/third_party/wiredtiger/src/include/misc.h
+++ b/src/third_party/wiredtiger/src/include/misc.h
@@ -255,3 +255,11 @@
 #define	__wt_page_swap(session, held, want, flags)			\
 	__wt_page_swap_func(session, held, want, flags)
 #endif
+
+/* Random number generator state. */
+union __wt_rand_state {
+	uint64_t v;
+	struct {
+		uint32_t w, z;
+	} x;
+};
diff --git a/src/third_party/wiredtiger/src/include/mutex.h b/src/third_party/wiredtiger/src/include/mutex.h
index 7a5028d6a28..7d901a38d0d 100644
--- a/src/third_party/wiredtiger/src/include/mutex.h
+++ b/src/third_party/wiredtiger/src/include/mutex.h
@@ -24,24 +24,20 @@ struct __wt_condvar {
 
 /*
  * !!!
- * Don't touch this structure without understanding the read/write
- * locking functions.
+ * Don't modify this structure without understanding the read/write locking
+ * functions.
  */
-typedef union {			/* Read/write lock */
-#ifdef WORDS_BIGENDIAN
-	WiredTiger read/write locks require modification for big-endian systems.
-#else
+typedef union {				/* Read/write lock */
 	uint64_t u;
 	struct {
-		uint32_t us;
+		uint32_t wr;		/* Writers and readers */
 	} i;
 	struct {
-		uint16_t writers;
-		uint16_t readers;
-		uint16_t users;
-		uint16_t pad;
+		uint16_t writers;	/* Now serving for writers */
+		uint16_t readers;	/* Now serving for readers */
+		uint16_t users;		/* Next available ticket number */
+		uint16_t __notused;	/* Padding */
 	} s;
-#endif
 } wt_rwlock_t;
 
 /*
diff --git a/src/third_party/wiredtiger/src/include/os.h b/src/third_party/wiredtiger/src/include/os.h
index ba5d95657d5..edb59b0f521 100644
--- a/src/third_party/wiredtiger/src/include/os.h
+++ b/src/third_party/wiredtiger/src/include/os.h
@@ -56,7 +56,7 @@ typedef enum {
 		case EMFILE:						\
 		case ENFILE:						\
 		case ENOSPC:						\
-			__wt_sleep(0L, 500000L);			\
+			__wt_sleep(0L, 50000L);				\
 			continue;					\
 		default:						\
 			break;						\
diff --git a/src/third_party/wiredtiger/src/include/serial.i b/src/third_party/wiredtiger/src/include/serial.i
index 9e6b0f7916c..0fc23348800 100644
--- a/src/third_party/wiredtiger/src/include/serial.i
+++ b/src/third_party/wiredtiger/src/include/serial.i
@@ -30,11 +30,11 @@ __page_write_gen_wrapped_check(WT_PAGE *page)
 }
 
 /*
- * __insert_serial_func --
- *	Worker function to add a WT_INSERT entry to a skiplist.
+ * __insert_simple_func --
+ *	Worker function to add a WT_INSERT entry to the middle of a skiplist.
  */
 static inline int
-__insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head,
+__insert_simple_func(WT_SESSION_IMPL *session,
     WT_INSERT ***ins_stack, WT_INSERT *new_ins, u_int skipdepth)
 {
 	u_int i;
@@ -42,31 +42,62 @@ __insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head,
 	WT_UNUSED(session);
 
 	/*
-	 * Confirm we are still in the expected position, and no item has been
-	 * added where our insert belongs.  Take extra care at the beginning
-	 * and end of the list (at each level): retry if we race there.
+	 * Update the skiplist elements referencing the new WT_INSERT item.
+	 * If we fail connecting one of the upper levels in the skiplist,
+	 * return success: the levels we updated are correct and sufficient.
+	 * Even though we don't get the benefit of the memory we allocated,
+	 * we can't roll back.
 	 *
-	 * !!!
-	 * Note the test for ins_stack[0] == NULL: that's the test for an
-	 * uninitialized cursor, ins_stack[0] is cleared as part of
-	 * initializing a cursor for a search.
+	 * All structure setup must be flushed before the structure is entered
+	 * into the list. We need a write barrier here, our callers depend on
+	 * it.  Don't pass complex arguments to the macro, some implementations
+	 * read the old value multiple times.
 	 */
 	for (i = 0; i < skipdepth; i++) {
-		if (ins_stack[i] == NULL ||
-		    *ins_stack[i] != new_ins->next[i])
-			return (WT_RESTART);
-		if (new_ins->next[i] == NULL &&
-		    ins_head->tail[i] != NULL &&
-		    ins_stack[i] != &ins_head->tail[i]->next[i])
-			return (WT_RESTART);
+		WT_INSERT *old_ins = *ins_stack[i];
+		if (old_ins != new_ins->next[i] ||
+		    !WT_ATOMIC_CAS8(*ins_stack[i], old_ins, new_ins))
+			return (i == 0 ? WT_RESTART : 0);
 	}
 
-	/* Update the skiplist elements referencing the new WT_INSERT item. */
+	return (0);
+}
+
+/*
+ * __insert_serial_func --
+ *	Worker function to add a WT_INSERT entry to a skiplist.
+ */
+static inline int
+__insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head,
+    WT_INSERT ***ins_stack, WT_INSERT *new_ins, u_int skipdepth)
+{
+	u_int i;
+
+	/* The cursor should be positioned. */
+	WT_ASSERT(session, ins_stack[0] != NULL);
+
+	/*
+	 * Update the skiplist elements referencing the new WT_INSERT item.
+	 *
+	 * Confirm we are still in the expected position, and no item has been
+	 * added where our insert belongs.  If we fail connecting one of the
+	 * upper levels in the skiplist, return success: the levels we updated
+	 * are correct and sufficient. Even though we don't get the benefit of
+	 * the memory we allocated, we can't roll back.
+	 *
+	 * All structure setup must be flushed before the structure is entered
+	 * into the list. We need a write barrier here, our callers depend on
+	 * it.  Don't pass complex arguments to the macro, some implementations
+	 * read the old value multiple times.
+	 */
 	for (i = 0; i < skipdepth; i++) {
+		WT_INSERT *old_ins = *ins_stack[i];
+		if (old_ins != new_ins->next[i] ||
+		    !WT_ATOMIC_CAS8(*ins_stack[i], old_ins, new_ins))
+			return (i == 0 ? WT_RESTART : 0);
 		if (ins_head->tail[i] == NULL ||
 		    ins_stack[i] == &ins_head->tail[i]->next[i])
 			ins_head->tail[i] = new_ins;
-		*ins_stack[i] = new_ins;
 	}
 
 	return (0);
@@ -128,20 +159,20 @@ __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
 	WT_INSERT *new_ins = *new_insp;
 	WT_DECL_RET;
 
-	/* Clear references to memory we now own. */
-	*new_insp = NULL;
-
 	/* Check for page write generation wrap. */
 	WT_RET(__page_write_gen_wrapped_check(page));
 
+	/* Clear references to memory we now own and must free on error. */
+	*new_insp = NULL;
+
 	/* Acquire the page's spinlock, call the worker function. */
 	WT_PAGE_LOCK(session, page);
 	ret = __col_append_serial_func(
 	    session, ins_head, ins_stack, new_ins, recnop, skipdepth);
 	WT_PAGE_UNLOCK(session, page);
 
-	/* Free unused memory on error. */
 	if (ret != 0) {
+		/* Free unused memory on error. */
 		__wt_free(session, new_ins);
 		return (ret);
 	}
@@ -171,21 +202,32 @@ __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
 {
 	WT_INSERT *new_ins = *new_insp;
 	WT_DECL_RET;
-
-	/* Clear references to memory we now own. */
-	*new_insp = NULL;
+	int simple;
+	u_int i;
 
 	/* Check for page write generation wrap. */
 	WT_RET(__page_write_gen_wrapped_check(page));
 
-	/* Acquire the page's spinlock, call the worker function. */
-	WT_PAGE_LOCK(session, page);
-	ret = __insert_serial_func(
-	    session, ins_head, ins_stack, new_ins, skipdepth);
-	WT_PAGE_UNLOCK(session, page);
+	/* Clear references to memory we now own and must free on error. */
+	*new_insp = NULL;
+
+	simple = 1;
+	for (i = 0; i < skipdepth; i++)
+		if (new_ins->next[i] == NULL)
+			simple = 0;
+
+	if (simple)
+		ret = __insert_simple_func(
+		    session, ins_stack, new_ins, skipdepth);
+	else {
+		WT_PAGE_LOCK(session, page);
+		ret = __insert_serial_func(
+		    session, ins_head, ins_stack, new_ins, skipdepth);
+		WT_PAGE_UNLOCK(session, page);
+	}
 
-	/* Free unused memory on error. */
 	if (ret != 0) {
+		/* Free unused memory on error. */
 		__wt_free(session, new_ins);
 		return (ret);
 	}
@@ -215,17 +257,19 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
 	WT_DECL_RET;
 	WT_UPDATE *obsolete, *upd = *updp;
 
-	/* Clear references to memory we now own. */
-	*updp = NULL;
-
 	/* Check for page write generation wrap. */
 	WT_RET(__page_write_gen_wrapped_check(page));
 
+	/* Clear references to memory we now own and must free on error. */
+	*updp = NULL;
+
 	/*
+	 * All structure setup must be flushed before the structure is entered
+	 * into the list. We need a write barrier here, our callers depend on
+	 * it.
+	 *
 	 * Swap the update into place.  If that fails, a new update was added
-	 * after our search, we raced.  Check if our update is still permitted,
-	 * and if it is, do a full-barrier to ensure the update's next pointer
-	 * is set before we update the linked list and try again.
+	 * after our search, we raced.  Check if our update is still permitted.
 	 */
 	while (!WT_ATOMIC_CAS8(*srch_upd, upd->next, upd)) {
 		if ((ret = __wt_txn_update_check(
@@ -234,7 +278,6 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
 			__wt_free(session, upd);
 			return (ret);
 		}
-		WT_WRITE_BARRIER();
 	}
 
 	/*
diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h
index bf1aa98d8d3..f32da177bf9 100644
--- a/src/third_party/wiredtiger/src/include/session.h
+++ b/src/third_party/wiredtiger/src/include/session.h
@@ -148,7 +148,7 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl {
 #define	WT_SESSION_CLEAR_SIZE(s)					\
 	(WT_PTRDIFF(&(s)->rnd, s))
 
-	uint64_t rnd;			/* Random number generation state */
+	WT_RAND_STATE rnd;		/* Random number generation state */
 
 					/* Hashed handle reference list array */
 	SLIST_HEAD(__dhandles_hash, __wt_data_handle_cache) *dhhash;
diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h
index 6dc9282a613..d99d70b6d23 100644
--- a/src/third_party/wiredtiger/src/include/stat.h
+++ b/src/third_party/wiredtiger/src/include/stat.h
@@ -196,12 +196,13 @@ struct __wt_connection_stats {
 	WT_STATS cursor_search;
 	WT_STATS cursor_search_near;
 	WT_STATS cursor_update;
-	WT_STATS dh_conn_handles;
-	WT_STATS dh_conn_ref;
-	WT_STATS dh_conn_sweeps;
-	WT_STATS dh_conn_tod;
 	WT_STATS dh_session_handles;
 	WT_STATS dh_session_sweeps;
+	WT_STATS dh_sweep_close;
+	WT_STATS dh_sweep_ref;
+	WT_STATS dh_sweep_remove;
+	WT_STATS dh_sweep_tod;
+	WT_STATS dh_sweeps;
 	WT_STATS file_open;
 	WT_STATS log_buffer_size;
 	WT_STATS log_bytes_payload;
diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i
index a9b54d26e47..a8e052ec5eb 100644
--- a/src/third_party/wiredtiger/src/include/txn.i
+++ b/src/third_party/wiredtiger/src/include/txn.i
@@ -377,7 +377,8 @@ __wt_txn_id_check(WT_SESSION_IMPL *session)
 		do {
 			txn_state->id = txn->id = txn_global->current;
 		} while (!WT_ATOMIC_CAS8(
-		    txn_global->current, txn->id, txn->id + 1));
+		    txn_global->current, txn->id, txn->id + 1) ||
+			WT_TXNID_LT(txn->id, txn_global->last_running));
 
 		/*
 		 * If we have used 64-bits of transaction IDs, there is nothing
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
index e8f3b9958ce..80e7d0fcacc 100644
--- a/src/third_party/wiredtiger/src/include/wiredtiger.in
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -3670,162 +3670,164 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
 #define	WT_STAT_CONN_CURSOR_SEARCH_NEAR			1062
 /*! cursor: cursor update calls */
 #define	WT_STAT_CONN_CURSOR_UPDATE			1063
-/*! data-handle: connection dhandles swept */
-#define	WT_STAT_CONN_DH_CONN_HANDLES			1064
-/*! data-handle: connection candidate referenced */
-#define	WT_STAT_CONN_DH_CONN_REF			1065
-/*! data-handle: connection sweeps */
-#define	WT_STAT_CONN_DH_CONN_SWEEPS			1066
-/*! data-handle: connection time-of-death sets */
-#define	WT_STAT_CONN_DH_CONN_TOD			1067
 /*! data-handle: session dhandles swept */
-#define	WT_STAT_CONN_DH_SESSION_HANDLES			1068
+#define	WT_STAT_CONN_DH_SESSION_HANDLES			1064
 /*! data-handle: session sweep attempts */
-#define	WT_STAT_CONN_DH_SESSION_SWEEPS			1069
+#define	WT_STAT_CONN_DH_SESSION_SWEEPS			1065
+/*! data-handle: connection sweep dhandles closed */
+#define	WT_STAT_CONN_DH_SWEEP_CLOSE			1066
+/*! data-handle: connection sweep candidate became referenced */
+#define	WT_STAT_CONN_DH_SWEEP_REF			1067
+/*! data-handle: connection sweep dhandles removed from hash list */
+#define	WT_STAT_CONN_DH_SWEEP_REMOVE			1068
+/*! data-handle: connection sweep time-of-death sets */
+#define	WT_STAT_CONN_DH_SWEEP_TOD			1069
+/*! data-handle: connection sweeps */
+#define	WT_STAT_CONN_DH_SWEEPS				1070
 /*! connection: files currently open */
-#define	WT_STAT_CONN_FILE_OPEN				1070
+#define	WT_STAT_CONN_FILE_OPEN				1071
 /*! log: total log buffer size */
-#define	WT_STAT_CONN_LOG_BUFFER_SIZE			1071
+#define	WT_STAT_CONN_LOG_BUFFER_SIZE			1072
 /*! log: log bytes of payload data */
-#define	WT_STAT_CONN_LOG_BYTES_PAYLOAD			1072
+#define	WT_STAT_CONN_LOG_BYTES_PAYLOAD			1073
 /*! log: log bytes written */
-#define	WT_STAT_CONN_LOG_BYTES_WRITTEN			1073
+#define	WT_STAT_CONN_LOG_BYTES_WRITTEN			1074
 /*! log: yields waiting for previous log file close */
-#define	WT_STAT_CONN_LOG_CLOSE_YIELDS			1074
+#define	WT_STAT_CONN_LOG_CLOSE_YIELDS			1075
 /*! log: total size of compressed records */
-#define	WT_STAT_CONN_LOG_COMPRESS_LEN			1075
+#define	WT_STAT_CONN_LOG_COMPRESS_LEN			1076
 /*! log: total in-memory size of compressed records */
-#define	WT_STAT_CONN_LOG_COMPRESS_MEM			1076
+#define	WT_STAT_CONN_LOG_COMPRESS_MEM			1077
 /*! log: log records too small to compress */
-#define	WT_STAT_CONN_LOG_COMPRESS_SMALL			1077
+#define	WT_STAT_CONN_LOG_COMPRESS_SMALL			1078
 /*! log: log records not compressed */
-#define	WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS		1078
+#define	WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS		1079
 /*! log: log records compressed */
-#define	WT_STAT_CONN_LOG_COMPRESS_WRITES		1079
+#define	WT_STAT_CONN_LOG_COMPRESS_WRITES		1080
 /*! log: maximum log file size */
-#define	WT_STAT_CONN_LOG_MAX_FILESIZE			1080
+#define	WT_STAT_CONN_LOG_MAX_FILESIZE			1081
 /*! log: pre-allocated log files prepared */
-#define	WT_STAT_CONN_LOG_PREALLOC_FILES			1081
+#define	WT_STAT_CONN_LOG_PREALLOC_FILES			1082
 /*! log: number of pre-allocated log files to create */
-#define	WT_STAT_CONN_LOG_PREALLOC_MAX			1082
+#define	WT_STAT_CONN_LOG_PREALLOC_MAX			1083
 /*! log: pre-allocated log files used */
-#define	WT_STAT_CONN_LOG_PREALLOC_USED			1083
+#define	WT_STAT_CONN_LOG_PREALLOC_USED			1084
 /*! log: log release advances write LSN */
-#define	WT_STAT_CONN_LOG_RELEASE_WRITE_LSN		1084
+#define	WT_STAT_CONN_LOG_RELEASE_WRITE_LSN		1085
 /*! log: records processed by log scan */
-#define	WT_STAT_CONN_LOG_SCAN_RECORDS			1085
+#define	WT_STAT_CONN_LOG_SCAN_RECORDS			1086
 /*! log: log scan records requiring two reads */
-#define	WT_STAT_CONN_LOG_SCAN_REREADS			1086
+#define	WT_STAT_CONN_LOG_SCAN_REREADS			1087
 /*! log: log scan operations */
-#define	WT_STAT_CONN_LOG_SCANS				1087
+#define	WT_STAT_CONN_LOG_SCANS				1088
 /*! log: consolidated slot closures */
-#define	WT_STAT_CONN_LOG_SLOT_CLOSES			1088
+#define	WT_STAT_CONN_LOG_SLOT_CLOSES			1089
 /*! log: written slots coalesced */
-#define	WT_STAT_CONN_LOG_SLOT_COALESCED			1089
+#define	WT_STAT_CONN_LOG_SLOT_COALESCED			1090
 /*! log: logging bytes consolidated */
-#define	WT_STAT_CONN_LOG_SLOT_CONSOLIDATED		1090
+#define	WT_STAT_CONN_LOG_SLOT_CONSOLIDATED		1091
 /*! log: consolidated slot joins */
-#define	WT_STAT_CONN_LOG_SLOT_JOINS			1091
+#define	WT_STAT_CONN_LOG_SLOT_JOINS			1092
 /*! log: consolidated slot join races */
-#define	WT_STAT_CONN_LOG_SLOT_RACES			1092
+#define	WT_STAT_CONN_LOG_SLOT_RACES			1093
 /*! log: record size exceeded maximum */
-#define	WT_STAT_CONN_LOG_SLOT_TOOBIG			1093
+#define	WT_STAT_CONN_LOG_SLOT_TOOBIG			1094
 /*! log: failed to find a slot large enough for record */
-#define	WT_STAT_CONN_LOG_SLOT_TOOSMALL			1094
+#define	WT_STAT_CONN_LOG_SLOT_TOOSMALL			1095
 /*! log: consolidated slot join transitions */
-#define	WT_STAT_CONN_LOG_SLOT_TRANSITIONS		1095
+#define	WT_STAT_CONN_LOG_SLOT_TRANSITIONS		1096
 /*! log: log sync operations */
-#define	WT_STAT_CONN_LOG_SYNC				1096
+#define	WT_STAT_CONN_LOG_SYNC				1097
 /*! log: log sync_dir operations */
-#define	WT_STAT_CONN_LOG_SYNC_DIR			1097
+#define	WT_STAT_CONN_LOG_SYNC_DIR			1098
 /*! log: log server thread advances write LSN */
-#define	WT_STAT_CONN_LOG_WRITE_LSN			1098
+#define	WT_STAT_CONN_LOG_WRITE_LSN			1099
 /*! log: log write operations */
-#define	WT_STAT_CONN_LOG_WRITES				1099
+#define	WT_STAT_CONN_LOG_WRITES				1100
 /*! LSM: sleep for LSM checkpoint throttle */
-#define	WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE		1100
+#define	WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE		1101
 /*! LSM: sleep for LSM merge throttle */
-#define	WT_STAT_CONN_LSM_MERGE_THROTTLE			1101
+#define	WT_STAT_CONN_LSM_MERGE_THROTTLE			1102
 /*! LSM: rows merged in an LSM tree */
-#define	WT_STAT_CONN_LSM_ROWS_MERGED			1102
+#define	WT_STAT_CONN_LSM_ROWS_MERGED			1103
 /*! LSM: application work units currently queued */
-#define	WT_STAT_CONN_LSM_WORK_QUEUE_APP			1103
+#define	WT_STAT_CONN_LSM_WORK_QUEUE_APP			1104
 /*! LSM: merge work units currently queued */
-#define	WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER		1104
+#define	WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER		1105
 /*! LSM: tree queue hit maximum */
-#define	WT_STAT_CONN_LSM_WORK_QUEUE_MAX			1105
+#define	WT_STAT_CONN_LSM_WORK_QUEUE_MAX			1106
 /*! LSM: switch work units currently queued */
-#define	WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH		1106
+#define	WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH		1107
 /*! LSM: tree maintenance operations scheduled */
-#define	WT_STAT_CONN_LSM_WORK_UNITS_CREATED		1107
+#define	WT_STAT_CONN_LSM_WORK_UNITS_CREATED		1108
 /*! LSM: tree maintenance operations discarded */
-#define	WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED		1108
+#define	WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED		1109
 /*! LSM: tree maintenance operations executed */
-#define	WT_STAT_CONN_LSM_WORK_UNITS_DONE		1109
+#define	WT_STAT_CONN_LSM_WORK_UNITS_DONE		1110
 /*! connection: memory allocations */
-#define	WT_STAT_CONN_MEMORY_ALLOCATION			1110
+#define	WT_STAT_CONN_MEMORY_ALLOCATION			1111
 /*! connection: memory frees */
-#define	WT_STAT_CONN_MEMORY_FREE			1111
+#define	WT_STAT_CONN_MEMORY_FREE			1112
 /*! connection: memory re-allocations */
-#define	WT_STAT_CONN_MEMORY_GROW			1112
+#define	WT_STAT_CONN_MEMORY_GROW			1113
 /*! thread-yield: page acquire busy blocked */
-#define	WT_STAT_CONN_PAGE_BUSY_BLOCKED			1113
+#define	WT_STAT_CONN_PAGE_BUSY_BLOCKED			1114
 /*! thread-yield: page acquire eviction blocked */
-#define	WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED	1114
+#define	WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED	1115
 /*! thread-yield: page acquire locked blocked */
-#define	WT_STAT_CONN_PAGE_LOCKED_BLOCKED		1115
+#define	WT_STAT_CONN_PAGE_LOCKED_BLOCKED		1116
 /*! thread-yield: page acquire read blocked */
-#define	WT_STAT_CONN_PAGE_READ_BLOCKED			1116
+#define	WT_STAT_CONN_PAGE_READ_BLOCKED			1117
 /*! thread-yield: page acquire time sleeping (usecs) */
-#define	WT_STAT_CONN_PAGE_SLEEP				1117
+#define	WT_STAT_CONN_PAGE_SLEEP				1118
 /*! connection: total read I/Os */
-#define	WT_STAT_CONN_READ_IO				1118
+#define	WT_STAT_CONN_READ_IO				1119
 /*! reconciliation: page reconciliation calls */
-#define	WT_STAT_CONN_REC_PAGES				1119
+#define	WT_STAT_CONN_REC_PAGES				1120
 /*! reconciliation: page reconciliation calls for eviction */
-#define	WT_STAT_CONN_REC_PAGES_EVICTION			1120
+#define	WT_STAT_CONN_REC_PAGES_EVICTION			1121
 /*! reconciliation: split bytes currently awaiting free */
-#define	WT_STAT_CONN_REC_SPLIT_STASHED_BYTES		1121
+#define	WT_STAT_CONN_REC_SPLIT_STASHED_BYTES		1122
 /*! reconciliation: split objects currently awaiting free */
-#define	WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS		1122
+#define	WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS		1123
 /*! connection: pthread mutex shared lock read-lock calls */
-#define	WT_STAT_CONN_RWLOCK_READ			1123
+#define	WT_STAT_CONN_RWLOCK_READ			1124
 /*! connection: pthread mutex shared lock write-lock calls */
-#define	WT_STAT_CONN_RWLOCK_WRITE			1124
+#define	WT_STAT_CONN_RWLOCK_WRITE			1125
 /*! session: open cursor count */
-#define	WT_STAT_CONN_SESSION_CURSOR_OPEN		1125
+#define	WT_STAT_CONN_SESSION_CURSOR_OPEN		1126
 /*! session: open session count */
-#define	WT_STAT_CONN_SESSION_OPEN			1126
+#define	WT_STAT_CONN_SESSION_OPEN			1127
 /*! transaction: transaction begins */
-#define	WT_STAT_CONN_TXN_BEGIN				1127
+#define	WT_STAT_CONN_TXN_BEGIN				1128
 /*! transaction: transaction checkpoints */
-#define	WT_STAT_CONN_TXN_CHECKPOINT			1128
+#define	WT_STAT_CONN_TXN_CHECKPOINT			1129
 /*! transaction: transaction checkpoint generation */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_GENERATION		1129
+#define	WT_STAT_CONN_TXN_CHECKPOINT_GENERATION		1130
 /*! transaction: transaction checkpoint currently running */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_RUNNING		1130
+#define	WT_STAT_CONN_TXN_CHECKPOINT_RUNNING		1131
 /*! transaction: transaction checkpoint max time (msecs) */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX		1131
+#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX		1132
 /*! transaction: transaction checkpoint min time (msecs) */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN		1132
+#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN		1133
 /*! transaction: transaction checkpoint most recent time (msecs) */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT		1133
+#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT		1134
 /*! transaction: transaction checkpoint total time (msecs) */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL		1134
+#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL		1135
 /*! transaction: transactions committed */
-#define	WT_STAT_CONN_TXN_COMMIT				1135
+#define	WT_STAT_CONN_TXN_COMMIT				1136
 /*! transaction: transaction failures due to cache overflow */
-#define	WT_STAT_CONN_TXN_FAIL_CACHE			1136
+#define	WT_STAT_CONN_TXN_FAIL_CACHE			1137
 /*! transaction: transaction range of IDs currently pinned by a checkpoint */
-#define	WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE	1137
+#define	WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE	1138
 /*! transaction: transaction range of IDs currently pinned */
-#define	WT_STAT_CONN_TXN_PINNED_RANGE			1138
+#define	WT_STAT_CONN_TXN_PINNED_RANGE			1139
 /*! transaction: transactions rolled back */
-#define	WT_STAT_CONN_TXN_ROLLBACK			1139
+#define	WT_STAT_CONN_TXN_ROLLBACK			1140
 /*! transaction: transaction sync calls */
-#define	WT_STAT_CONN_TXN_SYNC				1140
+#define	WT_STAT_CONN_TXN_SYNC				1141
 /*! connection: total write I/Os */
-#define	WT_STAT_CONN_WRITE_IO				1141
+#define	WT_STAT_CONN_WRITE_IO				1142
 
 /*!
  * @}
diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h
index b876a2d032d..64e29e104bc 100644
--- a/src/third_party/wiredtiger/src/include/wt_internal.h
+++ b/src/third_party/wiredtiger/src/include/wt_internal.h
@@ -266,6 +266,8 @@ struct __wt_upd_skipped;
     typedef struct __wt_upd_skipped WT_UPD_SKIPPED;
 struct __wt_update;
     typedef struct __wt_update WT_UPDATE;
+union __wt_rand_state;
+    typedef union __wt_rand_state WT_RAND_STATE;
 /*
  * Forward type declarations for internal types: END
  * DO NOT EDIT: automatically built by dist/s_typedef.
diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c
index cdd4f8a24e1..df558b12bef 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c
@@ -38,6 +38,78 @@
  * Joseph Seigh. Note that a similar (but not identical) algorithm was published
  * by John Mellor-Crummey and Michael Scott in their landmark paper "Scalable
  * Reader-Writer Synchronization for Shared-Memory Multiprocessors".
+ *
+ * The following is an explanation of this code. First, the underlying lock
+ * structure.
+ *
+ *	struct {
+ *		uint16_t writers;	Now serving for writers
+ *		uint16_t readers;	Now serving for readers
+ *		uint16_t users;		Next available ticket number
+ *		uint16_t __notused;	Padding
+ *	}
+ *
+ * First, imagine a store's 'take a number' ticket algorithm. A customer takes
+ * a unique ticket number and customers are served in ticket order. In the data
+ * structure, 'writers' is the next writer to be served, 'readers' is the next
+ * reader to be served, and 'users' is the next available ticket number.
+ *
+ * Next, consider exclusive (write) locks. The 'now serving' number for writers
+ * is 'writers'. To lock, 'take a number' and wait until that number is being
+ * served; more specifically, atomically copy and increment the current value of
+ * 'users', and then wait until 'writers' equals that copied number.
+ *
+ * Shared (read) locks are similar. Like writers, readers atomically get the
+ * next number available. However, instead of waiting for 'writers' to equal
+ * their number, they wait for 'readers' to equal their number.
+ *
+ * This has the effect of queuing lock requests in the order they arrive
+ * (incidentally avoiding starvation).
+ *
+ * Each lock/unlock pair requires incrementing both 'readers' and 'writers'.
+ * In the case of a reader, the 'readers' increment happens when the reader
+ * acquires the lock (to allow read-lock sharing), and the 'writers' increment
+ * happens when the reader releases the lock. In the case of a writer, both
+ * 'readers' and 'writers' are incremented when the writer releases the lock.
+ *
+ * For example, consider the following read (R) and write (W) lock requests:
+ *
+ *						writers	readers	users
+ *						0	0	0
+ *	R: ticket 0, readers match	OK	0	1	1
+ *	R: ticket 1, readers match	OK	0	2	2
+ *	R: ticket 2, readers match	OK	0	3	3
+ *	W: ticket 3, writers no match	block	0	3	4
+ *	R: ticket 2, unlock			1	3	4
+ *	R: ticket 0, unlock			2	3	4
+ *	R: ticket 1, unlock			3	3	4
+ *	W: ticket 3, writers match	OK	3	3	4
+ *
+ * Note the writer blocks until 'writers' equals its ticket number and it does
+ * not matter if readers unlock in order or not.
+ *
+ * Readers or writers entering the system after the write lock is queued block,
+ * and the next ticket holder (reader or writer) will unblock when the writer
+ * unlocks. An example, continuing from the last line of the above example:
+ *
+ *						writers	readers	users
+ *	W: ticket 3, writers match	OK	3	3	4
+ *	R: ticket 4, readers no match	block	3	3	5
+ *	R: ticket 5, readers no match	block	3	3	6
+ *	W: ticket 6, writers no match	block	3	3	7
+ *	W: ticket 3, unlock			4	4	7
+ *	R: ticket 4, readers match	OK	4	5	7
+ *	R: ticket 5, readers match	OK	4	6	7
+ *
+ * The 'users' field is a 2-byte value so the available ticket number wraps at
+ * 64K requests. If a thread's lock request is not granted until the 'users'
+ * field cycles and the same ticket is taken by another thread, we could grant
+ * a lock to two separate threads at the same time, and bad things happen: two
+ * writer threads or a reader thread and a writer thread would run in parallel,
+ * and lock waiters could be skipped if the unlocks race. This is unlikely, it
+ * only happens if a lock request is blocked by 64K other requests. The fix is
+ * to grow the lock structure fields, but the largest atomic instruction we have
+ * is 8 bytes, the structure has no room to grow.
  */
 
 #include "wt_internal.h"
@@ -69,20 +141,31 @@ __wt_rwlock_alloc(
 int
 __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 {
-	wt_rwlock_t *l;
-	uint64_t old, new, pad, users, writers;
+	wt_rwlock_t *l, new, old;
 
 	WT_RET(__wt_verbose(
 	    session, WT_VERB_MUTEX, "rwlock: try_readlock %s", rwlock->name));
 	WT_STAT_FAST_CONN_INCR(session, rwlock_read);
 
 	l = &rwlock->rwlock;
-	pad = l->s.pad;
-	users = l->s.users;
-	writers = l->s.writers;
-	old = (pad << 48) + (users << 32) + (users << 16) + writers;
-	new = (pad << 48) + ((users + 1) << 32) + ((users + 1) << 16) + writers;
-	return (WT_ATOMIC_CAS8(l->u, old, new) ? 0 : EBUSY);
+	new = old = *l;
+
+	/*
+	 * This read lock can only be granted if the lock was last granted to
+	 * a reader and there are no readers or writers blocked on the lock,
+	 * that is, if this thread's ticket would be the next ticket granted.
+	 * Do the cheap test to see if this can possibly succeed (and confirm
+	 * the lock is in the correct state to grant this read lock).
+	 */
+	if (old.s.readers != old.s.users)
+		return (EBUSY);
+
+	/*
+	 * The replacement lock value is a result of allocating a new ticket and
+	 * incrementing the reader value to match it.
+	 */
+	new.s.readers = new.s.users = old.s.users + 1;
+	return (WT_ATOMIC_CAS8(l->u, old.u, new.u) ? 0 : EBUSY);
 }
 
 /*
@@ -93,8 +176,7 @@ int
 __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 {
 	wt_rwlock_t *l;
-	uint64_t me;
-	uint16_t val;
+	uint16_t ticket;
 	int pause_cnt;
 
 	WT_RET(__wt_verbose(
@@ -102,17 +184,22 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 	WT_STAT_FAST_CONN_INCR(session, rwlock_read);
 
 	l = &rwlock->rwlock;
-	me = WT_ATOMIC_FETCH_ADD8(l->u, (uint64_t)1 << 32);
-	val = (uint16_t)(me >> 32);
-	for (pause_cnt = 0; val != l->s.readers;) {
+
+	/*
+	 * Possibly wrap: if we have more than 64K lockers waiting, the ticket
+	 * value will wrap and two lockers will simultaneously be granted the
+	 * lock.
+	 */
+	ticket = WT_ATOMIC_FETCH_ADD2(l->s.users, 1);
+	for (pause_cnt = 0; ticket != l->s.readers;) {
 		/*
 		 * We failed to get the lock; pause before retrying and if we've
 		 * paused enough, sleep so we don't burn CPU to no purpose. This
 		 * situation happens if there are more threads than cores in the
-		 * system and we're thrashing on shared resources. Regardless,
-		 * don't sleep long, all we need is to schedule the other reader
-		 * threads to complete a few more instructions and increment the
-		 * reader count.
+		 * system and we're thrashing on shared resources.
+		 *
+		 * Don't sleep long when waiting on a read lock, hopefully we're
+		 * waiting on another read thread to increment the reader count.
 		 */
 		if (++pause_cnt < 1000)
 			WT_PAUSE();
@@ -120,6 +207,10 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 			__wt_sleep(0, 10);
 	}
 
+	/*
+	 * We're the only writer of the readers field, so the update does not
+	 * need to be atomic.
+	 */
 	++l->s.readers;
 
 	return (0);
@@ -138,6 +229,11 @@ __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 	    session, WT_VERB_MUTEX, "rwlock: read unlock %s", rwlock->name));
 
 	l = &rwlock->rwlock;
+
+	/*
+	 * Increment the writers value (other readers are doing the same, make
+	 * sure we don't race).
+	 */
 	WT_ATOMIC_ADD2(l->s.writers, 1);
 
 	return (0);
@@ -150,20 +246,28 @@ __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 int
 __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 {
-	wt_rwlock_t *l;
-	uint64_t old, new, pad, readers, users;
+	wt_rwlock_t *l, new, old;
 
 	WT_RET(__wt_verbose(
 	    session, WT_VERB_MUTEX, "rwlock: try_writelock %s", rwlock->name));
 	WT_STAT_FAST_CONN_INCR(session, rwlock_write);
 
 	l = &rwlock->rwlock;
-	pad = l->s.pad;
-	readers = l->s.readers;
-	users = l->s.users;
-	old = (pad << 48) + (users << 32) + (readers << 16) + users;
-	new = (pad << 48) + ((users + 1) << 32) + (readers << 16) + users;
-	return (WT_ATOMIC_CAS8(l->u, old, new) ? 0 : EBUSY);
+	old = new = *l;
+
+	/*
+	 * This write lock can only be granted if the lock was last granted to
+	 * a writer and there are no readers or writers blocked on the lock,
+	 * that is, if this thread's ticket would be the next ticket granted.
+	 * Do the cheap test to see if this can possibly succeed (and confirm
+	 * the lock is in the correct state to grant this write lock).
+	 */
+	if (old.s.writers != old.s.users)
+		return (EBUSY);
+
+	/* The replacement lock value is a result of allocating a new ticket. */
+	++new.s.users;
+	return (WT_ATOMIC_CAS8(l->u, old.u, new.u) ? 0 : EBUSY);
 }
 
 /*
@@ -174,23 +278,33 @@ int
 __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 {
 	wt_rwlock_t *l;
-	uint64_t me;
-	uint16_t val;
+	uint16_t ticket;
+	int pause_cnt;
 
 	WT_RET(__wt_verbose(
 	    session, WT_VERB_MUTEX, "rwlock: writelock %s", rwlock->name));
 	WT_STAT_FAST_CONN_INCR(session, rwlock_write);
 
+	l = &rwlock->rwlock;
+
 	/*
-	 * Possibly wrap: if we have more than 64K lockers waiting, the count
-	 * of writers will wrap and two lockers will simultaneously be granted
-	 * the write lock.
+	 * Possibly wrap: if we have more than 64K lockers waiting, the ticket
+	 * value will wrap and two lockers will simultaneously be granted the
+	 * lock.
 	 */
-	l = &rwlock->rwlock;
-	me = WT_ATOMIC_FETCH_ADD8(l->u, (uint64_t)1 << 32);
-	val = (uint16_t)(me >> 32);
-	while (val != l->s.writers)
-		WT_PAUSE();
+	ticket = WT_ATOMIC_FETCH_ADD2(l->s.users, 1);
+	for (pause_cnt = 0; ticket != l->s.writers;) {
+		/*
+		 * We failed to get the lock; pause before retrying and if we've
+		 * paused enough, sleep so we don't burn CPU to no purpose. This
+		 * situation happens if there are more threads than cores in the
+		 * system and we're thrashing on shared resources.
+		 */
+		if (++pause_cnt < 1000)
+			WT_PAUSE();
+		else
+			__wt_sleep(0, 10);
+	}
 
 	return (0);
 }
@@ -211,12 +325,23 @@ __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 
 	copy = *l;
 
+	/*
+	 * We're the only writer of the writers/readers fields, so the update
+	 * does not need to be atomic; we have to update both values at the
+	 * same time though, otherwise we'd potentially race with the thread
+	 * next granted the lock.
+	 *
+	 * Use a memory barrier to ensure the compiler doesn't mess with these
+	 * instructions and rework the code in a way that avoids the update as
+	 * a unit.
+	 */
 	WT_BARRIER();
 
 	++copy.s.writers;
 	++copy.s.readers;
 
-	l->i.us = copy.i.us;
+	l->i.wr = copy.i.wr;
+
 	return (0);
 }
 
diff --git a/src/third_party/wiredtiger/src/os_posix/os_thread.c b/src/third_party/wiredtiger/src/os_posix/os_thread.c
index e4f24cdb44e..c7222aac6c4 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_thread.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_thread.c
@@ -19,7 +19,8 @@ __wt_thread_create(WT_SESSION_IMPL *session,
 	WT_DECL_RET;
 
 	/* Spawn a new thread of control. */
-	if ((ret = pthread_create(tidret, NULL, func, arg)) == 0)
+	WT_SYSCALL_RETRY(pthread_create(tidret, NULL, func, arg), ret);
+	if (ret == 0)
 		return (0);
 	WT_RET_MSG(session, ret, "pthread_create");
 }
@@ -33,7 +34,8 @@ __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid)
 {
 	WT_DECL_RET;
 
-	if ((ret = pthread_join(tid, NULL)) == 0)
+	WT_SYSCALL_RETRY(pthread_join(tid, NULL), ret);
+	if (ret == 0)
 		return (0);
 
 	WT_RET_MSG(session, ret, "pthread_join");
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 53a73b44feb..37acb28a00b 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -343,11 +343,12 @@ __wt_reconcile(WT_SESSION_IMPL *session,
 	WT_PAGE *page;
 	WT_PAGE_MODIFY *mod;
 	WT_RECONCILE *r;
-	int locked;
+	int page_lock, scan_lock, split_lock;
 
 	conn = S2C(session);
 	page = ref->page;
 	mod = page->modify;
+	page_lock = scan_lock = split_lock = 0;
 
 	/* We're shouldn't get called with a clean page, that's an error. */
 	if (!__wt_page_is_modified(page))
@@ -386,22 +387,38 @@ __wt_reconcile(WT_SESSION_IMPL *session,
 
 	/*
 	 * The compaction process looks at the page's modification information;
-	 * if compaction is running, lock the page down.
-	 *
-	 * Otherwise, flip on the scanning flag: obsolete updates cannot be
-	 * freed while reconciliation is in progress.
+	 * if compaction is running, acquire the page's lock.
 	 */
-	locked = 0;
 	if (conn->compact_in_memory_pass) {
-		locked = 1;
 		WT_PAGE_LOCK(session, page);
-	} else
+		page_lock = 1;
+	}
+
+	/*
+	 * Reconciliation reads the lists of updates, so obsolete updates cannot
+	 * be discarded while reconciliation is in progress.
+	 */
+	for (;;) {
+		F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret);
+		if (ret == 0)
+			break;
+		__wt_yield();
+	}
+	scan_lock = 1;
+
+	/*
+	 * Mark internal pages as splitting to ensure we don't deadlock when
+	 * performing an in-memory split during a checkpoint.
+	 */
+	if (WT_PAGE_IS_INTERNAL(page)) {
 		for (;;) {
-			F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret);
+			F_CAS_ATOMIC(page, WT_PAGE_SPLIT_LOCKED, ret);
 			if (ret == 0)
 				break;
 			__wt_yield();
 		}
+		split_lock = 1;
+	}
 
 	/* Reconcile the page. */
 	switch (page->type) {
@@ -434,11 +451,13 @@ __wt_reconcile(WT_SESSION_IMPL *session,
 	else
 		WT_TRET(__rec_write_wrapup_err(session, r, page));
 
-	/* Release the page lock if we're holding one. */
-	if (locked)
-		WT_PAGE_UNLOCK(session, page);
-	else
+	/* Release the locks we're holding. */
+	if (split_lock)
+		F_CLR_ATOMIC(page, WT_PAGE_SPLIT_LOCKED);
+	if (scan_lock)
 		F_CLR_ATOMIC(page, WT_PAGE_SCANNING);
+	if (page_lock)
+		WT_PAGE_UNLOCK(session, page);
 
 	/*
 	 * Clean up the boundary structures: some workloads result in millions
@@ -3266,18 +3285,6 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
 	WT_RET(__rec_split_init(
 	    session, r, page, page->pg_intl_recno, btree->maxintlpage));
 
-	/*
-	 * We need to mark this page as splitting, as this may be an in-memory
-	 * split during a checkpoint.
-	 */
-	for (;;) {
-		F_CAS_ATOMIC(page, WT_PAGE_SPLIT_LOCKED, ret);
-		if (ret == 0) {
-			break;
-		}
-		__wt_yield();
-	}
-
 	/* For each entry in the in-memory page... */
 	WT_INTL_FOREACH_BEGIN(session, page, ref) {
 		/* Update the starting record number in case we split. */
@@ -3360,8 +3367,6 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
 		__rec_copy_incr(session, r, val);
 	} WT_INTL_FOREACH_END;
 
-	F_CLR_ATOMIC(page, WT_PAGE_SPLIT_LOCKED);
-
 	/* Write the remnant page. */
 	return (__rec_split_finish(session, r));
 
@@ -4094,18 +4099,6 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
 	 */
 	r->cell_zero = 1;
 
-	/*
-	 * We need to mark this page as splitting in order to ensure we don't
-	 * deadlock when performing an in-memory split during a checkpoint.
-	 */
-	for (;;) {
-		F_CAS_ATOMIC(page, WT_PAGE_SPLIT_LOCKED, ret);
-		if (ret == 0) {
-			break;
-		}
-		__wt_yield();
-	}
-
 	/* For each entry in the in-memory page... */
 	WT_INTL_FOREACH_BEGIN(session, page, ref) {
 		/*
@@ -4264,8 +4257,6 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
 		__rec_key_state_update(r, ovfl_key);
 	} WT_INTL_FOREACH_END;
 
-	F_CLR_ATOMIC(page, WT_PAGE_SPLIT_LOCKED);
-
 	/* Write the remnant page. */
 	return (__rec_split_finish(session, r));
 
diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c
index ef9735a8b98..1103dba7409 100644
--- a/src/third_party/wiredtiger/src/session/session_api.c
+++ b/src/third_party/wiredtiger/src/session/session_api.c
@@ -800,7 +800,7 @@ __session_commit_transaction(WT_SESSION *wt_session, const char *config)
 	WT_STAT_FAST_CONN_INCR(session, txn_commit);
 
 	txn = &session->txn;
-	if (F_ISSET(txn, WT_TXN_ERROR)) {
+	if (F_ISSET(txn, WT_TXN_ERROR) && txn->mod_count != 0) {
 		__wt_errx(session, "failed transaction requires rollback");
 		ret = EINVAL;
 	}
@@ -1166,8 +1166,8 @@ __wt_open_session(WT_CONNECTION_IMPL *conn,
 	if (i == conn->session_size)
 		WT_ERR_MSG(session, ENOMEM,
 		    "only configured to support %" PRIu32 " sessions"
-		    " (including %" PRIu32 " internal)",
-		    conn->session_size, WT_NUM_INTERNAL_SESSIONS);
+		    " (including %d additional internal sessions)",
+		    conn->session_size, WT_EXTRA_INTERNAL_SESSIONS);
 
 	/*
 	 * If the active session count is increasing, update it.  We don't worry
diff --git a/src/third_party/wiredtiger/src/support/rand.c b/src/third_party/wiredtiger/src/support/rand.c
index 7dfb98c5ca4..caac04d3529 100644
--- a/src/third_party/wiredtiger/src/support/rand.c
+++ b/src/third_party/wiredtiger/src/support/rand.c
@@ -41,18 +41,18 @@
  * of the values to avoid that, and read/write in atomic, 8B chunks.
  */
 #undef	M_W
-#define	M_W(p)	((uint32_t *)&(p))[0]
+#define	M_W(r)	r.x.w
 #undef	M_Z
-#define	M_Z(p)	((uint32_t *)&(p))[1]
+#define	M_Z(r)	r.x.z
 
 /*
  * __wt_random_init --
  *	Initialize return of a 32-bit pseudo-random number.
  */
 void
-__wt_random_init(uint64_t volatile * rnd_state)
+__wt_random_init(WT_RAND_STATE volatile * rnd_state)
 {
-	uint64_t rnd;
+	WT_RAND_STATE rnd;
 
 	M_W(rnd) = 521288629;
 	M_Z(rnd) = 362436069;
@@ -64,9 +64,9 @@ __wt_random_init(uint64_t volatile * rnd_state)
  *	Return a 32-bit pseudo-random number.
  */
 uint32_t
-__wt_random(uint64_t volatile * rnd_state)
+__wt_random(WT_RAND_STATE volatile * rnd_state)
 {
-	uint64_t rnd;
+	WT_RAND_STATE rnd;
 	uint32_t w, z;
 
 	/*
diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c
index b0e7d660587..b706263d1ce 100644
--- a/src/third_party/wiredtiger/src/support/stat.c
+++ b/src/third_party/wiredtiger/src/support/stat.c
@@ -444,11 +444,15 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats)
 	stats->cursor_search.desc = "cursor: cursor search calls";
 	stats->cursor_search_near.desc = "cursor: cursor search near calls";
 	stats->cursor_update.desc = "cursor: cursor update calls";
-	stats->dh_conn_ref.desc =
-	    "data-handle: connection candidate referenced";
-	stats->dh_conn_handles.desc = "data-handle: connection dhandles swept";
-	stats->dh_conn_sweeps.desc = "data-handle: connection sweeps";
-	stats->dh_conn_tod.desc = "data-handle: connection time-of-death sets";
+	stats->dh_sweep_ref.desc =
+	    "data-handle: connection sweep candidate became referenced";
+	stats->dh_sweep_close.desc =
+	    "data-handle: connection sweep dhandles closed";
+	stats->dh_sweep_remove.desc =
+	    "data-handle: connection sweep dhandles removed from hash list";
+	stats->dh_sweep_tod.desc =
+	    "data-handle: connection sweep time-of-death sets";
+	stats->dh_sweeps.desc = "data-handle: connection sweeps";
 	stats->dh_session_handles.desc = "data-handle: session dhandles swept";
 	stats->dh_session_sweeps.desc = "data-handle: session sweep attempts";
 	stats->log_slot_closes.desc = "log: consolidated slot closures";
@@ -618,10 +622,11 @@ __wt_stat_refresh_connection_stats(void *stats_arg)
 	stats->cursor_search.v = 0;
 	stats->cursor_search_near.v = 0;
 	stats->cursor_update.v = 0;
-	stats->dh_conn_ref.v = 0;
-	stats->dh_conn_handles.v = 0;
-	stats->dh_conn_sweeps.v = 0;
-	stats->dh_conn_tod.v = 0;
+	stats->dh_sweep_ref.v = 0;
+	stats->dh_sweep_close.v = 0;
+	stats->dh_sweep_remove.v = 0;
+	stats->dh_sweep_tod.v = 0;
+	stats->dh_sweeps.v = 0;
 	stats->dh_session_handles.v = 0;
 	stats->dh_session_sweeps.v = 0;
 	stats->log_slot_closes.v = 0;
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index c9924056e91..210c5dde5d0 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -125,20 +125,6 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
 	txn_global = &conn->txn_global;
 	txn_state = WT_SESSION_TXN_STATE(session);
 
-	current_id = snap_min = txn_global->current;
-	prev_oldest_id = txn_global->oldest_id;
-
-	/* For pure read-only workloads, avoid scanning. */
-	if (prev_oldest_id == current_id) {
-		txn_state->snap_min = current_id;
-		__txn_sort_snapshot(session, 0, current_id);
-
-		/* Check that the oldest ID has not moved in the meantime. */
-		if (prev_oldest_id == txn_global->oldest_id &&
-		    txn_global->scan_count == 0)
-			return;
-	}
-
 	/*
 	 * We're going to scan.  Increment the count of scanners to prevent the
 	 * oldest ID from moving forwards.  Spin if the count is negative,
@@ -150,9 +136,21 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
 	} while (count < 0 ||
 	    !WT_ATOMIC_CAS4(txn_global->scan_count, count, count + 1));
 
-	/* The oldest ID cannot change until the scan count goes to zero. */
-	prev_oldest_id = txn_global->oldest_id;
 	current_id = snap_min = txn_global->current;
+	prev_oldest_id = txn_global->oldest_id;
+
+	/* For pure read-only workloads, avoid scanning. */
+	if (prev_oldest_id == current_id) {
+		txn_state->snap_min = current_id;
+		__txn_sort_snapshot(session, 0, current_id);
+
+		/* Check that the oldest ID has not moved in the meantime. */
+		if (prev_oldest_id == txn_global->oldest_id) {
+			WT_ASSERT(session, txn_global->scan_count > 0);
+			(void)WT_ATOMIC_SUB4(txn_global->scan_count, 1);
+			return;
+		}
+	}
 
 	/* Walk the array of concurrent transactions. */
 	WT_ORDERED_READ(session_cnt, conn->session_cnt);
@@ -184,10 +182,6 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
 	WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
 	txn_state->snap_min = snap_min;
 
-	/* Update the last running ID if we have a much newer value. */
-	if (snap_min > txn_global->last_running + 100)
-		txn_global->last_running = snap_min;
-
 	WT_ASSERT(session, txn_global->scan_count > 0);
 	(void)WT_ATOMIC_SUB4(txn_global->scan_count, 1);
 
@@ -214,7 +208,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
 	WT_SESSION_IMPL *oldest_session;
 	WT_TXN_GLOBAL *txn_global;
 	WT_TXN_STATE *s;
-	uint64_t current_id, id, oldest_id, prev_oldest_id, snap_min;
+	uint64_t current_id, id, last_running, oldest_id, prev_oldest_id;
 	uint32_t i, session_cnt;
 	int32_t count;
 	int last_running_moved;
@@ -222,7 +216,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
 	conn = S2C(session);
 	txn_global = &conn->txn_global;
 
-	current_id = snap_min = txn_global->current;
+	current_id = last_running = txn_global->current;
 	oldest_session = NULL;
 	prev_oldest_id = txn_global->oldest_id;
 
@@ -247,7 +241,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
 
 	/* The oldest ID cannot change until the scan count goes to zero. */
 	prev_oldest_id = txn_global->oldest_id;
-	current_id = oldest_id = snap_min = txn_global->current;
+	current_id = oldest_id = last_running = txn_global->current;
 
 	/* Walk the array of concurrent transactions. */
 	WT_ORDERED_READ(session_cnt, conn->session_cnt);
@@ -262,8 +256,8 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
 		 */
 		if ((id = s->id) != WT_TXN_NONE &&
 		    WT_TXNID_LE(prev_oldest_id, id) &&
-		    WT_TXNID_LT(id, snap_min))
-			snap_min = id;
+		    WT_TXNID_LT(id, last_running))
+			last_running = id;
 
 		/*
 		 * !!!
@@ -280,8 +274,8 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
 		}
 	}
 
-	if (WT_TXNID_LT(snap_min, oldest_id))
-		oldest_id = snap_min;
+	if (WT_TXNID_LT(last_running, oldest_id))
+		oldest_id = last_running;
 
 	/* The oldest ID can't move past any named snapshots. */
 	if ((id = txn_global->nsnap_oldest_id) != WT_TXN_NONE &&
@@ -289,26 +283,42 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
 		oldest_id = id;
 
 	/* Update the last running ID. */
-	if (WT_TXNID_LT(txn_global->last_running, snap_min)) {
-		txn_global->last_running = snap_min;
-		last_running_moved = 1;
-	} else
-		last_running_moved = 0;
+	last_running_moved =
+	    WT_TXNID_LT(txn_global->last_running, last_running);
 
 	/* Update the oldest ID. */
-	if (WT_TXNID_LT(prev_oldest_id, oldest_id) &&
+	if ((WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) &&
 	    WT_ATOMIC_CAS4(txn_global->scan_count, 1, -1)) {
 		WT_ORDERED_READ(session_cnt, conn->session_cnt);
 		for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
 			if ((id = s->id) != WT_TXN_NONE &&
-			    WT_TXNID_LT(id, oldest_id))
-				oldest_id = id;
+			    WT_TXNID_LT(id, last_running))
+				last_running = id;
 			if ((id = s->snap_min) != WT_TXN_NONE &&
 			    WT_TXNID_LT(id, oldest_id))
 				oldest_id = id;
 		}
+
+		if (WT_TXNID_LT(last_running, oldest_id))
+			oldest_id = last_running;
+
+#ifdef HAVE_DIAGNOSTIC
+		/*
+		 * Make sure the ID doesn't move past any named snapshots.
+		 *
+		 * Don't include the read/assignment in the assert statement.
+		 * Coverity complains if there are assignments only done in
+		 * diagnostic builds, and when the read is from a volatile.
+		 */
+		id = txn_global->nsnap_oldest_id;
+		WT_ASSERT(session,
+		    id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id));
+#endif
+		if (WT_TXNID_LT(txn_global->last_running, last_running))
+			txn_global->last_running = last_running;
 		if (WT_TXNID_LT(txn_global->oldest_id, oldest_id))
 			txn_global->oldest_id = oldest_id;
+		WT_ASSERT(session, txn_global->scan_count == -1);
 		txn_global->scan_count = 0;
 	} else {
 		if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) &&
@@ -408,6 +418,9 @@ __wt_txn_release(WT_SESSION_IMPL *session)
 		txn_global->checkpoint_id = 0;
 		txn_global->checkpoint_pinned = WT_TXN_NONE;
 	} else if (F_ISSET(txn, WT_TXN_HAS_ID)) {
+		WT_ASSERT(session,
+		    !WT_TXNID_LT(txn->id, txn_global->last_running));
+
 		WT_ASSERT(session, txn_state->id != WT_TXN_NONE &&
 		    txn->id != WT_TXN_NONE);
 		WT_PUBLISH(txn_state->id, WT_TXN_NONE);
@@ -458,7 +471,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
 
 	txn = &session->txn;
 	conn = S2C(session);
-	WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR));
+	WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || txn->mod_count == 0);
 
 	if (!F_ISSET(txn, WT_TXN_RUNNING))
 		WT_RET_MSG(session, EINVAL, "No transaction is active");
@@ -582,6 +595,7 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[])
 		switch (op->type) {
 		case WT_TXN_OP_BASIC:
 		case WT_TXN_OP_INMEM:
+		       WT_ASSERT(session, op->u.upd->txnid == txn->id);
 			op->u.upd->txnid = WT_TXN_ABORTED;
 			break;
 		case WT_TXN_OP_REF:
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
index 0eadcbf3b01..f321da303d7 100644
--- a/src/third_party/wiredtiger/src/txn/txn_recover.c
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -522,7 +522,7 @@ __wt_txn_recover(WT_SESSION_IMPL *session)
 	 */
 	WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));
 
-done:
+done:	FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE);
 err:	WT_TRET(__recovery_free(&r));
 	__wt_free(session, config);
 	WT_TRET(session->iface.close(&session->iface, NULL));
author	Michael Cahill <michael.cahill@mongodb.com>	2015-08-12 20:56:25 +1000
committer	Michael Cahill <michael.cahill@mongodb.com>	2015-08-12 20:56:25 +1000
commit	7bb09c0377f5160857617c38ab07955f8f4b03f6 (patch)
tree	2d7041f6e1cc121c743c368406485e39280b551c /src/third_party/wiredtiger
parent	6d9cbeb53eb9e6d39a24a8ac54b71e105483730b (diff)
download	mongo-7bb09c0377f5160857617c38ab07955f8f4b03f6.tar.gz