Merge branch 'develop' into index-create-lsm3

author: Don Anderson <dda@ddanderson.com> 2015-09-13 21:57:42 -0400
committer: Don Anderson <dda@ddanderson.com> 2015-09-13 21:57:42 -0400
commit: 0225351bb6d937309f0bccb800c46e72e1aa4b82 (patch)
tree: 95998c9b83fe07fef1642cc281f814e4f25831f0
parent: 4c663725867d2f9434298d30883c58a0d96deaa9 (diff)
parent: e1d6886824058b333495236b776b10fcd8fb74ae (diff)
download: mongo-0225351bb6d937309f0bccb800c46e72e1aa4b82.tar.gz
209 files changed, 8245 insertions, 5277 deletions
diff --git a/SConstruct b/SConstruct
index 49e4417133f..70ed6e0220b 100644
--- a/SConstruct
+++ b/SConstruct
@@ -454,6 +454,7 @@ t = env.Program("wtperf", [
     "bench/wtperf/misc.c",
     "bench/wtperf/track.c",
     "bench/wtperf/wtperf.c",
+    "bench/wtperf/wtperf_truncate.c",
     ],
     LIBS=[wtlib, shim]  + wtlibs)
 Default(t)
diff --git a/bench/wtperf/config.c b/bench/wtperf/config.c
index 4445de3296d..6b0ce47ef3f 100644
--- a/bench/wtperf/config.c
+++ b/bench/wtperf/config.c
@@ -96,7 +96,7 @@ config_assign(CONFIG *dest, const CONFIG *src)
 			}
 		}
 
-	STAILQ_INIT(&dest->stone_head);
+	TAILQ_INIT(&dest->stone_head);
 	return (0);
 }
 
@@ -257,13 +257,15 @@ config_threads(CONFIG *cfg, const char *config, size_t len)
 				continue;
 			}
 			if (STRING_MATCH("truncate_pct", k.str, k.len)) {
-				if ((workp->truncate_pct = v.val) <= 0)
+				if (v.val <= 0)
 					goto err;
+				workp->truncate_pct = (uint64_t)v.val;
 				continue;
 			}
 			if (STRING_MATCH("truncate_count", k.str, k.len)) {
-				if ((workp->truncate_count = v.val) <= 0)
+				if (v.val <= 0)
 					goto err;
+				workp->truncate_count = (uint64_t)v.val;
 				continue;
 			}
 			goto err;
diff --git a/bench/wtperf/runners/mongodb-large-oplog.wtperf b/bench/wtperf/runners/mongodb-large-oplog.wtperf
new file mode 100644
index 00000000000..1e203a34cc3
--- /dev/null
+++ b/bench/wtperf/runners/mongodb-large-oplog.wtperf
@@ -0,0 +1,13 @@
+# wtperf options file to simulate populating a MongoDB oplog
+# This creates a test database of 7.8GB
+conn_config="cache_size=2GB,checkpoint=(wait=60)"
+table_config="type=file"
+# Start with a small set of inserts in the populate phase.
+icount=300000
+report_interval=5
+run_time=3600
+populate_threads=1
+key_sz=8192
+# Setup three threads to insert into the oplog
+# Setup one thread to be doing truncates from the oplog
+threads=((count=3,inserts=1,throttle=2000),(count=1,truncate=1,truncate_pct=10,truncate_count=300000))
diff --git a/bench/wtperf/runners/mongodb-small-oplog.wtperf b/bench/wtperf/runners/mongodb-small-oplog.wtperf
new file mode 100644
index 00000000000..4f2ae5359cd
--- /dev/null
+++ b/bench/wtperf/runners/mongodb-small-oplog.wtperf
@@ -0,0 +1,13 @@
+# wtperf options file to simulate populating a MongoDB oplog
+# This creates an oplog of 6.1GB
+conn_config="cache_size=2GB,checkpoint=(wait=60)"
+table_config="type=file"
+# Start with a small set of inserts in the populate phase.
+icount=750000
+report_interval=5
+run_time=3600
+populate_threads=1
+key_sz=512
+# Setup three threads to insert into the oplog
+# Setup one thread to be doing truncates from the oplog
+threads=((count=3,inserts=1,throttle=2000),(count=1,truncate=1,truncate_pct=10,truncate_count=750000))
diff --git a/bench/wtperf/runners/wtperf_run.sh b/bench/wtperf/runners/wtperf_run.sh
index d5de7c4abdb..ac31c2a2e78 100755
--- a/bench/wtperf/runners/wtperf_run.sh
+++ b/bench/wtperf/runners/wtperf_run.sh
@@ -24,18 +24,18 @@ outfile=./wtperf.out
 rm -f $outfile
 
 # Each of these has an entry for each op in ops below.
-avg=(0 0 0)
-max=(0 0 0)
-min=(0 0 0)
-sum=(0 0 0)
+avg=(0 0 0 0)
+max=(0 0 0 0)
+min=(0 0 0 0)
+sum=(0 0 0 0)
 # Load needs floating point and bc, handle separately.
-loadindex=4
+loadindex=5
 avg[$loadindex]=0
 max[$loadindex]=0
 min[$loadindex]=0
 sum[$loadindex]=0
-ops=(read insert update)
-outp=("Read count:" "Insert count:" "Update count:")
+ops=(read insert update truncate)
+outp=("Read count:" "Insert count:" "Update count:" "Truncate count:")
 outp[$loadindex]="Load time:"
 
 # getval min/max val cur
diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c
index 148aa0e4e84..5d3b334785d 100644
--- a/bench/wtperf/wtperf.c
+++ b/bench/wtperf/wtperf.c
@@ -96,17 +96,11 @@ static uint64_t	 wtperf_value_range(CONFIG *);
 #define	HELIUM_CONFIG	",type=helium"
 #define	INDEX_COL_NAMES	",columns=(key,val)"
 
-inline uint64_t
-decode_key(char *key_buf)
-{
-	return (strtoull(key_buf, NULL, 10));
-}
-
 /* Retrieve an ID for the next insert operation. */
 static inline uint64_t
 get_next_incr(CONFIG *cfg)
 {
-	return (WT_ATOMIC_ADD8(cfg->insert_key, 1));
+	return (__wt_atomic_add64(&cfg->insert_key, 1));
 }
 
 static void
@@ -157,7 +151,7 @@ cb_asyncop(WT_ASYNC_CALLBACK *cb, WT_ASYNC_OP *op, int ret, uint32_t flags)
 	switch (type) {
 	case WT_AOP_COMPACT:
 		tables = (uint32_t *)op->app_private;
-		WT_ATOMIC_ADD4(*tables, (uint32_t)-1);
+		(void)__wt_atomic_add32(tables, (uint32_t)-1);
 		break;
 	case WT_AOP_INSERT:
 		trk = &thread->insert;
@@ -192,7 +186,7 @@ cb_asyncop(WT_ASYNC_CALLBACK *cb, WT_ASYNC_OP *op, int ret, uint32_t flags)
 		return (0);
 	if (ret == 0 || (ret == WT_NOTFOUND && type != WT_AOP_INSERT)) {
 		if (!cfg->in_warmup)
-			(void)WT_ATOMIC_ADD8(trk->ops, 1);
+			(void)__wt_atomic_add64(&trk->ops, 1);
 		return (0);
 	}
 err:
@@ -513,10 +507,9 @@ worker(void *arg)
 		 * is 0, to avoid first time latency spikes.
 		 */
 		measure_latency =
-		    cfg->sample_interval != 0 && trk->ops != 0 && (
-		    trk->ops % cfg->sample_rate == 0);
-		if (measure_latency &&
-		    (ret = __wt_epoch(NULL, &start)) != 0) {
+		    cfg->sample_interval != 0 && trk != NULL &&
+		    trk->ops != 0 && (trk->ops % cfg->sample_rate == 0);
+		if (measure_latency && (ret = __wt_epoch(NULL, &start)) != 0) {
 			lprintf(cfg, ret, 0, "Get time call failed");
 			goto err;
 		}
@@ -880,10 +873,9 @@ populate_thread(void *arg)
 		cursor = cursors[op % cfg->table_count];
 		generate_key(cfg, key_buf, op);
 		measure_latency =
-		    cfg->sample_interval != 0 && trk->ops != 0 && (
-		    trk->ops % cfg->sample_rate == 0);
-		if (measure_latency &&
-		    (ret = __wt_epoch(NULL, &start)) != 0) {
+		    cfg->sample_interval != 0 &&
+		    trk->ops != 0 && (trk->ops % cfg->sample_rate == 0);
+		if (measure_latency && (ret = __wt_epoch(NULL, &start)) != 0) {
 			lprintf(cfg, ret, 0, "Get time call failed");
 			goto err;
 		}
@@ -1001,10 +993,9 @@ populate_async(void *arg)
 	 * the time to process by workers.
 	 */
 	measure_latency =
-	    cfg->sample_interval != 0 && trk->ops != 0 && (
-	    trk->ops % cfg->sample_rate == 0);
-	if (measure_latency &&
-	    (ret = __wt_epoch(NULL, &start)) != 0) {
+	    cfg->sample_interval != 0 &&
+	    trk->ops != 0 && (trk->ops % cfg->sample_rate == 0);
+	if (measure_latency && (ret = __wt_epoch(NULL, &start)) != 0) {
 		lprintf(cfg, ret, 0, "Get time call failed");
 		goto err;
 	}
@@ -1046,8 +1037,7 @@ populate_async(void *arg)
 		goto err;
 	if (measure_latency) {
 		if ((ret = __wt_epoch(NULL, &stop)) != 0) {
-			lprintf(cfg, ret, 0,
-			    "Get time call failed");
+			lprintf(cfg, ret, 0, "Get time call failed");
 			goto err;
 		}
 		++trk->latency_ops;
diff --git a/bench/wtperf/wtperf.h b/bench/wtperf/wtperf.h
index 991c09138e3..e4b9fc00798 100644
--- a/bench/wtperf/wtperf.h
+++ b/bench/wtperf/wtperf.h
@@ -95,7 +95,7 @@ typedef struct {
 	int64_t ops_per_txn;
 	int64_t truncate;		/* Truncate ratio */
 	uint64_t truncate_pct;		/* Truncate Percent */
-	uint64_t truncate_count;	/* Truncate Percent */
+	uint64_t truncate_count;	/* Truncate Count */
 
 #define	WORKER_INSERT		1	/* Insert */
 #define	WORKER_INSERT_RMW	2	/* Insert with read-modify-write */
@@ -108,7 +108,6 @@ typedef struct {
 /* Steering items for the truncate workload */
 typedef struct __truncate_struct TRUNCATE_CONFIG;
 struct __truncate_struct {
-	double truncation_percentage;
 	uint64_t stone_gap;
 	uint64_t needed_stones;
 	uint64_t final_stone_gap;
@@ -122,8 +121,8 @@ struct __truncate_struct {
 /* Queue entry for use with the Truncate Logic */
 struct __truncate_queue_entry {
 	char *key;			/* Truncation point */
-	u_int diff;			/* Number of items to be truncated*/
-	STAILQ_ENTRY(__truncate_queue_entry) q;
+	uint64_t diff;			/* Number of items to be truncated*/
+	TAILQ_ENTRY(__truncate_queue_entry) q;
 };
 typedef struct __truncate_queue_entry TRUNCATE_QUEUE_ENTRY;
 
@@ -179,7 +178,7 @@ struct __config {			/* Configuration structure */
 	u_int		 has_truncate;  /* if there is a truncate workload */
 
 	/* Queue head for use with the Truncate Logic */
-	STAILQ_HEAD(__truncate_qh, __truncate_queue_entry) stone_head;
+	TAILQ_HEAD(__truncate_qh, __truncate_queue_entry) stone_head;
 
 	/* Fields changeable on command line are listed in wtperf_opt.i */
 #define	OPT_DECLARE_STRUCT
@@ -273,7 +272,6 @@ int	 config_opt_line(CONFIG *, const char *);
 int	 config_opt_str(CONFIG *, const char *, const char *);
 void	 config_print(CONFIG *);
 int	 config_sanity(CONFIG *);
-uint64_t decode_key(char *);
 void	 latency_insert(CONFIG *, uint32_t *, uint32_t *, uint32_t *);
 void	 latency_read(CONFIG *, uint32_t *, uint32_t *, uint32_t *);
 void	 latency_update(CONFIG *, uint32_t *, uint32_t *, uint32_t *);
diff --git a/bench/wtperf/wtperf_truncate.c b/bench/wtperf/wtperf_truncate.c
index 0cdbbb914a4..581d1987947 100644
--- a/bench/wtperf/wtperf_truncate.c
+++ b/bench/wtperf/wtperf_truncate.c
@@ -28,6 +28,12 @@
 
 #include "wtperf.h"
 
+static inline uint64_t
+decode_key(char *key_buf)
+{
+	return (strtoull(key_buf, NULL, 10));
+}
+
 int
 setup_truncate(CONFIG *cfg, CONFIG_THREAD *thread, WT_SESSION *session) {
 
@@ -37,8 +43,7 @@ setup_truncate(CONFIG *cfg, CONFIG_THREAD *thread, WT_SESSION *session) {
 	WT_CURSOR *cursor;
 	char *key, *truncate_key;
 	int ret;
-	size_t i;
-	uint64_t end_point, final_stone_gap, start_point;
+	uint64_t end_point, final_stone_gap, i, start_point;
 
 	end_point = final_stone_gap = start_point = 0;
 	trunc_cfg = &thread->trunc_cfg;
@@ -49,11 +54,9 @@ setup_truncate(CONFIG *cfg, CONFIG_THREAD *thread, WT_SESSION *session) {
 	    session, cfg->uris[0], NULL, NULL, &cursor)) != 0)
 		goto err;
 
-	/* Truncation percentage value. eg 10% is 0.1. */
-	trunc_cfg->truncation_percentage = (double)workload->truncate_pct / 100;
 	/* How many entries between each stone. */
 	trunc_cfg->stone_gap =
-	    workload->truncate_count * trunc_cfg->truncation_percentage;
+	    (workload->truncate_count * workload->truncate_pct) / 100;
 	/* How many stones we need. */
 	trunc_cfg->needed_stones =
 	    workload->truncate_count / trunc_cfg->stone_gap;
@@ -94,8 +97,13 @@ setup_truncate(CONFIG *cfg, CONFIG_THREAD *thread, WT_SESSION *session) {
 		trunc_cfg->expected_total = (end_point - start_point);
 		for (i = 1; i <= trunc_cfg->needed_stones; i++) {
 			truncate_key = calloc(cfg->key_sz, 1);
+			if (truncate_key == NULL) {
+				ret = enomem(cfg);
+				goto err;
+			}
 			truncate_item = calloc(sizeof(TRUNCATE_QUEUE_ENTRY), 1);
 			if (truncate_item == NULL) {
+				free(truncate_key);
 				ret = enomem(cfg);
 				goto err;
 			}
@@ -104,14 +112,16 @@ setup_truncate(CONFIG *cfg, CONFIG_THREAD *thread, WT_SESSION *session) {
 			truncate_item->key = truncate_key;
 			truncate_item->diff =
 			    (trunc_cfg->stone_gap * i) - trunc_cfg->last_key;
-			STAILQ_INSERT_TAIL( &cfg->stone_head, truncate_item, q);
+			TAILQ_INSERT_TAIL( &cfg->stone_head, truncate_item, q);
 			trunc_cfg->last_key = trunc_cfg->stone_gap * i;
 			trunc_cfg->num_stones++;
 		}
 	}
 	trunc_cfg->stone_gap = final_stone_gap;
 
-err:	cursor->close(cursor);
+err:	if ((ret = cursor->close(cursor)) != 0) {
+		lprintf(cfg, ret, 0, "truncate setup: cursor close failed");
+	}
 	return (ret);
 }
 
@@ -141,16 +151,22 @@ run_truncate(CONFIG *cfg, CONFIG_THREAD *thread,
 	while (trunc_cfg->num_stones < trunc_cfg->needed_stones) {
 		trunc_cfg->last_key += trunc_cfg->stone_gap;
 		truncate_key = calloc(cfg->key_sz, 1);
+		if (truncate_key == NULL) {
+			lprintf(cfg, ENOMEM, 0,
+			    "truncate: couldn't allocate key array");
+			return (ENOMEM);
+		}
 		truncate_item = calloc(sizeof(TRUNCATE_QUEUE_ENTRY), 1);
 		if (truncate_item == NULL) {
+			free(truncate_key);
 			lprintf(cfg, ENOMEM, 0,
-			    "worker: couldn't allocate cursor array");
+			    "truncate: couldn't allocate item");
 			return (ENOMEM);
 		}
 		generate_key(cfg, truncate_key, trunc_cfg->last_key);
 		truncate_item->key = truncate_key;
 		truncate_item->diff = trunc_cfg->stone_gap;
-		STAILQ_INSERT_TAIL(&cfg->stone_head, truncate_item, q);
+		TAILQ_INSERT_TAIL(&cfg->stone_head, truncate_item, q);
 		trunc_cfg->num_stones++;
 	}
 
@@ -159,9 +175,9 @@ run_truncate(CONFIG *cfg, CONFIG_THREAD *thread,
 	    trunc_cfg->expected_total <= thread->workload->truncate_count)
 		return (0);
 
-	truncate_item = STAILQ_FIRST(&cfg->stone_head);
+	truncate_item = TAILQ_FIRST(&cfg->stone_head);
 	trunc_cfg->num_stones--;
-	STAILQ_REMOVE_HEAD(&cfg->stone_head, q);
+	TAILQ_REMOVE(&cfg->stone_head, truncate_item, q);
 	cursor->set_key(cursor,truncate_item->key);
 	if ((ret = cursor->search(cursor)) != 0) {
 		lprintf(cfg, ret, 0, "Truncate search: failed");
@@ -179,7 +195,6 @@ run_truncate(CONFIG *cfg, CONFIG_THREAD *thread,
 
 err:	free(truncate_item->key);
 	free(truncate_item);
-	truncate_item = NULL;
 	t_ret = cursor->reset(cursor);
 	if (t_ret != 0)
 		lprintf(cfg, t_ret, 0, "Cursor reset failed");
@@ -192,9 +207,9 @@ void
 cleanup_truncate_config(CONFIG *cfg) {
 	TRUNCATE_QUEUE_ENTRY *truncate_item;
 
-	while (!STAILQ_EMPTY(&cfg->stone_head)) {
-		truncate_item = STAILQ_FIRST(&cfg->stone_head);
-		STAILQ_REMOVE_HEAD(&cfg->stone_head, q);
+	while (!TAILQ_EMPTY(&cfg->stone_head)) {
+		truncate_item = TAILQ_FIRST(&cfg->stone_head);
+		TAILQ_REMOVE(&cfg->stone_head, truncate_item, q);
 		free(truncate_item->key);
 		free(truncate_item);
 	}
diff --git a/build_win/filelist.win b/build_win/filelist.win
index 099451e418d..9d0ee10d305 100644
--- a/build_win/filelist.win
+++ b/build_win/filelist.win
@@ -45,6 +45,7 @@ src/btree/col_srch.c
 src/btree/row_key.c
 src/btree/row_modify.c
 src/btree/row_srch.c
+src/cache/cache_las.c
 src/config/config.c
 src/config/config_api.c
 src/config/config_check.c
diff --git a/dist/api_data.py b/dist/api_data.py
index 43b585a6c6d..3a700cf886b 100644
--- a/dist/api_data.py
+++ b/dist/api_data.py
@@ -449,13 +449,17 @@ connection_runtime_config = [
         Config('chunk', '10MB', r'''
             the granularity that a shared cache is redistributed''',
             min='1MB', max='10TB'),
+        Config('name', 'none', r'''
+            the name of a cache that is shared between databases or
+            \c "none" when no shared cache is configured'''),
+        Config('quota', '0', r'''
+            maximum size of cache this database can be allocated from the
+            shared cache. Defaults to the entire shared cache size''',
+            type='int'),
         Config('reserve', '0', r'''
             amount of cache this database is guaranteed to have
             available from the shared cache. This setting is per
             database. Defaults to the chunk size''', type='int'),
-        Config('name', 'none', r'''
-            the name of a cache that is shared between databases or
-            \c "none" when no shared cache is configured'''),
         Config('size', '500MB', r'''
             maximum memory to allocate for the shared cache. Setting
             this will update the value if one is already set''',
@@ -981,8 +985,10 @@ methods = {
     connection_runtime_config +
     common_wiredtiger_open + [
     Config('config_base', 'true', r'''
-        write the base configuration file if creating the database,
-        see @ref config_base for more information''',
+        write the base configuration file if creating the database.  If
+        \c false in the config passed directly to ::wiredtiger_open, will
+        ignore any existing base configuration file in addition to not creating
+        one.  See @ref config_base for more information''',
         type='boolean'),
     Config('create', 'false', r'''
         create the database if it does not exist''',
@@ -1011,8 +1017,10 @@ methods = {
     connection_runtime_config +
     common_wiredtiger_open + [
     Config('config_base', 'true', r'''
-        write the base configuration file if creating the database,
-        see @ref config_base for more information''',
+        write the base configuration file if creating the database.  If
+        \c false in the config passed directly to ::wiredtiger_open, will
+        ignore any existing base configuration file in addition to not creating
+        one.  See @ref config_base for more information''',
         type='boolean'),
     Config('create', 'false', r'''
         create the database if it does not exist''',
diff --git a/dist/filelist b/dist/filelist
index c3321cf845d..f33f0e9a962 100644
--- a/dist/filelist
+++ b/dist/filelist
@@ -45,6 +45,7 @@ src/btree/col_srch.c
 src/btree/row_key.c
 src/btree/row_modify.c
 src/btree/row_srch.c
+src/cache/cache_las.c
 src/config/config.c
 src/config/config_api.c
 src/config/config_check.c
diff --git a/dist/flags.py b/dist/flags.py
index c8d9bcc6a5e..d98f249335e 100644
--- a/dist/flags.py
+++ b/dist/flags.py
@@ -12,7 +12,6 @@ flags = {
         'SYNC_CHECKPOINT',
         'SYNC_CLOSE',
         'SYNC_DISCARD',
-        'SYNC_DISCARD_FORCE',
         'SYNC_WRITE_LEAVES',
     ],
     'file_types' : [
@@ -46,9 +45,10 @@ flags = {
         'READ_WONT_NEED',
     ],
     'rec_write' : [
+        'EVICT_LOOKASIDE',
         'EVICTING',
-        'SKIP_UPDATE_ERR',
-        'SKIP_UPDATE_RESTORE',
+        'EVICT_UPDATE_RESTORE',
+        'VISIBILITY_ERR',
     ],
     'txn_log_checkpoint' : [
         'TXN_LOG_CKPT_CLEANUP',
@@ -107,19 +107,20 @@ flags = {
     'session' : [
         'SESSION_CAN_WAIT',
         'SESSION_CLEAR_EVICT_WALK',
-        'SESSION_DISCARD_FORCE',
+        'SESSION_INTERNAL',
         'SESSION_LOCKED_CHECKPOINT',
         'SESSION_LOCKED_HANDLE_LIST',
         'SESSION_LOCKED_SCHEMA',
+        'SESSION_LOCKED_SLOT',
         'SESSION_LOCKED_TABLE',
-        'SESSION_INTERNAL',
         'SESSION_LOGGING_INMEM',
+        'SESSION_LOOKASIDE_CURSOR',
         'SESSION_NO_CACHE',
-        'SESSION_NO_CACHE_CHECK',
         'SESSION_NO_DATA_HANDLES',
+        'SESSION_NO_EVICTION',
         'SESSION_NO_LOGGING',
         'SESSION_NO_SCHEMA_LOCK',
-        'SESSION_SALVAGE_CORRUPT_OK',
+        'SESSION_QUIET_CORRUPT_FILE',
         'SESSION_SERVER_ASYNC',
     ],
 }
diff --git a/dist/s_all b/dist/s_all
index c624db06a97..8e3f265e79b 100755
--- a/dist/s_all
+++ b/dist/s_all
@@ -2,7 +2,7 @@
 
 # Run standard scripts.
 t=__wt.$$
-t_pfx=__s_all_tmp
+t_pfx=__s_all_tmp_
 trap 'rm -f $t *.pyc __tmp __wt.* __s_all_tmp*' 0 1 2 3 13 15
 
 # We require python which may not be installed.
diff --git a/dist/s_define b/dist/s_define
index 7809bf14918..77673bdcdf9 100755
--- a/dist/s_define
+++ b/dist/s_define
@@ -4,18 +4,23 @@
 t=__wt.$$
 trap 'rm -f $t; exit 0' 0 1 2 3 13 15
 
-# List of files to search.
+# List of source files to search.
 l=`sed -e 's,#.*,,' -e '/^$/d' -e 's,^,../,' filelist`
 l="$l `echo ../src/include/*.i ../src/utilities/*.c ../test/*/*.c`"
 
+# List of include files for source #defines.
+# Ignore the queue.h file, we don't use most of it.
+dl="../src/include/*.[hi] ../src/include/*.in"
+dl=`echo $dl | sed 's/ [^ ]*queue.h//'`
+
 (
 # Copy out the list of #defines we don't use, but it's OK.
 sed -e '/^$/d' -e '/^#/d' < s_define.list
 
-# Get the list of #defines.
-# Ignore the list of configuration objects
-# Ignore the list of statistic "keys" generated for applications.
-search=`cat ../src/include/*.[hi] ../src/include/*.in |
+# Search the list of include files for #defines
+# Ignore configuration objects #defines
+# Ignore statistic "keys" generated for applications #defines
+search=`cat $dl |
     sed -e '/configuration section: BEGIN/,/configuration section: END/d' \
         -e '/Statistics section: BEGIN/,/Statistics section: END/d' |
     egrep '^#define' |
diff --git a/dist/s_define.list b/dist/s_define.list
index 623a34447a8..aaf365a7376 100644
--- a/dist/s_define.list
+++ b/dist/s_define.list
@@ -16,115 +16,43 @@ TXN_API_CALL
 TXN_API_CALL_NOCONF
 TXN_API_END
 WIN32_LEAN_AND_MEAN
-WT_ATOMIC_ADD1
-WT_ATOMIC_ADD2
-WT_ATOMIC_CAS1
-WT_ATOMIC_CAS2
-WT_ATOMIC_FETCH_ADD1
-WT_ATOMIC_FETCH_ADD2
-WT_ATOMIC_FETCH_ADD4
-WT_ATOMIC_STORE1
-WT_ATOMIC_STORE2
-WT_ATOMIC_SUB1
-WT_ATOMIC_SUB2
+WT_ATOMIC_CAS
+WT_ATOMIC_FUNC
 WT_BARRIER
 WT_BLOCK_DESC_SIZE
 WT_CACHE_LINE_ALIGNMENT
 WT_COMPILER_TYPE_ALIGN
 WT_CONN_CHECK_PANIC
+WT_COUNTER_SLOTS
 WT_DEADLOCK
 WT_DEBUG_BYTE
 WT_HANDLE_CLOSED
 WT_HANDLE_NULLABLE
+WT_LOG_SLOT_ACTIVE
+WT_LOG_SLOT_BITS
+WT_LOG_SLOT_JOIN_MASK
+WT_LOG_SLOT_MASK_OFF
+WT_LOG_SLOT_MASK_ON
+WT_LOG_SLOT_MAXBITS
+WT_LOG_SLOT_UNBUFFERED_ISSET
 WT_PACKED_STRUCT_BEGIN
 WT_PACKED_STRUCT_END
 WT_READ_BARRIER
 WT_REF_SIZE
 WT_SESSION_LOCKED_CHECKPOINT
-WT_STAT_ATOMIC_DECR
-WT_STAT_ATOMIC_DECRV
-WT_STAT_ATOMIC_INCR
-WT_STAT_ATOMIC_INCRV
+WT_STATS_FIELD_TO_SLOT
+WT_STATS_SLOT_ID
 WT_STAT_DECR
 WT_STAT_DECRV
-WT_STAT_FAST_ATOMIC_DECR
-WT_STAT_FAST_ATOMIC_DECRV
-WT_STAT_FAST_ATOMIC_INCR
-WT_STAT_FAST_ATOMIC_INCRV
-WT_STAT_FAST_CONN_ATOMIC_DECRV
-WT_STAT_FAST_CONN_ATOMIC_INCRV
 WT_STAT_FAST_CONN_DECRV
 WT_STAT_FAST_DATA_DECRV
 WT_STAT_FAST_DECR
 WT_STAT_FAST_DECRV
+WT_STAT_FAST_INCR
 WT_STAT_FAST_INCRV
 WT_STAT_FAST_SET
+WT_STAT_WRITE
 WT_WITH_LOCK
 __F
 __WIREDTIGER_EXT_H_
 __WIREDTIGER_H_
-__WT_ATOMIC_ADD
-__WT_ATOMIC_CAS
-__WT_ATOMIC_FETCH_ADD
-__WT_ATOMIC_STORE
-__WT_ATOMIC_SUB
-
-# List of queue.h #defines that are "unused", but it's OK.
-LIST_EMPTY
-LIST_ENTRY
-LIST_FIRST
-LIST_FOREACH
-LIST_HEAD
-LIST_HEAD_INITIALIZER
-LIST_INIT
-LIST_INSERT_AFTER
-LIST_INSERT_BEFORE
-LIST_INSERT_HEAD
-LIST_NEXT
-LIST_REMOVE
-QMD_TRACE_ELEM
-QMD_TRACE_HEAD
-QUEUE_MACRO_DEBUG
-SLIST_EMPTY
-SLIST_ENTRY
-SLIST_FIRST
-SLIST_FOREACH
-SLIST_FOREACH_PREVPTR
-SLIST_HEAD
-SLIST_HEAD_INITIALIZER
-SLIST_INIT
-SLIST_INSERT_AFTER
-SLIST_INSERT_HEAD
-SLIST_NEXT
-SLIST_REMOVE
-SLIST_REMOVE_HEAD
-STAILQ_CONCAT
-STAILQ_EMPTY
-STAILQ_ENTRY
-STAILQ_FIRST
-STAILQ_FOREACH
-STAILQ_HEAD
-STAILQ_HEAD_INITIALIZER
-STAILQ_INIT
-STAILQ_INSERT_AFTER
-STAILQ_INSERT_HEAD
-STAILQ_INSERT_TAIL
-STAILQ_LAST
-STAILQ_NEXT
-STAILQ_REMOVE
-STAILQ_REMOVE_HEAD
-STAILQ_REMOVE_HEAD_UNTIL
-TAILQ_CONCAT
-TAILQ_EMPTY
-TAILQ_ENTRY
-TAILQ_FOREACH_REVERSE
-TAILQ_HEAD
-TAILQ_HEAD_INITIALIZER
-TAILQ_INSERT_AFTER
-TAILQ_INSERT_BEFORE
-TAILQ_LAST
-TAILQ_NEXT
-TAILQ_PREV
-TRACEBUF
-TRASHIT
-_DB_QUEUE_H_
diff --git a/dist/s_stat b/dist/s_stat
index 152097f14be..44c22ab56bb 100755
--- a/dist/s_stat
+++ b/dist/s_stat
@@ -16,7 +16,7 @@ l="$l `echo ../src/include/*.i`"
 (
 # Get the list of statistics fields.
 search=`sed \
-    -e 's/^	WT_STATS \([a-z_*]*\);$/\1/p' \
+    -e 's/^	int64_t \([a-z_*]*\);$/\1/p' \
     -e d ../src/include/stat.h |
     sort`
 
diff --git a/dist/s_string.ok b/dist/s_string.ok
index 48c0f7f30f4..bfc4124f74d 100644
--- a/dist/s_string.ok
+++ b/dist/s_string.ok
@@ -32,6 +32,7 @@ BIGENDIAN
 BOOL
 BSR
 BTREE
+BUF
 BZ
 Barack
 Bitfield
@@ -156,11 +157,13 @@ KVS
 Kanowski's
 Kounavis
 LANGID
+LAS
 LF
 LLLLLL
 LLLLLLL
 LOGREC
 LOGSCAN
+LOOKASIDE
 LRU
 LRVv
 LSB
@@ -176,6 +179,7 @@ Levyx
 Llqr
 Llqrt
 LockFile
+Lookaside
 Lookup
 MALLOC
 MEM
@@ -210,6 +214,7 @@ NetBSD
 NoAddr
 Noll
 Nul
+OOB
 OPTYPE
 OUTBUFF
 OVFL
@@ -231,6 +236,7 @@ Preload
 Prepend
 Qsort
 RCS
+RECNO
 REF's
 REFs
 RET
@@ -291,6 +297,7 @@ ULINE
 URI
 URIs
 UTF
+Unbuffered
 UnixLib
 Unmap
 UnmapViewOfFile
@@ -320,6 +327,7 @@ WiredTiger's
 WiredTigerCheckpoint
 WiredTigerException
 WiredTigerInit
+WiredTigerLAS
 WiredTigerLog
 WiredTigerPreplog
 WiredTigerTmplog
@@ -396,6 +404,7 @@ bzalloc
 bzfree
 bzip
 calloc
+cas
 catfmt
 cd
 centric
@@ -494,6 +503,7 @@ desc
 dest
 destSize
 dev
+dh
 dhandle
 dhandles
 dir
@@ -503,6 +513,7 @@ dlh
 dll
 dlopen
 dlsym
+dmalloc
 dmsg
 doxgen
 doxygen
@@ -512,6 +523,7 @@ dsk
 dsrc
 dst
 dstlen
+dstrdup
 dsync
 dumpcmp
 dumpfile
@@ -648,6 +660,7 @@ kvraw
 kvs
 kvsbdb
 lang
+las
 latencies
 lbrace
 lbracket
@@ -675,6 +688,7 @@ logread
 logrec
 logsize
 logtest
+lookaside
 lookup
 lookups
 lossy
@@ -745,6 +759,7 @@ nop
 noraw
 notfound
 notsup
+notused
 nset
 nsnap
 nul
@@ -797,6 +812,7 @@ progname
 ps
 psp
 pthread
+ptr
 pushms
 putK
 putV
@@ -937,6 +953,7 @@ uS
 uint
 uintmax
 unbare
+unbuffered
 uncompressing
 uncompresssed
 undef
@@ -945,6 +962,7 @@ unesc
 unescaped
 uninstantiated
 unistd
+unlinked
 unmap
 unmarshall
 unmarshalled
diff --git a/dist/s_style b/dist/s_style
index e5411748a31..0e013852914 100755
--- a/dist/s_style
+++ b/dist/s_style
@@ -46,6 +46,11 @@ else
 		cat $t
 	fi
 
+	if ! expr "$f" : 'src/include/queue\.h' > /dev/null &&
+	    egrep 'STAILQ_|SLIST_|\bLIST_' $f ; then
+		echo "$f: use TAILQ for all lists"
+	fi
+
 	if ! expr "$f" : 'src/os_posix/.*' > /dev/null &&
 	   ! expr "$f" : 'src/os_win/.*' > /dev/null &&
 	   ! expr "$f" : 'src/include/extern.h' > /dev/null &&
@@ -69,6 +74,13 @@ else
 		cat $t
 	}
 
+	# Alignment directive before "struct".
+	egrep 'WT_COMPILER_TYPE_ALIGN.*struct' $f > $t
+	test -s $t && {
+		echo "$f: compiler alignment direction must precede \"struct\""
+		cat $t
+	}
+
 	# Direct calls to functions we're not supposed to use in the library.
 	# We don't check for all of them, just a few of the common ones.
 	if ! expr "$f" : 'bench/.*' > /dev/null &&
diff --git a/dist/s_typedef b/dist/s_typedef
index 2e206757f48..233f432f0e5 100755
--- a/dist/s_typedef
+++ b/dist/s_typedef
@@ -25,7 +25,7 @@ build() {
 	    $l |
 	    sed -e 's/WT_PACKED_STRUCT_BEGIN(\(.*\))/struct \1 {/' \
 	        -e 's/WT_COMPILER_TYPE_ALIGN(.*)[ ]*//' \
-	        -e 's/^[	 ]*//' -e 's/[	 ]*{.*//' | sort | \
+	        -e 's/^[	 ]*//' -e 's/[	 ]*{.*//' | sort -u | \
 	while read t n; do
 		upper=`echo $n | sed -e 's/^__//' | tr [a-z] [A-Z]`
 		echo "$t $n;"
diff --git a/dist/s_whitespace b/dist/s_whitespace
index 3a51b251bfe..dfc031e3ea4 100755
--- a/dist/s_whitespace
+++ b/dist/s_whitespace
@@ -4,7 +4,16 @@
 t=__wt.$$
 trap 'rm -f $t; exit 0' 0 1 2 3 13 15
 
-ws()
+# Clear lines that only contain whitespace.
+whitespace()
+{
+	sed -e 's/[	 ][	 ]*$//' < $1 > $t
+	cmp $t $1 > /dev/null 2>&1 || (echo "$1" && cp $t $1)
+}
+
+# Clear lines that only contain whitespace, compress multiple empty lines
+# into a single line, discarding trailing empty lines.
+whitespace_and_empty_line()
 {
 	sed -e 's/[	 ][	 ]*$//' \
 	    -e '/^$/N' \
@@ -14,10 +23,12 @@ ws()
 
 cd ..
 
+# Scripts.
 for f in `find dist -name '*.py' -name 's_*'`; do
-	ws $f
+	whitespace_and_empty_line $f
 done
 
+# C-language sources.
 for f in `find examples ext src test \
     -name '*.[chi]' -o \
     -name '*.dox' -o \
@@ -26,5 +37,11 @@ for f in `find examples ext src test \
 	if expr "$f" : ".*/Makefile.in" > /dev/null; then
 		continue
 	fi
-	ws $f
+	whitespace_and_empty_line $f
+done
+
+# Python sources.
+for f in `find test \
+    -name '*.py' | sed '/3rdparty/d'`; do
+	whitespace $f
 done
diff --git a/dist/stat.py b/dist/stat.py
index 2a87d4425e6..c9684665a53 100644
--- a/dist/stat.py
+++ b/dist/stat.py
@@ -12,12 +12,11 @@ def print_struct(title, name, base, stats):
     f.write('/*\n')
     f.write(' * Statistics entries for ' + title + '.\n')
     f.write(' */\n')
-    f.write(
-        '#define\tWT_' + name.upper() + '_STATS_BASE\t' + str(base) + '\n')
+    f.write('#define\tWT_' + name.upper() + '_STATS_BASE\t' + str(base) + '\n')
     f.write('struct __wt_' + name + '_stats {\n')
 
     for l in stats:
-        f.write('\tWT_STATS ' + l.name + ';\n')
+        f.write('\tint64_t ' + l.name + ';\n')
     f.write('};\n\n')
 
 # Update the #defines in the stat.h file.
@@ -90,67 +89,113 @@ for line in open('../src/include/wiredtiger.in', 'r'):
 f.close()
 compare_srcfile(tmp_file, '../src/include/wiredtiger.in')
 
-def print_func(name, list):
-    '''Print the functions for the stat.c file.'''
+def print_func(name, handle, list):
+    '''Print the structures/functions for the stat.c file.'''
+    f.write('\n')
+    f.write('static const char * const __stats_' + name + '_desc[] = {\n')
+    for l in list:
+        f.write('\t"' + l.desc + '",\n')
+    f.write('};\n')
+
+    f.write('''
+const char *
+__wt_stat_''' + name + '''_desc(int slot)
+{
+\treturn (__stats_''' + name + '''_desc[slot]);
+}
+''')
+
     f.write('''
 void
-__wt_stat_init_''' + name + '''_stats(WT_''' + name.upper() + '''_STATS *stats)
+__wt_stat_''' + name + '_init_single(WT_' + name.upper() + '''_STATS *stats)
 {
-\t/* Clear, so can also be called for reinitialization. */
 \tmemset(stats, 0, sizeof(*stats));
-
-''')
-    for l in sorted(list):
-        o = '\tstats->' + l.name + '.desc = "' + l.desc + '";\n'
-        if len(o) + 7  > 80:
-            o = o.replace('= ', '=\n\t    ')
-        f.write(o)
-    f.write('''}
+}
 ''')
 
     f.write('''
 void
-__wt_stat_refresh_''' + name + '''_stats(void *stats_arg)
+__wt_stat_''' + name + '_init(' + handle + ''' *handle)
 {
-\tWT_''' + name.upper() + '''_STATS *stats;
+\tint i;
+
+\tfor (i = 0; i < WT_COUNTER_SLOTS; ++i) {
+\t\thandle->stats[i] = &handle->stat_array[i];
+\t\t__wt_stat_''' + name + '''_init_single(handle->stats[i]);
+\t}
+}
+''')
 
-\tstats = (WT_''' + name.upper() + '''_STATS *)stats_arg;
+    f.write('''
+void
+__wt_stat_''' + name + '_clear_single(WT_' + name.upper() + '''_STATS *stats)
+{
 ''')
     for l in sorted(list):
         # no_clear: don't clear the value.
-        if not 'no_clear' in l.flags:
-            f.write('\tstats->' + l.name + '.v = 0;\n');
+        if 'no_clear' in l.flags:
+            f.write('\t\t/* not clearing ' + l.name + ' */\n')
+        else:
+            f.write('\tstats->' + l.name + ' = 0;\n')
     f.write('}\n')
 
-    # Aggregation is only interesting for data-source statistics.
-    # Complain if any aggregation flags are set.
-    if name == 'connection':
+    f.write('''
+void
+__wt_stat_''' + name + '_clear_all(WT_' + name.upper() + '''_STATS **stats)
+{
+\tu_int i;
+
+\tfor (i = 0; i < WT_COUNTER_SLOTS; ++i)
+\t\t__wt_stat_''' + name + '''_clear_single(stats[i]);
+}
+''')
+
+    # Single structure aggregation is currently only used by data sources.
+    if name == 'dsrc':
+        f.write('''
+void
+__wt_stat_''' + name + '''_aggregate_single(
+    WT_''' + name.upper() + '_STATS *from, WT_' + name.upper() + '''_STATS *to)
+{
+''')
         for l in sorted(list):
-            if 'no_aggregate' in l.flags or 'max_aggregate' in l.flags:
-                print >>sys.stdout,\
-                    "Aggregation configuration for " +\
-                    name + "." + l.name + " statistics not supported"
-        return;
+            if 'no_aggregate' in l.flags:
+                o = '\tto->' + l.name + ' = from->' + l.name + ';\n'
+            elif 'max_aggregate' in l.flags:
+                o = '\tif (from->' + l.name + ' > to->' + l.name + ')\n' +\
+                    '\t\tto->' + l.name + ' = from->' + l.name + ';\n'
+            else:
+                o = '\tto->' + l.name + ' += from->' + l.name + ';\n'
+                if len(o) > 72:         # Account for the leading tab.
+                    o = o.replace(' += ', ' +=\n\t    ')
+            f.write(o)
+        f.write('}\n')
 
     f.write('''
 void
-__wt_stat_aggregate_''' + name +
-'''_stats(const void *child, const void *parent)
+__wt_stat_''' + name + '''_aggregate(
+    WT_''' + name.upper() + '_STATS **from, WT_' + name.upper() + '''_STATS *to)
 {
-\tWT_''' + name.upper() + '''_STATS *c, *p;
-
-\tc = (WT_''' + name.upper() + '''_STATS *)child;
-\tp = (WT_''' + name.upper() + '''_STATS *)parent;
 ''')
+    # Connection level aggregation does not currently have any computation
+    # of a maximum value; I'm leaving in support for it, but don't declare
+    # a temporary variable until it's needed.
+    for l in sorted(list):
+        if 'max_aggregate' in l.flags:
+            f.write('\tint64_t v;\n\n')
+            break;
     for l in sorted(list):
         if 'no_aggregate' in l.flags:
-            continue;
+            o = '\tto->' + l.name + ' = from[0]->' + l.name + ';\n'
         elif 'max_aggregate' in l.flags:
-            o = 'if (c->' + l.name + '.v > p->' + l.name +\
-            '.v)\n\t    p->' + l.name + '.v = c->' + l.name + '.v;'
+            o = '\tif ((v = WT_STAT_READ(from, ' + l.name + ')) >\n' +\
+                '\t    to->' + l.name + ')\n' +\
+                '\t\tto->' + l.name + ' = v;\n'
         else:
-            o = 'p->' + l.name + '.v += c->' + l.name + '.v;'
-        f.write('\t' + o + '\n')
+            o = '\tto->' + l.name + ' += WT_STAT_READ(from, ' + l.name + ');\n'
+            if len(o) > 72:             # Account for the leading tab.
+                o = o.replace(' += ', ' +=\n\t    ')
+        f.write(o)
     f.write('}\n')
 
 # Write the stat initialization and refresh routines to the stat.c file.
@@ -158,12 +203,11 @@ f = open(tmp_file, 'w')
 f.write('/* DO NOT EDIT: automatically built by dist/stat.py. */\n\n')
 f.write('#include "wt_internal.h"\n')
 
-print_func('dsrc', dsrc_stats)
-print_func('connection', connection_stats)
+print_func('dsrc', 'WT_DATA_HANDLE', dsrc_stats)
+print_func('connection', 'WT_CONNECTION_IMPL', connection_stats)
 f.close()
 compare_srcfile(tmp_file, '../src/support/stat.c')
 
-
 # Update the statlog file with the entries we can scale per second.
 scale_info = 'no_scale_per_second_list = [\n'
 clear_info = 'no_clear_list = [\n'
diff --git a/dist/stat_data.py b/dist/stat_data.py
index caf68364696..c91fc921380 100644
--- a/dist/stat_data.py
+++ b/dist/stat_data.py
@@ -7,14 +7,21 @@
 # currently open'.
 # NOTE: All statistics descriptions must have a prefix string followed by ':'.
 #
-# Optional configuration flags:
-#       no_clear        Value not cleared when statistics cleared
-#       no_scale        Don't scale value per second in the logging tool script
-#
 # Data-source statistics are normally aggregated across the set of underlying
 # objects. Additional optionaly configuration flags are available:
 #       no_aggregate    Ignore the value when aggregating statistics
 #       max_aggregate   Take the maximum value when aggregating statistics
+#
+# Optional configuration flags:
+#       no_clear        Value not cleared when statistics cleared
+#       no_scale        Don't scale value per second in the logging tool script
+#
+# The no_clear flag is a little complicated: it means we don't clear the values
+# when resetting statistics after each run (necessary when the WiredTiger engine
+# is updating values that persist over multiple runs, for example the count of
+# cursors), but it also causes the underlying display routines to not treat the
+# change between displays as relative to the number of seconds, that is, it's an
+# absolute value. The no_clear flag should be set in either case.
 
 from operator import attrgetter
 import sys
@@ -120,9 +127,9 @@ connection_stats = [
     AsyncStat('async_alloc_race', 'number of allocation state races'),
     AsyncStat('async_alloc_view',
         'number of operation slots viewed for allocation'),
+    AsyncStat('async_cur_queue', 'current work queue length'),
     AsyncStat('async_flush', 'number of flush calls'),
     AsyncStat('async_full', 'number of times operation allocation failed'),
-    AsyncStat('async_cur_queue', 'current work queue length'),
     AsyncStat('async_max_queue',
         'maximum work queue length', 'no_clear,no_scale'),
     AsyncStat('async_nowork', 'number of times worker found no work'),
@@ -149,11 +156,11 @@ connection_stats = [
     ##########################################
     CacheStat('cache_bytes_dirty',
         'tracked dirty bytes in the cache', 'no_clear,no_scale'),
-    CacheStat('cache_bytes_inuse',
-        'bytes currently in the cache', 'no_clear,no_scale'),
     CacheStat('cache_bytes_internal',
         'tracked bytes belonging to internal pages in the cache',
         'no_clear,no_scale'),
+    CacheStat('cache_bytes_inuse',
+        'bytes currently in the cache', 'no_clear,no_scale'),
     CacheStat('cache_bytes_leaf',
         'tracked bytes belonging to leaf pages in the cache',
         'no_clear,no_scale'),
@@ -165,11 +172,11 @@ connection_stats = [
     CacheStat('cache_bytes_read', 'bytes read into cache'),
     CacheStat('cache_bytes_write', 'bytes written from cache'),
     CacheStat('cache_eviction_app', 'pages evicted by application threads'),
+    CacheStat('cache_eviction_checkpoint', 'checkpoint blocked page eviction'),
     CacheStat('cache_eviction_clean', 'unmodified pages evicted'),
     CacheStat('cache_eviction_deepen',
         'page split during eviction deepened the tree'),
     CacheStat('cache_eviction_dirty', 'modified pages evicted'),
-    CacheStat('cache_eviction_checkpoint', 'checkpoint blocked page eviction'),
     CacheStat('cache_eviction_fail',
         'pages selected for eviction unable to be evicted'),
     CacheStat('cache_eviction_force',
@@ -197,21 +204,35 @@ connection_stats = [
     CacheStat('cache_eviction_worker_evicting',
         'eviction worker thread evicting pages'),
     CacheStat('cache_inmem_split', 'in-memory page splits'),
+    CacheStat('cache_inmem_splittable',
+        'in-memory page passed criteria to be split'),
+    CacheStat('cache_lookaside_insert', 'lookaside table insert calls'),
+    CacheStat('cache_lookaside_remove', 'lookaside table remove calls'),
     CacheStat('cache_overhead', 'percentage overhead', 'no_clear,no_scale'),
     CacheStat('cache_pages_dirty',
         'tracked dirty pages in the cache', 'no_clear,no_scale'),
     CacheStat('cache_pages_inuse',
         'pages currently held in the cache', 'no_clear,no_scale'),
     CacheStat('cache_read', 'pages read into cache'),
+    CacheStat('cache_read_lookaside',
+        'pages read into cache requiring lookaside entries'),
     CacheStat('cache_write', 'pages written from cache'),
+    CacheStat('cache_write_lookaside',
+        'page written requiring lookaside records'),
+    CacheStat('cache_write_restore',
+        'pages written requiring in-memory restoration'),
 
     ##########################################
     # Dhandle statistics
     ##########################################
-    DhandleStat('dh_conn_handles', 'connection dhandles swept'),
-    DhandleStat('dh_conn_ref', 'connection candidate referenced'),
-    DhandleStat('dh_conn_sweeps', 'connection sweeps'),
-    DhandleStat('dh_conn_tod', 'connection time-of-death sets'),
+    DhandleStat('dh_conn_handle_count',
+        'connection data handles currently active', 'no_clear,no_scale'),
+    DhandleStat('dh_sweep_close', 'connection sweep dhandles closed'),
+    DhandleStat('dh_sweep_remove',
+        'connection sweep dhandles removed from hash list'),
+    DhandleStat('dh_sweep_ref', 'connection sweep candidate became referenced'),
+    DhandleStat('dh_sweep_tod', 'connection sweep time-of-death sets'),
+    DhandleStat('dh_sweeps', 'connection sweeps'),
     DhandleStat('dh_session_handles', 'session dhandles swept'),
     DhandleStat('dh_session_sweeps', 'session sweep attempts'),
 
@@ -225,8 +246,8 @@ connection_stats = [
     LogStat('log_compress_len', 'total size of compressed records'),
     LogStat('log_compress_mem', 'total in-memory size of compressed records'),
     LogStat('log_compress_small', 'log records too small to compress'),
-    LogStat('log_compress_writes', 'log records compressed'),
     LogStat('log_compress_write_fails', 'log records not compressed'),
+    LogStat('log_compress_writes', 'log records compressed'),
     LogStat('log_max_filesize', 'maximum log file size', 'no_clear,no_scale'),
     LogStat('log_prealloc_files', 'pre-allocated log files prepared'),
     LogStat('log_prealloc_max',
@@ -236,20 +257,18 @@ connection_stats = [
     LogStat('log_scan_records', 'records processed by log scan'),
     LogStat('log_scan_rereads', 'log scan records requiring two reads'),
     LogStat('log_scans', 'log scan operations'),
-    LogStat('log_sync', 'log sync operations'),
-    LogStat('log_sync_dir', 'log sync_dir operations'),
-    LogStat('log_writes', 'log write operations'),
-    LogStat('log_write_lsn', 'log server thread advances write LSN'),
-
+    LogStat('log_slot_closes', 'consolidated slot closures'),
     LogStat('log_slot_coalesced', 'written slots coalesced'),
     LogStat('log_slot_consolidated', 'logging bytes consolidated'),
-    LogStat('log_slot_closes', 'consolidated slot closures'),
     LogStat('log_slot_joins', 'consolidated slot joins'),
     LogStat('log_slot_races', 'consolidated slot join races'),
-    LogStat('log_slot_toobig', 'record size exceeded maximum'),
-    LogStat('log_slot_toosmall',
-        'failed to find a slot large enough for record'),
+    LogStat('log_slot_switch_busy', 'busy returns attempting to switch slots'),
     LogStat('log_slot_transitions', 'consolidated slot join transitions'),
+    LogStat('log_slot_unbuffered', 'consolidated slot unbuffered writes'),
+    LogStat('log_sync', 'log sync operations'),
+    LogStat('log_sync_dir', 'log sync_dir operations'),
+    LogStat('log_write_lsn', 'log server thread advances write LSN'),
+    LogStat('log_writes', 'log write operations'),
 
     ##########################################
     # Reconciliation statistics
@@ -268,6 +287,8 @@ connection_stats = [
     TxnStat('txn_checkpoint', 'transaction checkpoints'),
     TxnStat('txn_checkpoint_generation',
         'transaction checkpoint generation', 'no_clear,no_scale'),
+    TxnStat('txn_checkpoint_running',
+        'transaction checkpoint currently running', 'no_clear,no_scale'),
     TxnStat('txn_checkpoint_time_max',
         'transaction checkpoint max time (msecs)', 'no_clear,no_scale'),
     TxnStat('txn_checkpoint_time_min',
@@ -276,17 +297,16 @@ connection_stats = [
         'transaction checkpoint most recent time (msecs)', 'no_clear,no_scale'),
     TxnStat('txn_checkpoint_time_total',
         'transaction checkpoint total time (msecs)', 'no_clear,no_scale'),
-    TxnStat('txn_checkpoint_running',
-        'transaction checkpoint currently running', 'no_clear,no_scale'),
+    TxnStat('txn_commit', 'transactions committed'),
+    TxnStat('txn_fail_cache',
+        'transaction failures due to cache overflow'),
     TxnStat('txn_pinned_checkpoint_range',
         'transaction range of IDs currently pinned by a checkpoint',
-        'no_clear,no_scale'),
+            'no_clear,no_scale'),
     TxnStat('txn_pinned_range',
         'transaction range of IDs currently pinned', 'no_clear,no_scale'),
-    TxnStat('txn_sync', 'transaction sync calls'),
-    TxnStat('txn_commit', 'transactions committed'),
-    TxnStat('txn_fail_cache', 'transaction failures due to cache overflow'),
     TxnStat('txn_rollback', 'transactions rolled back'),
+    TxnStat('txn_sync', 'transaction sync calls'),
 
     ##########################################
     # LSM statistics
@@ -322,6 +342,7 @@ connection_stats = [
     CursorStat('cursor_prev', 'cursor prev calls'),
     CursorStat('cursor_remove', 'cursor remove calls'),
     CursorStat('cursor_reset', 'cursor reset calls'),
+    CursorStat('cursor_restart', 'cursor restarted searches'),
     CursorStat('cursor_search', 'cursor search calls'),
     CursorStat('cursor_search_near', 'cursor search near calls'),
     CursorStat('cursor_update', 'cursor update calls'),
@@ -362,6 +383,7 @@ dsrc_stats = [
     CursorStat('cursor_remove', 'remove calls'),
     CursorStat('cursor_remove_bytes', 'cursor-remove key bytes removed'),
     CursorStat('cursor_reset', 'reset calls'),
+    CursorStat('cursor_restart', 'restarted searches'),
     CursorStat('cursor_search', 'search calls'),
     CursorStat('cursor_search_near', 'search near calls'),
     CursorStat('cursor_update', 'update calls'),
@@ -378,6 +400,8 @@ dsrc_stats = [
         'column-store fixed-size leaf pages', 'no_scale'),
     BtreeStat('btree_column_internal',
         'column-store internal pages', 'no_scale'),
+    BtreeStat('btree_column_rle',
+        'column-store variable-size RLE encoded values', 'no_scale'),
     BtreeStat('btree_column_variable',
         'column-store variable-size leaf pages', 'no_scale'),
     BtreeStat('btree_compact_rewrite', 'pages rewritten by compaction'),
@@ -421,9 +445,9 @@ dsrc_stats = [
     ##########################################
     # Block manager statistics
     ##########################################
-    BlockStat('block_alloc', 'blocks allocated'),
     BlockStat('allocation_size',
         'file allocation unit size', 'no_aggregate,no_scale'),
+    BlockStat('block_alloc', 'blocks allocated'),
     BlockStat('block_checkpoint_size', 'checkpoint size', 'no_scale'),
     BlockStat('block_extension', 'allocations requiring file extension'),
     BlockStat('block_free', 'blocks freed'),
@@ -450,20 +474,28 @@ dsrc_stats = [
     CacheStat('cache_eviction_internal', 'internal pages evicted'),
     CacheStat('cache_eviction_split', 'pages split during eviction'),
     CacheStat('cache_inmem_split', 'in-memory page splits'),
+    CacheStat('cache_inmem_splittable',
+        'in-memory page passed criteria to be split'),
     CacheStat('cache_overflow_value',
         'overflow values cached in memory', 'no_scale'),
     CacheStat('cache_read', 'pages read into cache'),
+    CacheStat('cache_read_lookaside',
+        'pages read into cache requiring lookaside entries'),
     CacheStat('cache_read_overflow', 'overflow pages read into cache'),
     CacheStat('cache_write', 'pages written from cache'),
+    CacheStat('cache_write_lookaside',
+        'page written requiring lookaside records'),
+    CacheStat('cache_write_restore',
+        'pages written requiring in-memory restoration'),
 
     ##########################################
     # Compression statistics
     ##########################################
-    CompressStat('compress_raw_ok', 'raw compression call succeeded'),
     CompressStat('compress_raw_fail',
         'raw compression call failed, no additional data available'),
     CompressStat('compress_raw_fail_temporary',
         'raw compression call failed, additional data available'),
+    CompressStat('compress_raw_ok', 'raw compression call succeeded'),
     CompressStat('compress_read', 'compressed pages read'),
     CompressStat('compress_write', 'compressed pages written'),
     CompressStat('compress_write_fail', 'page written failed to compress'),
@@ -474,21 +506,21 @@ dsrc_stats = [
     # Reconciliation statistics
     ##########################################
     RecStat('rec_dictionary', 'dictionary matches'),
+    RecStat('rec_multiblock_internal', 'internal page multi-block writes'),
+    RecStat('rec_multiblock_leaf', 'leaf page multi-block writes'),
+    RecStat('rec_multiblock_max',
+        'maximum blocks required for a page', 'max_aggregate,no_scale'),
     RecStat('rec_overflow_key_internal', 'internal-page overflow keys'),
     RecStat('rec_overflow_key_leaf', 'leaf-page overflow keys'),
     RecStat('rec_overflow_value', 'overflow values written'),
-    RecStat('rec_page_match', 'page checksum matches'),
     RecStat('rec_page_delete', 'pages deleted'),
+    RecStat('rec_page_match', 'page checksum matches'),
     RecStat('rec_pages', 'page reconciliation calls'),
     RecStat('rec_pages_eviction', 'page reconciliation calls for eviction'),
     RecStat('rec_prefix_compression',
         'leaf page key bytes discarded using prefix compression'),
     RecStat('rec_suffix_compression',
         'internal page key bytes discarded using suffix compression'),
-    RecStat('rec_multiblock_internal', 'internal page multi-block writes'),
-    RecStat('rec_multiblock_leaf', 'leaf page multi-block writes'),
-    RecStat('rec_multiblock_max',
-        'maximum blocks required for a page', 'max_aggregate,no_scale'),
 
     ##########################################
     # Transaction statistics
diff --git a/examples/c/ex_all.c b/examples/c/ex_all.c
index 6905169c4c2..213e058d4cc 100644
--- a/examples/c/ex_all.c
+++ b/examples/c/ex_all.c
@@ -1064,7 +1064,8 @@ main(void)
 		home = NULL;
 
 	/*! [Open a connection] */
-	ret = wiredtiger_open(home, NULL, "create,cache_size=500M", &conn);
+	ret = wiredtiger_open(home, NULL,
+	    "create,cache_size=5GB,log=(enabled,recover=on)", &conn);
 	/*! [Open a connection] */
 
 	if (ret == 0)
diff --git a/examples/c/ex_log.c b/examples/c/ex_log.c
index 136cca900cd..d5a8f32487d 100644
--- a/examples/c/ex_log.c
+++ b/examples/c/ex_log.c
@@ -128,20 +128,22 @@ print_record(WT_LSN *lsn, uint32_t opcount,
  *	A simple walk of the log.
  */
 static int
-simple_walk_log(WT_SESSION *session)
+simple_walk_log(WT_SESSION *session, int count_min)
 {
 	WT_CURSOR *cursor;
 	WT_LSN lsn;
 	WT_ITEM logrec_key, logrec_value;
 	uint64_t txnid;
 	uint32_t fileid, opcount, optype, rectype;
-	int ret;
+	int count, ret;
 
 	/*! [log cursor open] */
 	ret = session->open_cursor(session, "log:", NULL, NULL, &cursor);
 	/*! [log cursor open] */
 
+	count = 0;
 	while ((ret = cursor->next(cursor)) == 0) {
+		count++;
 		/*! [log cursor get_key] */
 		ret = cursor->get_key(cursor, &lsn.file, &lsn.offset, &opcount);
 		/*! [log cursor get_key] */
@@ -156,6 +158,12 @@ simple_walk_log(WT_SESSION *session)
 	if (ret == WT_NOTFOUND)
 		ret = 0;
 	ret = cursor->close(cursor);
+	if (count < count_min) {
+		fprintf(stderr,
+		    "Expected minimum %d records, found %d\n",
+		    count_min, count);
+		abort();
+	}
 	return (ret);
 }
 /*! [log cursor walk] */
@@ -206,11 +214,13 @@ walk_log(WT_SESSION *session)
 
 		/*
 		 * If the operation is a put, replay it here on the backup
-		 * connection.  Note, we cheat by looking only for fileid 1
-		 * in this example.  The metadata is fileid 0.
+		 * connection.
+		 *
+		 * !!!
+		 * Minor cheat: the metadata is fileid 0, skip its records.
 		 */
-		if (fileid == 1 && rectype == WT_LOGREC_COMMIT &&
-		    optype == WT_LOGOP_ROW_PUT) {
+		if (fileid != 0 &&
+		    rectype == WT_LOGREC_COMMIT && optype == WT_LOGOP_ROW_PUT) {
 			if (!in_txn) {
 				ret = session2->begin_transaction(session2,
 				    NULL);
@@ -276,9 +286,10 @@ main(void)
 	WT_CONNECTION *wt_conn;
 	WT_CURSOR *cursor;
 	WT_SESSION *session;
-	int i, record_count, ret;
+	int count_min, i, record_count, ret;
 	char cmd_buf[256], k[16], v[16];
 
+	count_min = 0;
 	snprintf(cmd_buf, sizeof(cmd_buf), "rm -rf %s %s && mkdir %s %s",
 	    home1, home2, home1, home2);
 	if ((ret = system(cmd_buf)) != 0) {
@@ -293,6 +304,7 @@ main(void)
 
 	ret = wt_conn->open_session(wt_conn, NULL, NULL, &session);
 	ret = session->create(session, uri, "key_format=S,value_format=S");
+	count_min++;
 
 	ret = session->open_cursor(session, uri, NULL, NULL, &cursor);
 	/*
@@ -304,6 +316,7 @@ main(void)
 		cursor->set_key(cursor, k);
 		cursor->set_value(cursor, v);
 		ret = cursor->insert(cursor);
+		count_min++;
 	}
 	ret = session->begin_transaction(session, NULL);
 	/*
@@ -317,10 +330,12 @@ main(void)
 		ret = cursor->insert(cursor);
 	}
 	ret = session->commit_transaction(session, NULL);
+	count_min++;
 	ret = cursor->close(cursor);
 
 	/*! [log cursor printf] */
 	ret = session->log_printf(session, "Wrote %d records", record_count);
+	count_min++;
 	/*! [log cursor printf] */
 
 	/*
@@ -336,7 +351,7 @@ main(void)
 	}
 
 	ret = wt_conn->open_session(wt_conn, NULL, NULL, &session);
-	ret = simple_walk_log(session);
+	ret = simple_walk_log(session, count_min);
 	ret = walk_log(session);
 	ret = wt_conn->close(wt_conn, NULL);
 	return (ret);
diff --git a/examples/python/ex_access.py b/examples/python/ex_access.py
index 8eeefd56cf7..2940ac63625 100755
--- a/examples/python/ex_access.py
+++ b/examples/python/ex_access.py
@@ -50,6 +50,6 @@ cursor.insert()
 # Iterate through the records
 cursor.reset()
 for key, value in cursor:
-    print('Got record: ' + key + ' : ' + value)
+    print('Got record: %s : %s' % (key, value))
 
 conn.close()
diff --git a/examples/python/ex_stat.py b/examples/python/ex_stat.py
index e27177403cc..af2c4f7a1a7 100755
--- a/examples/python/ex_stat.py
+++ b/examples/python/ex_stat.py
@@ -32,6 +32,7 @@
 import os
 from wiredtiger import wiredtiger_open,WIREDTIGER_VERSION_STRING,stat
 
+
 def main():
     # Create a clean test directory for this run of the test program
     os.system('rm -rf WT_HOME')
@@ -39,16 +40,16 @@ def main():
     # Connect to the database and open a session
     conn = wiredtiger_open('WT_HOME', 'create,statistics=(all)')
     session = conn.open_session()
-    
+
     # Create a simple table
     session.create('table:access', 'key_format=S,value_format=S')
-    
+
     # Open a cursor and insert a record
     cursor = session.open_cursor('table:access', None)
 
-    cursor['key'] = 'value'    
+    cursor['key'] = 'value'
     cursor.close()
-    
+
     session.checkpoint()
     print WIREDTIGER_VERSION_STRING
     print_database_stats(session)
@@ -57,46 +58,51 @@ def main():
     print_derived_stats(session)
     conn.close()
 
+
 def print_database_stats(session):
     statcursor = session.open_cursor("statistics:")
     print_cursor(statcursor)
     statcursor.close()
 
+
 def print_file_stats(session):
     fstatcursor = session.open_cursor("statistics:table:access")
     print_cursor(fstatcursor)
     fstatcursor.close()
 
+
 def print_overflow_pages(session):
     ostatcursor = session.open_cursor("statistics:table:access")
     val = ostatcursor[stat.dsrc.btree_overflow]
-    if val != 0 :
-        print str(val[0]) + '=' + str(val[1])
+    if val != 0:
+        print '%s=%s' % (str(val[0]), str(val[1]))
     ostatcursor.close()
 
+
 def print_derived_stats(session):
     dstatcursor = session.open_cursor("statistics:table:access")
     ckpt_size = dstatcursor[stat.dsrc.block_checkpoint_size][1]
     file_size = dstatcursor[stat.dsrc.block_size][1]
     percent = 0
-    if file_size != 0 :
+    if file_size != 0:
         percent = 100 * ((float(file_size) - float(ckpt_size)) / float(file_size))
-    print "Table is %" + str(percent) + " fragmented"
+    print "Table is %%%s fragmented" % str(percent)
 
     app_insert = int(dstatcursor[stat.dsrc.cursor_insert_bytes][1])
     app_remove = int(dstatcursor[stat.dsrc.cursor_remove_bytes][1])
     app_update = int(dstatcursor[stat.dsrc.cursor_update_bytes][1])
-    fs_writes  = int(dstatcursor[stat.dsrc.cache_bytes_write][1])
+    fs_writes = int(dstatcursor[stat.dsrc.cache_bytes_write][1])
 
-    if(app_insert + app_remove + app_update != 0):
+    if app_insert + app_remove + app_update != 0:
         print "Write amplification is " + '{:.2f}'.format(fs_writes / (app_insert + app_remove + app_update))
     dstatcursor.close()
 
+
 def print_cursor(mycursor):
     while mycursor.next() == 0:
         val = mycursor.get_value()
-        if val[1] != '0' :
-            print str(val[0]) + '=' + str(val[1])
+        if val[1] != '0':
+            print '%s=%s' % (str(val[0]), str(val[1]))
 
 if __name__ == "__main__":
     main()
diff --git a/ext/encryptors/rotn/rotn_encrypt.c b/ext/encryptors/rotn/rotn_encrypt.c
index 503dcae83a7..5b29e66c503 100644
--- a/ext/encryptors/rotn/rotn_encrypt.c
+++ b/ext/encryptors/rotn/rotn_encrypt.c
@@ -68,7 +68,7 @@
 typedef struct {
 	WT_ENCRYPTOR encryptor;		/* Must come first */
 
-	WT_EXTENSION_API *wt_api;	/* Extension API */
+	WT_EXTENSION_API *wtext;	/* Extension API */
 
 	int rot_N;			/* rotN value */
 	char *keyid;			/* Saved keyid */
@@ -76,6 +76,7 @@ typedef struct {
 	u_char *shift_forw;		/* Encrypt shift data from secretkey */
 	u_char *shift_back;		/* Decrypt shift data from secretkey */
 	size_t shift_len;		/* Length of shift* byte arrays */
+	int force_error;		/* Force a decrypt error for testing */
 
 } ROTN_ENCRYPTOR;
 /*! [WT_ENCRYPTOR initialization structure] */
@@ -84,6 +85,22 @@ typedef struct {
 #define	IV_LEN		16
 
 /*
+ * rotn_error --
+ *	Display an error from this module in a standard way.
+ */
+static int
+rotn_error(ROTN_ENCRYPTOR *encryptor, WT_SESSION *session, int err,
+    const char *msg)
+{
+	WT_EXTENSION_API *wtext;
+
+	wtext = encryptor->wtext;
+	(void)wtext->err_printf(wtext, session,
+	    "rotn encryption: %s: %s", msg, wtext->strerror(wtext, NULL, err));
+	return (err);
+}
+
+/*
  * make_cksum --
  *	This is where one would call a checksum function on the encrypted
  *	buffer.  Here we just put a constant value in it.
@@ -221,13 +238,18 @@ rotn_decrypt(WT_ENCRYPTOR *encryptor, WT_SESSION *session,
 	(void)session;		/* Unused */
 
 	/*
+	 * For certain tests, force an error we can recognize.
+	 */
+	if (rotn_encryptor->force_error)
+		return (-1000);
+
+	/*
 	 * Make sure it is big enough.
 	 */
 	mylen = src_len - (CHKSUM_LEN + IV_LEN);
-	if (dst_len < mylen) {
-		fprintf(stderr, "Rotate: ENOMEM ERROR\n");
-		return (ENOMEM);
-	}
+	if (dst_len < mylen)
+		return (rotn_error(rotn_encryptor, session,
+		    ENOMEM, "decrypt buffer not big enough"));
 
 	/*
 	 * !!! Most implementations would verify the checksum here.
@@ -286,7 +308,7 @@ rotn_customize(WT_ENCRYPTOR *encryptor, WT_SESSION *session,
 	const ROTN_ENCRYPTOR *orig;
 	ROTN_ENCRYPTOR *rotn_encryptor;
 	WT_CONFIG_ITEM keyid, secret;
-	WT_EXTENSION_API *wt_api;
+	WT_EXTENSION_API *wtext;
 	size_t i, len;
 	int ret, keyid_val;
 	u_char base;
@@ -295,7 +317,7 @@ rotn_customize(WT_ENCRYPTOR *encryptor, WT_SESSION *session,
 	keyid_val = 0;
 
 	orig = (const ROTN_ENCRYPTOR *)encryptor;
-	wt_api = orig->wt_api;
+	wtext = orig->wtext;
 
 	if ((rotn_encryptor = calloc(1, sizeof(ROTN_ENCRYPTOR))) == NULL)
 		return (errno);
@@ -305,7 +327,7 @@ rotn_customize(WT_ENCRYPTOR *encryptor, WT_SESSION *session,
 	/*
 	 * Stash the keyid from the configuration string.
 	 */
-	if ((ret = wt_api->config_get(wt_api, session, encrypt_config,
+	if ((ret = wtext->config_get(wtext, session, encrypt_config,
 	    "keyid", &keyid)) == 0 && keyid.len != 0) {
 		/*
 		 * In this demonstration, we expect keyid to be a number.
@@ -327,7 +349,7 @@ rotn_customize(WT_ENCRYPTOR *encryptor, WT_SESSION *session,
 	 * We stash the secret key from the configuration string
 	 * and build some shift bytes to make encryption/decryption easy.
 	 */
-	if ((ret = wt_api->config_get(wt_api, session, encrypt_config,
+	if ((ret = wtext->config_get(wtext, session, encrypt_config,
 	    "secretkey", &secret)) == 0 && secret.len != 0) {
 		len = secret.len;
 		if ((rotn_encryptor->secretkey = malloc(len + 1)) == NULL ||
@@ -396,6 +418,53 @@ rotn_terminate(WT_ENCRYPTOR *encryptor, WT_SESSION *session)
 }
 /*! [WT_ENCRYPTOR terminate] */
 
+/*
+ * rotn_configure --
+ *	WiredTiger no-op encryption configuration.
+ */
+static int
+rotn_configure(ROTN_ENCRYPTOR *rotn_encryptor, WT_CONFIG_ARG *config)
+{
+	WT_CONFIG_ITEM k, v;
+	WT_CONFIG_PARSER *config_parser;
+	WT_EXTENSION_API *wtext;	/* Extension API */
+	int ret, t_ret;
+
+	wtext = rotn_encryptor->wtext;
+
+	/* Get the configuration string. */
+	if ((ret = wtext->config_get(wtext, NULL, config, "config", &v)) != 0)
+		return (rotn_error(rotn_encryptor, NULL, ret,
+		    "WT_EXTENSION_API.config_get"));
+
+	/* Step through the list of configuration options. */
+	if ((ret = wtext->config_parser_open(
+	    wtext, NULL, v.str, v.len, &config_parser)) != 0)
+		return (rotn_error(rotn_encryptor, NULL, ret,
+		    "WT_EXTENSION_API.config_parser_open"));
+
+	while ((ret = config_parser->next(config_parser, &k, &v)) == 0) {
+		if (strncmp("rotn_force_error", k.str, k.len) == 0 &&
+		    strlen("rotn_force_error") == k.len) {
+			rotn_encryptor->force_error = v.val == 0 ? 0 : 1;
+			continue;
+		}
+		else {
+			(void)config_parser->close(config_parser);
+			return (rotn_error(rotn_encryptor, NULL, EINVAL,
+			    "unknown config key"));
+		}
+	}
+	if ((t_ret = config_parser->close(config_parser)) != 0)
+		return (rotn_error(rotn_encryptor, NULL, t_ret,
+		    "WT_CONFIG_PARSER.close"));
+	if (ret != WT_NOTFOUND)
+		return (rotn_error(rotn_encryptor, NULL, ret,
+		    "WT_CONFIG_PARSER.next"));
+
+	return (0);
+}
+
 /*! [WT_ENCRYPTOR initialization function] */
 /*
  * wiredtiger_extension_init --
@@ -405,8 +474,7 @@ int
 wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
 {
 	ROTN_ENCRYPTOR *rotn_encryptor;
-
-	(void)config;				/* Unused parameters */
+	int ret;
 
 	if ((rotn_encryptor = calloc(1, sizeof(ROTN_ENCRYPTOR))) == NULL)
 		return (errno);
@@ -423,9 +491,12 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
 	rotn_encryptor->encryptor.sizing = rotn_sizing;
 	rotn_encryptor->encryptor.customize = rotn_customize;
 	rotn_encryptor->encryptor.terminate = rotn_terminate;
+	rotn_encryptor->wtext = connection->get_extension_api(connection);
 
-	rotn_encryptor->wt_api = connection->get_extension_api(connection);
-
+	if ((ret = rotn_configure(rotn_encryptor, config)) != 0) {
+		free(rotn_encryptor);
+		return (ret);
+	}
 						/* Load the encryptor */
 	return (connection->add_encryptor(
 	    connection, "rotn", (WT_ENCRYPTOR *)rotn_encryptor, NULL));
diff --git a/ext/extractors/csv/csv_extractor.c b/ext/extractors/csv/csv_extractor.c
index 0dd110955ad..34b8d7c7c64 100644
--- a/ext/extractors/csv/csv_extractor.c
+++ b/ext/extractors/csv/csv_extractor.c
@@ -128,7 +128,7 @@ csv_customize(WT_EXTRACTOR *extractor, WT_SESSION *session,
 		return (errno);
 
 	*csv_extractor = *orig;
-	csv_extractor->field_num = field_num;
+	csv_extractor->field_num = (int)field_num;
 	*customp = (WT_EXTRACTOR *)csv_extractor;
 	return (0);
 }
diff --git a/src/async/async_api.c b/src/async/async_api.c
index 44e492cb0e5..416c3c84f7b 100644
--- a/src/async/async_api.c
+++ b/src/async/async_api.c
@@ -43,7 +43,7 @@ __async_get_format(WT_CONNECTION_IMPL *conn, const char *uri,
 	 * is a possibility a duplicate entry might be inserted, but
 	 * that is not harmful.
 	 */
-	STAILQ_FOREACH(af, &async->formatqh, q) {
+	TAILQ_FOREACH(af, &async->formatqh, q) {
 		if (af->uri_hash == uri_hash && af->cfg_hash == cfg_hash)
 			goto setup;
 	}
@@ -71,7 +71,7 @@ __async_get_format(WT_CONNECTION_IMPL *conn, const char *uri,
 	WT_ERR(c->close(c));
 	c = NULL;
 
-	STAILQ_INSERT_HEAD(&async->formatqh, af, q);
+	TAILQ_INSERT_HEAD(&async->formatqh, af, q);
 	__wt_spin_unlock(session, &async->ops_lock);
 	WT_ERR(wt_session->close(wt_session, NULL));
 
@@ -151,15 +151,16 @@ retry:
 	 * If we can set the state then the op entry is ours.
 	 * Start the next search at the next entry after this one.
 	 */
-	if (!WT_ATOMIC_CAS4(op->state, WT_ASYNCOP_FREE, WT_ASYNCOP_READY)) {
+	if (!__wt_atomic_cas32(&op->state, WT_ASYNCOP_FREE, WT_ASYNCOP_READY)) {
 		WT_STAT_FAST_CONN_INCR(session, async_alloc_race);
 		goto retry;
 	}
 	WT_STAT_FAST_CONN_INCRV(session, async_alloc_view, view);
 	WT_RET(__async_get_format(conn, uri, config, op));
-	op->unique_id = WT_ATOMIC_ADD8(async->op_id, 1);
+	op->unique_id = __wt_atomic_add64(&async->op_id, 1);
 	op->optype = WT_AOP_NONE;
-	(void)WT_ATOMIC_STORE4(async->ops_index, (i + 1) % conn->async_size);
+	(void)__wt_atomic_store32(
+	    &async->ops_index, (i + 1) % conn->async_size);
 	*opp = op;
 	return (0);
 }
@@ -206,15 +207,15 @@ __wt_async_stats_update(WT_SESSION_IMPL *session)
 {
 	WT_ASYNC *async;
 	WT_CONNECTION_IMPL *conn;
-	WT_CONNECTION_STATS *stats;
+	WT_CONNECTION_STATS **stats;
 
 	conn = S2C(session);
 	async = conn->async;
 	if (async == NULL)
 		return;
-	stats = &conn->stats;
-	WT_STAT_SET(stats, async_cur_queue, async->cur_queue);
-	WT_STAT_SET(stats, async_max_queue, async->max_queue);
+	stats = conn->stats;
+	WT_STAT_SET(session, stats, async_cur_queue, async->cur_queue);
+	WT_STAT_SET(session, stats, async_max_queue, async->max_queue);
 	F_SET(conn, WT_CONN_SERVER_ASYNC);
 }
 
@@ -237,7 +238,7 @@ __async_start(WT_SESSION_IMPL *session)
 	 */
 	WT_RET(__wt_calloc_one(session, &conn->async));
 	async = conn->async;
-	STAILQ_INIT(&async->formatqh);
+	TAILQ_INIT(&async->formatqh);
 	WT_RET(__wt_spin_init(session, &async->ops_lock, "ops"));
 	WT_RET(__wt_cond_alloc(session, "async flush", 0, &async->flush_cond));
 	WT_RET(__wt_async_op_init(session));
@@ -461,9 +462,9 @@ __wt_async_destroy(WT_SESSION_IMPL *session)
 	}
 
 	/* Free format resources */
-	af = STAILQ_FIRST(&async->formatqh);
+	af = TAILQ_FIRST(&async->formatqh);
 	while (af != NULL) {
-		afnext = STAILQ_NEXT(af, q);
+		afnext = TAILQ_NEXT(af, q);
 		__wt_free(session, af->uri);
 		__wt_free(session, af->config);
 		__wt_free(session, af->key_format);
@@ -514,7 +515,7 @@ retry:
 		 */
 		__wt_sleep(0, 100000);
 
-	if (!WT_ATOMIC_CAS4(async->flush_state, WT_ASYNC_FLUSH_NONE,
+	if (!__wt_atomic_cas32(&async->flush_state, WT_ASYNC_FLUSH_NONE,
 	    WT_ASYNC_FLUSH_IN_PROGRESS))
 		goto retry;
 	/*
@@ -524,7 +525,7 @@ retry:
 	 * things off the work queue with the lock.
 	 */
 	async->flush_count = 0;
-	(void)WT_ATOMIC_ADD8(async->flush_gen, 1);
+	(void)__wt_atomic_add64(&async->flush_gen, 1);
 	WT_ASSERT(session, async->flush_op.state == WT_ASYNCOP_FREE);
 	async->flush_op.state = WT_ASYNCOP_READY;
 	WT_ERR(__wt_async_op_enqueue(session, &async->flush_op));
diff --git a/src/async/async_op.c b/src/async/async_op.c
index d0c58f584cc..469dbc8e615 100644
--- a/src/async/async_op.c
+++ b/src/async/async_op.c
@@ -237,7 +237,7 @@ __async_op_init(WT_CONNECTION_IMPL *conn, WT_ASYNC_OP_IMPL *op, uint32_t id)
 	asyncop->c.set_key = __wt_cursor_set_key;
 	asyncop->c.get_value = __wt_cursor_get_value;
 	asyncop->c.set_value = __wt_cursor_set_value;
-	asyncop->c.recno = 0;
+	asyncop->c.recno = WT_RECNO_OOB;
 	memset(asyncop->c.raw_recno_buf, 0, sizeof(asyncop->c.raw_recno_buf));
 	memset(&asyncop->c.key, 0, sizeof(asyncop->c.key));
 	memset(&asyncop->c.value, 0, sizeof(asyncop->c.value));
@@ -280,7 +280,7 @@ __wt_async_op_enqueue(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op)
 	 * Enqueue op at the tail of the work queue.
 	 * We get our slot in the ring buffer to use.
 	 */
-	my_alloc = WT_ATOMIC_ADD8(async->alloc_head, 1);
+	my_alloc = __wt_atomic_add64(&async->alloc_head, 1);
 	my_slot = my_alloc % async->async_qsize;
 
 	/*
@@ -300,7 +300,7 @@ __wt_async_op_enqueue(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op)
 #endif
 	WT_PUBLISH(async->async_queue[my_slot], op);
 	op->state = WT_ASYNCOP_ENQUEUED;
-	if (WT_ATOMIC_ADD4(async->cur_queue, 1) > async->max_queue)
+	if (__wt_atomic_add32(&async->cur_queue, 1) > async->max_queue)
 		WT_PUBLISH(async->max_queue, async->cur_queue);
 	/*
 	 * Multiple threads may be adding ops to the queue.  We need to wait
diff --git a/src/async/async_worker.c b/src/async/async_worker.c
index 4f372d05d19..6a5ec5feeb0 100644
--- a/src/async/async_worker.c
+++ b/src/async/async_worker.c
@@ -67,7 +67,7 @@ retry:
 	 * a race, try again.
 	 */
 	my_consume = last_consume + 1;
-	if (!WT_ATOMIC_CAS8(async->alloc_tail, last_consume, my_consume))
+	if (!__wt_atomic_cas64(&async->alloc_tail, last_consume, my_consume))
 		goto retry;
 	/*
 	 * This item of work is ours to process.  Clear it out of the
@@ -81,7 +81,7 @@ retry:
 	WT_ASSERT(session, async->cur_queue > 0);
 	WT_ASSERT(session, *op != NULL);
 	WT_ASSERT(session, (*op)->state == WT_ASYNCOP_ENQUEUED);
-	(void)WT_ATOMIC_SUB4(async->cur_queue, 1);
+	(void)__wt_atomic_sub32(&async->cur_queue, 1);
 	(*op)->state = WT_ASYNCOP_WORKING;
 
 	if (*op == &async->flush_op)
@@ -135,7 +135,7 @@ __async_worker_cursor(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op,
 	if (op->optype == WT_AOP_COMPACT)
 		return (0);
 	WT_ASSERT(session, op->format != NULL);
-	STAILQ_FOREACH(ac, &worker->cursorqh, q) {
+	TAILQ_FOREACH(ac, &worker->cursorqh, q) {
 		if (op->format->cfg_hash == ac->cfg_hash &&
 		    op->format->uri_hash == ac->uri_hash) {
 			/*
@@ -156,7 +156,7 @@ __async_worker_cursor(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op,
 	ac->cfg_hash = op->format->cfg_hash;
 	ac->uri_hash = op->format->uri_hash;
 	ac->c = c;
-	STAILQ_INSERT_HEAD(&worker->cursorqh, ac, q);
+	TAILQ_INSERT_HEAD(&worker->cursorqh, ac, q);
 	worker->num_cursors++;
 	*cursorp = c;
 	return (0);
@@ -297,7 +297,7 @@ __wt_async_worker(void *arg)
 	async = conn->async;
 
 	worker.num_cursors = 0;
-	STAILQ_INIT(&worker.cursorqh);
+	TAILQ_INIT(&worker.cursorqh);
 	while (F_ISSET(conn, WT_CONN_SERVER_ASYNC) &&
 	    F_ISSET(session, WT_SESSION_SERVER_ASYNC)) {
 		WT_ERR(__async_op_dequeue(conn, session, &op));
@@ -316,7 +316,7 @@ __wt_async_worker(void *arg)
 			 * the queue.
 			 */
 			WT_ORDERED_READ(flush_gen, async->flush_gen);
-			if (WT_ATOMIC_ADD4(async->flush_count, 1) ==
+			if (__wt_atomic_add32(&async->flush_count, 1) ==
 			    conn->async_workers) {
 				/*
 				 * We're last.  All workers accounted for so
@@ -346,9 +346,9 @@ err:		WT_PANIC_MSG(session, ret, "async worker error");
 	 * Worker thread cleanup, close our cached cursors and free all the
 	 * WT_ASYNC_CURSOR structures.
 	 */
-	ac = STAILQ_FIRST(&worker.cursorqh);
+	ac = TAILQ_FIRST(&worker.cursorqh);
 	while (ac != NULL) {
-		acnext = STAILQ_NEXT(ac, q);
+		acnext = TAILQ_NEXT(ac, q);
 		WT_TRET(ac->c->close(ac->c));
 		__wt_free(session, ac);
 		ac = acnext;
diff --git a/src/block/block_ext.c b/src/block/block_ext.c
index d593537446b..018f6a20164 100644
--- a/src/block/block_ext.c
+++ b/src/block/block_ext.c
@@ -86,7 +86,7 @@ __block_off_srch(WT_EXT **head, wt_off_t off, WT_EXT ***stack, int skip_off)
  * __block_first_srch --
  *	Search the skiplist for the first available slot.
  */
-static inline int
+static inline bool
 __block_first_srch(WT_EXT **head, wt_off_t size, WT_EXT ***stack)
 {
 	WT_EXT *ext;
@@ -99,11 +99,11 @@ __block_first_srch(WT_EXT **head, wt_off_t size, WT_EXT ***stack)
 		if (ext->size >= size)
 			break;
 	if (ext == NULL)
-		return (0);
+		return (false);
 
 	/* Build a stack for the offset we want. */
 	__block_off_srch(head, ext->off, stack, 0);
-	return (1);
+	return (true);
 }
 
 /*
@@ -251,7 +251,7 @@ __block_off_insert(
  *	Return if any part of a specified range appears on a specified extent
  * list.
  */
-static int
+static bool
 __block_off_match(WT_EXTLIST *el, wt_off_t off, wt_off_t size)
 {
 	WT_EXT *before, *after;
@@ -261,10 +261,10 @@ __block_off_match(WT_EXTLIST *el, wt_off_t off, wt_off_t size)
 
 	/* If "before" or "after" overlaps, we have a winner. */
 	if (before != NULL && before->off + before->size > off)
-		return (1);
+		return (true);
 	if (after != NULL && off + size > after->off)
-		return (1);
-	return (0);
+		return (true);
+	return (false);
 }
 
 /*
@@ -283,7 +283,7 @@ __wt_block_misplaced(WT_SESSION_IMPL *session,
 	 * Don't check during the salvage read phase, we might be reading an
 	 * already freed overflow page.
 	 */
-	if (F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))
+	if (F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))
 		return (0);
 
 	/*
diff --git a/src/block/block_open.c b/src/block/block_open.c
index fd00e0c7deb..cfb5b000092 100644
--- a/src/block/block_open.c
+++ b/src/block/block_open.c
@@ -158,9 +158,9 @@ __wt_block_configure_first_fit(WT_BLOCK *block, int on)
 	 * as long as any operation wants it.
 	 */
 	if (on)
-		(void)WT_ATOMIC_ADD4(block->allocfirst, 1);
+		(void)__wt_atomic_add32(&block->allocfirst, 1);
 	else
-		(void)WT_ATOMIC_SUB4(block->allocfirst, 1);
+		(void)__wt_atomic_sub32(&block->allocfirst, 1);
 }
 
 /*
@@ -185,7 +185,7 @@ __wt_block_open(WT_SESSION_IMPL *session,
 	hash = __wt_hash_city64(filename, strlen(filename));
 	bucket = hash % WT_HASH_ARRAY_SIZE;
 	__wt_spin_lock(session, &conn->block_lock);
-	SLIST_FOREACH(block, &conn->blockhash[bucket], hashl) {
+	TAILQ_FOREACH(block, &conn->blockhash[bucket], hashq) {
 		if (strcmp(filename, block->name) == 0) {
 			++block->ref;
 			*blockp = block;
@@ -398,21 +398,19 @@ err:	__wt_scr_free(session, &buf);
 void
 __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats)
 {
+	WT_UNUSED(session);
+
 	/*
-	 * We're looking inside the live system's structure, which normally
-	 * requires locking: the chances of a corrupted read are probably
-	 * non-existent, and it's statistics information regardless, but it
-	 * isn't like this is a common function for an application to call.
+	 * Reading from the live system's structure normally requires locking,
+	 * but it's an 8B statistics read, there's no need.
 	 */
-	__wt_spin_lock(session, &block->live_lock);
-	WT_STAT_SET(stats, allocation_size, block->allocsize);
-	WT_STAT_SET(stats, block_checkpoint_size, block->live.ckpt_size);
-	WT_STAT_SET(stats, block_magic, WT_BLOCK_MAGIC);
-	WT_STAT_SET(stats, block_major, WT_BLOCK_MAJOR_VERSION);
-	WT_STAT_SET(stats, block_minor, WT_BLOCK_MINOR_VERSION);
-	WT_STAT_SET(stats, block_reuse_bytes, block->live.avail.bytes);
-	WT_STAT_SET(stats, block_size, block->fh->size);
-	__wt_spin_unlock(session, &block->live_lock);
+	stats->allocation_size = block->allocsize;
+	stats->block_checkpoint_size = (int64_t)block->live.ckpt_size;
+	stats->block_magic = WT_BLOCK_MAGIC;
+	stats->block_major = WT_BLOCK_MAJOR_VERSION;
+	stats->block_minor = WT_BLOCK_MINOR_VERSION;
+	stats->block_reuse_bytes = (int64_t)block->live.avail.bytes;
+	stats->block_size = block->fh->size;
 }
 
 /*
@@ -426,7 +424,7 @@ __wt_block_manager_size(
 	wt_off_t filesize;
 
 	WT_RET(__wt_filesize_name(session, filename, &filesize));
-	WT_STAT_SET(stats, block_size, filesize);
+	stats->block_size = filesize;
 
 	return (0);
 }
diff --git a/src/block/block_read.c b/src/block/block_read.c
index 0d631396b41..9f7c869dd38 100644
--- a/src/block/block_read.c
+++ b/src/block/block_read.c
@@ -200,7 +200,7 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
 		if (page_cksum == cksum)
 			return (0);
 
-		if (!F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))
+		if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))
 			__wt_errx(session,
 			    "read checksum error for %" PRIu32 "B block at "
 			    "offset %" PRIuMAX ": calculated block checksum "
@@ -208,7 +208,7 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
 			    "of %" PRIu32,
 			    size, (uintmax_t)offset, page_cksum, cksum);
 	} else
-		if (!F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))
+		if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))
 			__wt_errx(session,
 			    "read checksum error for %" PRIu32 "B block at "
 			    "offset %" PRIuMAX ": block header checksum "
@@ -218,6 +218,6 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
 
 	/* Panic if a checksum fails during an ordinary read. */
 	return (block->verify ||
-	    F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ?
+	    F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE) ?
 	    WT_ERROR : __wt_illegal_value(session, block->name));
 }
diff --git a/src/block/block_slvg.c b/src/block/block_slvg.c
index c78a6c39942..641bb8a42f7 100644
--- a/src/block/block_slvg.c
+++ b/src/block/block_slvg.c
@@ -73,19 +73,19 @@ __wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block)
  * __wt_block_offset_invalid --
  *	Return if the block offset is insane.
  */
-int
+bool
 __wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size)
 {
 	if (size == 0)				/* < minimum page size */
-		return (1);
+		return (true);
 	if (size % block->allocsize != 0)	/* not allocation-size units */
-		return (1);
+		return (true);
 	if (size > WT_BTREE_PAGE_SIZE_MAX)	/* > maximum page size */
-		return (1);
+		return (true);
 						/* past end-of-file */
 	if (offset + (wt_off_t)size > block->fh->size)
-		return (1);
-	return (0);
+		return (true);
+	return (false);
 }
 
 /*
diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c
index 18f8ca54601..79a52dbcaa3 100644
--- a/src/btree/bt_compact.c
+++ b/src/btree/bt_compact.c
@@ -53,12 +53,12 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp)
 	} else if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_REPLACE) {
 		/*
 		 * The page's modification information can change underfoot if
-		 * the page is being reconciled, lock the page down.
+		 * the page is being reconciled, serialize with reconciliation.
 		 */
-		WT_PAGE_LOCK(session, page);
+		F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION);
 		ret = bm->compact_page_skip(bm, session,
 		    mod->mod_replace.addr, mod->mod_replace.size, skipp);
-		WT_PAGE_UNLOCK(session, page);
+		F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION);
 		WT_RET(ret);
 	}
 	return (0);
@@ -73,14 +73,12 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[])
 {
 	WT_BM *bm;
 	WT_BTREE *btree;
-	WT_CONNECTION_IMPL *conn;
 	WT_DECL_RET;
 	WT_REF *ref;
-	int block_manager_begin, evict_reset, skip;
+	int block_manager_begin, skip;
 
 	WT_UNUSED(cfg);
 
-	conn = S2C(session);
 	btree = S2BT(session);
 	bm = btree->bm;
 	ref = NULL;
@@ -118,25 +116,6 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[])
 	 */
 	__wt_spin_lock(session, &btree->flush_lock);
 
-	/*
-	 * That leaves eviction, we don't want to block eviction.  Set a flag
-	 * so reconciliation knows compaction is running.  If reconciliation
-	 * sees the flag it locks the page it's writing, we acquire the same
-	 * lock when reading the page's modify information, serializing access.
-	 * The same page lock blocks work on the page, but compaction is an
-	 * uncommon, heavy-weight operation.  If it's ever a problem, there's
-	 * no reason we couldn't use an entirely separate lock than the page
-	 * lock.
-	 *
-	 * We also need to ensure we don't race with an on-going reconciliation.
-	 * After we set the flag, wait for eviction of this file to drain, and
-	 * then let eviction continue;
-	 */
-	conn->compact_in_memory_pass = 1;
-	WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset));
-	if (evict_reset)
-		__wt_evict_file_exclusive_off(session);
-
 	/* Start compaction. */
 	WT_ERR(bm->compact_start(bm, session));
 	block_manager_begin = 1;
@@ -172,11 +151,7 @@ err:	if (ref != NULL)
 	if (block_manager_begin)
 		WT_TRET(bm->compact_end(bm, session));
 
-	/*
-	 * Unlock will be a release barrier, use it to update the compaction
-	 * status for reconciliation.
-	 */
-	conn->compact_in_memory_pass = 0;
+	/* Unblock threads writing leaf pages. */
 	__wt_spin_unlock(session, &btree->flush_lock);
 
 	return (ret);
diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c
index 0aed5940533..458a1985e28 100644
--- a/src/btree/bt_cursor.c
+++ b/src/btree/bt_cursor.c
@@ -70,7 +70,7 @@ __cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt)
  * __cursor_valid --
  *	Return if the cursor references an valid key/value pair.
  */
-static inline int
+static inline bool
 __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
 {
 	WT_BTREE *btree;
@@ -133,10 +133,10 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
 	if (cbt->ins != NULL &&
 	    (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) {
 		if (WT_UPDATE_DELETED_ISSET(upd))
-			return (0);
+			return (false);
 		if (updp != NULL)
 			*updp = upd;
-		return (1);
+		return (true);
 	}
 
 	/*
@@ -155,7 +155,7 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
 		 * keys, check for retrieval past the end of the page.
 		 */
 		if (cbt->recno >= page->pg_fix_recno + page->pg_fix_entries)
-			return (0);
+			return (false);
 
 		/*
 		 * Updates aren't stored on the page, an update would have
@@ -170,7 +170,7 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
 		 * "slots", check if search returned a valid slot.
 		 */
 		if (cbt->slot >= page->pg_var_entries)
-			return (0);
+			return (false);
 
 		/*
 		 * Updates aren't stored on the page, an update would have
@@ -181,7 +181,7 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
 		cip = &page->pg_var_d[cbt->slot];
 		if ((cell = WT_COL_PTR(page, cip)) == NULL ||
 		    __wt_cell_type(cell) == WT_CELL_DEL)
-			return (0);
+			return (false);
 		break;
 	case BTREE_ROW:
 		/*
@@ -189,7 +189,7 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
 		 * key as an on-page object, we're done.
 		 */
 		if (cbt->ins != NULL)
-			return (0);
+			return (false);
 
 		/*
 		 * Check if searched returned a valid slot (the failure mode is
@@ -198,19 +198,19 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
 		 * mirrors the column-store test).
 		 */
 		if (cbt->slot >= page->pg_row_entries)
-			return (0);
+			return (false);
 
 		/* Updates are stored on the page, check for a delete. */
 		if (page->pg_row_upd != NULL && (upd = __wt_txn_read(
 		    session, page->pg_row_upd[cbt->slot])) != NULL) {
 			if (WT_UPDATE_DELETED_ISSET(upd))
-				return (0);
+				return (false);
 			if (updp != NULL)
 				*updp = upd;
 		}
 		break;
 	}
-	return (1);
+	return (true);
 }
 
 /*
@@ -517,7 +517,7 @@ retry:	WT_RET(__cursor_func_init(cbt, 1));
 		WT_ERR(__cursor_col_search(session, cbt, NULL));
 
 		if (F_ISSET(cursor, WT_CURSTD_APPEND))
-			cbt->iface.recno = 0;
+			cbt->iface.recno = WT_RECNO_OOB;
 
 		/*
 		 * If not overwriting, fail if the key exists.  Creating a
@@ -549,8 +549,11 @@ retry:	WT_RET(__cursor_func_init(cbt, 1));
 	WT_ILLEGAL_VALUE_ERR(session);
 	}
 
-err:	if (ret == WT_RESTART)
+err:	if (ret == WT_RESTART) {
+		WT_STAT_FAST_CONN_INCR(session, cursor_restart);
+		WT_STAT_FAST_DATA_INCR(session, cursor_restart);
 		goto retry;
+	}
 	/* Insert doesn't maintain a position across calls, clear resources. */
 	if (ret == 0)
 		WT_TRET(__curfile_leave(cbt));
@@ -624,8 +627,11 @@ retry:	WT_RET(__cursor_func_init(cbt, 1));
 	WT_ILLEGAL_VALUE_ERR(session);
 	}
 
-err:	if (ret == WT_RESTART)
+err:	if (ret == WT_RESTART) {
+		WT_STAT_FAST_CONN_INCR(session, cursor_restart);
+		WT_STAT_FAST_DATA_INCR(session, cursor_restart);
 		goto retry;
+	}
 	WT_TRET(__curfile_leave(cbt));
 	if (ret != 0)
 		WT_TRET(__cursor_reset(cbt));
@@ -702,8 +708,11 @@ retry:	WT_RET(__cursor_func_init(cbt, 1));
 	WT_ILLEGAL_VALUE_ERR(session);
 	}
 
-err:	if (ret == WT_RESTART)
+err:	if (ret == WT_RESTART) {
+		WT_STAT_FAST_CONN_INCR(session, cursor_restart);
+		WT_STAT_FAST_DATA_INCR(session, cursor_restart);
 		goto retry;
+	}
 	/*
 	 * If the cursor is configured to overwrite and the record is not
 	 * found, that is exactly what we want.
@@ -790,8 +799,11 @@ retry:	WT_RET(__cursor_func_init(cbt, 1));
 	WT_ILLEGAL_VALUE_ERR(session);
 	}
 
-err:	if (ret == WT_RESTART)
+err:	if (ret == WT_RESTART) {
+		WT_STAT_FAST_CONN_INCR(session, cursor_restart);
+		WT_STAT_FAST_DATA_INCR(session, cursor_restart);
 		goto retry;
+	}
 
 	/*
 	 * If successful, point the cursor at internal copies of the data.  We
@@ -899,7 +911,7 @@ __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp)
  * __cursor_equals --
  *	Return if two cursors reference the same row.
  */
-static inline int
+static inline bool
 __cursor_equals(WT_CURSOR_BTREE *a, WT_CURSOR_BTREE *b)
 {
 	switch (a->btree->type) {
@@ -911,21 +923,21 @@ __cursor_equals(WT_CURSOR_BTREE *a, WT_CURSOR_BTREE *b)
 		 * one being returned to the application.
 		 */
 		if (((WT_CURSOR *)a)->recno == ((WT_CURSOR *)b)->recno)
-			return (1);
+			return (true);
 		break;
 	case BTREE_ROW:
 		if (a->ref != b->ref)
-			return (0);
+			return (false);
 		if (a->ins != NULL || b->ins != NULL) {
 			if (a->ins == b->ins)
-				return (1);
+				return (true);
 			break;
 		}
 		if (a->slot == b->slot)
-			return (1);
+			return (true);
 		break;
 	}
-	return (0);
+	return (false);
 }
 
 /*
@@ -993,22 +1005,27 @@ __cursor_truncate(WT_SESSION_IMPL *session,
 	 * instantiated the end cursor, so we know that page is pinned in memory
 	 * and we can proceed without concern.
 	 */
-	do {
-		WT_RET(__wt_btcur_remove(start));
-		/*
-		 * Reset ret each time through so that we don't loop forever in
-		 * the cursor equals case.
-		 */
-		for (ret = 0;;) {
-			if (stop != NULL && __cursor_equals(start, stop))
-				break;
-			if ((ret = __wt_btcur_next(start, 1)) != 0)
-				break;
-			start->compare = 0;	/* Exact match */
-			if ((ret = rmfunc(session, start, 1)) != 0)
-				break;
-		}
-	} while (ret == WT_RESTART);
+retry:	WT_RET(__wt_btcur_remove(start));
+
+	/*
+	 * Reset ret each time through so that we don't loop forever in
+	 * the cursor equals case.
+	 */
+	for (ret = 0;;) {
+		if (stop != NULL && __cursor_equals(start, stop))
+			break;
+		if ((ret = __wt_btcur_next(start, 1)) != 0)
+			break;
+		start->compare = 0;	/* Exact match */
+		if ((ret = rmfunc(session, start, 1)) != 0)
+			break;
+	}
+
+	if (ret == WT_RESTART) {
+		WT_STAT_FAST_CONN_INCR(session, cursor_restart);
+		WT_STAT_FAST_DATA_INCR(session, cursor_restart);
+		goto retry;
+	}
 
 	WT_RET_NOTFOUND_OK(ret);
 	return (0);
@@ -1042,24 +1059,28 @@ __cursor_truncate_fix(WT_SESSION_IMPL *session,
 	 * other thread of control; in that case, repeat the full search to
 	 * refresh the page's modification information.
 	 */
-	do {
-		WT_RET(__wt_btcur_remove(start));
-		/*
-		 * Reset ret each time through so that we don't loop forever in
-		 * the cursor equals case.
-		 */
-		for (ret = 0;;) {
-			if (stop != NULL && __cursor_equals(start, stop))
-				break;
-			if ((ret = __wt_btcur_next(start, 1)) != 0)
-				break;
-			start->compare = 0;	/* Exact match */
-			value = (uint8_t *)start->iface.value.data;
-			if (*value != 0 &&
-			    (ret = rmfunc(session, start, 1)) != 0)
-				break;
-		}
-	} while (ret == WT_RESTART);
+retry:	WT_RET(__wt_btcur_remove(start));
+	/*
+	 * Reset ret each time through so that we don't loop forever in
+	 * the cursor equals case.
+	 */
+	for (ret = 0;;) {
+		if (stop != NULL && __cursor_equals(start, stop))
+			break;
+		if ((ret = __wt_btcur_next(start, 1)) != 0)
+			break;
+		start->compare = 0;	/* Exact match */
+		value = (uint8_t *)start->iface.value.data;
+		if (*value != 0 &&
+		    (ret = rmfunc(session, start, 1)) != 0)
+			break;
+	}
+
+	if (ret == WT_RESTART) {
+		WT_STAT_FAST_CONN_INCR(session, cursor_restart);
+		WT_STAT_FAST_DATA_INCR(session, cursor_restart);
+		goto retry;
+	}
 
 	WT_RET_NOTFOUND_OK(ret);
 	return (0);
@@ -1132,6 +1153,19 @@ err:	if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED))
 }
 
 /*
+ * __wt_btcur_init --
+ *	Initialize an cursor used for internal purposes.
+ */
+void
+__wt_btcur_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+{
+	memset(cbt, 0, sizeof(WT_CURSOR_BTREE));
+
+	cbt->iface.session = &session->iface;
+	cbt->btree = S2BT(session);
+}
+
+/*
  * __wt_btcur_open --
  *	Open a btree cursor.
  */
@@ -1147,14 +1181,22 @@ __wt_btcur_open(WT_CURSOR_BTREE *cbt)
  *	Close a btree cursor.
  */
 int
-__wt_btcur_close(WT_CURSOR_BTREE *cbt)
+__wt_btcur_close(WT_CURSOR_BTREE *cbt, int lowlevel)
 {
 	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
 
 	session = (WT_SESSION_IMPL *)cbt->iface.session;
 
-	ret = __curfile_leave(cbt);
+	/*
+	 * The in-memory split and lookaside table code creates low-level btree
+	 * cursors to search/modify leaf pages. Those cursors don't hold hazard
+	 * pointers, nor are they counted in the session handle's cursor count.
+	 * Skip the usual cursor tear-down in that case.
+	 */
+	if (!lowlevel)
+		ret = __curfile_leave(cbt);
+
 	__wt_buf_free(session, &cbt->_row_key);
 	__wt_buf_free(session, &cbt->_tmp);
 
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index 77d80cdb3a2..38ef407e160 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -340,6 +340,8 @@ __wt_debug_disk(
 		__dmsg(ds, ", empty-all");
 	if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE))
 		__dmsg(ds, ", empty-none");
+	if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE))
+		__dmsg(ds, ", LAS-update");
 
 	__dmsg(ds, ", generation %" PRIu64 "\n", dsk->write_gen);
 
@@ -643,12 +645,10 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page)
 		__dmsg(ds, ", disk-mapped");
 	if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU))
 		__dmsg(ds, ", evict-lru");
-	if (F_ISSET_ATOMIC(page, WT_PAGE_SCANNING))
-		__dmsg(ds, ", scanning");
+	if (F_ISSET_ATOMIC(page, WT_PAGE_RECONCILIATION))
+		__dmsg(ds, ", reconciliation");
 	if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT))
 		__dmsg(ds, ", split-insert");
-	if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_LOCKED))
-		__dmsg(ds, ", split-locked");
 
 	if (mod != NULL)
 		switch (F_ISSET(mod, WT_PM_REC_MASK)) {
diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c
index 8cca6328f21..c3c7afa1450 100644
--- a/src/btree/bt_delete.c
+++ b/src/btree/bt_delete.c
@@ -70,15 +70,15 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp)
 
 	/* If we have a clean page in memory, attempt to evict it. */
 	if (ref->state == WT_REF_MEM &&
-	    WT_ATOMIC_CAS4(ref->state, WT_REF_MEM, WT_REF_LOCKED)) {
+	    __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED)) {
 		if (__wt_page_is_modified(ref->page)) {
 			WT_PUBLISH(ref->state, WT_REF_MEM);
 			return (0);
 		}
 
-		(void)WT_ATOMIC_ADD4(S2BT(session)->evict_busy, 1);
-		ret = __wt_evict_page(session, ref);
-		(void)WT_ATOMIC_SUB4(S2BT(session)->evict_busy, 1);
+		(void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1);
+		ret = __wt_evict(session, ref, 0);
+		(void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1);
 		WT_RET_BUSY_OK(ret);
 	}
 
@@ -93,7 +93,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp)
 	 * unclear optimizing for overlapping range deletes is worth the effort.
 	 */
 	if (ref->state != WT_REF_DISK ||
-	    !WT_ATOMIC_CAS4(ref->state, WT_REF_DISK, WT_REF_LOCKED))
+	    !__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED))
 		return (0);
 
 	/*
@@ -176,8 +176,8 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
 			 * If the page is still "deleted", it's as we left it,
 			 * reset the state.
 			 */
-			if (WT_ATOMIC_CAS4(
-			    ref->state, WT_REF_DELETED, WT_REF_DISK))
+			if (__wt_atomic_casv32(
+			    &ref->state, WT_REF_DELETED, WT_REF_DISK))
 				return;
 			break;
 		case WT_REF_LOCKED:
@@ -216,10 +216,10 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
  * __wt_delete_page_skip --
  *	If iterating a cursor, skip deleted pages that are visible to us.
  */
-int
+bool
 __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
 {
-	int skip;
+	bool skip;
 
 	/*
 	 * Deleted pages come from two sources: either it's a fast-delete as
@@ -240,13 +240,13 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
 	 * the structure, just to be safe.
 	 */
 	if (ref->page_del == NULL)
-		return (1);
+		return (true);
 
-	if (!WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED))
-		return (0);
+	if (!__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED))
+		return (false);
 
-	skip = (ref->page_del == NULL ||
-	    __wt_txn_visible(session, ref->page_del->txnid));
+	skip = ref->page_del == NULL ||
+	    __wt_txn_visible(session, ref->page_del->txnid);
 
 	WT_PUBLISH(ref->state, WT_REF_DELETED);
 	return (skip);
diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c
index a05c6217338..73e6affccd3 100644
--- a/src/btree/bt_discard.c
+++ b/src/btree/bt_discard.c
@@ -15,7 +15,6 @@ static void __free_page_row_leaf(WT_SESSION_IMPL *, WT_PAGE *);
 static void __free_skip_array(WT_SESSION_IMPL *, WT_INSERT_HEAD **, uint32_t);
 static void __free_skip_list(WT_SESSION_IMPL *, WT_INSERT *);
 static void __free_update(WT_SESSION_IMPL *, WT_UPDATE **, uint32_t);
-static void __free_update_list(WT_SESSION_IMPL *, WT_UPDATE *);
 
 /*
  * __wt_ref_out --
@@ -56,7 +55,7 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep)
 	 */
 	WT_ASSERT(session, !__wt_page_is_modified(page));
 	WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU));
-	WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_LOCKED));
+	WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_RECONCILIATION));
 
 #ifdef HAVE_DIAGNOSTIC
 	{
@@ -160,8 +159,8 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
 				__wt_free(session, multi->key.ikey);
 				break;
 			}
-			__wt_free(session, multi->skip);
-			__wt_free(session, multi->skip_dsk);
+			__wt_free(session, multi->supd);
+			__wt_free(session, multi->supd_dsk);
 			__wt_free(session, multi->addr.addr);
 		}
 		__wt_free(session, mod->mod_multi);
@@ -235,10 +234,7 @@ __wt_free_ref(
 	 * it clean explicitly.)
 	 */
 	if (free_pages && ref->page != NULL) {
-		if (ref->page->modify != NULL) {
-			ref->page->modify->write_gen = 0;
-			__wt_cache_dirty_decr(session, ref->page);
-		}
+		__wt_page_modify_clear(session, ref->page);
 		__wt_page_out(session, &ref->page);
 	}
 
@@ -373,7 +369,7 @@ __free_skip_list(WT_SESSION_IMPL *session, WT_INSERT *ins)
 	WT_INSERT *next;
 
 	for (; ins != NULL; ins = next) {
-		__free_update_list(session, ins->upd);
+		__wt_free_update_list(session, ins->upd);
 		next = WT_SKIP_NEXT(ins);
 		__wt_free(session, ins);
 	}
@@ -395,29 +391,23 @@ __free_update(
 	 */
 	for (updp = update_head; entries > 0; --entries, ++updp)
 		if (*updp != NULL)
-			__free_update_list(session, *updp);
+			__wt_free_update_list(session, *updp);
 
 	/* Free the update array. */
 	__wt_free(session, update_head);
 }
 
 /*
- * __free_update_list --
+ * __wt_free_update_list --
  *	Walk a WT_UPDATE forward-linked list and free the per-thread combination
  *	of a WT_UPDATE structure and its associated data.
  */
-static void
-__free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd)
+void
+__wt_free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd)
 {
 	WT_UPDATE *next;
 
 	for (; upd != NULL; upd = next) {
-		/* Everything we free should be visible to everyone. */
-		WT_ASSERT(session,
-		    F_ISSET(session, WT_SESSION_DISCARD_FORCE) ||
-		    upd->txnid == WT_TXN_ABORTED ||
-		    __wt_txn_visible_all(session, upd->txnid));
-
 		next = upd->next;
 		__wt_free(session, upd);
 	}
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index c1a8ab61054..6a4243a0fc7 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -255,27 +255,17 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
 	/* Page sizes */
 	WT_RET(__btree_page_sizes(session));
 
-	/* 
-	 * Set special flags for the metadata file.
-	 * Eviction; the metadata file is never evicted.
-	 * Logging; the metadata file is always logged if possible.
-	 */
-	if (WT_IS_METADATA(btree->dhandle)) {
+	WT_RET(__wt_config_gets(session, cfg, "cache_resident", &cval));
+	if (cval.val)
 		F_SET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION);
+	else
+		F_CLR(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION);
+
+	WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval));
+	if (cval.val)
 		F_CLR(btree, WT_BTREE_NO_LOGGING);
-	} else {
-		WT_RET(__wt_config_gets(session, cfg, "cache_resident", &cval));
-		if (cval.val)
-			F_SET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION);
-		else
-			F_CLR(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION);
-
-		WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval));
-		if (cval.val)
-			F_CLR(btree, WT_BTREE_NO_LOGGING);
-		else
-			F_SET(btree, WT_BTREE_NO_LOGGING);
-	}
+	else
+		F_SET(btree, WT_BTREE_NO_LOGGING);
 
 	/* Checksums */
 	WT_RET(__wt_config_gets(session, cfg, "checksum", &cval));
@@ -352,8 +342,6 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
 	    session, &btree->ovfl_lock, "btree overflow lock"));
 	WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush lock"));
 
-	__wt_stat_init_dsrc_stats(&btree->dhandle->stats);
-
 	btree->write_gen = ckpt->write_gen;		/* Write generation */
 	btree->modified = 0;				/* Clean */
 
@@ -372,7 +360,7 @@ __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, int is_recno)
 	root_ref->page = root;
 	root_ref->state = WT_REF_MEM;
 
-	root_ref->key.recno = is_recno ? 1 : 0;
+	root_ref->key.recno = is_recno ? 1 : WT_RECNO_OOB;
 
 	root->pg_intl_parent_ref = root_ref;
 }
@@ -385,12 +373,15 @@ int
 __wt_btree_tree_open(
     WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
 {
+	WT_BM *bm;
 	WT_BTREE *btree;
+	WT_DECL_ITEM(tmp);
 	WT_DECL_RET;
 	WT_ITEM dsk;
 	WT_PAGE *page;
 
 	btree = S2BT(session);
+	bm = btree->bm;
 
 	/*
 	 * A buffer into which we read a root page; don't use a scratch buffer,
@@ -399,12 +390,43 @@ __wt_btree_tree_open(
 	WT_CLEAR(dsk);
 
 	/*
-	 * Read the page, then build the in-memory version of the page. Clear
-	 * any local reference to an allocated copy of the disk image on return,
-	 * the page steals it.
+	 * Read and verify the page (verify to catch encrypted objects we can't
+	 * decrypt, where we read the object successfully but we can't decrypt
+	 * it, and we want to fail gracefully).
+	 *
+	 * Create a printable version of the address to pass to verify.
+	 */
+	WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+	WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size));
+
+	F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE);
+	if ((ret = __wt_bt_read(session, &dsk, addr, addr_size)) == 0)
+		ret = __wt_verify_dsk(session, tmp->data, &dsk);
+	F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE);
+	if (ret != 0)
+		__wt_err(session, ret,
+		    "unable to read root page from %s", session->dhandle->name);
+	/*
+	 * Failure to open metadata means that the database is unavailable.
+	 * Try to provide a helpful failure message.
+	 */
+	if (ret != 0 && WT_IS_METADATA(session->dhandle)) {
+		__wt_errx(session,
+		    "WiredTiger has failed to open its metadata");
+		__wt_errx(session, "This may be due to the database"
+		    " files being encrypted, being from an older"
+		    " version or due to corruption on disk");
+		__wt_errx(session, "You should confirm that you have"
+		    " opened the database with the correct options including"
+		    " all encryption and compression options");
+	}
+	WT_ERR(ret);
+
+	/*
+	 * Build the in-memory version of the page. Clear our local reference to
+	 * the allocated copy of the disk image on return, the in-memory object
+	 * steals it.
 	 */
-	WT_ERR(__wt_bt_read(session, &dsk, addr, addr_size));
-	WT_ERR(__wt_verify_dsk(session, (const char *)addr, &dsk));
 	WT_ERR(__wt_page_inmem(session, NULL, dsk.data, dsk.memsize,
 	    WT_DATA_IN_ITEM(&dsk) ?
 	    WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page));
@@ -414,6 +436,8 @@ __wt_btree_tree_open(
 	__wt_root_ref_init(&btree->root, page, btree->type != BTREE_ROW);
 
 err:	__wt_buf_free(session, &dsk);
+	__wt_scr_free(session, &tmp);
+
 	return (ret);
 }
 
@@ -663,9 +687,11 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
 	WT_RET(__wt_config_gets(session, cfg, "memory_page_max", &cval));
 	btree->maxmempage =
 	    WT_MAX((uint64_t)cval.val, 50 * (uint64_t)btree->maxleafpage);
-	cache_size = S2C(session)->cache_size;
-	if (cache_size > 0)
-		btree->maxmempage = WT_MIN(btree->maxmempage, cache_size / 4);
+	if (!F_ISSET(S2C(session), WT_CONN_CACHE_POOL)) {
+		if ((cache_size = S2C(session)->cache_size) > 0)
+			btree->maxmempage =
+			    WT_MIN(btree->maxmempage, cache_size / 4);
+	}
 
 	/*
 	 * Get the split percentage (reconciliation splits pages into smaller
diff --git a/src/btree/bt_io.c b/src/btree/bt_io.c
index a8bbf8a0266..836c1540c5f 100644
--- a/src/btree/bt_io.c
+++ b/src/btree/bt_io.c
@@ -24,10 +24,12 @@ __wt_bt_read(WT_SESSION_IMPL *session,
 	WT_ENCRYPTOR *encryptor;
 	WT_ITEM *ip;
 	const WT_PAGE_HEADER *dsk;
+	const char *fail_msg;
 	size_t result_len;
 
 	btree = S2BT(session);
 	bm = btree->bm;
+	fail_msg = NULL;			/* -Wuninitialized */
 
 	/*
 	 * If anticipating a compressed or encrypted block, read into a scratch
@@ -52,40 +54,36 @@ __wt_bt_read(WT_SESSION_IMPL *session,
 	if (F_ISSET(dsk, WT_PAGE_ENCRYPTED)) {
 		if (btree->kencryptor == NULL ||
 		    (encryptor = btree->kencryptor->encryptor) == NULL ||
-		    encryptor->decrypt == NULL)
-			WT_ERR_MSG(session, WT_ERROR,
-			    "read encrypted block where no decryption engine "
-			    "configured");
+		    encryptor->decrypt == NULL) {
+			fail_msg =
+			    "encrypted block in file for which no encryption "
+			    "configured";
+			goto corrupt;
+		}
 
 		WT_ERR(__wt_scr_alloc(session, 0, &etmp));
-		ret = __wt_decrypt(session,
-		    encryptor, WT_BLOCK_ENCRYPT_SKIP, ip, etmp);
-		/*
-		 * It may be file corruption, which is really, really bad, or
-		 * may be a mismatch of encryption configuration, for example,
-		 * an incorrect secretkey.
-		 */
-		if (ret != 0)
-			WT_ERR(F_ISSET(btree, WT_BTREE_VERIFY) ||
-			    F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ?
-			    WT_ERROR :
-			    __wt_illegal_value(session, btree->dhandle->name));
+		if ((ret = __wt_decrypt(session,
+		    encryptor, WT_BLOCK_ENCRYPT_SKIP, ip, etmp)) != 0) {
+			fail_msg = "block decryption failed";
+			goto corrupt;
+		}
 
 		ip = etmp;
 		dsk = ip->data;
-	} else if (btree->kencryptor != NULL &&
-	    !F_ISSET(btree, WT_BTREE_VERIFY) &&
-	    !F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))
-		WT_ERR_MSG(session, WT_ERROR,
-		    "encryption configured, and existing file is not "
-		    "encrypted");
+	} else if (btree->kencryptor != NULL) {
+		fail_msg =
+		    "unencrypted block in file for which encryption configured";
+		goto corrupt;
+	}
 
 	if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) {
 		if (btree->compressor == NULL ||
-		    btree->compressor->decompress == NULL)
-			WT_ERR_MSG(session, WT_ERROR,
-			    "read compressed block where no compression engine "
-			    "configured");
+		    btree->compressor->decompress == NULL) {
+			fail_msg =
+			    "compressed block in file for which no compression "
+			    "configured";
+			goto corrupt;
+		}
 
 		/*
 		 * Size the buffer based on the in-memory bytes we're expecting
@@ -118,11 +116,10 @@ __wt_bt_read(WT_SESSION_IMPL *session,
 		 * it's OK, otherwise it's really, really bad.
 		 */
 		if (ret != 0 ||
-		    result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP)
-			WT_ERR(F_ISSET(btree, WT_BTREE_VERIFY) ||
-			    F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ?
-			    WT_ERROR :
-			    __wt_illegal_value(session, btree->dhandle->name));
+		    result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) {
+			fail_msg = "block decryption failed";
+			goto corrupt;
+		}
 	} else
 		/*
 		 * If we uncompressed above, the page is in the correct buffer.
@@ -139,7 +136,7 @@ __wt_bt_read(WT_SESSION_IMPL *session,
 		if (tmp == NULL)
 			WT_ERR(__wt_scr_alloc(session, 0, &tmp));
 		WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size));
-		WT_ERR(__wt_verify_dsk(session, (const char *)tmp->data, buf));
+		WT_ERR(__wt_verify_dsk(session, tmp->data, buf));
 	}
 
 	WT_STAT_FAST_CONN_INCR(session, cache_read);
@@ -149,6 +146,16 @@ __wt_bt_read(WT_SESSION_IMPL *session,
 	WT_STAT_FAST_CONN_INCRV(session, cache_bytes_read, dsk->mem_size);
 	WT_STAT_FAST_DATA_INCRV(session, cache_bytes_read, dsk->mem_size);
 
+	if (0) {
+corrupt:	if (ret == 0)
+			ret = WT_ERROR;
+		if (!F_ISSET(btree, WT_BTREE_VERIFY) &&
+		    !F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) {
+			__wt_err(session, ret, "%s", fail_msg);
+			ret = __wt_illegal_value(session, btree->dhandle->name);
+		}
+	}
+
 err:	__wt_scr_free(session, &tmp);
 	__wt_scr_free(session, &etmp);
 	return (ret);
diff --git a/src/btree/bt_ovfl.c b/src/btree/bt_ovfl.c
index d8456c5b61f..7104e702418 100644
--- a/src/btree/bt_ovfl.c
+++ b/src/btree/bt_ovfl.c
@@ -79,7 +79,7 @@ __wt_ovfl_read(WT_SESSION_IMPL *session,
  * __ovfl_cache_col_visible --
  *	column-store: check for a globally visible update.
  */
-static int
+static bool
 __ovfl_cache_col_visible(
     WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_CELL_UNPACK *unpack)
 {
@@ -99,15 +99,15 @@ __ovfl_cache_col_visible(
 	if (__wt_cell_rle(unpack) == 1 &&
 	    upd != NULL &&		/* Sanity: upd should always be set. */
 	    __wt_txn_visible_all(session, upd->txnid))
-		return (1);
-	return (0);
+		return (true);
+	return (false);
 }
 
 /*
  * __ovfl_cache_row_visible --
  *	row-store: check for a globally visible update.
  */
-static int
+static bool
 __ovfl_cache_row_visible(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip)
 {
 	WT_UPDATE *upd;
@@ -115,9 +115,9 @@ __ovfl_cache_row_visible(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip)
 	/* Check to see if there's a globally visible update. */
 	for (upd = WT_ROW_UPDATE(page, rip); upd != NULL; upd = upd->next)
 		if (__wt_txn_visible_all(session, upd->txnid))
-			return (1);
+			return (true);
 
-	return (0);
+	return (false);
 }
 
 /*
diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c
index 86edd992b28..ba218fc332c 100644
--- a/src/btree/bt_page.c
+++ b/src/btree/bt_page.c
@@ -17,214 +17,6 @@ static int  __inmem_row_leaf_entries(
 	WT_SESSION_IMPL *, const WT_PAGE_HEADER *, uint32_t *);
 
 /*
- * __evict_force_check --
- *	Check if a page matches the criteria for forced eviction.
- */
-static int
-__evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page)
-{
-	WT_BTREE *btree;
-
-	btree = S2BT(session);
-
-	/* Pages are usually small enough, check that first. */
-	if (page->memory_footprint < btree->maxmempage)
-		return (0);
-
-	/* Leaf pages only. */
-	if (WT_PAGE_IS_INTERNAL(page))
-		return (0);
-
-	/*
-	 * It's hard to imagine a page with a huge memory footprint that has
-	 * never been modified, but check to be sure.
-	 */
-	if (page->modify == NULL)
-		return (0);
-
-	/* Trigger eviction on the next page release. */
-	__wt_page_evict_soon(page);
-
-	/* Bump the oldest ID, we're about to do some visibility checks. */
-	__wt_txn_update_oldest(session, 0);
-
-	/* If eviction cannot succeed, don't try. */
-	return (__wt_page_can_evict(session, page, 1, NULL));
-}
-
-/*
- * __wt_page_in_func --
- *	Acquire a hazard pointer to a page; if the page is not in-memory,
- *	read it from the disk and build an in-memory version.
- */
-int
-__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
-#ifdef HAVE_DIAGNOSTIC
-    , const char *file, int line
-#endif
-    )
-{
-	WT_BTREE *btree;
-	WT_DECL_RET;
-	WT_PAGE *page;
-	u_int sleep_cnt, wait_cnt;
-	int busy, cache_work, force_attempts, oldgen;
-
-	btree = S2BT(session);
-
-	for (force_attempts = oldgen = 0, wait_cnt = 0;;) {
-		switch (ref->state) {
-		case WT_REF_DISK:
-		case WT_REF_DELETED:
-			if (LF_ISSET(WT_READ_CACHE))
-				return (WT_NOTFOUND);
-
-			/*
-			 * The page isn't in memory, attempt to read it.
-			 * Make sure there is space in the cache.
-			 */
-			WT_RET(__wt_cache_eviction_check(session, 1, NULL));
-			WT_RET(__wt_cache_read(session, ref));
-			oldgen = LF_ISSET(WT_READ_WONT_NEED) ||
-			    F_ISSET(session, WT_SESSION_NO_CACHE);
-			continue;
-		case WT_REF_READING:
-			if (LF_ISSET(WT_READ_CACHE))
-				return (WT_NOTFOUND);
-			if (LF_ISSET(WT_READ_NO_WAIT))
-				return (WT_NOTFOUND);
-
-			/* Waiting on another thread's read, stall. */
-			WT_STAT_FAST_CONN_INCR(session, page_read_blocked);
-			goto stall;
-		case WT_REF_LOCKED:
-			if (LF_ISSET(WT_READ_NO_WAIT))
-				return (WT_NOTFOUND);
-
-			/* Waiting on eviction, stall. */
-			WT_STAT_FAST_CONN_INCR(session, page_locked_blocked);
-			goto stall;
-		case WT_REF_SPLIT:
-			return (WT_RESTART);
-		case WT_REF_MEM:
-			/*
-			 * The page is in memory.
-			 *
-			 * Get a hazard pointer if one is required. We cannot
-			 * be evicting if no hazard pointer is required, we're
-			 * done.
-			 */
-			if (F_ISSET(btree, WT_BTREE_IN_MEMORY))
-				goto skip_evict;
-
-			/*
-			 * The expected reason we can't get a hazard pointer is
-			 * because the page is being evicted, yield, try again.
-			 */
-#ifdef HAVE_DIAGNOSTIC
-			WT_RET(
-			    __wt_hazard_set(session, ref, &busy, file, line));
-#else
-			WT_RET(__wt_hazard_set(session, ref, &busy));
-#endif
-			if (busy) {
-				WT_STAT_FAST_CONN_INCR(
-				    session, page_busy_blocked);
-				break;
-			}
-
-			/*
-			 * If eviction is configured for this file, check to see
-			 * if the page qualifies for forced eviction and update
-			 * the page's generation number. If eviction isn't being
-			 * done on this file, we're done.
-			 */
-			if (LF_ISSET(WT_READ_NO_EVICT) ||
-			    F_ISSET(btree, WT_BTREE_NO_EVICTION))
-				goto skip_evict;
-
-			/*
-			 * Forcibly evict pages that are too big.
-			 */
-			page = ref->page;
-			if (force_attempts < 10 &&
-			    __evict_force_check(session, page)) {
-				++force_attempts;
-				ret = __wt_page_release_evict(session, ref);
-				/* If forced eviction fails, stall. */
-				if (ret == EBUSY) {
-					ret = 0;
-					WT_STAT_FAST_CONN_INCR(session,
-					    page_forcible_evict_blocked);
-					goto stall;
-				}
-				WT_RET(ret);
-
-				/*
-				 * The result of a successful forced eviction
-				 * is a page-state transition (potentially to
-				 * an in-memory page we can use, or a restart
-				 * return for our caller), continue the outer
-				 * page-acquisition loop.
-				 */
-				continue;
-			}
-
-			/*
-			 * If we read the page and we are configured to not
-			 * trash the cache, set the oldest read generation so
-			 * the page is forcibly evicted as soon as possible.
-			 *
-			 * Otherwise, update the page's read generation.
-			 */
-			if (oldgen && page->read_gen == WT_READGEN_NOTSET)
-				__wt_page_evict_soon(page);
-			else if (!LF_ISSET(WT_READ_NO_GEN) &&
-			    page->read_gen != WT_READGEN_OLDEST &&
-			    page->read_gen < __wt_cache_read_gen(session))
-				page->read_gen =
-				    __wt_cache_read_gen_bump(session);
-skip_evict:
-			/*
-			 * Check if we need an autocommit transaction.
-			 * Starting a transaction can trigger eviction, so skip
-			 * it if eviction isn't permitted.
-			 */
-			return (LF_ISSET(WT_READ_NO_EVICT) ? 0 :
-			    __wt_txn_autocommit_check(session));
-		WT_ILLEGAL_VALUE(session);
-		}
-
-		/*
-		 * We failed to get the page -- yield before retrying, and if
-		 * we've yielded enough times, start sleeping so we don't burn
-		 * CPU to no purpose.
-		 */
-		if (++wait_cnt < 1000)
-			__wt_yield();
-		else {
-			if (0) {
-stall:				wait_cnt += 1000;
-			}
-
-			/*
-			 * If stalling, check if the cache needs help. If we do
-			 * work for the cache, substitute that for a sleep.
-			 */
-			WT_RET(
-			    __wt_cache_eviction_check(session, 1, &cache_work));
-			if (!cache_work) {
-				sleep_cnt = WT_MIN(wait_cnt, 10000);
-				wait_cnt *= 2;
-				WT_STAT_FAST_CONN_INCRV(
-				    session, page_sleep, sleep_cnt);
-				__wt_sleep(0, sleep_cnt);
-			}
-		}
-	}
-}
-
-/*
  * __wt_page_alloc --
  *	Create or read a page into the cache.
  */
@@ -326,8 +118,8 @@ err:			if ((pindex = WT_INTL_INDEX_GET_SAFE(page)) != NULL) {
 
 	/* Increment the cache statistics. */
 	__wt_cache_page_inmem_incr(session, page, size);
-	(void)WT_ATOMIC_ADD8(cache->bytes_read, size);
-	(void)WT_ATOMIC_ADD8(cache->pages_inmem, 1);
+	(void)__wt_atomic_add64(&cache->bytes_read, size);
+	(void)__wt_atomic_add64(&cache->pages_inmem, 1);
 
 	*pagep = page;
 	return (0);
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c
index e27f7c3398c..d26b44e04c0 100644
--- a/src/btree/bt_read.c
+++ b/src/btree/bt_read.c
@@ -9,19 +9,328 @@
 #include "wt_internal.h"
 
 /*
- * __wt_cache_read --
- *	Read a page from the file.
+ * __wt_las_remove_block --
+ *	Remove all records matching a key prefix from the lookaside store.
  */
 int
-__wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref)
+__wt_las_remove_block(WT_SESSION_IMPL *session,
+    WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size)
 {
+	WT_DECL_ITEM(las_addr);
+	WT_DECL_ITEM(las_key);
+	WT_DECL_RET;
+	uint64_t las_counter, las_txnid;
+	uint32_t las_id;
+	int exact;
+
+	WT_ERR(__wt_scr_alloc(session, 0, &las_addr));
+	WT_ERR(__wt_scr_alloc(session, 0, &las_key));
+
+	/*
+	 * Search for the block's unique prefix and step through all matching
+	 * records, removing them.
+	 */
+	las_addr->data = addr;
+	las_addr->size = addr_size;
+	las_key->size = 0;
+	cursor->set_key(
+	    cursor, btree_id, las_addr, (uint64_t)0, (uint32_t)0, las_key);
+	if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0)
+		ret = cursor->next(cursor);
+	for (; ret == 0; ret = cursor->next(cursor)) {
+		WT_ERR(cursor->get_key(cursor,
+		    &las_id, las_addr, &las_counter, &las_txnid, las_key));
+
+		/*
+		 * Confirm the search using the unique prefix; if not a match,
+		 * we're done searching for records for this page.
+		 */
+		 if (las_id != btree_id ||
+		     las_addr->size != addr_size ||
+		     memcmp(las_addr->data, addr, addr_size) != 0)
+			break;
+
+		/*
+		 * Cursor opened overwrite=true: won't return WT_NOTFOUND should
+		 * another thread remove the record before we do, and the cursor
+		 * remains positioned in that case.
+		 */
+		WT_ERR(cursor->remove(cursor));
+	}
+	WT_ERR_NOTFOUND_OK(ret);
+
+err:	__wt_scr_free(session, &las_addr);
+	__wt_scr_free(session, &las_key);
+	return (ret);
+}
+
+/*
+ * __col_instantiate --
+ *	Update a column-store page entry based on a lookaside table update list.
+ */
+static int
+__col_instantiate(WT_SESSION_IMPL *session,
+    uint64_t recno, WT_REF *ref, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
+{
+	/* Search the page and add updates. */
+	WT_RET(__wt_col_search(session, recno, ref, cbt));
+	WT_RET(__wt_col_modify(session, cbt, recno, NULL, upd, 0));
+	return (0);
+}
+
+/*
+ * __row_instantiate --
+ *	Update a row-store page entry based on a lookaside table update list.
+ */
+static int
+__row_instantiate(WT_SESSION_IMPL *session,
+    WT_ITEM *key, WT_REF *ref, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
+{
+	/* Search the page and add updates. */
+	WT_RET(__wt_row_search(session, key, ref, cbt, 1));
+	WT_RET(__wt_row_modify(session, cbt, key, NULL, upd, 0));
+	return (0);
+}
+
+/*
+ * __las_page_instantiate --
+ *	Instantiate lookaside update records in a recently read page.
+ */
+static int
+__las_page_instantiate(WT_SESSION_IMPL *session,
+    WT_REF *ref, uint32_t read_id, const uint8_t *addr, size_t addr_size)
+{
+	WT_CURSOR *cursor;
+	WT_CURSOR_BTREE cbt;
+	WT_DECL_ITEM(current_key);
+	WT_DECL_ITEM(las_addr);
+	WT_DECL_ITEM(las_key);
+	WT_DECL_ITEM(las_value);
+	WT_DECL_RET;
+	WT_PAGE *page;
+	WT_UPDATE *first_upd, *last_upd, *upd;
+	size_t incr, total_incr;
+	uint64_t current_recno, las_counter, las_txnid, recno, upd_txnid;
+	uint32_t las_id, upd_size, session_flags;
+	int exact;
+	const uint8_t *p;
+
+	cursor = NULL;
+	page = ref->page;
+	first_upd = last_upd = upd = NULL;
+	total_incr = 0;
+	current_recno = recno = WT_RECNO_OOB;
+	session_flags = 0;		/* [-Werror=maybe-uninitialized] */
+
+	__wt_btcur_init(session, &cbt);
+	__wt_btcur_open(&cbt);
+
+	WT_ERR(__wt_scr_alloc(session, 0, &current_key));
+	WT_ERR(__wt_scr_alloc(session, 0, &las_addr));
+	WT_ERR(__wt_scr_alloc(session, 0, &las_key));
+	WT_ERR(__wt_scr_alloc(session, 0, &las_value));
+
+	/* Open a lookaside table cursor. */
+	WT_ERR(__wt_las_cursor(session, &cursor, &session_flags));
+
+	/*
+	 * The lookaside records are in key and update order, that is, there
+	 * will be a set of in-order updates for a key, then another set of
+	 * in-order updates for a subsequent key. We process all of the updates
+	 * for a key and then insert those updates into the page, then all the
+	 * updates for the next key, and so on.
+	 *
+	 * Search for the block's unique prefix, stepping through any matching
+	 * records.
+	 */
+	las_addr->data = addr;
+	las_addr->size = addr_size;
+	las_key->size = 0;
+	cursor->set_key(
+	    cursor, read_id, las_addr, (uint64_t)0, (uint32_t)0, las_key);
+	if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0)
+		ret = cursor->next(cursor);
+	for (; ret == 0; ret = cursor->next(cursor)) {
+		WT_ERR(cursor->get_key(cursor,
+		    &las_id, las_addr, &las_counter, &las_txnid, las_key));
+
+		/*
+		 * Confirm the search using the unique prefix; if not a match,
+		 * we're done searching for records for this page.
+		 */
+		if (las_id != read_id ||
+		    las_addr->size != addr_size ||
+		    memcmp(las_addr->data, addr, addr_size) != 0)
+			break;
+
+		/*
+		 * If the on-page value has become globally visible, this record
+		 * is no longer needed.
+		 */
+		if (__wt_txn_visible_all(session, las_txnid))
+			continue;
+
+		/* Allocate the WT_UPDATE structure. */
+		WT_ERR(cursor->get_value(
+		    cursor, &upd_txnid, &upd_size, las_value));
+		WT_ERR(__wt_update_alloc(session,
+		    (upd_size == WT_UPDATE_DELETED_VALUE) ? NULL : las_value,
+		    &upd, &incr));
+		total_incr += incr;
+		upd->txnid = upd_txnid;
+
+		switch (page->type) {
+		case WT_PAGE_COL_FIX:
+		case WT_PAGE_COL_VAR:
+			p = las_key->data;
+			WT_ERR(__wt_vunpack_uint(&p, 0, &recno));
+			if (current_recno == recno)
+				break;
+			WT_ASSERT(session, current_recno < recno);
+
+			if (first_upd != NULL) {
+				WT_ERR(__col_instantiate(session,
+				    current_recno, ref, &cbt, first_upd));
+				first_upd = NULL;
+			}
+			current_recno = recno;
+			break;
+		case WT_PAGE_ROW_LEAF:
+			if (current_key->size == las_key->size &&
+			    memcmp(current_key->data,
+			    las_key->data, las_key->size) == 0)
+				break;
+
+			if (first_upd != NULL) {
+				WT_ERR(__row_instantiate(session,
+				    current_key, ref, &cbt, first_upd));
+				first_upd = NULL;
+			}
+			WT_ERR(__wt_buf_set(session,
+			    current_key, las_key->data, las_key->size));
+			break;
+		WT_ILLEGAL_VALUE_ERR(session);
+		}
+
+		/* Append the latest update to the list. */
+		if (first_upd == NULL)
+			first_upd = last_upd = upd;
+		else {
+			last_upd->next = upd;
+			last_upd = upd;
+		}
+		upd = NULL;
+	}
+	WT_ERR_NOTFOUND_OK(ret);
+
+	/* Insert the last set of updates, if any. */
+	if (first_upd != NULL)
+		switch (page->type) {
+		case WT_PAGE_COL_FIX:
+		case WT_PAGE_COL_VAR:
+			WT_ERR(__col_instantiate(session,
+			    current_recno, ref, &cbt, first_upd));
+			first_upd = NULL;
+			break;
+		case WT_PAGE_ROW_LEAF:
+			WT_ERR(__row_instantiate(session,
+			    current_key, ref, &cbt, first_upd));
+			first_upd = NULL;
+			break;
+		WT_ILLEGAL_VALUE_ERR(session);
+		}
+
+	/* Discard the cursor. */
+	WT_ERR(__wt_las_cursor_close(session, &cursor, session_flags));
+
+	if (total_incr != 0) {
+		__wt_cache_page_inmem_incr(session, page, total_incr);
+
+		/*
+		 * We've modified/dirtied the page, but that's not necessary and
+		 * if we keep the page clean, it's easier to evict. We leave the
+		 * lookaside table updates in place, so if we evict this page
+		 * without dirtying it, any future instantiation of it will find
+		 * the records it needs. If the page is dirtied before eviction,
+		 * then we'll write any needed lookaside table records for the
+		 * new location of the page.
+		 */
+		__wt_page_modify_clear(session, page);
+	}
+
+err:	WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
+	WT_TRET(__wt_btcur_close(&cbt, 1));
+
+	/*
+	 * On error, upd points to a single unlinked WT_UPDATE structure,
+	 * first_upd points to a list.
+	 */
+	if (upd != NULL)
+		__wt_free(session, upd);
+	if (first_upd != NULL)
+		__wt_free_update_list(session, first_upd);
+
+	__wt_scr_free(session, &current_key);
+	__wt_scr_free(session, &las_addr);
+	__wt_scr_free(session, &las_key);
+	__wt_scr_free(session, &las_value);
+
+	return (ret);
+}
+
+/*
+ * __evict_force_check --
+ *	Check if a page matches the criteria for forced eviction.
+ */
+static int
+__evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_BTREE *btree;
+
+	btree = S2BT(session);
+
+	/* Pages are usually small enough, check that first. */
+	if (page->memory_footprint < btree->maxmempage)
+		return (0);
+
+	/* Leaf pages only. */
+	if (WT_PAGE_IS_INTERNAL(page))
+		return (0);
+
+	/*
+	 * It's hard to imagine a page with a huge memory footprint that has
+	 * never been modified, but check to be sure.
+	 */
+	if (page->modify == NULL)
+		return (0);
+
+	/* Trigger eviction on the next page release. */
+	__wt_page_evict_soon(page);
+
+	/* Bump the oldest ID, we're about to do some visibility checks. */
+	__wt_txn_update_oldest(session, 0);
+
+	/* If eviction cannot succeed, don't try. */
+	return (__wt_page_can_evict(session, page, 1, NULL));
+}
+
+/*
+ * __page_read --
+ *	Read a page from the file.
+ */
+static int
+__page_read(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+	const WT_PAGE_HEADER *dsk;
+	WT_BTREE *btree;
 	WT_DECL_RET;
 	WT_ITEM tmp;
 	WT_PAGE *page;
-	WT_PAGE_STATE previous_state;
 	size_t addr_size;
+	uint32_t previous_state;
 	const uint8_t *addr;
 
+	btree = S2BT(session);
 	page = NULL;
 
 	/*
@@ -35,9 +344,9 @@ __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref)
 	 * WT_REF_LOCKED, for deleted pages.  If successful, we've won the
 	 * race, read the page.
 	 */
-	if (WT_ATOMIC_CAS4(ref->state, WT_REF_DISK, WT_REF_READING))
+	if (__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_READING))
 		previous_state = WT_REF_DISK;
-	else if (WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED))
+	else if (__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED))
 		previous_state = WT_REF_DELETED;
 	else
 		return (0);
@@ -45,8 +354,6 @@ __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref)
 	/*
 	 * Get the address: if there is no address, the page was deleted, but a
 	 * subsequent search or insert is forcing re-creation of the name space.
-	 * Otherwise, there's an address, read the backing disk page and build
-	 * an in-memory version of the page.
 	 */
 	WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
 	if (addr == NULL) {
@@ -54,27 +361,51 @@ __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref)
 
 		WT_ERR(__wt_btree_new_leaf_page(session, &page));
 		ref->page = page;
-	} else {
-		/*
-		 * Read the page, then build the in-memory version of the page.
-		 * Clear any local reference to an allocated copy of the disk
-		 * image on return, the page steals it.
-		 */
-		WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size));
-		WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize,
-		    WT_DATA_IN_ITEM(&tmp) ?
-		    WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page));
-		tmp.mem = NULL;
-
-		/* If the page was deleted, instantiate that information. */
-		if (previous_state == WT_REF_DELETED)
-			WT_ERR(__wt_delete_page_instantiate(session, ref));
+		goto done;
 	}
 
-	WT_ERR(__wt_verbose(session, WT_VERB_READ,
-	    "page %p: %s", page, __wt_page_type_string(page->type)));
+	/*
+	 * There's an address, read or map the backing disk page and build an
+	 * in-memory version of the page.
+	 */
+	WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size));
+	WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize,
+	    WT_DATA_IN_ITEM(&tmp) ?
+	    WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page));
+
+	/*
+	 * Clear the local reference to an allocated copy of the disk image on
+	 * return; the page steals it, errors in this code should not free it.
+	 */
+	tmp.mem = NULL;
 
-	WT_PUBLISH(ref->state, WT_REF_MEM);
+	/*
+	 * If reading for a checkpoint, there's no additional work to do, the
+	 * page on disk is correct as written.
+	 */
+	if (session->dhandle->checkpoint != NULL)
+		goto done;
+
+	/* If the page was deleted, instantiate that information. */
+	if (previous_state == WT_REF_DELETED)
+		WT_ERR(__wt_delete_page_instantiate(session, ref));
+
+	/*
+	 * Instantiate updates from the database's lookaside table. The page
+	 * flag was set when the page was written, potentially a long time ago.
+	 * We only care if the lookaside table is currently active, check that
+	 * before doing any work.
+	 */
+	dsk = tmp.data;
+	if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE) && __wt_las_is_written(session)) {
+		WT_STAT_FAST_CONN_INCR(session, cache_read_lookaside);
+		WT_STAT_FAST_DATA_INCR(session, cache_read_lookaside);
+
+		WT_ERR(__las_page_instantiate(
+		    session, ref, btree->id, addr, addr_size));
+	}
+
+done:	WT_PUBLISH(ref->state, WT_REF_MEM);
 	return (0);
 
 err:	/*
@@ -90,3 +421,183 @@ err:	/*
 
 	return (ret);
 }
+
+/*
+ * __wt_page_in_func --
+ *	Acquire a hazard pointer to a page; if the page is not in-memory,
+ *	read it from the disk and build an in-memory version.
+ */
+int
+__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
+#ifdef HAVE_DIAGNOSTIC
+    , const char *file, int line
+#endif
+    )
+{
+	WT_BTREE *btree;
+	WT_DECL_RET;
+	WT_PAGE *page;
+	u_int sleep_cnt, wait_cnt;
+	int busy, cache_work, force_attempts, oldgen, stalled;
+
+	btree = S2BT(session);
+	stalled = 0;
+
+	for (force_attempts = oldgen = 0, sleep_cnt = wait_cnt = 0;;) {
+		switch (ref->state) {
+		case WT_REF_DISK:
+		case WT_REF_DELETED:
+			if (LF_ISSET(WT_READ_CACHE))
+				return (WT_NOTFOUND);
+
+			/*
+			 * The page isn't in memory, read it. If this thread is
+			 * allowed to do eviction work, check for space in the
+			 * cache.
+			 */
+			if (!LF_ISSET(WT_READ_NO_EVICT))
+				WT_RET(__wt_cache_eviction_check(
+				    session, 1, NULL));
+			WT_RET(__page_read(session, ref));
+			oldgen = LF_ISSET(WT_READ_WONT_NEED) ||
+			    F_ISSET(session, WT_SESSION_NO_CACHE);
+			continue;
+		case WT_REF_READING:
+			if (LF_ISSET(WT_READ_CACHE))
+				return (WT_NOTFOUND);
+			if (LF_ISSET(WT_READ_NO_WAIT))
+				return (WT_NOTFOUND);
+
+			/* Waiting on another thread's read, stall. */
+			WT_STAT_FAST_CONN_INCR(session, page_read_blocked);
+			stalled = 1;
+			break;
+		case WT_REF_LOCKED:
+			if (LF_ISSET(WT_READ_NO_WAIT))
+				return (WT_NOTFOUND);
+
+			/* Waiting on eviction, stall. */
+			WT_STAT_FAST_CONN_INCR(session, page_locked_blocked);
+			stalled = 1;
+			break;
+		case WT_REF_SPLIT:
+			return (WT_RESTART);
+		case WT_REF_MEM:
+			/*
+			 * The page is in memory.
+			 *
+			 * Get a hazard pointer if one is required. We cannot
+			 * be evicting if no hazard pointer is required, we're
+			 * done.
+			 */
+			if (F_ISSET(btree, WT_BTREE_IN_MEMORY))
+				goto skip_evict;
+
+			/*
+			 * The expected reason we can't get a hazard pointer is
+			 * because the page is being evicted, yield, try again.
+			 */
+#ifdef HAVE_DIAGNOSTIC
+			WT_RET(
+			    __wt_hazard_set(session, ref, &busy, file, line));
+#else
+			WT_RET(__wt_hazard_set(session, ref, &busy));
+#endif
+			if (busy) {
+				WT_STAT_FAST_CONN_INCR(
+				    session, page_busy_blocked);
+				break;
+			}
+
+			/*
+			 * If eviction is configured for this file, check to see
+			 * if the page qualifies for forced eviction and update
+			 * the page's generation number. If eviction isn't being
+			 * done on this file, we're done.
+			 */
+			if (LF_ISSET(WT_READ_NO_EVICT) ||
+			    F_ISSET(session, WT_SESSION_NO_EVICTION) ||
+			    F_ISSET(btree, WT_BTREE_NO_EVICTION))
+				goto skip_evict;
+
+			/*
+			 * Forcibly evict pages that are too big.
+			 */
+			page = ref->page;
+			if (force_attempts < 10 &&
+			    __evict_force_check(session, page)) {
+				++force_attempts;
+				ret = __wt_page_release_evict(session, ref);
+				/* If forced eviction fails, stall. */
+				if (ret == EBUSY) {
+					ret = 0;
+					WT_STAT_FAST_CONN_INCR(session,
+					    page_forcible_evict_blocked);
+					stalled = 1;
+					break;
+				}
+				WT_RET(ret);
+
+				/*
+				 * The result of a successful forced eviction
+				 * is a page-state transition (potentially to
+				 * an in-memory page we can use, or a restart
+				 * return for our caller), continue the outer
+				 * page-acquisition loop.
+				 */
+				continue;
+			}
+
+			/*
+			 * If we read the page and we are configured to not
+			 * trash the cache, set the oldest read generation so
+			 * the page is forcibly evicted as soon as possible.
+			 *
+			 * Otherwise, update the page's read generation.
+			 */
+			if (oldgen && page->read_gen == WT_READGEN_NOTSET)
+				__wt_page_evict_soon(page);
+			else if (!LF_ISSET(WT_READ_NO_GEN) &&
+			    page->read_gen != WT_READGEN_OLDEST &&
+			    page->read_gen < __wt_cache_read_gen(session))
+				page->read_gen =
+				    __wt_cache_read_gen_bump(session);
+skip_evict:
+			/*
+			 * Check if we need an autocommit transaction.
+			 * Starting a transaction can trigger eviction, so skip
+			 * it if eviction isn't permitted.
+			 */
+			return (LF_ISSET(WT_READ_NO_EVICT) ? 0 :
+			    __wt_txn_autocommit_check(session));
+		WT_ILLEGAL_VALUE(session);
+		}
+
+		/*
+		 * We failed to get the page -- yield before retrying, and if
+		 * we've yielded enough times, start sleeping so we don't burn
+		 * CPU to no purpose.
+		 */
+		if (stalled)
+			wait_cnt += 1000;
+		else if (++wait_cnt < 1000) {
+			__wt_yield();
+			continue;
+		}
+
+		/*
+		 * If stalling and this thread is allowed to do eviction work,
+		 * check if the cache needs help. If we do work for the cache,
+		 * substitute that for a sleep.
+		 */
+		if (!LF_ISSET(WT_READ_NO_EVICT)) {
+			WT_RET(
+			    __wt_cache_eviction_check(session, 1, &cache_work));
+			if (cache_work)
+				continue;
+		}
+		sleep_cnt = WT_MIN(sleep_cnt + 1000, 10000);
+		WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt);
+		__wt_sleep(0, sleep_cnt);
+	}
+}
diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c
index f41a5d86e9f..c2a211bdd2d 100644
--- a/src/btree/bt_slvg.c
+++ b/src/btree/bt_slvg.c
@@ -197,9 +197,9 @@ __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[])
 	 * Turn off read checksum and verification error messages while we're
 	 * reading the file, we expect to see corrupted blocks.
 	 */
-	F_SET(session, WT_SESSION_SALVAGE_CORRUPT_OK);
+	F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE);
 	ret = __slvg_read(session, ss);
-	F_CLR(session, WT_SESSION_SALVAGE_CORRUPT_OK);
+	F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE);
 	WT_ERR(ret);
 
 	/*
@@ -349,9 +349,6 @@ err:	WT_TRET(bm->salvage_end(bm, session));
 	__wt_scr_free(session, &ss->tmp1);
 	__wt_scr_free(session, &ss->tmp2);
 
-	/* Wrap up reporting. */
-	WT_TRET(__wt_progress(session, NULL, ss->fcnt));
-
 	return (ret);
 }
 
@@ -381,8 +378,9 @@ __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss)
 		if (eof)
 			break;
 
-		/* Report progress every 10 chunks. */
-		if (++ss->fcnt % 10 == 0)
+		/* Report progress occasionally. */
+#define	WT_SALVAGE_PROGRESS_INTERVAL	100
+		if (++ss->fcnt % WT_SALVAGE_PROGRESS_INTERVAL == 0)
 			WT_ERR(__wt_progress(session, NULL, ss->fcnt));
 
 		/*
@@ -1305,7 +1303,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref)
 
 	/* Write the new version of the leaf page to disk. */
 	WT_ERR(__slvg_modify_init(session, page));
-	WT_ERR(__wt_reconcile(session, ref, cookie, WT_SKIP_UPDATE_ERR));
+	WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR));
 
 	/* Reset the page. */
 	page->pg_var_d = save_col_var;
@@ -2011,7 +2009,7 @@ __slvg_row_build_leaf(
 
 	/* Write the new version of the leaf page to disk. */
 	WT_ERR(__slvg_modify_init(session, page));
-	WT_ERR(__wt_reconcile(session, ref, cookie, WT_SKIP_UPDATE_ERR));
+	WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR));
 
 	/* Reset the page. */
 	page->pg_row_entries += skip_stop;
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index dbd4042129d..4b9ab45c678 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -45,10 +45,13 @@ static int
 __split_stash_add(
     WT_SESSION_IMPL *session, uint64_t split_gen, void *p, size_t len)
 {
+	WT_CONNECTION_IMPL *conn;
 	WT_SPLIT_STASH *stash;
 
 	WT_ASSERT(session, p != NULL);
 
+	conn = S2C(session);
+
 	/* Grow the list as necessary. */
 	WT_RET(__wt_realloc_def(session, &session->split_stash_alloc,
 	    session->split_stash_cnt + 1, &session->split_stash));
@@ -58,8 +61,8 @@ __split_stash_add(
 	stash->p = p;
 	stash->len = len;
 
-	WT_STAT_FAST_CONN_ATOMIC_INCRV(session, rec_split_stashed_bytes, len);
-	WT_STAT_FAST_CONN_ATOMIC_INCR(session, rec_split_stashed_objects);
+	(void)__wt_atomic_add64(&conn->split_stashed_bytes, len);
+	(void)__wt_atomic_add64(&conn->split_stashed_objects, 1);
 
 	/* See if we can free any previous entries. */
 	if (session->split_stash_cnt > 1)
@@ -75,10 +78,13 @@ __split_stash_add(
 void
 __wt_split_stash_discard(WT_SESSION_IMPL *session)
 {
+	WT_CONNECTION_IMPL *conn;
 	WT_SPLIT_STASH *stash;
 	uint64_t oldest;
 	size_t i;
 
+	conn = S2C(session);
+
 	/* Get the oldest split generation. */
 	oldest = __split_oldest_gen(session);
 
@@ -93,10 +99,8 @@ __wt_split_stash_discard(WT_SESSION_IMPL *session)
 		 * It's a bad thing if another thread is in this memory after
 		 * we free it, make sure nothing good happens to that thread.
 		 */
-		WT_STAT_FAST_CONN_ATOMIC_DECRV(
-		    session, rec_split_stashed_bytes, stash->len);
-		WT_STAT_FAST_CONN_ATOMIC_DECR(
-		    session, rec_split_stashed_objects);
+		(void)__wt_atomic_sub64(&conn->split_stashed_bytes, stash->len);
+		(void)__wt_atomic_sub64(&conn->split_stashed_objects, 1);
 		__wt_overwrite_and_free_len(session, stash->p, stash->len);
 	}
 
@@ -169,7 +173,7 @@ __split_safe_free(WT_SESSION_IMPL *session,
  * __split_should_deepen --
  *	Return if we should deepen the tree.
  */
-static int
+static bool
 __split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref)
 {
 	WT_BTREE *btree;
@@ -192,7 +196,7 @@ __split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref)
 	 * pressure on the cache).
 	 */
 	if (page->memory_footprint < btree->maxmempage)
-		return (0);
+		return (false);
 
 	/*
 	 * Ensure the page has enough entries to make it worth splitting and
@@ -200,7 +204,7 @@ __split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref)
 	 * splitting won't help).
 	 */
 	if (pindex->entries > btree->split_deepen_min_child)
-		return (1);
+		return (true);
 
 	/*
 	 * Don't allow a single page to put pressure on cache usage. The root
@@ -212,9 +216,9 @@ __split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref)
 	if (pindex->entries >= 100 &&
 	    (__wt_ref_is_root(ref) ||
 	    page->memory_footprint >= S2C(session)->cache_size / 4))
-		return (1);
+		return (true);
 
-	return (0);
+	return (false);
 }
 
 /*
@@ -339,7 +343,7 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
 
 	switch (page->type) {
 	case WT_PAGE_COL_INT:
-		recno = 0;
+		recno = 0;		/* Less than any valid record number. */
 		WT_INTL_FOREACH_BEGIN(session, page, ref) {
 			WT_ASSERT(session, ref->key.recno > recno);
 			recno = ref->key.recno;
@@ -557,7 +561,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
 	 */
 	WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex);
 	WT_INTL_INDEX_SET(parent, alloc_index);
-	split_gen = WT_ATOMIC_ADD8(S2C(session)->split_gen, 1);
+	split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
 	panic = 1;
 
 #ifdef HAVE_DIAGNOSTIC
@@ -680,13 +684,11 @@ __split_multi_inmem(
 	WT_DECL_RET;
 	WT_PAGE *page;
 	WT_UPDATE *upd;
-	WT_UPD_SKIPPED *skip;
+	WT_SAVE_UPD *supd;
 	uint64_t recno;
 	uint32_t i, slot;
 
-	WT_CLEAR(cbt);
-	cbt.iface.session = &session->iface;
-	cbt.btree = S2BT(session);
+	__wt_btcur_init(session, &cbt);
 	__wt_btcur_open(&cbt);
 
 	/*
@@ -700,22 +702,22 @@ __split_multi_inmem(
 	 * allocated page on error, when discarding the allocated WT_REF.
 	 */
 	WT_RET(__wt_page_inmem(session, ref,
-	    multi->skip_dsk, ((WT_PAGE_HEADER *)multi->skip_dsk)->mem_size,
+	    multi->supd_dsk, ((WT_PAGE_HEADER *)multi->supd_dsk)->mem_size,
 	    WT_PAGE_DISK_ALLOC, &page));
-	multi->skip_dsk = NULL;
+	multi->supd_dsk = NULL;
 
 	if (orig->type == WT_PAGE_ROW_LEAF)
 		WT_RET(__wt_scr_alloc(session, 0, &key));
 
 	/* Re-create each modification we couldn't write. */
-	for (i = 0, skip = multi->skip; i < multi->skip_entries; ++i, ++skip)
+	for (i = 0, supd = multi->supd; i < multi->supd_entries; ++i, ++supd)
 		switch (orig->type) {
 		case WT_PAGE_COL_FIX:
 		case WT_PAGE_COL_VAR:
 			/* Build a key. */
-			upd = skip->ins->upd;
-			skip->ins->upd = NULL;
-			recno = WT_INSERT_RECNO(skip->ins);
+			upd = supd->ins->upd;
+			supd->ins->upd = NULL;
+			recno = WT_INSERT_RECNO(supd->ins);
 
 			/* Search the page. */
 			WT_ERR(__wt_col_search(session, recno, ref, &cbt));
@@ -726,19 +728,19 @@ __split_multi_inmem(
 			break;
 		case WT_PAGE_ROW_LEAF:
 			/* Build a key. */
-			if (skip->ins == NULL) {
-				slot = WT_ROW_SLOT(orig, skip->rip);
+			if (supd->ins == NULL) {
+				slot = WT_ROW_SLOT(orig, supd->rip);
 				upd = orig->pg_row_upd[slot];
 				orig->pg_row_upd[slot] = NULL;
 
 				WT_ERR(__wt_row_leaf_key(
-				    session, orig, skip->rip, key, 0));
+				    session, orig, supd->rip, key, 0));
 			} else {
-				upd = skip->ins->upd;
-				skip->ins->upd = NULL;
+				upd = supd->ins->upd;
+				supd->ins->upd = NULL;
 
-				key->data = WT_INSERT_KEY(skip->ins);
-				key->size = WT_INSERT_KEY_SIZE(skip->ins);
+				key->data = WT_INSERT_KEY(supd->ins);
+				key->size = WT_INSERT_KEY_SIZE(supd->ins);
 			}
 
 			/* Search the page. */
@@ -761,7 +763,7 @@ __split_multi_inmem(
 	page->modify->first_dirty_txn = WT_TXN_FIRST;
 
 err:	/* Free any resources that may have been cached in the cursor. */
-	WT_TRET(__wt_btcur_close(&cbt));
+	WT_TRET(__wt_btcur_close(&cbt, 1));
 
 	__wt_scr_free(session, &key);
 	return (ret);
@@ -797,7 +799,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
 	 */
 	ref->home = NULL;
 
-	if (multi->skip == NULL) {
+	if (multi->supd == NULL) {
 		/*
 		 * Copy the address: we could simply take the buffer, but that
 		 * would complicate error handling, freeing the reference array
@@ -826,7 +828,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
 		break;
 	}
 
-	ref->state = multi->skip == NULL ? WT_REF_DISK : WT_REF_MEM;
+	ref->state = multi->supd == NULL ? WT_REF_DISK : WT_REF_MEM;
 
 	/*
 	 * If our caller wants to track the memory allocations, we have a return
@@ -837,16 +839,13 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
 	return (0);
 }
 
-#define	WT_SPLIT_EXCLUSIVE	0x01		/* Page held exclusively */
-#define	WT_SPLIT_INMEM		0x02		/* In-memory split */
-
 /*
  * __split_parent --
  *	Resolve a multi-page split, inserting new information into the parent.
  */
 static int
 __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
-    WT_REF **ref_new, uint32_t new_entries, size_t parent_incr, uint32_t flags)
+    WT_REF **ref_new, uint32_t new_entries, size_t parent_incr, int exclusive)
 {
 	WT_DECL_RET;
 	WT_IKEY *ikey;
@@ -874,26 +873,39 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
 	 * memory inside of the lock and may want to invest effort in making the
 	 * locked period shorter.
 	 *
-	 * We could race with another thread deepening our parent.  To deal
-	 * with that, read the parent pointer each time we try to lock it, and
-	 * check that it's still correct after it is locked.
+	 * We use the reconciliation lock here because not only do we have to
+	 * single-thread the split, we have to lock out reconciliation of the
+	 * parent because reconciliation of the parent can't deal with finding
+	 * a split child during internal page traversal. Basically, there's no
+	 * reason to use a different lock if we have to block reconciliation
+	 * anyway.
 	 */
 	for (;;) {
 		parent = ref->home;
-		F_CAS_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED, ret);
+		F_CAS_ATOMIC(parent, WT_PAGE_RECONCILIATION, ret);
 		if (ret == 0) {
+			/*
+			 * We can race with another thread deepening our parent.
+			 * To deal with that, read the parent pointer each time
+			 * we try to lock it, and check it's still correct after
+			 * it's locked.
+			 */
 			if (parent == ref->home)
 				break;
-			F_CLR_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED);
+			F_CLR_ATOMIC(parent, WT_PAGE_RECONCILIATION);
 			continue;
 		}
+
 		/*
-		 * If we're attempting an in-memory split and we can't lock the
-		 * parent, give up.  This avoids an infinite loop where we are
-		 * trying to split a page while its parent is being
-		 * checkpointed.
+		 * A checkpoint reconciling this parent page can deadlock with
+		 * our split. We have an exclusive page lock on the child before
+		 * we acquire the page's reconciliation lock, and reconciliation
+		 * acquires the page's reconciliation lock before it encounters
+		 * the child's exclusive lock (which causes reconciliation to
+		 * loop until the exclusive lock is resolved). If we can't lock
+		 * the parent, give up to avoid that deadlock.
 		 */
-		if (LF_ISSET(WT_SPLIT_INMEM))
+		if (S2BT(session)->checkpointing)
 			return (EBUSY);
 		__wt_yield();
 	}
@@ -905,9 +917,10 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
 	 * could conceivably be evicted.  Get a hazard pointer on the parent
 	 * now, so that we can safely access it after updating the index.
 	 *
-	 * Take care that getting the page doesn't trigger eviction, or we
-	 * could block trying to split a different child of our parent and
-	 * deadlock.
+	 * Take care getting the page doesn't trigger eviction work: we could
+	 * block trying to split a different child of our parent and deadlock
+	 * or we could be the eviction server relied upon by other threads to
+	 * populate the eviction queue.
 	 */
 	if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) {
 		WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT));
@@ -933,8 +946,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
 		WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
 		if (next_ref->state == WT_REF_DELETED &&
 		    __wt_delete_page_skip(session, next_ref) &&
-		    WT_ATOMIC_CAS4(next_ref->state,
-		    WT_REF_DELETED, WT_REF_SPLIT))
+		    __wt_atomic_casv32(
+		    &next_ref->state, WT_REF_DELETED, WT_REF_SPLIT))
 			deleted_entries++;
 	}
 
@@ -994,7 +1007,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
 	 */
 	WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex);
 	WT_INTL_INDEX_SET(parent, alloc_index);
-	split_gen = WT_ATOMIC_ADD8(S2C(session)->split_gen, 1);
+	split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
 	alloc_index = NULL;
 
 #ifdef HAVE_DIAGNOSTIC
@@ -1089,8 +1102,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
 	 * Add it to the session discard list, to be freed when it's safe.
 	 */
 	size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
-	WT_TRET(__split_safe_free(session,
-	    split_gen, LF_ISSET(WT_SPLIT_EXCLUSIVE) ? 1 : 0, pindex, size));
+	WT_TRET(__split_safe_free(session, split_gen, exclusive, pindex, size));
 	parent_decr += size;
 
 	/*
@@ -1115,7 +1127,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
 	 *	Do the check here because we've just grown the parent page and
 	 * are holding it locked.
 	 */
-	if (ret == 0 && !LF_ISSET(WT_SPLIT_EXCLUSIVE) &&
+	if (ret == 0 && !exclusive &&
 	    __split_should_deepen(session, parent_ref))
 		ret = __split_deepen(session, parent);
 
@@ -1125,7 +1137,7 @@ err:	if (!complete)
 			if (next_ref->state == WT_REF_SPLIT)
 				next_ref->state = WT_REF_DELETED;
 		}
-	F_CLR_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED);
+	F_CLR_ATOMIC(parent, WT_PAGE_RECONCILIATION);
 
 	if (hazard)
 		WT_TRET(__wt_hazard_clear(session, parent));
@@ -1164,7 +1176,13 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
 	right = NULL;
 	page_decr = parent_incr = right_incr = 0;
 
+	/*
+	 * Assert splitting makes sense; specifically assert the page is dirty,
+	 * we depend on that, otherwise the page might be evicted based on its
+	 * last reconciliation which no longer matches reality after the split.
+	 */
 	WT_ASSERT(session, __wt_page_can_split(session, page));
+	WT_ASSERT(session, __wt_page_is_modified(page));
 
 	/* Find the last item on the page. */
 	ins_head = page->pg_row_entries == 0 ?
@@ -1192,7 +1210,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
 	 * The key-instantiation code checks for races, clear the key fields so
 	 * we don't trigger them.
 	 */
-	child->key.recno = 0;
+	child->key.recno = WT_RECNO_OOB;
 	child->key.ikey = NULL;
 	child->state = WT_REF_MEM;
 
@@ -1367,7 +1385,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
 	 */
 	page = NULL;
 	if ((ret = __split_parent(
-	    session, ref, split_ref, 2, parent_incr, WT_SPLIT_INMEM)) != 0) {
+	    session, ref, split_ref, 2, parent_incr, 0)) != 0) {
 		/*
 		 * Move the insert list element back to the original page list.
 		 * For simplicity, the previous skip list pointers originally
@@ -1384,8 +1402,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
 		 * We marked the new page dirty; we're going to discard it, but
 		 * first mark it clean and fix up the cache statistics.
 		 */
-		 right->modify->write_gen = 0;
-		 __wt_cache_dirty_decr(session, right);
+		__wt_page_modify_clear(session, right);
 
 		WT_ERR(ret);
 	}
@@ -1442,8 +1459,7 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref)
 	 * Pages with unresolved changes are not marked clean during
 	 * reconciliation, do it now.
 	 */
-	mod->write_gen = 0;
-	__wt_cache_dirty_decr(session, page);
+	__wt_page_modify_clear(session, page);
 	__wt_ref_out(session, ref);
 
 	/* Swap the new page into place. */
@@ -1486,8 +1502,8 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
 	 * Split into the parent; if we're closing the file, we hold it
 	 * exclusively.
 	 */
-	WT_ERR(__split_parent( session, ref, ref_new,
-	    new_entries, parent_incr, closing ? WT_SPLIT_EXCLUSIVE : 0));
+	WT_ERR(__split_parent(
+	    session, ref, ref_new, new_entries, parent_incr, closing));
 
 	WT_STAT_FAST_CONN_INCR(session, cache_eviction_split);
 	WT_STAT_FAST_DATA_INCR(session, cache_eviction_split);
@@ -1500,10 +1516,7 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
 	 * Pages with unresolved changes are not marked clean during
 	 * reconciliation, do it now.
 	 */
-	if (__wt_page_is_modified(page)) {
-		 mod->write_gen = 0;
-		 __wt_cache_dirty_decr(session, page);
-	}
+	__wt_page_modify_clear(session, page);
 	__wt_page_out(session, &page);
 
 	return (0);
diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c
index 6285edde217..b379712f6e7 100644
--- a/src/btree/bt_stat.c
+++ b/src/btree/bt_stat.c
@@ -8,10 +8,11 @@
 
 #include "wt_internal.h"
 
-static int  __stat_page(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS *);
-static void __stat_page_col_var(WT_PAGE *, WT_DSRC_STATS *);
-static void __stat_page_row_int(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS *);
-static void __stat_page_row_leaf(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS *);
+static int  __stat_page(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS **);
+static void __stat_page_col_var(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS **);
+static void __stat_page_row_int(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS **);
+static void
+	__stat_page_row_leaf(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS **);
 
 /*
  * __wt_btree_stat_init --
@@ -23,22 +24,22 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst)
 	WT_BM *bm;
 	WT_BTREE *btree;
 	WT_DECL_RET;
-	WT_DSRC_STATS *stats;
+	WT_DSRC_STATS **stats;
 	WT_REF *next_walk;
 
 	btree = S2BT(session);
 	bm = btree->bm;
-	stats = &btree->dhandle->stats;
+	stats = btree->dhandle->stats;
 
-	WT_RET(bm->stat(bm, session, stats));
+	WT_RET(bm->stat(bm, session, stats[0]));
 
-	WT_STAT_SET(stats, btree_fixed_len, btree->bitcnt);
-	WT_STAT_SET(stats, btree_maximum_depth, btree->maximum_depth);
-	WT_STAT_SET(stats, btree_maxintlpage, btree->maxintlpage);
-	WT_STAT_SET(stats, btree_maxintlkey, btree->maxintlkey);
-	WT_STAT_SET(stats, btree_maxleafpage, btree->maxleafpage);
-	WT_STAT_SET(stats, btree_maxleafkey, btree->maxleafkey);
-	WT_STAT_SET(stats, btree_maxleafvalue, btree->maxleafvalue);
+	WT_STAT_SET(session, stats, btree_fixed_len, btree->bitcnt);
+	WT_STAT_SET(session, stats, btree_maximum_depth, btree->maximum_depth);
+	WT_STAT_SET(session, stats, btree_maxintlpage, btree->maxintlpage);
+	WT_STAT_SET(session, stats, btree_maxintlkey, btree->maxintlkey);
+	WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage);
+	WT_STAT_SET(session, stats, btree_maxleafkey, btree->maxleafkey);
+	WT_STAT_SET(session, stats, btree_maxleafvalue, btree->maxleafvalue);
 
 	/* Everything else is really, really expensive. */
 	if (!F_ISSET(cst, WT_CONN_STAT_ALL))
@@ -47,14 +48,15 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst)
 	/*
 	 * Clear the statistics we're about to count.
 	 */
-	WT_STAT_SET(stats, btree_column_deleted, 0);
-	WT_STAT_SET(stats, btree_column_fix, 0);
-	WT_STAT_SET(stats, btree_column_internal, 0);
-	WT_STAT_SET(stats, btree_column_variable, 0);
-	WT_STAT_SET(stats, btree_entries, 0);
-	WT_STAT_SET(stats, btree_overflow, 0);
-	WT_STAT_SET(stats, btree_row_internal, 0);
-	WT_STAT_SET(stats, btree_row_leaf, 0);
+	WT_STAT_SET(session, stats, btree_column_deleted, 0);
+	WT_STAT_SET(session, stats, btree_column_fix, 0);
+	WT_STAT_SET(session, stats, btree_column_internal, 0);
+	WT_STAT_SET(session, stats, btree_column_rle, 0);
+	WT_STAT_SET(session, stats, btree_column_variable, 0);
+	WT_STAT_SET(session, stats, btree_entries, 0);
+	WT_STAT_SET(session, stats, btree_overflow, 0);
+	WT_STAT_SET(session, stats, btree_row_internal, 0);
+	WT_STAT_SET(session, stats, btree_row_leaf, 0);
 
 	next_walk = NULL;
 	while ((ret = __wt_tree_walk(session, &next_walk, NULL, 0)) == 0 &&
@@ -71,7 +73,7 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst)
  *	Stat any Btree page.
  */
 static int
-__stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats)
+__stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats)
 {
 	/*
 	 * All internal pages and overflow pages are trivial, all we track is
@@ -79,14 +81,15 @@ __stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats)
 	 */
 	switch (page->type) {
 	case WT_PAGE_COL_FIX:
-		WT_STAT_INCR(stats, btree_column_fix);
-		WT_STAT_INCRV(stats, btree_entries, page->pg_fix_entries);
+		WT_STAT_INCR(session, stats, btree_column_fix);
+		WT_STAT_INCRV(
+		    session, stats, btree_entries, page->pg_fix_entries);
 		break;
 	case WT_PAGE_COL_INT:
-		WT_STAT_INCR(stats, btree_column_internal);
+		WT_STAT_INCR(session, stats, btree_column_internal);
 		break;
 	case WT_PAGE_COL_VAR:
-		__stat_page_col_var(page, stats);
+		__stat_page_col_var(session, page, stats);
 		break;
 	case WT_PAGE_ROW_INT:
 		__stat_page_row_int(session, page, stats);
@@ -104,21 +107,22 @@ __stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats)
  *	Stat a WT_PAGE_COL_VAR page.
  */
 static void
-__stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats)
+__stat_page_col_var(
+    WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats)
 {
 	WT_CELL *cell;
 	WT_CELL_UNPACK *unpack, _unpack;
 	WT_COL *cip;
 	WT_INSERT *ins;
 	WT_UPDATE *upd;
-	uint64_t deleted_cnt, entry_cnt, ovfl_cnt;
+	uint64_t deleted_cnt, entry_cnt, ovfl_cnt, rle_cnt;
 	uint32_t i;
 	int orig_deleted;
 
 	unpack = &_unpack;
-	deleted_cnt = entry_cnt = ovfl_cnt = 0;
+	deleted_cnt = entry_cnt = ovfl_cnt = rle_cnt = 0;
 
-	WT_STAT_INCR(stats, btree_column_variable);
+	WT_STAT_INCR(session, stats, btree_column_variable);
 
 	/*
 	 * Walk the page counting regular items, adjusting if the item has been
@@ -137,8 +141,10 @@ __stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats)
 			__wt_cell_unpack(cell, unpack);
 			if (unpack->type == WT_CELL_ADDR_DEL)
 				orig_deleted = 1;
-			else
+			else {
 				entry_cnt += __wt_cell_rle(unpack);
+				rle_cnt += __wt_cell_rle(unpack) - 1;
+			}
 			if (unpack->ovfl)
 				++ovfl_cnt;
 		}
@@ -169,9 +175,10 @@ __stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats)
 		else
 			++entry_cnt;
 
-	WT_STAT_INCRV(stats, btree_column_deleted, deleted_cnt);
-	WT_STAT_INCRV(stats, btree_entries, entry_cnt);
-	WT_STAT_INCRV(stats, btree_overflow, ovfl_cnt);
+	WT_STAT_INCRV(session, stats, btree_column_deleted, deleted_cnt);
+	WT_STAT_INCRV(session, stats, btree_column_rle, rle_cnt);
+	WT_STAT_INCRV(session, stats, btree_entries, entry_cnt);
+	WT_STAT_INCRV(session, stats, btree_overflow, ovfl_cnt);
 }
 
 /*
@@ -180,7 +187,7 @@ __stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats)
  */
 static void
 __stat_page_row_int(
-    WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats)
+    WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats)
 {
 	WT_BTREE *btree;
 	WT_CELL *cell;
@@ -190,7 +197,7 @@ __stat_page_row_int(
 	btree = S2BT(session);
 	ovfl_cnt = 0;
 
-	WT_STAT_INCR(stats, btree_row_internal);
+	WT_STAT_INCR(session, stats, btree_row_internal);
 
 	/*
 	 * Overflow keys are hard: we have to walk the disk image to count them,
@@ -204,7 +211,7 @@ __stat_page_row_int(
 				++ovfl_cnt;
 		}
 
-	WT_STAT_INCRV(stats, btree_overflow, ovfl_cnt);
+	WT_STAT_INCRV(session, stats, btree_overflow, ovfl_cnt);
 }
 
 /*
@@ -213,7 +220,7 @@ __stat_page_row_int(
  */
 static void
 __stat_page_row_leaf(
-    WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats)
+    WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats)
 {
 	WT_BTREE *btree;
 	WT_CELL *cell;
@@ -226,7 +233,7 @@ __stat_page_row_leaf(
 	btree = S2BT(session);
 	entry_cnt = ovfl_cnt = 0;
 
-	WT_STAT_INCR(stats, btree_row_leaf);
+	WT_STAT_INCR(session, stats, btree_row_leaf);
 
 	/*
 	 * Walk any K/V pairs inserted into the page before the first from-disk
@@ -267,6 +274,6 @@ __stat_page_row_leaf(
 				++ovfl_cnt;
 		}
 
-	WT_STAT_INCRV(stats, btree_entries, entry_cnt);
-	WT_STAT_INCRV(stats, btree_overflow, ovfl_cnt);
+	WT_STAT_INCRV(session, stats, btree_entries, entry_cnt);
+	WT_STAT_INCRV(session, stats, btree_overflow, ovfl_cnt);
 }
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index 838d778dadf..29ae5b185cd 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -259,7 +259,6 @@ __wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op)
 		break;
 	case WT_SYNC_CLOSE:
 	case WT_SYNC_DISCARD:
-	case WT_SYNC_DISCARD_FORCE:
 		WT_ERR(__wt_evict_file(session, op));
 		break;
 	WT_ILLEGAL_VALUE_ERR(session);
diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c
index 3f615babb07..1fd660d4cd4 100644
--- a/src/btree/bt_vrfy.c
+++ b/src/btree/bt_vrfy.c
@@ -245,9 +245,6 @@ err:	/* Inform the underlying block manager we're done. */
 	if (ckptbase != NULL)
 		__wt_meta_ckptlist_free(session, ckptbase);
 
-	/* Wrap up reporting. */
-	WT_TRET(__wt_progress(session, NULL, vs->fcnt));
-
 	/* Free allocated memory. */
 	__wt_scr_free(session, &vs->max_key);
 	__wt_scr_free(session, &vs->max_addr);
@@ -343,9 +340,10 @@ __verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs)
 	 * of the page to be built, and then a subsequent logical verification
 	 * which happens here.
 	 *
-	 * Report progress every 10 pages.
+	 * Report progress occasionally.
 	 */
-	if (++vs->fcnt % 10 == 0)
+#define	WT_VERIFY_PROGRESS_INTERVAL	100
+	if (++vs->fcnt % WT_VERIFY_PROGRESS_INTERVAL == 0)
 		WT_RET(__wt_progress(session, NULL, vs->fcnt));
 
 #ifdef HAVE_DIAGNOSTIC
diff --git a/src/btree/bt_vrfy_dsk.c b/src/btree/bt_vrfy_dsk.c
index 904a16a7548..e80bde3c91e 100644
--- a/src/btree/bt_vrfy_dsk.c
+++ b/src/btree/bt_vrfy_dsk.c
@@ -26,13 +26,13 @@ static int __verify_dsk_row(
 	WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *);
 
 #define	WT_ERR_VRFY(session, ...) do {					\
-	if (!(F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK)))		\
+	if (!(F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)))		\
 		__wt_errx(session, __VA_ARGS__);			\
 	goto err;							\
 } while (0)
 
 #define	WT_RET_VRFY(session, ...) do {					\
-	if (!(F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK)))		\
+	if (!(F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)))		\
 		__wt_errx(session, __VA_ARGS__);			\
 	return (WT_ERROR);						\
 } while (0)
@@ -43,7 +43,7 @@ static int __verify_dsk_row(
  */
 int
 __wt_verify_dsk_image(WT_SESSION_IMPL *session,
-    const char *addr, const WT_PAGE_HEADER *dsk, size_t size, int empty_page_ok)
+    const char *tag, const WT_PAGE_HEADER *dsk, size_t size, int empty_page_ok)
 {
 	const uint8_t *p, *end;
 	u_int i;
@@ -63,7 +63,7 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session,
 	default:
 		WT_RET_VRFY(session,
 		    "page at %s has an invalid type of %" PRIu32,
-		    addr, dsk->type);
+		    tag, dsk->type);
 	}
 
 	/* Check the page record number. */
@@ -71,51 +71,54 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session,
 	case WT_PAGE_COL_FIX:
 	case WT_PAGE_COL_INT:
 	case WT_PAGE_COL_VAR:
-		if (dsk->recno != 0)
+		if (dsk->recno != WT_RECNO_OOB)
 			break;
 		WT_RET_VRFY(session,
-		    "%s page at %s has a record number of zero",
-		    __wt_page_type_string(dsk->type), addr);
+		    "%s page at %s has an invalid record number of %d",
+		    __wt_page_type_string(dsk->type), tag, WT_RECNO_OOB);
 	case WT_PAGE_BLOCK_MANAGER:
 	case WT_PAGE_OVFL:
 	case WT_PAGE_ROW_INT:
 	case WT_PAGE_ROW_LEAF:
-		if (dsk->recno == 0)
+		if (dsk->recno == WT_RECNO_OOB)
 			break;
 		WT_RET_VRFY(session,
-		    "%s page at %s has a non-zero record number",
-		    __wt_page_type_string(dsk->type), addr);
+		    "%s page at %s has a record number, which is illegal for "
+		    "this page type",
+		    __wt_page_type_string(dsk->type), tag);
 	}
 
 	/* Check the page flags. */
 	flags = dsk->flags;
 	if (LF_ISSET(WT_PAGE_COMPRESSED))
 		LF_CLR(WT_PAGE_COMPRESSED);
-	if (LF_ISSET(WT_PAGE_ENCRYPTED))
-		LF_CLR(WT_PAGE_ENCRYPTED);
 	if (dsk->type == WT_PAGE_ROW_LEAF) {
 		if (LF_ISSET(WT_PAGE_EMPTY_V_ALL) &&
 		    LF_ISSET(WT_PAGE_EMPTY_V_NONE))
 			WT_RET_VRFY(session,
 			    "page at %s has invalid flags combination: 0x%"
 			    PRIx8,
-			    addr, dsk->flags);
+			    tag, dsk->flags);
 		if (LF_ISSET(WT_PAGE_EMPTY_V_ALL))
 			LF_CLR(WT_PAGE_EMPTY_V_ALL);
 		if (LF_ISSET(WT_PAGE_EMPTY_V_NONE))
 			LF_CLR(WT_PAGE_EMPTY_V_NONE);
 	}
+	if (LF_ISSET(WT_PAGE_ENCRYPTED))
+		LF_CLR(WT_PAGE_ENCRYPTED);
+	if (LF_ISSET(WT_PAGE_LAS_UPDATE))
+		LF_CLR(WT_PAGE_LAS_UPDATE);
 	if (flags != 0)
 		WT_RET_VRFY(session,
 		    "page at %s has invalid flags set: 0x%" PRIx8,
-		    addr, flags);
+		    tag, flags);
 
 	/* Unused bytes */
 	for (p = dsk->unused, i = sizeof(dsk->unused); i > 0; --i)
 		if (*p != '\0')
 			WT_RET_VRFY(session,
 			    "page at %s has non-zero unused page header bytes",
-			    addr);
+			    tag);
 
 	/*
 	 * Any bytes after the data chunk should be nul bytes; ignore if the
@@ -129,7 +132,7 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session,
 			if (*p != '\0')
 				WT_RET_VRFY(session,
 				    "%s page at %s has non-zero trailing bytes",
-				    __wt_page_type_string(dsk->type), addr);
+				    __wt_page_type_string(dsk->type), tag);
 	}
 
 	/* Check for empty pages, then verify the items on the page. */
@@ -141,28 +144,28 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session,
 	case WT_PAGE_ROW_LEAF:
 		if (!empty_page_ok && dsk->u.entries == 0)
 			WT_RET_VRFY(session, "%s page at %s has no entries",
-			    __wt_page_type_string(dsk->type), addr);
+			    __wt_page_type_string(dsk->type), tag);
 		break;
 	case WT_PAGE_BLOCK_MANAGER:
 	case WT_PAGE_OVFL:
 		if (dsk->u.datalen == 0)
 			WT_RET_VRFY(session, "%s page at %s has no data",
-			    __wt_page_type_string(dsk->type), addr);
+			    __wt_page_type_string(dsk->type), tag);
 		break;
 	}
 	switch (dsk->type) {
 	case WT_PAGE_COL_INT:
-		return (__verify_dsk_col_int(session, addr, dsk));
+		return (__verify_dsk_col_int(session, tag, dsk));
 	case WT_PAGE_COL_FIX:
-		return (__verify_dsk_col_fix(session, addr, dsk));
+		return (__verify_dsk_col_fix(session, tag, dsk));
 	case WT_PAGE_COL_VAR:
-		return (__verify_dsk_col_var(session, addr, dsk));
+		return (__verify_dsk_col_var(session, tag, dsk));
 	case WT_PAGE_ROW_INT:
 	case WT_PAGE_ROW_LEAF:
-		return (__verify_dsk_row(session, addr, dsk));
+		return (__verify_dsk_row(session, tag, dsk));
 	case WT_PAGE_BLOCK_MANAGER:
 	case WT_PAGE_OVFL:
-		return (__verify_dsk_chunk(session, addr, dsk, dsk->u.datalen));
+		return (__verify_dsk_chunk(session, tag, dsk, dsk->u.datalen));
 	WT_ILLEGAL_VALUE(session);
 	}
 	/* NOTREACHED */
@@ -173,9 +176,9 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session,
  *	Verify a single Btree page as read from disk.
  */
 int
-__wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM *buf)
+__wt_verify_dsk(WT_SESSION_IMPL *session, const char *tag, WT_ITEM *buf)
 {
-	return (__wt_verify_dsk_image(session, addr, buf->data, buf->size, 0));
+	return (__wt_verify_dsk_image(session, tag, buf->data, buf->size, 0));
 }
 
 /*
@@ -184,7 +187,7 @@ __wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM *buf)
  */
 static int
 __verify_dsk_row(
-    WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk)
+    WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk)
 {
 	WT_BM *bm;
 	WT_BTREE *btree;
@@ -220,16 +223,16 @@ __verify_dsk_row(
 		++cell_num;
 
 		/* Carefully unpack the cell. */
-		if (__wt_cell_unpack_safe(cell, unpack, end) != 0) {
-			ret = __err_cell_corrupted(session, cell_num, addr);
+		if (__wt_cell_unpack_safe(cell, unpack, dsk, end) != 0) {
+			ret = __err_cell_corrupted(session, cell_num, tag);
 			goto err;
 		}
 
 		/* Check the raw and collapsed cell types. */
 		WT_ERR(__err_cell_type(
-		    session, cell_num, addr, unpack->raw, dsk->type));
+		    session, cell_num, tag, unpack->raw, dsk->type));
 		WT_ERR(__err_cell_type(
-		    session, cell_num, addr, unpack->type, dsk->type));
+		    session, cell_num, tag, unpack->type, dsk->type));
 		cell_type = unpack->type;
 
 		/*
@@ -256,7 +259,7 @@ __verify_dsk_row(
 				WT_ERR_VRFY(session,
 				    "cell %" PRIu32 " on page at %s is the "
 				    "first of two adjacent keys",
-				    cell_num - 1, addr);
+				    cell_num - 1, tag);
 			}
 			last_cell_type = WAS_KEY;
 			break;
@@ -269,14 +272,14 @@ __verify_dsk_row(
 			switch (last_cell_type) {
 			case FIRST:
 				WT_ERR_VRFY(session,
-				    "page at %s begins with a value", addr);
+				    "page at %s begins with a value", tag);
 			case WAS_KEY:
 				break;
 			case WAS_VALUE:
 				WT_ERR_VRFY(session,
 				    "cell %" PRIu32 " on page at %s is the "
 				    "first of two adjacent values",
-				    cell_num - 1, addr);
+				    cell_num - 1, tag);
 			}
 			last_cell_type = WAS_VALUE;
 			break;
@@ -327,7 +330,7 @@ __verify_dsk_row(
 			    "the %" PRIu32 " key on page at %s is the first "
 			    "non-overflow key on the page and has a non-zero "
 			    "prefix compression value",
-			    cell_num, addr);
+			    cell_num, tag);
 
 		/* Confirm the prefix compression count is possible. */
 		if (cell_num > 1 && prefix > last->size)
@@ -335,7 +338,7 @@ __verify_dsk_row(
 			    "key %" PRIu32 " on page at %s has a prefix "
 			    "compression count of %" PRIu32 ", larger than "
 			    "the length of the previous key, %" WT_SIZET_FMT,
-			    cell_num, addr, prefix, last->size);
+			    cell_num, tag, prefix, last->size);
 
 		/*
 		 * If Huffman decoding required, unpack the cell to build the
@@ -394,7 +397,7 @@ key_compare:	/*
 				WT_ERR_VRFY(session,
 				    "the %" PRIu32 " and %" PRIu32 " keys on "
 				    "page at %s are incorrectly sorted",
-				    cell_num - 2, cell_num, addr);
+				    cell_num - 2, cell_num, tag);
 		}
 
 		/*
@@ -414,7 +417,7 @@ key_compare:	/*
 		}
 		WT_ASSERT(session, last != current);
 	}
-	WT_ERR(__verify_dsk_memsize(session, addr, dsk, cell));
+	WT_ERR(__verify_dsk_memsize(session, tag, dsk, cell));
 
 	/*
 	 * On row-store internal pages, and on row-store leaf pages, where the
@@ -428,7 +431,7 @@ key_compare:	/*
 		    "%s page at %s has a key count of %" PRIu32 " and a "
 		    "physical entry count of %" PRIu32,
 		    __wt_page_type_string(dsk->type),
-		    addr, key_cnt, dsk->u.entries);
+		    tag, key_cnt, dsk->u.entries);
 	if (dsk->type == WT_PAGE_ROW_LEAF &&
 	    F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL) &&
 	    key_cnt != dsk->u.entries)
@@ -437,7 +440,7 @@ key_compare:	/*
 		    "key count of %" PRIu32 " and a physical entry count of %"
 		    PRIu32,
 		    __wt_page_type_string(dsk->type),
-		    addr, key_cnt, dsk->u.entries);
+		    tag, key_cnt, dsk->u.entries);
 	if (dsk->type == WT_PAGE_ROW_LEAF &&
 	    F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE) &&
 	    key_cnt * 2 != dsk->u.entries)
@@ -446,10 +449,10 @@ key_compare:	/*
 		    "key count of %" PRIu32 " and a physical entry count of %"
 		    PRIu32,
 		    __wt_page_type_string(dsk->type),
-		    addr, key_cnt, dsk->u.entries);
+		    tag, key_cnt, dsk->u.entries);
 
 	if (0) {
-eof:		ret = __err_eof(session, cell_num, addr);
+eof:		ret = __err_eof(session, cell_num, tag);
 	}
 
 	if (0) {
@@ -468,7 +471,7 @@ err:		if (ret == 0)
  */
 static int
 __verify_dsk_col_int(
-    WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk)
+    WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk)
 {
 	WT_BM *bm;
 	WT_BTREE *btree;
@@ -487,20 +490,20 @@ __verify_dsk_col_int(
 		++cell_num;
 
 		/* Carefully unpack the cell. */
-		if (__wt_cell_unpack_safe(cell, unpack, end) != 0)
-			return (__err_cell_corrupted(session, cell_num, addr));
+		if (__wt_cell_unpack_safe(cell, unpack, dsk, end) != 0)
+			return (__err_cell_corrupted(session, cell_num, tag));
 
 		/* Check the raw and collapsed cell types. */
 		WT_RET(__err_cell_type(
-		    session, cell_num, addr, unpack->raw, dsk->type));
+		    session, cell_num, tag, unpack->raw, dsk->type));
 		WT_RET(__err_cell_type(
-		    session, cell_num, addr, unpack->type, dsk->type));
+		    session, cell_num, tag, unpack->type, dsk->type));
 
 		/* Check if any referenced item is entirely in the file. */
 		if (!bm->addr_valid(bm, session, unpack->data, unpack->size))
-			return (__err_eof(session, cell_num, addr));
+			return (__err_eof(session, cell_num, tag));
 	}
-	WT_RET(__verify_dsk_memsize(session, addr, dsk, cell));
+	WT_RET(__verify_dsk_memsize(session, tag, dsk, cell));
 
 	return (0);
 }
@@ -511,7 +514,7 @@ __verify_dsk_col_int(
  */
 static int
 __verify_dsk_col_fix(
-    WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk)
+    WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk)
 {
 	WT_BTREE *btree;
 	uint32_t datalen;
@@ -519,7 +522,7 @@ __verify_dsk_col_fix(
 	btree = S2BT(session);
 
 	datalen = __bitstr_size(btree->bitcnt * dsk->u.entries);
-	return (__verify_dsk_chunk(session, addr, dsk, datalen));
+	return (__verify_dsk_chunk(session, tag, dsk, datalen));
 }
 
 /*
@@ -528,7 +531,7 @@ __verify_dsk_col_fix(
  */
 static int
 __verify_dsk_col_var(
-    WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk)
+    WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk)
 {
 	WT_BM *bm;
 	WT_BTREE *btree;
@@ -554,20 +557,20 @@ __verify_dsk_col_var(
 		++cell_num;
 
 		/* Carefully unpack the cell. */
-		if (__wt_cell_unpack_safe(cell, unpack, end) != 0)
-			return (__err_cell_corrupted(session, cell_num, addr));
+		if (__wt_cell_unpack_safe(cell, unpack, dsk, end) != 0)
+			return (__err_cell_corrupted(session, cell_num, tag));
 
 		/* Check the raw and collapsed cell types. */
 		WT_RET(__err_cell_type(
-		    session, cell_num, addr, unpack->raw, dsk->type));
+		    session, cell_num, tag, unpack->raw, dsk->type));
 		WT_RET(__err_cell_type(
-		    session, cell_num, addr, unpack->type, dsk->type));
+		    session, cell_num, tag, unpack->type, dsk->type));
 		cell_type = unpack->type;
 
 		/* Check if any referenced item is entirely in the file. */
 		if (cell_type == WT_CELL_VALUE_OVFL &&
 		    !bm->addr_valid(bm, session, unpack->data, unpack->size))
-			return (__err_eof(session, cell_num, addr));
+			return (__err_eof(session, cell_num, tag));
 
 		/*
 		 * Compare the last two items and see if reconciliation missed
@@ -586,7 +589,7 @@ match_err:			WT_RET_VRFY(session,
 				    "data entries %" PRIu32 " and %" PRIu32
 				    " on page at %s are identical and should "
 				    "have been run-length encoded",
-				    cell_num - 1, cell_num, addr);
+				    cell_num - 1, cell_num, tag);
 
 		switch (cell_type) {
 		case WT_CELL_DEL:
@@ -604,7 +607,7 @@ match_err:			WT_RET_VRFY(session,
 			break;
 		}
 	}
-	WT_RET(__verify_dsk_memsize(session, addr, dsk, cell));
+	WT_RET(__verify_dsk_memsize(session, tag, dsk, cell));
 
 	return (0);
 }
@@ -615,7 +618,7 @@ match_err:			WT_RET_VRFY(session,
  */
 static int
 __verify_dsk_memsize(WT_SESSION_IMPL *session,
-    const char *addr, const WT_PAGE_HEADER *dsk, WT_CELL *cell)
+    const char *tag, const WT_PAGE_HEADER *dsk, WT_CELL *cell)
 {
 	size_t len;
 
@@ -630,7 +633,7 @@ __verify_dsk_memsize(WT_SESSION_IMPL *session,
 	WT_RET_VRFY(session,
 	    "%s page at %s has %" WT_SIZET_FMT " unexpected bytes of data "
 	    "after the last cell",
-	    __wt_page_type_string(dsk->type), addr, len);
+	    __wt_page_type_string(dsk->type), tag, len);
 }
 
 /*
@@ -639,7 +642,7 @@ __verify_dsk_memsize(WT_SESSION_IMPL *session,
  */
 static int
 __verify_dsk_chunk(WT_SESSION_IMPL *session,
-    const char *addr, const WT_PAGE_HEADER *dsk, uint32_t datalen)
+    const char *tag, const WT_PAGE_HEADER *dsk, uint32_t datalen)
 {
 	WT_BTREE *btree;
 	uint8_t *p, *end;
@@ -655,14 +658,14 @@ __verify_dsk_chunk(WT_SESSION_IMPL *session,
 	if (p + datalen > end)
 		WT_RET_VRFY(session,
 		    "data on page at %s extends past the end of the page",
-		    addr);
+		    tag);
 
 	/* Any bytes after the data chunk should be nul bytes. */
 	for (p += datalen; p < end; ++p)
 		if (*p != '\0')
 			WT_RET_VRFY(session,
 			    "%s page at %s has non-zero trailing bytes",
-			    __wt_page_type_string(dsk->type), addr);
+			    __wt_page_type_string(dsk->type), tag);
 
 	return (0);
 }
@@ -673,11 +676,11 @@ __verify_dsk_chunk(WT_SESSION_IMPL *session,
  */
 static int
 __err_cell_corrupted(
-    WT_SESSION_IMPL *session, uint32_t entry_num, const char *addr)
+    WT_SESSION_IMPL *session, uint32_t entry_num, const char *tag)
 {
 	WT_RET_VRFY(session,
 	    "item %" PRIu32 " on page at %s is a corrupted cell",
-	    entry_num, addr);
+	    entry_num, tag);
 }
 
 /*
@@ -686,7 +689,7 @@ __err_cell_corrupted(
  */
 static int
 __err_cell_type(WT_SESSION_IMPL *session,
-    uint32_t entry_num, const char *addr, uint8_t cell_type, uint8_t dsk_type)
+    uint32_t entry_num, const char *tag, uint8_t cell_type, uint8_t dsk_type)
 {
 	switch (cell_type) {
 	case WT_CELL_ADDR_DEL:
@@ -735,7 +738,7 @@ __err_cell_type(WT_SESSION_IMPL *session,
 	WT_RET_VRFY(session,
 	    "illegal cell and page type combination: cell %" PRIu32
 	    " on page at %s is a %s cell on a %s page",
-	    entry_num, addr,
+	    entry_num, tag,
 	    __wt_cell_type_string(cell_type), __wt_page_type_string(dsk_type));
 }
 
@@ -744,10 +747,10 @@ __err_cell_type(WT_SESSION_IMPL *session,
  *	Generic item references non-existent file pages error.
  */
 static int
-__err_eof(WT_SESSION_IMPL *session, uint32_t entry_num, const char *addr)
+__err_eof(WT_SESSION_IMPL *session, uint32_t entry_num, const char *tag)
 {
 	WT_RET_VRFY(session,
 	    "off-page item %" PRIu32
 	    " on page at %s references non-existent file pages",
-	    entry_num, addr);
+	    entry_num, tag);
 }
diff --git a/src/btree/col_modify.c b/src/btree/col_modify.c
index 2fe09681090..cbc5143698b 100644
--- a/src/btree/col_modify.c
+++ b/src/btree/col_modify.c
@@ -17,7 +17,7 @@ static int __col_insert_alloc(
  */
 int
 __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
-    uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove)
+    uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, int is_remove)
 {
 	WT_BTREE *btree;
 	WT_DECL_RET;
@@ -25,7 +25,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
 	WT_INSERT_HEAD *ins_head, **ins_headp;
 	WT_ITEM _value;
 	WT_PAGE *page;
-	WT_UPDATE *old_upd;
+	WT_UPDATE *old_upd, *upd;
 	size_t ins_size, upd_size;
 	u_int i, skipdepth;
 	int append, logged;
@@ -33,6 +33,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
 	btree = cbt->btree;
 	ins = NULL;
 	page = cbt->ref->page;
+	upd = upd_arg;
 	append = logged = 0;
 
 	/* This code expects a remove to have a NULL value. */
@@ -48,10 +49,10 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
 		 * There's some chance the application specified a record past
 		 * the last record on the page.  If that's the case, and we're
 		 * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the
-		 * append list, not the update list. In addition, a recno of 0
+		 * append list, not the update list. Also, an out-of-band recno
 		 * implies an append operation, we're allocating a new row.
 		 */
-		if (recno == 0 ||
+		if (recno == WT_RECNO_OOB ||
 		    recno > (btree->type == BTREE_COL_VAR ?
 		    __col_var_last_recno(page) : __col_fix_last_recno(page)))
 			append = 1;
@@ -76,7 +77,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
 		 * If we are restoring updates that couldn't be evicted, the
 		 * key must not exist on the new page.
 		 */
-		WT_ASSERT(session, upd == NULL);
+		WT_ASSERT(session, upd_arg == NULL);
 
 		/* Make sure the update can proceed. */
 		WT_ERR(__wt_txn_update_check(
@@ -134,7 +135,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
 		cbt->ins_head = ins_head;
 		cbt->ins = ins;
 
-		if (upd == NULL) {
+		if (upd_arg == NULL) {
 			WT_ERR(
 			    __wt_update_alloc(session, value, &upd, &upd_size));
 			WT_ERR(__wt_txn_modify(session, upd));
@@ -160,7 +161,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
 		 * The serial mutex acts as our memory barrier to flush these
 		 * writes before inserting them into the list.
 		 */
-		if (WT_SKIP_FIRST(ins_head) == NULL || recno == 0)
+		if (cbt->ins_stack[0] == NULL || recno == WT_RECNO_OOB)
 			for (i = 0; i < skipdepth; i++) {
 				cbt->ins_stack[i] = &ins_head->head[i];
 				ins->next[i] = cbt->next_stack[i] = NULL;
@@ -192,7 +193,8 @@ err:		/*
 		if (logged)
 			__wt_txn_unmodify(session);
 		__wt_free(session, ins);
-		__wt_free(session, upd);
+		if (upd_arg == NULL)
+			__wt_free(session, upd);
 	}
 
 	return (ret);
diff --git a/src/btree/row_key.c b/src/btree/row_key.c
index f2868afe13a..4affa7fa62a 100644
--- a/src/btree/row_key.c
+++ b/src/btree/row_key.c
@@ -448,7 +448,8 @@ next:		switch (direction) {
 			 * update the page's memory footprint, on failure, free
 			 * the allocated memory.
 			 */
-			if (WT_ATOMIC_CAS8(WT_ROW_KEY_COPY(rip), copy, ikey))
+			if (__wt_atomic_cas_ptr(
+			    (void *)&WT_ROW_KEY_COPY(rip), copy, ikey))
 				__wt_cache_page_inmem_incr(session,
 				    page, sizeof(WT_IKEY) + ikey->size);
 			else
@@ -525,7 +526,7 @@ __wt_row_ikey(WT_SESSION_IMPL *session,
 	WT_ASSERT(session, oldv == 0 || (oldv & WT_IK_FLAG) != 0);
 	WT_ASSERT(session, ref->state != WT_REF_SPLIT);
 	WT_ASSERT(session,
-	    WT_ATOMIC_CAS8(ref->key.ikey, (WT_IKEY *)oldv, ikey));
+	    __wt_atomic_cas_ptr(&ref->key.ikey, (WT_IKEY *)oldv, ikey));
 	}
 #else
 	ref->key.ikey = ikey;
diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c
index 62177b7e4c7..888c54d1ec9 100644
--- a/src/btree/row_modify.c
+++ b/src/btree/row_modify.c
@@ -26,7 +26,7 @@ __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page)
 	 * Select a spinlock for the page; let the barrier immediately below
 	 * keep things from racing too badly.
 	 */
-	modify->page_lock = ++conn->page_lock_cnt % WT_PAGE_LOCKS(conn);
+	modify->page_lock = ++conn->page_lock_cnt % WT_PAGE_LOCKS;
 
 	/*
 	 * Multiple threads of control may be searching and deciding to modify
@@ -34,7 +34,7 @@ __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page)
 	 * footprint, else discard the modify structure, another thread did the
 	 * work.
 	 */
-	if (WT_ATOMIC_CAS8(page->modify, NULL, modify))
+	if (__wt_atomic_cas_ptr(&page->modify, NULL, modify))
 		__wt_cache_page_inmem_incr(session, page, sizeof(*modify));
 	else
 		__wt_free(session, modify);
@@ -112,6 +112,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
 			 * there should only be one update list per key.
 			 */
 			WT_ASSERT(session, *upd_entry == NULL);
+
 			/*
 			 * Set the "old" entry to the second update in the list
 			 * so that the serialization function succeeds in
@@ -192,7 +193,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
 		 * The serial mutex acts as our memory barrier to flush these
 		 * writes before inserting them into the list.
 		 */
-		if (WT_SKIP_FIRST(ins_head) == NULL)
+		if (cbt->ins_stack[0] == NULL)
 			for (i = 0; i < skipdepth; i++) {
 				cbt->ins_stack[i] = &ins_head->head[i];
 				ins->next[i] = cbt->next_stack[i] = NULL;
@@ -316,7 +317,7 @@ __wt_update_obsolete_check(
 	 */
 	if (first != NULL &&
 	    (next = first->next) != NULL &&
-	    WT_ATOMIC_CAS8(first->next, next, NULL))
+	    __wt_atomic_cas_ptr(&first->next, next, NULL))
 		return (next);
 
 	/*
diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c
index 9803b924355..d83d3253c44 100644
--- a/src/btree/row_srch.c
+++ b/src/btree/row_srch.c
@@ -471,6 +471,7 @@ __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
 	WT_PAGE *page;
 	WT_PAGE_INDEX *pindex;
 	WT_REF *current, *descent;
+	uint32_t cnt;
 
 	btree = S2BT(session);
 
@@ -528,18 +529,22 @@ restart:
 
 	/*
 	 * If the tree is new (and not empty), it might have a large insert
-	 * list, pick the key in the middle of that insert list.
+	 * list. Count how many records are in the list.
 	 */
 	F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
 	if ((cbt->ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
 		WT_ERR(WT_NOTFOUND);
-	for (p = t = WT_SKIP_FIRST(cbt->ins_head);;) {
+	for (cnt = 1, p = WT_SKIP_FIRST(cbt->ins_head);; ++cnt)
 		if ((p = WT_SKIP_NEXT(p)) == NULL)
 			break;
-		if ((p = WT_SKIP_NEXT(p)) == NULL)
+
+	/*
+	 * Select a random number from 0 to (N - 1), return that record.
+	 */
+	cnt = __wt_random(&session->rnd) % cnt;
+	for (p = t = WT_SKIP_FIRST(cbt->ins_head);; t = p)
+		if (cnt-- == 0 || (p = WT_SKIP_NEXT(p)) == NULL)
 			break;
-		t = WT_SKIP_NEXT(t);
-	}
 	cbt->ref = current;
 	cbt->compare = 0;
 	cbt->ins = t;
diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c
new file mode 100644
index 00000000000..e269e8702e1
--- /dev/null
+++ b/src/cache/cache_las.c
@@ -0,0 +1,391 @@
+/*-
+ * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_las_stats_update --
+ *	Update the lookaside table statistics for return to the application.
+ */
+void
+__wt_las_stats_update(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_CONNECTION_STATS **cstats;
+	WT_DSRC_STATS **dstats;
+
+	conn = S2C(session);
+
+	/*
+	 * Lookaside table statistics are copied from the underlying lookaside
+	 * table data-source statistics. If there's no lookaside table, values
+	 * remain 0. In the current system, there's always a lookaside table,
+	 * but there's no reason not to be cautious.
+	 */
+	if (conn->las_cursor == NULL)
+		return;
+
+	/*
+	 * We have a cursor, and we need the underlying data handle; we can get
+	 * to it by way of the underlying btree handle, but it's a little ugly.
+	 */
+	cstats = conn->stats;
+	dstats = ((WT_CURSOR_BTREE *)conn->las_cursor)->btree->dhandle->stats;
+
+	WT_STAT_SET(session, cstats,
+	    cache_lookaside_insert, WT_STAT_READ(dstats, cursor_insert));
+	WT_STAT_SET(session, cstats,
+	    cache_lookaside_remove, WT_STAT_READ(dstats, cursor_remove));
+}
+
+/*
+ * __las_cursor_create --
+ *	Open a new lookaside table cursor.
+ */
+static int
+__las_cursor_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp)
+{
+	WT_BTREE *btree;
+	const char *open_cursor_cfg[] = {
+	    WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL };
+
+	WT_RET(__wt_open_cursor(
+	    session, WT_LAS_URI, NULL, open_cursor_cfg, cursorp));
+
+	/*
+	 * Set special flags for the lookaside table: the lookaside flag (used,
+	 * for example, to avoid writing records during reconciliation), also
+	 * turn off checkpoints and logging.
+	 *
+	 * Test flags before setting them so updates can't race in subsequent
+	 * opens (the first update is safe because it's single-threaded from
+	 * wiredtiger_open).
+	 */
+	btree = S2BT(session);
+	if (!F_ISSET(btree, WT_BTREE_LOOKASIDE))
+		F_SET(btree, WT_BTREE_LOOKASIDE);
+	if (!F_ISSET(btree, WT_BTREE_NO_CHECKPOINT))
+		F_SET(btree, WT_BTREE_NO_CHECKPOINT);
+	if (!F_ISSET(btree, WT_BTREE_NO_LOGGING))
+		F_SET(btree, WT_BTREE_NO_LOGGING);
+
+	return (0);
+}
+
+/*
+ * __wt_las_create --
+ *	Initialize the database's lookaside store.
+ */
+int
+__wt_las_create(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	const char *drop_cfg[] = {
+	    WT_CONFIG_BASE(session, WT_SESSION_drop), "force=true", NULL };
+
+	conn = S2C(session);
+
+	/*
+	 * Done at startup: we cannot do it on demand because we require the
+	 * schema lock to create and drop the file, and it may not always be
+	 * available.
+	 *
+	 * Open an internal session, used for the shared lookaside cursor.
+	 *
+	 * Sessions associated with a lookaside cursor should never be tapped
+	 * for eviction.
+	 */
+	WT_RET(__wt_open_internal_session(
+	    conn, "lookaside table", 1, 1, &conn->las_session));
+	session = conn->las_session;
+	F_SET(session, WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION);
+
+	/* Discard any previous incarnation of the file. */
+	WT_RET(__wt_session_drop(session, WT_LAS_URI, drop_cfg));
+
+	/* Re-create the file. */
+	WT_RET(__wt_session_create(session, WT_LAS_URI, WT_LAS_FORMAT));
+
+	/* Open the shared cursor. */
+	WT_WITHOUT_DHANDLE(session,
+	    ret = __las_cursor_create(session, &conn->las_cursor));
+
+	return (ret);
+}
+
+/*
+ * __wt_las_destroy --
+ *	Destroy the database's lookaside store.
+ */
+int
+__wt_las_destroy(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_SESSION *wt_session;
+
+	conn = S2C(session);
+
+	if (conn->las_session == NULL)
+		return (0);
+
+	wt_session = &conn->las_session->iface;
+	ret = wt_session->close(wt_session, NULL);
+
+	conn->las_cursor = NULL;
+	conn->las_session = NULL;
+
+	return (ret);
+}
+
+/*
+ * __wt_las_set_written --
+ *	Flag that the lookaside table has been written.
+ */
+void
+__wt_las_set_written(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+
+	conn = S2C(session);
+	if (!conn->las_written) {
+		conn->las_written = true;
+
+		/*
+		 * Push the flag: unnecessary, but from now page reads must deal
+		 * with lookaside table records, and we only do the write once.
+		 */
+		WT_FULL_BARRIER();
+	}
+}
+
+/*
+ * __wt_las_is_written --
+ *	Return if the lookaside table has been written.
+ */
+bool
+__wt_las_is_written(WT_SESSION_IMPL *session)
+{
+	return (S2C(session)->las_written);
+}
+
+/*
+ * __wt_las_cursor --
+ *	Return a lookaside cursor.
+ */
+int
+__wt_las_cursor(
+    WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+
+	*cursorp = NULL;
+
+	/*
+	 * We don't want to get tapped for eviction after we start using the
+	 * lookaside cursor; save a copy of the current eviction state, we'll
+	 * turn eviction off before we return.
+	 *
+	 * Don't cache lookaside table pages, we're here because of eviction
+	 * problems and there's no reason to believe lookaside pages will be
+	 * useful more than once.
+	 */
+	*session_flags =
+	    F_ISSET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
+
+	conn = S2C(session);
+
+	/* Eviction and sweep threads have their own lookaside table cursors. */
+	if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) {
+		if (session->las_cursor == NULL) {
+			WT_WITHOUT_DHANDLE(session, ret =
+			    __las_cursor_create(session, &session->las_cursor));
+			WT_RET(ret);
+		}
+
+		*cursorp = session->las_cursor;
+	} else {
+		/* Lock the shared lookaside cursor. */
+		__wt_spin_lock(session, &conn->las_lock);
+
+		*cursorp = conn->las_cursor;
+	}
+
+	/* Turn caching and eviction off. */
+	F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
+
+	return (0);
+}
+
+/*
+ * __wt_las_cursor_close --
+ *	Discard a lookaside cursor.
+ */
+int
+__wt_las_cursor_close(
+	WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+
+	conn = S2C(session);
+
+	if ((cursor = *cursorp) == NULL)
+		return (0);
+	*cursorp = NULL;
+
+	/* Reset the cursor. */
+	ret = cursor->reset(cursor);
+
+	/*
+	 * We turned off caching and eviction while the lookaside cursor was in
+	 * use, restore the session's flags.
+	 */
+	F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
+	F_SET(session, session_flags);
+
+	/*
+	 * Eviction and sweep threads have their own lookaside table cursors;
+	 * else, unlock the shared lookaside cursor.
+	 */
+	if (!F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR))
+		__wt_spin_unlock(session, &conn->las_lock);
+
+	return (ret);
+}
+
+/*
+ * __wt_las_sweep --
+ *	Sweep the lookaside table.
+ */
+int
+__wt_las_sweep(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_CURSOR *cursor;
+	WT_DECL_ITEM(las_addr);
+	WT_DECL_ITEM(las_key);
+	WT_DECL_RET;
+	WT_ITEM *key;
+	uint64_t cnt, las_counter, las_txnid;
+	uint32_t las_id, session_flags;
+	int notused;
+
+	conn = S2C(session);
+	cursor = NULL;
+	key = &conn->las_sweep_key;
+	session_flags = 0;		/* [-Werror=maybe-uninitialized] */
+
+	WT_ERR(__wt_scr_alloc(session, 0, &las_addr));
+	WT_ERR(__wt_scr_alloc(session, 0, &las_key));
+
+	WT_ERR(__wt_las_cursor(session, &cursor, &session_flags));
+
+	/*
+	 * If we're not starting a new sweep, position the cursor using the key
+	 * from the last call (we don't care if we're before or after the key,
+	 * just roughly in the same spot is fine).
+	 */
+	if (conn->las_sweep_call != 0 && key->data != NULL) {
+		__wt_cursor_set_raw_key(cursor, key);
+		if ((ret = cursor->search_near(cursor, &notused)) != 0)
+			goto srch_notfound;
+	}
+
+	/*
+	 * The sweep server wakes up every 10 seconds (by default), it's a slow
+	 * moving thread. Try to review the entire lookaside table once every 5
+	 * minutes, or every 30 calls.
+	 *
+	 * The reason is because the lookaside table exists because we're seeing
+	 * cache/eviction pressure (it allows us to trade performance and disk
+	 * space for cache space), and it's likely lookaside blocks are being
+	 * evicted, and reading them back in doesn't help things. A trickier,
+	 * but possibly better, alternative might be to review all lookaside
+	 * blocks in the cache in order to get rid of them, and slowly review
+	 * lookaside blocks that have already been evicted.
+	 *
+	 * We can't know for sure how many records are in the lookaside table,
+	 * the cursor insert and remove statistics aren't updated atomically.
+	 * Start with reviewing 100 rows, and if it takes more than the target
+	 * number of calls to finish, increase the number of rows checked on
+	 * each call; if it takes less than the target calls to finish, then
+	 * decrease the number of rows reviewed on each call (but never less
+	 * than 100).
+	 */
+#define	WT_SWEEP_LOOKASIDE_MIN_CNT	100
+#define	WT_SWEEP_LOOKASIDE_PASS_TARGET	 30
+	++conn->las_sweep_call;
+	if ((cnt = conn->las_sweep_cnt) < WT_SWEEP_LOOKASIDE_MIN_CNT)
+		cnt = conn->las_sweep_cnt = WT_SWEEP_LOOKASIDE_MIN_CNT;
+
+	/* Walk the file. */
+	for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) {
+		/*
+		 * If the loop terminates after completing a work unit, we will
+		 * continue the table sweep next time. Get a local copy of the
+		 * sweep key, we're going to reset the cursor; do so before
+		 * calling cursor.remove, cursor.remove can discard our hazard
+		 * pointer and the page could be evicted from underneath us.
+		 */
+		if (cnt == 1) {
+			WT_ERR(__wt_cursor_get_raw_key(cursor, key));
+			if (!WT_DATA_IN_ITEM(key))
+				WT_ERR(__wt_buf_set(
+				    session, key, key->data, key->size));
+		}
+
+		WT_ERR(cursor->get_key(cursor,
+		    &las_id, las_addr, &las_counter, &las_txnid, las_key));
+
+		/*
+		 * If the on-page record transaction ID associated with the
+		 * record is globally visible, the record can be discarded.
+		 *
+		 * Cursor opened overwrite=true: won't return WT_NOTFOUND should
+		 * another thread remove the record before we do, and the cursor
+		 * remains positioned in that case.
+		 */
+		if (__wt_txn_visible_all(session, las_txnid))
+			WT_ERR(cursor->remove(cursor));
+	}
+
+	/*
+	 * When reaching the lookaside table end or the target number of calls,
+	 * adjust the row count. Decrease/increase the row count depending on
+	 * if the number of calls is less/more than the target.
+	 */
+	if (ret == WT_NOTFOUND ||
+	    conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET) {
+		if (conn->las_sweep_call < WT_SWEEP_LOOKASIDE_PASS_TARGET &&
+		    conn->las_sweep_cnt > WT_SWEEP_LOOKASIDE_MIN_CNT)
+			conn->las_sweep_cnt -= WT_SWEEP_LOOKASIDE_MIN_CNT;
+		if (conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET)
+			conn->las_sweep_cnt += WT_SWEEP_LOOKASIDE_MIN_CNT;
+	}
+
+srch_notfound:
+	if (ret == WT_NOTFOUND)
+		conn->las_sweep_call = 0;
+
+	WT_ERR_NOTFOUND_OK(ret);
+
+	if (0) {
+err:		__wt_buf_free(session, key);
+	}
+
+	WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
+
+	__wt_scr_free(session, &las_addr);
+	__wt_scr_free(session, &las_key);
+
+	return (ret);
+}
diff --git a/src/config/config_def.c b/src/config/config_def.c
index 73837c46ee8..91cfcedfcaf 100644
--- a/src/config/config_def.c
+++ b/src/config/config_def.c
@@ -76,6 +76,7 @@ static const WT_CONFIG_CHECK
     confchk_wiredtiger_open_shared_cache_subconfigs[] = {
 	{ "chunk", "int", NULL, "min=1MB,max=10TB", NULL, 0 },
 	{ "name", "string", NULL, NULL, NULL, 0 },
+	{ "quota", "int", NULL, NULL, NULL, 0 },
 	{ "reserve", "int", NULL, NULL, NULL, 0 },
 	{ "size", "int", NULL, "min=1MB,max=10TB", NULL, 0 },
 	{ NULL, NULL, NULL, NULL, NULL, 0 }
@@ -121,7 +122,7 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = {
 	{ "lsm_merge", "boolean", NULL, NULL, NULL, 0 },
 	{ "shared_cache", "category",
 	    NULL, NULL,
-	    confchk_wiredtiger_open_shared_cache_subconfigs, 4 },
+	    confchk_wiredtiger_open_shared_cache_subconfigs, 5 },
 	{ "statistics", "list",
 	    NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
 	    NULL, 0 },
@@ -520,7 +521,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
 	{ "session_scratch_max", "int", NULL, NULL, NULL, 0 },
 	{ "shared_cache", "category",
 	    NULL, NULL,
-	    confchk_wiredtiger_open_shared_cache_subconfigs, 4 },
+	    confchk_wiredtiger_open_shared_cache_subconfigs, 5 },
 	{ "statistics", "list",
 	    NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
 	    NULL, 0 },
@@ -595,7 +596,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = {
 	{ "session_scratch_max", "int", NULL, NULL, NULL, 0 },
 	{ "shared_cache", "category",
 	    NULL, NULL,
-	    confchk_wiredtiger_open_shared_cache_subconfigs, 4 },
+	    confchk_wiredtiger_open_shared_cache_subconfigs, 5 },
 	{ "statistics", "list",
 	    NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
 	    NULL, 0 },
@@ -668,7 +669,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = {
 	{ "session_scratch_max", "int", NULL, NULL, NULL, 0 },
 	{ "shared_cache", "category",
 	    NULL, NULL,
-	    confchk_wiredtiger_open_shared_cache_subconfigs, 4 },
+	    confchk_wiredtiger_open_shared_cache_subconfigs, 5 },
 	{ "statistics", "list",
 	    NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
 	    NULL, 0 },
@@ -740,7 +741,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = {
 	{ "session_scratch_max", "int", NULL, NULL, NULL, 0 },
 	{ "shared_cache", "category",
 	    NULL, NULL,
-	    confchk_wiredtiger_open_shared_cache_subconfigs, 4 },
+	    confchk_wiredtiger_open_shared_cache_subconfigs, 5 },
 	{ "statistics", "list",
 	    NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
 	    NULL, 0 },
@@ -807,8 +808,8 @@ static const WT_CONFIG_ENTRY config_entries[] = {
 	  "eviction_dirty_trigger=95,eviction_target=80,eviction_trigger=95"
 	  ",file_manager=(close_handle_minimum=250,close_idle_time=30,"
 	  "close_scan_interval=10),lsm_manager=(merge=,worker_thread_max=4)"
-	  ",lsm_merge=,shared_cache=(chunk=10MB,name=,reserve=0,size=500MB)"
-	  ",statistics=none,statistics_log=(on_close=0,"
+	  ",lsm_merge=,shared_cache=(chunk=10MB,name=,quota=0,reserve=0,"
+	  "size=500MB),statistics=none,statistics_log=(on_close=0,"
 	  "path=\"WiredTigerStat.%d.%H\",sources=,"
 	  "timestamp=\"%b %d %H:%M:%S\",wait=0),verbose=",
 	  confchk_WT_CONNECTION_reconfigure, 17
@@ -959,9 +960,9 @@ static const WT_CONFIG_ENTRY config_entries[] = {
 	  "log=(archive=,compressor=,enabled=0,file_max=100MB,path=,"
 	  "prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4),"
 	  "lsm_merge=,mmap=,multiprocess=0,session_max=100,"
-	  "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,reserve=0"
-	  ",size=500MB),statistics=none,statistics_log=(on_close=0,"
-	  "path=\"WiredTigerStat.%d.%H\",sources=,"
+	  "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0,"
+	  "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0"
+	  ",path=\"WiredTigerStat.%d.%H\",sources=,"
 	  "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
 	  ",method=fsync),use_environment_priv=0,verbose=",
 	  confchk_wiredtiger_open, 34
@@ -979,9 +980,9 @@ static const WT_CONFIG_ENTRY config_entries[] = {
 	  "log=(archive=,compressor=,enabled=0,file_max=100MB,path=,"
 	  "prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4),"
 	  "lsm_merge=,mmap=,multiprocess=0,session_max=100,"
-	  "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,reserve=0"
-	  ",size=500MB),statistics=none,statistics_log=(on_close=0,"
-	  "path=\"WiredTigerStat.%d.%H\",sources=,"
+	  "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0,"
+	  "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0"
+	  ",path=\"WiredTigerStat.%d.%H\",sources=,"
 	  "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
 	  ",method=fsync),use_environment_priv=0,verbose=,version=(major=0,"
 	  "minor=0)",
@@ -999,9 +1000,9 @@ static const WT_CONFIG_ENTRY config_entries[] = {
 	  "log=(archive=,compressor=,enabled=0,file_max=100MB,path=,"
 	  "prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4),"
 	  "lsm_merge=,mmap=,multiprocess=0,session_max=100,"
-	  "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,reserve=0"
-	  ",size=500MB),statistics=none,statistics_log=(on_close=0,"
-	  "path=\"WiredTigerStat.%d.%H\",sources=,"
+	  "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0,"
+	  "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0"
+	  ",path=\"WiredTigerStat.%d.%H\",sources=,"
 	  "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
 	  ",method=fsync),verbose=,version=(major=0,minor=0)",
 	  confchk_wiredtiger_open_basecfg, 31
@@ -1018,9 +1019,9 @@ static const WT_CONFIG_ENTRY config_entries[] = {
 	  "log=(archive=,compressor=,enabled=0,file_max=100MB,path=,"
 	  "prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4),"
 	  "lsm_merge=,mmap=,multiprocess=0,session_max=100,"
-	  "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,reserve=0"
-	  ",size=500MB),statistics=none,statistics_log=(on_close=0,"
-	  "path=\"WiredTigerStat.%d.%H\",sources=,"
+	  "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0,"
+	  "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0"
+	  ",path=\"WiredTigerStat.%d.%H\",sources=,"
 	  "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
 	  ",method=fsync),verbose=",
 	  confchk_wiredtiger_open_usercfg, 30
diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c
index 067ad00560e..b1155d06826 100644
--- a/src/conn/conn_api.c
+++ b/src/conn/conn_api.c
@@ -432,7 +432,7 @@ __wt_encryptor_config(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *cval,
 		    "requires connection encryption to be set");
 	hash = __wt_hash_city64(keyid->str, keyid->len);
 	bucket = hash % WT_HASH_ARRAY_SIZE;
-	SLIST_FOREACH(kenc, &nenc->keyedhashlh[bucket], l)
+	TAILQ_FOREACH(kenc, &nenc->keyedhashqh[bucket], q)
 		if (WT_STRING_MATCH(kenc->keyid, keyid->str, keyid->len))
 			goto out;
 
@@ -450,8 +450,8 @@ __wt_encryptor_config(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *cval,
 	WT_ERR(encryptor->sizing(encryptor, &session->iface,
 	    &kenc->size_const));
 	kenc->encryptor = encryptor;
-	SLIST_INSERT_HEAD(&nenc->keyedlh, kenc, l);
-	SLIST_INSERT_HEAD(&nenc->keyedhashlh[bucket], kenc, hashl);
+	TAILQ_INSERT_HEAD(&nenc->keyedqh, kenc, q);
+	TAILQ_INSERT_HEAD(&nenc->keyedhashqh[bucket], kenc, hashq);
 
 out:	__wt_spin_unlock(session, &conn->encryptor_lock);
 	*kencryptorp = kenc;
@@ -506,9 +506,9 @@ __conn_add_encryptor(WT_CONNECTION *wt_conn,
 	WT_ERR(__wt_calloc_one(session, &nenc));
 	WT_ERR(__wt_strdup(session, name, &nenc->name));
 	nenc->encryptor = encryptor;
-	SLIST_INIT(&nenc->keyedlh);
+	TAILQ_INIT(&nenc->keyedqh);
 	for (i = 0; i < WT_HASH_ARRAY_SIZE; i++)
-		SLIST_INIT(&nenc->keyedhashlh[i]);
+		TAILQ_INIT(&nenc->keyedhashqh[i]);
 
 	TAILQ_INSERT_TAIL(&conn->encryptqh, nenc, q);
 	nenc = NULL;
@@ -537,15 +537,14 @@ __wt_conn_remove_encryptor(WT_SESSION_IMPL *session)
 	conn = S2C(session);
 
 	while ((nenc = TAILQ_FIRST(&conn->encryptqh)) != NULL) {
-		while ((kenc = SLIST_FIRST(&nenc->keyedlh)) != NULL) {
+		while ((kenc = TAILQ_FIRST(&nenc->keyedqh)) != NULL) {
 			/* Call any termination method. */
 			if (kenc->owned && kenc->encryptor->terminate != NULL)
 				WT_TRET(kenc->encryptor->terminate(
 				    kenc->encryptor, (WT_SESSION *)session));
 
 			/* Remove from the connection's list, free memory. */
-			SLIST_REMOVE(
-			    &nenc->keyedlh, kenc, __wt_keyed_encryptor, l);
+			TAILQ_REMOVE(&nenc->keyedqh, kenc, q);
 			__wt_free(session, kenc->keyid);
 			__wt_free(session, kenc);
 		}
@@ -1725,7 +1724,8 @@ __conn_write_base_config(WT_SESSION_IMPL *session, const char *cfg[])
 	    "encryption=(secretkey=),"
 	    "exclusive=,"
 	    "log=(recover=),"
-	    "use_environment_priv=,", &base_config));
+	    "use_environment_priv=,"
+	    "verbose=,", &base_config));
 	WT_ERR(__wt_config_init(session, &parser, base_config));
 	while ((ret = __wt_config_next(&parser, &k, &v)) == 0) {
 		/* Fix quoting for non-trivial settings. */
@@ -1795,6 +1795,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
 	WT_DECL_RET;
 	const WT_NAME_FLAG *ft;
 	WT_SESSION_IMPL *session;
+	int64_t config_base_set;
 	const char *enc_cfg[] = { NULL, NULL };
 	char version[64];
 
@@ -1836,6 +1837,10 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
 	cfg[0] = WT_CONFIG_BASE(session, wiredtiger_open);
 	cfg[1] = config;
 
+	/* Capture the config_base setting file for later use. */
+	WT_ERR(__wt_config_gets(session, cfg, "config_base", &cval));
+	config_base_set = cval.val;
+
 	/* Configure error messages so we get them right early. */
 	WT_ERR(__wt_config_gets(session, cfg, "error_prefix", &cval));
 	if (cval.len != 0)
@@ -1873,7 +1878,10 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
 	    WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR) >=
 	    (int)sizeof(version), ENOMEM);
 	__conn_config_append(cfg, version);
-	WT_ERR(__conn_config_file(session, WT_BASECONFIG, 0, cfg, i1));
+
+	/* Ignore the base_config file if we config_base set to false. */
+	if (config_base_set != 0)
+		WT_ERR(__conn_config_file(session, WT_BASECONFIG, 0, cfg, i1));
 	__conn_config_append(cfg, config);
 	WT_ERR(__conn_config_file(session, WT_USERCONFIG, 1, cfg, i2));
 	WT_ERR(__conn_config_env(session, cfg, i3));
@@ -1904,7 +1912,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
 	conn->hazard_max = (uint32_t)cval.val;
 
 	WT_ERR(__wt_config_gets(session, cfg, "session_max", &cval));
-	conn->session_size = (uint32_t)cval.val + WT_NUM_INTERNAL_SESSIONS;
+	conn->session_size = (uint32_t)cval.val + WT_EXTRA_INTERNAL_SESSIONS;
 
 	WT_ERR(__wt_config_gets(session, cfg, "session_scratch_max", &cval));
 	conn->session_scratch_max = (size_t)cval.val;
@@ -2023,11 +2031,12 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
 	WT_ERR(__wt_turtle_init(session));
 	WT_ERR(__wt_metadata_open(session));
 
-	/*
-	 * Start the worker threads last.
-	 */
+	/* Start the worker threads and run recovery. */
 	WT_ERR(__wt_connection_workers(session, cfg));
 
+	/* Create the lookaside table. */
+	WT_ERR(__wt_las_create(session));
+
 	WT_STATIC_ASSERT(offsetof(WT_CONNECTION_IMPL, iface) == 0);
 	*wt_connp = &conn->iface;
 
diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c
index d62425fe536..8f62c7140c7 100644
--- a/src/conn/conn_cache.c
+++ b/src/conn/conn_cache.c
@@ -156,7 +156,8 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[])
 
 	/* Allocate the LRU eviction queue. */
 	cache->evict_slots = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR;
-	WT_ERR(__wt_calloc_def(session, cache->evict_slots, &cache->evict));
+	WT_ERR(__wt_calloc_def(session,
+	    cache->evict_slots, &cache->evict_queue));
 
 	/*
 	 * We get/set some values in the cache statistics (rather than have
@@ -178,12 +179,12 @@ __wt_cache_stats_update(WT_SESSION_IMPL *session)
 {
 	WT_CACHE *cache;
 	WT_CONNECTION_IMPL *conn;
-	WT_CONNECTION_STATS *stats;
+	WT_CONNECTION_STATS **stats;
 	uint64_t inuse, leaf, used;
 
 	conn = S2C(session);
 	cache = conn->cache;
-	stats = &conn->stats;
+	stats = conn->stats;
 
 	inuse = __wt_cache_bytes_inuse(cache);
 	/*
@@ -193,19 +194,23 @@ __wt_cache_stats_update(WT_SESSION_IMPL *session)
 	used = cache->bytes_overflow + cache->bytes_internal;
 	leaf = inuse > used ? inuse - used : 0;
 
-	WT_STAT_SET(stats, cache_bytes_max, conn->cache_size);
-	WT_STAT_SET(stats, cache_bytes_inuse, inuse);
+	WT_STAT_SET(session, stats, cache_bytes_max, conn->cache_size);
+	WT_STAT_SET(session, stats, cache_bytes_inuse, inuse);
 
-	WT_STAT_SET(stats, cache_overhead, cache->overhead_pct);
-	WT_STAT_SET(stats, cache_pages_inuse, __wt_cache_pages_inuse(cache));
-	WT_STAT_SET(stats, cache_bytes_dirty, __wt_cache_dirty_inuse(cache));
-	WT_STAT_SET(stats,
+	WT_STAT_SET(session, stats, cache_overhead, cache->overhead_pct);
+	WT_STAT_SET(
+	    session, stats, cache_pages_inuse, __wt_cache_pages_inuse(cache));
+	WT_STAT_SET(
+	    session, stats, cache_bytes_dirty, __wt_cache_dirty_inuse(cache));
+	WT_STAT_SET(session, stats,
 	    cache_eviction_maximum_page_size, cache->evict_max_page_size);
-	WT_STAT_SET(stats, cache_pages_dirty, cache->pages_dirty);
+	WT_STAT_SET(session, stats, cache_pages_dirty, cache->pages_dirty);
 
-	WT_STAT_SET(stats, cache_bytes_internal, cache->bytes_internal);
-	WT_STAT_SET(stats, cache_bytes_overflow, cache->bytes_overflow);
-	WT_STAT_SET(stats, cache_bytes_leaf, leaf);
+	WT_STAT_SET(
+	    session, stats, cache_bytes_internal, cache->bytes_internal);
+	WT_STAT_SET(
+	    session, stats, cache_bytes_overflow, cache->bytes_overflow);
+	WT_STAT_SET(session, stats, cache_bytes_leaf, leaf);
 }
 
 /*
@@ -246,7 +251,7 @@ __wt_cache_destroy(WT_SESSION_IMPL *session)
 	__wt_spin_destroy(session, &cache->evict_lock);
 	__wt_spin_destroy(session, &cache->evict_walk_lock);
 
-	__wt_free(session, cache->evict);
+	__wt_free(session, cache->evict_queue);
 	__wt_free(session, conn->cache);
 	return (ret);
 }
diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c
index fdc95a32387..aaae58ef168 100644
--- a/src/conn/conn_cache_pool.c
+++ b/src/conn/conn_cache_pool.c
@@ -22,21 +22,22 @@
  */
 #define	WT_CACHE_POOL_REDUCE_THRESHOLD	20
 /* Balancing passes after a bump before a connection is a candidate. */
-#define	WT_CACHE_POOL_BUMP_SKIPS	10
+#define	WT_CACHE_POOL_BUMP_SKIPS	5
 /* Balancing passes after a reduction before a connection is a candidate. */
-#define	WT_CACHE_POOL_REDUCE_SKIPS	5
+#define	WT_CACHE_POOL_REDUCE_SKIPS	10
 
 /*
  * Constants that control how much influence different metrics have on
  * the pressure calculation.
  */
-#define	WT_CACHE_POOL_APP_EVICT_MULTIPLIER	10
-#define	WT_CACHE_POOL_APP_WAIT_MULTIPLIER	50
+#define	WT_CACHE_POOL_APP_EVICT_MULTIPLIER	3
+#define	WT_CACHE_POOL_APP_WAIT_MULTIPLIER	6
 #define	WT_CACHE_POOL_READ_MULTIPLIER	1
 
-static int __cache_pool_adjust(WT_SESSION_IMPL *, uint64_t, uint64_t, int *);
+static int __cache_pool_adjust(
+    WT_SESSION_IMPL *, uint64_t, uint64_t, int, int *);
 static int __cache_pool_assess(WT_SESSION_IMPL *, uint64_t *);
-static int __cache_pool_balance(WT_SESSION_IMPL *);
+static int __cache_pool_balance(WT_SESSION_IMPL *, int);
 
 /*
  * __wt_cache_pool_config --
@@ -51,7 +52,7 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
 	WT_DECL_RET;
 	char *pool_name;
 	int created, updating;
-	uint64_t chunk, reserve, size, used_cache;
+	uint64_t chunk, quota, reserve, size, used_cache;
 
 	conn = S2C(session);
 	created = updating = 0;
@@ -142,6 +143,11 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
 			chunk = (uint64_t)cval.val;
 		else
 			chunk = cp->chunk;
+		if (__wt_config_gets(session, &cfg[1],
+		    "shared_cache.quota", &cval) == 0 && cval.val != 0)
+			quota = (uint64_t)cval.val;
+		else
+			quota = cp->quota;
 	} else {
 		/*
 		 * The only time shared cache configuration uses default
@@ -155,6 +161,9 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
 		    session, cfg, "shared_cache.chunk", &cval));
 		WT_ASSERT(session, cval.val != 0);
 		chunk = (uint64_t)cval.val;
+		WT_ERR(__wt_config_gets(
+		    session, cfg, "shared_cache.quota", &cval));
+		quota = (uint64_t)cval.val;
 	}
 
 	/*
@@ -197,8 +206,10 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
 	/* The configuration is verified - it's safe to update the pool. */
 	cp->size = size;
 	cp->chunk = chunk;
+	cp->quota = quota;
 
 	conn->cache->cp_reserved = reserve;
+	conn->cache->cp_quota = quota;
 
 	/* Wake up the cache pool server so any changes are noticed. */
 	if (updating)
@@ -402,7 +413,7 @@ __wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session)
  *	effectively used.
  */
 static int
-__cache_pool_balance(WT_SESSION_IMPL *session)
+__cache_pool_balance(WT_SESSION_IMPL *session, int forward)
 {
 	WT_CACHE_POOL *cp;
 	WT_DECL_RET;
@@ -421,16 +432,16 @@ __cache_pool_balance(WT_SESSION_IMPL *session)
 
 	WT_ERR(__cache_pool_assess(session, &highest));
 	bump_threshold = WT_CACHE_POOL_BUMP_THRESHOLD;
+
 	/*
 	 * Actively attempt to:
 	 * - Reduce the amount allocated, if we are over the budget
 	 * - Increase the amount used if there is capacity and any pressure.
 	 */
-	for (bump_threshold = WT_CACHE_POOL_BUMP_THRESHOLD;
-	    F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) &&
-	    F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN);) {
+	while (F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) &&
+	    F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN)) {
 		WT_ERR(__cache_pool_adjust(
-		    session, highest, bump_threshold, &adjusted));
+		    session, highest, bump_threshold, forward, &adjusted));
 		/*
 		 * Stop if the amount of cache being used is stable, and we
 		 * aren't over capacity.
@@ -456,30 +467,39 @@ __cache_pool_assess(WT_SESSION_IMPL *session, uint64_t *phighest)
 	WT_CACHE *cache;
 	WT_CONNECTION_IMPL *entry;
 	uint64_t app_evicts, app_waits, reads;
-	uint64_t entries, highest, tmp;
+	uint64_t balanced_size, entries, highest, tmp;
 
 	cp = __wt_process.cache_pool;
-	entries = 0;
+	balanced_size = entries = 0;
 	highest = 1; /* Avoid divide by zero */
 
+	TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) {
+		if (entry->cache_size == 0 || entry->cache == NULL)
+			continue;
+		++entries;
+	}
+
+	if (entries > 0)
+		balanced_size = cp->currently_used / entries;
+
 	/* Generate read pressure information. */
 	TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) {
-		if (entry->cache_size == 0 ||
-		    entry->cache == NULL)
+		if (entry->cache_size == 0 || entry->cache == NULL)
 			continue;
 		cache = entry->cache;
-		++entries;
 
 		/*
 		 * Figure out a delta since the last time we did an assessment
 		 * for each metric we are tracking.  Watch out for wrapping
 		 * of values.
+		 *
+		 * Count pages read, assuming pages are 4KB.
 		 */
-		tmp = cache->bytes_read;
+		tmp = cache->bytes_read >> 12;
 		if (tmp >= cache->cp_saved_read)
 			reads = tmp - cache->cp_saved_read;
 		else
-			reads = (UINT64_MAX - cache->cp_saved_read) + tmp;
+			reads = tmp;
 		cache->cp_saved_read = tmp;
 
 		/* Update the application eviction count information */
@@ -500,12 +520,19 @@ __cache_pool_assess(WT_SESSION_IMPL *session, uint64_t *phighest)
 			    (UINT64_MAX - cache->cp_saved_app_waits) + tmp;
 		cache->cp_saved_app_waits = tmp;
 
-		/* Calculate the weighted pressure for this member */
-		cache->cp_pass_pressure =
-		    (app_evicts * WT_CACHE_POOL_APP_EVICT_MULTIPLIER) +
+		/* Calculate the weighted pressure for this member. */
+		tmp = (app_evicts * WT_CACHE_POOL_APP_EVICT_MULTIPLIER) +
 		    (app_waits * WT_CACHE_POOL_APP_WAIT_MULTIPLIER) +
 		    (reads * WT_CACHE_POOL_READ_MULTIPLIER);
 
+		/* Weight smaller caches higher. */
+		tmp = (uint64_t)(tmp *
+		    ((double)balanced_size / entry->cache_size));
+
+		/* Smooth over history. */
+		cache->cp_pass_pressure =
+		    (9 * cache->cp_pass_pressure + tmp) / 10;
+
 		if (cache->cp_pass_pressure > highest)
 			highest = cache->cp_pass_pressure;
 
@@ -524,24 +551,25 @@ __cache_pool_assess(WT_SESSION_IMPL *session, uint64_t *phighest)
 
 /*
  * __cache_pool_adjust --
- *	Adjust the allocation of cache to each connection. If force is set
+ *	Adjust the allocation of cache to each connection. If full is set
  *	ignore cache load information, and reduce the allocation for every
  *	connection allocated more than their reserved size.
  */
 static int
 __cache_pool_adjust(WT_SESSION_IMPL *session,
-    uint64_t highest, uint64_t bump_threshold, int *adjustedp)
+    uint64_t highest, uint64_t bump_threshold, int forward, int *adjustedp)
 {
 	WT_CACHE_POOL *cp;
 	WT_CACHE *cache;
 	WT_CONNECTION_IMPL *entry;
-	uint64_t adjusted, highest_percentile, pressure, reserved;
-	int force, grew;
+	uint64_t adjustment, highest_percentile, pressure, reserved, smallest;
+	int busy, pool_full, grow;
+	u_int pct_full;
 
 	*adjustedp = 0;
 	cp = __wt_process.cache_pool;
-	force = (cp->currently_used > cp->size);
-	grew = 0;
+	grow = 0;
+	pool_full = (cp->currently_used >= cp->size);
 	/* Highest as a percentage, avoid 0 */
 	highest_percentile = (highest / 100) + 1;
 
@@ -549,13 +577,17 @@ __cache_pool_adjust(WT_SESSION_IMPL *session,
 		WT_RET(__wt_verbose(session,
 		    WT_VERB_SHARED_CACHE, "Cache pool distribution: "));
 		WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
-		    "\t" "cache_size, pressure, skips: "));
+		    "\t" "cache (MB), pressure, skips, busy, %% full:"));
 	}
 
-	TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) {
+	for (entry = forward ? TAILQ_FIRST(&cp->cache_pool_qh) :
+	    TAILQ_LAST(&cp->cache_pool_qh, __wt_cache_pool_qh);
+	    entry != NULL;
+	    entry = forward ? TAILQ_NEXT(entry, cpq) :
+	    TAILQ_PREV(entry, __wt_cache_pool_qh, cpq)) {
 		cache = entry->cache;
 		reserved = cache->cp_reserved;
-		adjusted = 0;
+		adjustment = 0;
 
 		/*
 		 * The read pressure is calculated as a percentage of how
@@ -565,84 +597,109 @@ __cache_pool_adjust(WT_SESSION_IMPL *session,
 		 * assigned.
 		 */
 		pressure = cache->cp_pass_pressure / highest_percentile;
+		busy = __wt_eviction_needed(entry->default_session, &pct_full);
+
 		WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
-		    "\t%" PRIu64 ", %" PRIu64 ", %" PRIu32,
-		    entry->cache_size, pressure, cache->cp_skip_count));
+		    "\t%5" PRIu64 ", %3" PRIu64 ", %2" PRIu32 ", %d, %2u",
+		    entry->cache_size >> 20, pressure, cache->cp_skip_count,
+		    busy, pct_full));
 
 		/* Allow to stabilize after changes. */
 		if (cache->cp_skip_count > 0 && --cache->cp_skip_count > 0)
 			continue;
+
 		/*
 		 * If the entry is currently allocated less than the reserved
-		 * size, increase it's allocation. This should only happen if:
-		 *  - It's the first time we've seen this member
-		 *  - The reserved size has been adjusted
+		 * size, increase its allocation. This should only happen if:
+		 *  - it's the first time we've seen this member, or
+		 *  - the reserved size has been adjusted
 		 */
 		if (entry->cache_size < reserved) {
-			grew = 1;
-			adjusted = reserved - entry->cache_size;
-
+			grow = 1;
+			adjustment = reserved - entry->cache_size;
 		/*
 		 * Conditions for reducing the amount of resources for an
 		 * entry:
-		 *  - If we are forcing and this entry has more than the
-		 *    minimum amount of space in use.
-		 *  - If the read pressure in this entry is below the
-		 *    threshold, other entries need more cache, the entry has
-		 *    more than the minimum space and there is no available
-		 *    space in the pool.
+		 *  - the pool is full,
+		 *  - application threads are not busy doing eviction already,
+		 *  - this entry has more than the minimum amount of space in
+		 *    use,
+		 *  - the read pressure in this entry is below the threshold,
+		 *    other entries need more cache, the entry has more than
+		 *    the minimum space and there is no available space in the
+		 *    pool.
 		 */
-		} else if ((force && entry->cache_size > reserved) ||
-		    (pressure < WT_CACHE_POOL_REDUCE_THRESHOLD &&
-		    highest > 1 && entry->cache_size > reserved &&
-		    cp->currently_used >= cp->size)) {
-			grew = 0;
+		} else if (pool_full && !busy &&
+		    entry->cache_size > reserved &&
+		    pressure < WT_CACHE_POOL_REDUCE_THRESHOLD && highest > 1) {
+			grow = 0;
 			/*
-			 * Shrink by a chunk size if that doesn't drop us
-			 * below the reserved size.
+			 * Don't drop the size down too much - or it can
+			 * trigger aggressive eviction in the connection,
+			 * which is likely to lead to lower throughput and
+			 * potentially a negative feedback loop in the
+			 * balance algorithm.
 			 */
-			if (entry->cache_size > cp->chunk + reserved)
-				adjusted = cp->chunk;
-			else
-				adjusted = entry->cache_size - reserved;
+			smallest = (100 * __wt_cache_bytes_inuse(cache)) /
+			    cache->eviction_trigger;
+			if (entry->cache_size > smallest)
+				adjustment = WT_MIN(cp->chunk,
+				    (entry->cache_size - smallest) / 2);
+			adjustment =
+			    WT_MIN(adjustment, entry->cache_size - reserved);
 		/*
 		 * Conditions for increasing the amount of resources for an
 		 * entry:
-		 *  - There was some activity across the pool
-		 *  - This entry is using less than the entire cache pool
-		 *  - The connection is using enough cache to require eviction
-		 *  - There is space available in the pool
-		 *  - Additional cache would benefit the connection OR
-		 *  - The pool is less than half distributed
+		 *  - there is space available in the pool
+		 *  - the connection isn't over quota
+		 *  - the connection is using enough cache to require eviction
+		 *  - there was some activity across the pool
+		 *  - this entry is using less than the entire cache pool
+		 *  - additional cache would benefit the connection OR
+		 *  - the pool is less than half distributed
 		 */
-		} else if (entry->cache_size < cp->size &&
+		} else if (!pool_full &&
+		    (cache->cp_quota == 0 ||
+		    entry->cache_size < cache->cp_quota) &&
 		    __wt_cache_bytes_inuse(cache) >=
 		    (entry->cache_size * cache->eviction_target) / 100 &&
-		    ((cp->currently_used < cp->size &&
-		    pressure > bump_threshold) ||
+		    (pressure > bump_threshold ||
 		    cp->currently_used < cp->size * 0.5)) {
-			grew = 1;
-			adjusted = WT_MIN(cp->chunk,
-			    cp->size - cp->currently_used);
+			grow = 1;
+			adjustment = WT_MIN(WT_MIN(cp->chunk,
+			    cp->size - cp->currently_used),
+			    cache->cp_quota - entry->cache_size);
 		}
-		if (adjusted > 0) {
+		/*
+		 * Bounds checking: don't go over the pool size or under the
+		 * reserved size for this cache.
+		 *
+		 * Shrink by a chunk size if that doesn't drop us
+		 * below the reserved size.
+		 *
+		 * Limit the reduction to half of the free space in the
+		 * connection's cache.  This should reduce cache sizes
+		 * gradually without stalling application threads.
+		 */
+		if (adjustment > 0) {
 			*adjustedp = 1;
-			if (grew > 0) {
+			if (grow) {
 				cache->cp_skip_count = WT_CACHE_POOL_BUMP_SKIPS;
-				entry->cache_size += adjusted;
-				cp->currently_used += adjusted;
+				entry->cache_size += adjustment;
+				cp->currently_used += adjustment;
 			} else {
 				cache->cp_skip_count =
 				    WT_CACHE_POOL_REDUCE_SKIPS;
 				WT_ASSERT(session,
-				    entry->cache_size >= adjusted &&
-				    cp->currently_used >= adjusted);
-				entry->cache_size -= adjusted;
-				cp->currently_used -= adjusted;
+				    entry->cache_size >= adjustment &&
+				    cp->currently_used >= adjustment);
+				entry->cache_size -= adjustment;
+				cp->currently_used -= adjustment;
 			}
 			WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
 			    "Allocated %s%" PRId64 " to %s",
-			    grew ? "" : "-", adjusted, entry->home));
+			    grow ? "" : "-", adjustment, entry->home));
+
 			/*
 			 * TODO: Add a loop waiting for connection to give up
 			 * cache.
@@ -663,11 +720,13 @@ __wt_cache_pool_server(void *arg)
 	WT_CACHE_POOL *cp;
 	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
+	int forward;
 
 	session = (WT_SESSION_IMPL *)arg;
 
 	cp = __wt_process.cache_pool;
 	cache = S2C(session)->cache;
+	forward = 1;
 
 	while (F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) &&
 	    F_ISSET(cache, WT_CACHE_POOL_RUN)) {
@@ -695,8 +754,10 @@ __wt_cache_pool_server(void *arg)
 		 * Continue even if there was an error. Details of errors are
 		 * reported in the balance function.
 		 */
-		if (F_ISSET(cache, WT_CACHE_POOL_MANAGER))
-			(void)__cache_pool_balance(session);
+		if (F_ISSET(cache, WT_CACHE_POOL_MANAGER)) {
+			(void)__cache_pool_balance(session, forward);
+			forward = !forward;
+		}
 	}
 
 	if (0) {
diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c
index 76f55fa44e5..92497484408 100644
--- a/src/conn/conn_dhandle.c
+++ b/src/conn/conn_dhandle.c
@@ -55,6 +55,8 @@ __conn_dhandle_alloc(WT_SESSION_IMPL *session,
 	WT_ERR(__wt_spin_init(
 	    session, &dhandle->close_lock, "data handle close"));
 
+	__wt_stat_dsrc_init(dhandle);
+
 	*dhandlep = dhandle;
 	return (0);
 
@@ -81,7 +83,7 @@ __wt_conn_dhandle_find(
 
 	bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE;
 	if (checkpoint == NULL) {
-		SLIST_FOREACH(dhandle, &conn->dhhash[bucket], hashl) {
+		TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) {
 			if (F_ISSET(dhandle, WT_DHANDLE_DEAD))
 				continue;
 			if (dhandle->checkpoint == NULL &&
@@ -91,7 +93,7 @@ __wt_conn_dhandle_find(
 			}
 		}
 	} else
-		SLIST_FOREACH(dhandle, &conn->dhhash[bucket], hashl) {
+		TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) {
 			if (F_ISSET(dhandle, WT_DHANDLE_DEAD))
 				continue;
 			if (dhandle->checkpoint != NULL &&
@@ -404,7 +406,7 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session,
 	if (uri != NULL) {
 		bucket =
 		    __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE;
-		SLIST_FOREACH(dhandle, &conn->dhhash[bucket], hashl)
+		TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq)
 			if (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
 			    !F_ISSET(dhandle, WT_DHANDLE_DEAD) &&
 			    strcmp(uri, dhandle->name) == 0 &&
@@ -412,7 +414,7 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session,
 				WT_RET(__conn_btree_apply_internal(
 				    session, dhandle, func, cfg));
 	} else {
-		SLIST_FOREACH(dhandle, &conn->dhlh, l)
+		TAILQ_FOREACH(dhandle, &conn->dhqh, q)
 			if (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
 			    !F_ISSET(dhandle, WT_DHANDLE_DEAD) &&
 			    (apply_checkpoints ||
@@ -489,7 +491,7 @@ __wt_conn_btree_apply_single(WT_SESSION_IMPL *session,
 
 	hash = __wt_hash_city64(uri, strlen(uri));
 	bucket = hash % WT_HASH_ARRAY_SIZE;
-	SLIST_FOREACH(dhandle, &conn->dhhash[bucket], hashl)
+	TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq)
 		if (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
 		    !F_ISSET(dhandle, WT_DHANDLE_DEAD) &&
 		    (hash == dhandle->name_hash &&
@@ -538,7 +540,7 @@ __wt_conn_dhandle_close_all(
 	WT_ASSERT(session, session->dhandle == NULL);
 
 	bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE;
-	SLIST_FOREACH(dhandle, &conn->dhhash[bucket], hashl) {
+	TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) {
 		if (strcmp(dhandle->name, uri) != 0 ||
 		    F_ISSET(dhandle, WT_DHANDLE_DEAD))
 			continue;
@@ -596,6 +598,7 @@ __conn_dhandle_remove(WT_SESSION_IMPL *session, int final)
 	bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE;
 
 	WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
+	WT_ASSERT(session, dhandle != conn->cache->evict_file_next);
 
 	/* Check if the handle was reacquired by a session while we waited. */
 	if (!final &&
@@ -675,7 +678,7 @@ __wt_conn_dhandle_discard(WT_SESSION_IMPL *session)
 	 * the list, so we do it the hard way.
 	 */
 restart:
-	SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+	TAILQ_FOREACH(dhandle, &conn->dhqh, q) {
 		if (WT_IS_METADATA(dhandle))
 			continue;
 
@@ -694,7 +697,7 @@ restart:
 	F_SET(session, WT_SESSION_NO_DATA_HANDLES);
 
 	/* Close the metadata file handle. */
-	while ((dhandle = SLIST_FIRST(&conn->dhlh)) != NULL)
+	while ((dhandle = TAILQ_FIRST(&conn->dhqh)) != NULL)
 		WT_WITH_DHANDLE(session, dhandle,
 		    WT_TRET(__wt_conn_dhandle_discard_single(session, 1, 0)));
 
diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c
index 94e69897c1d..7a8a6cba838 100644
--- a/src/conn/conn_handle.c
+++ b/src/conn/conn_handle.c
@@ -21,14 +21,14 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn)
 	session = conn->default_session;
 
 	for (i = 0; i < WT_HASH_ARRAY_SIZE; i++) {
-		SLIST_INIT(&conn->dhhash[i]);	/* Data handle hash lists */
-		SLIST_INIT(&conn->fhhash[i]);	/* File handle hash lists */
+		TAILQ_INIT(&conn->dhhash[i]);	/* Data handle hash lists */
+		TAILQ_INIT(&conn->fhhash[i]);	/* File handle hash lists */
 	}
 
-	SLIST_INIT(&conn->dhlh);		/* Data handle list */
+	TAILQ_INIT(&conn->dhqh);		/* Data handle list */
 	TAILQ_INIT(&conn->dlhqh);		/* Library list */
 	TAILQ_INIT(&conn->dsrcqh);		/* Data source list */
-	SLIST_INIT(&conn->fhlh);		/* File list */
+	TAILQ_INIT(&conn->fhqh);		/* File list */
 	TAILQ_INIT(&conn->collqh);		/* Collator list */
 	TAILQ_INIT(&conn->compqh);		/* Compressor list */
 	TAILQ_INIT(&conn->encryptqh);		/* Encryptor list */
@@ -45,7 +45,7 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn)
 	WT_RET(__wt_conn_config_init(session));
 
 	/* Statistics. */
-	__wt_stat_init_connection_stats(&conn->stats);
+	__wt_stat_connection_init(conn);
 
 	/* Locks. */
 	WT_RET(__wt_spin_init(session, &conn->api_lock, "api"));
@@ -55,11 +55,14 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn)
 	WT_RET(__wt_spin_init(session, &conn->fh_lock, "file list"));
 	WT_RET(__wt_rwlock_alloc(session,
 	    &conn->hot_backup_lock, "hot backup"));
+	WT_RET(__wt_spin_init(session, &conn->las_lock, "lookaside table"));
 	WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure"));
 	WT_RET(__wt_spin_init(session, &conn->schema_lock, "schema"));
 	WT_RET(__wt_spin_init(session, &conn->table_lock, "table creation"));
-	WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS(conn), &conn->page_lock));
-	for (i = 0; i < WT_PAGE_LOCKS(conn); ++i)
+
+	WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS, &conn->page_lock));
+	WT_CACHE_LINE_ALIGNMENT_VERIFY(session, conn->page_lock);
+	for (i = 0; i < WT_PAGE_LOCKS; ++i)
 		WT_RET(
 		    __wt_spin_init(session, &conn->page_lock[i], "btree page"));
 
@@ -91,8 +94,8 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn)
 	 */
 	WT_RET(__wt_spin_init(session, &conn->block_lock, "block manager"));
 	for (i = 0; i < WT_HASH_ARRAY_SIZE; i++)
-		SLIST_INIT(&conn->blockhash[i]);/* Block handle hash lists */
-	SLIST_INIT(&conn->blocklh);		/* Block manager list */
+		TAILQ_INIT(&conn->blockhash[i]);/* Block handle hash lists */
+	TAILQ_INIT(&conn->blockqh);		/* Block manager list */
 
 	return (0);
 }
@@ -138,10 +141,11 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn)
 	__wt_spin_destroy(session, &conn->encryptor_lock);
 	__wt_spin_destroy(session, &conn->fh_lock);
 	WT_TRET(__wt_rwlock_destroy(session, &conn->hot_backup_lock));
+	__wt_spin_destroy(session, &conn->las_lock);
 	__wt_spin_destroy(session, &conn->reconfig_lock);
 	__wt_spin_destroy(session, &conn->schema_lock);
 	__wt_spin_destroy(session, &conn->table_lock);
-	for (i = 0; i < WT_PAGE_LOCKS(conn); ++i)
+	for (i = 0; i < WT_PAGE_LOCKS; ++i)
 		__wt_spin_destroy(session, &conn->page_lock[i]);
 	__wt_free(session, conn->page_lock);
 
diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c
index de4bf7268ed..2b115190b06 100644
--- a/src/conn/conn_log.c
+++ b/src/conn/conn_log.c
@@ -287,8 +287,9 @@ __log_file_server(void *arg)
 	WT_DECL_RET;
 	WT_FH *close_fh;
 	WT_LOG *log;
-	WT_LSN close_end_lsn, close_lsn, min_lsn;
+	WT_LSN close_end_lsn, min_lsn;
 	WT_SESSION_IMPL *session;
+	uint32_t filenum;
 	int locked;
 
 	session = arg;
@@ -300,66 +301,97 @@ __log_file_server(void *arg)
 		 * If there is a log file to close, make sure any outstanding
 		 * write operations have completed, then fsync and close it.
 		 */
-		if ((close_fh = log->log_close_fh) != NULL &&
-		    (ret = __wt_log_extract_lognum(session, close_fh->name,
-		    &close_lsn.file)) == 0 &&
-		    close_lsn.file < log->write_lsn.file) {
+		if ((close_fh = log->log_close_fh) != NULL) {
+			WT_ERR(__wt_log_extract_lognum(session, close_fh->name,
+			    &filenum));
 			/*
-			 * We've copied the file handle, clear out the one in
-			 * log structure to allow it to be set again.
+			 * We update the close file handle before updating the
+			 * close LSN when changing files.  It is possible we
+			 * could see mismatched settings.  If we do, yield
+			 * until it is set.  This should rarely happen.
 			 */
-			log->log_close_fh = NULL;
-			/*
-			 * Set the close_end_lsn to the LSN immediately after
-			 * ours.  That is, the beginning of the next log file.
-			 * We need to know the LSN file number of our own close
-			 * in case earlier calls are still in progress and the
-			 * next one to move the sync_lsn into the next file for
-			 * later syncs.
-			 */
-			close_lsn.offset = 0;
-			close_end_lsn = close_lsn;
-			close_end_lsn.file++;
-			WT_ERR(__wt_fsync(session, close_fh));
-			__wt_spin_lock(session, &log->log_sync_lock);
-			locked = 1;
-			WT_ERR(__wt_close(session, &close_fh));
-			WT_ASSERT(session,
-			    WT_LOG_CMP(&close_end_lsn, &log->sync_lsn) >= 0);
-			log->sync_lsn = close_end_lsn;
-			WT_ERR(__wt_cond_signal(session, log->log_sync_cond));
-			locked = 0;
-			__wt_spin_unlock(session, &log->log_sync_lock);
+			while (log->log_close_lsn.file < filenum)
+				__wt_yield();
+
+			if (__wt_log_cmp(
+			    &log->write_lsn, &log->log_close_lsn) >= 0) {
+				/*
+				 * We've copied the file handle, clear out the
+				 * one in the log structure to allow it to be
+				 * set again.  Copy the LSN before clearing
+				 * the file handle.
+				 * Use a barrier to make sure the compiler does
+				 * not reorder the following two statements.
+				 */
+				close_end_lsn = log->log_close_lsn;
+				WT_FULL_BARRIER();
+				log->log_close_fh = NULL;
+				/*
+				 * Set the close_end_lsn to the LSN immediately
+				 * after ours.  That is, the beginning of the
+				 * next log file.   We need to know the LSN
+				 * file number of our own close in case earlier
+				 * calls are still in progress and the next one
+				 * to move the sync_lsn into the next file for
+				 * later syncs.
+				 */
+				close_end_lsn.file++;
+				close_end_lsn.offset = 0;
+				WT_ERR(__wt_fsync(session, close_fh));
+				__wt_spin_lock(session, &log->log_sync_lock);
+				locked = 1;
+				WT_ERR(__wt_close(session, &close_fh));
+				WT_ASSERT(session, __wt_log_cmp(
+				    &close_end_lsn, &log->sync_lsn) >= 0);
+				log->sync_lsn = close_end_lsn;
+				WT_ERR(__wt_cond_signal(
+				    session, log->log_sync_cond));
+				locked = 0;
+				__wt_spin_unlock(session, &log->log_sync_lock);
+			}
 		}
 		/*
 		 * If a later thread asked for a background sync, do it now.
 		 */
-		if (WT_LOG_CMP(&log->bg_sync_lsn, &log->sync_lsn) > 0) {
+		if (__wt_log_cmp(&log->bg_sync_lsn, &log->sync_lsn) > 0) {
 			/*
 			 * Save the latest write LSN which is the minimum
 			 * we will have written to disk.
 			 */
 			min_lsn = log->write_lsn;
 			/*
-			 * The sync LSN we asked for better be smaller than
-			 * the current written LSN.
+			 * We have to wait until the LSN we asked for is
+			 * written.  If it isn't signal the wrlsn thread
+			 * to get it written.
 			 */
-			WT_ASSERT(session,
-			    WT_LOG_CMP(&log->bg_sync_lsn, &min_lsn) <= 0);
-			WT_ERR(__wt_fsync(session, log->log_fh));
-			__wt_spin_lock(session, &log->log_sync_lock);
-			locked = 1;
-			/*
-			 * The sync LSN could have advanced while we were
-			 * writing to disk.
-			 */
-			if (WT_LOG_CMP(&log->sync_lsn, &min_lsn) <= 0) {
-				log->sync_lsn = min_lsn;
+			if (__wt_log_cmp(&log->bg_sync_lsn, &min_lsn) <= 0) {
+				WT_ERR(__wt_fsync(session, log->log_fh));
+				__wt_spin_lock(session, &log->log_sync_lock);
+				locked = 1;
+				/*
+				 * The sync LSN could have advanced while we
+				 * were writing to disk.
+				 */
+				if (__wt_log_cmp(
+				    &log->sync_lsn, &min_lsn) <= 0) {
+					log->sync_lsn = min_lsn;
+					WT_ERR(__wt_cond_signal(
+					    session, log->log_sync_cond));
+				}
+				locked = 0;
+				__wt_spin_unlock(session, &log->log_sync_lock);
+			} else {
 				WT_ERR(__wt_cond_signal(
-				    session, log->log_sync_cond));
+				    session, conn->log_wrlsn_cond));
+				/*
+				 * We do not want to wait potentially a second
+				 * to process this.  Yield to give the wrlsn
+				 * thread a chance to run and try again in
+				 * this case.
+				 */
+				__wt_yield();
+				continue;
 			}
-			locked = 0;
-			__wt_spin_unlock(session, &log->log_sync_lock);
 		}
 		/* Wait until the next event. */
 		WT_ERR(__wt_cond_wait(
@@ -394,26 +426,29 @@ typedef struct {
 /*
  * __wt_log_wrlsn --
  *	Process written log slots and attempt to coalesce them if the LSNs
- *	are contiguous.  Returns 1 if slots were freed, 0 if no slots were
- *	freed in the progress arg.  Must be called with the log slot lock held.
+ *	are contiguous.  The purpose of this function is to advance the
+ *	write_lsn in LSN order after the buffer is written to the log file.
  */
 int
-__wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield)
+__wt_log_wrlsn(WT_SESSION_IMPL *session)
 {
 	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
 	WT_LOG *log;
 	WT_LOG_WRLSN_ENTRY written[WT_SLOT_POOL];
 	WT_LOGSLOT *coalescing, *slot;
+	WT_LSN save_lsn;
 	size_t written_i;
 	uint32_t i, save_i;
 
 	conn = S2C(session);
 	log = conn->log;
+	__wt_spin_lock(session, &log->log_writelsn_lock);
+restart:
 	coalescing = NULL;
+	WT_INIT_LSN(&save_lsn);
 	written_i = 0;
 	i = 0;
-	if (free_i != NULL)
-		*free_i = WT_SLOT_POOL;
 
 	/*
 	 * Walk the array once saving any slots that are in the
@@ -422,9 +457,14 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield)
 	while (i < WT_SLOT_POOL) {
 		save_i = i;
 		slot = &log->slot_pool[i++];
-		if (free_i != NULL && *free_i == WT_SLOT_POOL &&
-		    slot->slot_state == WT_LOG_SLOT_FREE)
-			*free_i = save_i;
+		/*
+		 * XXX - During debugging I saw slot 0 become orphaned.
+		 * I believe it is fixed, but check for now.
+		 * This assertion should catch that.
+		 */
+		if (slot->slot_state == 0)
+			WT_ASSERT(session,
+			    slot->slot_release_lsn.file >= log->write_lsn.file);
 		if (slot->slot_state != WT_LOG_SLOT_WRITTEN)
 			continue;
 		written[written_i].slot_index = save_i;
@@ -435,15 +475,8 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield)
 	 * based on the release LSN, and then look for them in order.
 	 */
 	if (written_i > 0) {
-		/*
-		 * If wanted, reset the yield variable to indicate that we
-		 * have found written slots.
-		 */
-		if (yield != NULL)
-			*yield = 0;
 		WT_INSERTION_SORT(written, written_i,
 		    WT_LOG_WRLSN_ENTRY, WT_WRLSN_ENTRY_CMP_LT);
-
 		/*
 		 * We know the written array is sorted by LSN.  Go
 		 * through them either advancing write_lsn or coalesce
@@ -451,8 +484,28 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield)
 		 */
 		for (i = 0; i < written_i; i++) {
 			slot = &log->slot_pool[written[i].slot_index];
+			/*
+			 * The log server thread pushes out slots periodically.
+			 * Sometimes they are empty slots.  If we find an
+			 * empty slot, where empty means the start and end LSN
+			 * are the same, free it and continue.
+			 */
+			if (__wt_log_cmp(&slot->slot_start_lsn,
+			    &slot->slot_release_lsn) == 0 &&
+			    __wt_log_cmp(&slot->slot_start_lsn,
+			    &slot->slot_end_lsn) == 0) {
+				__wt_log_slot_free(session, slot);
+				continue;
+			}
 			if (coalescing != NULL) {
-				if (WT_LOG_CMP(&coalescing->slot_end_lsn,
+				/*
+				 * If the write_lsn changed, we may be able to
+				 * process slots.  Try again.
+				 */
+				if (__wt_log_cmp(
+				    &log->write_lsn, &save_lsn) != 0)
+					goto restart;
+				if (__wt_log_cmp(&coalescing->slot_end_lsn,
 				    &written[i].lsn) != 0) {
 					coalescing = slot;
 					continue;
@@ -461,6 +514,8 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield)
 				 * If we get here we have a slot to coalesce
 				 * and free.
 				 */
+				coalescing->slot_last_offset =
+				    slot->slot_last_offset;
 				coalescing->slot_end_lsn = slot->slot_end_lsn;
 				WT_STAT_FAST_CONN_INCR(
 				    session, log_slot_coalesced);
@@ -473,8 +528,12 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield)
 				/*
 				 * If this written slot is not the next LSN,
 				 * try to start coalescing with later slots.
+				 * A synchronous write may update write_lsn
+				 * so save the last one we saw to check when
+				 * coalescing slots.
 				 */
-				if (WT_LOG_CMP(
+				save_lsn = log->write_lsn;
+				if (__wt_log_cmp(
 				    &log->write_lsn, &written[i].lsn) != 0) {
 					coalescing = slot;
 					continue;
@@ -483,27 +542,29 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield)
 				 * If we get here we have a slot to process.
 				 * Advance the LSN and process the slot.
 				 */
-				WT_ASSERT(session, WT_LOG_CMP(&written[i].lsn,
+				WT_ASSERT(session, __wt_log_cmp(&written[i].lsn,
 				    &slot->slot_release_lsn) == 0);
+				if (slot->slot_start_lsn.offset !=
+				    slot->slot_last_offset)
+					slot->slot_start_lsn.offset =
+					    slot->slot_last_offset;
 				log->write_start_lsn = slot->slot_start_lsn;
 				log->write_lsn = slot->slot_end_lsn;
-				WT_RET(__wt_cond_signal(
+				WT_ERR(__wt_cond_signal(
 				    session, log->log_write_cond));
 				WT_STAT_FAST_CONN_INCR(session, log_write_lsn);
 				/*
 				 * Signal the close thread if needed.
 				 */
 				if (F_ISSET(slot, WT_SLOT_CLOSEFH))
-					WT_RET(__wt_cond_signal(
+					WT_ERR(__wt_cond_signal(
 					    session, conn->log_file_cond));
 			}
-			WT_RET(__wt_log_slot_free(session, slot));
-			if (free_i != NULL && *free_i == WT_SLOT_POOL &&
-			    slot->slot_state == WT_LOG_SLOT_FREE)
-				*free_i = save_i;
+			__wt_log_slot_free(session, slot);
 		}
 	}
-	return (0);
+err:	__wt_spin_unlock(session, &log->log_writelsn_lock);
+	return (ret);
 }
 
 /*
@@ -515,31 +576,26 @@ __log_wrlsn_server(void *arg)
 {
 	WT_CONNECTION_IMPL *conn;
 	WT_DECL_RET;
-	WT_LOG *log;
 	WT_SESSION_IMPL *session;
-	int locked, yield;
 
 	session = arg;
 	conn = S2C(session);
-	log = conn->log;
-	locked = yield = 0;
 	while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) {
-		__wt_spin_lock(session, &log->log_slot_lock);
-		locked = 1;
-		WT_ERR(__wt_log_wrlsn(session, NULL, &yield));
-		locked = 0;
-		__wt_spin_unlock(session, &log->log_slot_lock);
-		if (++yield < 1000)
-			__wt_yield();
-		else
-			WT_ERR(__wt_cond_wait(session,
-			    conn->log_wrlsn_cond, 100000));
+		/*
+		 * Write out any log record buffers.
+		 */
+		WT_ERR(__wt_log_wrlsn(session));
+		WT_ERR(__wt_cond_wait(session, conn->log_wrlsn_cond, 10000));
 	}
+	/*
+	 * On close we need to do this one more time because there could
+	 * be straggling log writes that need to be written.
+	 */
+	WT_ERR(__wt_log_force_write(session, 1));
+	WT_ERR(__wt_log_wrlsn(session));
 	if (0) {
 err:		__wt_err(session, ret, "log wrlsn server error");
 	}
-	if (locked)
-		__wt_spin_unlock(session, &log->log_slot_lock);
 	return (WT_THREAD_RET_VALUE);
 }
 
@@ -554,44 +610,81 @@ __log_server(void *arg)
 	WT_DECL_RET;
 	WT_LOG *log;
 	WT_SESSION_IMPL *session;
-	u_int locked;
+	int freq_per_sec, signalled;
 
 	session = arg;
 	conn = S2C(session);
 	log = conn->log;
-	locked = 0;
+	signalled = 0;
+
+	/*
+	 * Set this to the number of times per second we want to force out the
+	 * log slot buffer.
+	 */
+#define	WT_FORCE_PER_SECOND	20
+	freq_per_sec = WT_FORCE_PER_SECOND;
+
+	/*
+	 * The log server thread does a variety of work.  It forces out any
+	 * buffered log writes.  It pre-allocates log files and it performs
+	 * log archiving.  The reason the wrlsn thread does not force out
+	 * the buffered writes is because we want to process and move the
+	 * write_lsn forward as quickly as possible.  The same reason applies
+	 * to why the log file server thread does not force out the writes.
+	 * That thread does fsync calls which can take a long time and we
+	 * don't want log records sitting in the buffer over the time it
+	 * takes to sync out an earlier file.
+	 */
 	while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) {
 		/*
-		 * Perform log pre-allocation.
+		 * Slots depend on future activity.  Force out buffered
+		 * writes in case we are idle.  This cannot be part of the
+		 * wrlsn thread because of interaction advancing the write_lsn
+		 * and a buffer may need to wait for the write_lsn to advance
+		 * in the case of a synchronous buffer.  We end up with a hang.
 		 */
-		if (conn->log_prealloc > 0)
-			WT_ERR(__log_prealloc_once(session));
+		WT_ERR_BUSY_OK(__wt_log_force_write(session, 0));
 
 		/*
-		 * Perform the archive.
+		 * We don't want to archive or pre-allocate files as often as
+		 * we want to force out log buffers.  Only do it once per second
+		 * or if the condition was signalled.
 		 */
-		if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) {
-			if (__wt_try_writelock(
-			    session, log->log_archive_lock) == 0) {
-				locked = 1;
-				WT_ERR(__log_archive_once(session, 0));
-				WT_ERR(	__wt_writeunlock(
-				    session, log->log_archive_lock));
-				locked = 0;
-			} else
-				WT_ERR(__wt_verbose(session, WT_VERB_LOG,
-				    "log_archive: Blocked due to open log "
-				    "cursor holding archive lock"));
+		if (--freq_per_sec <= 0 || signalled != 0) {
+			freq_per_sec = WT_FORCE_PER_SECOND;
+
+			/*
+			 * Perform log pre-allocation.
+			 */
+			if (conn->log_prealloc > 0)
+				WT_ERR(__log_prealloc_once(session));
+
+			/*
+			 * Perform the archive.
+			 */
+			if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) {
+				if (__wt_try_writelock(
+				    session, log->log_archive_lock) == 0) {
+					ret = __log_archive_once(session, 0);
+					WT_TRET(__wt_writeunlock(
+					    session, log->log_archive_lock));
+					WT_ERR(ret);
+				} else
+					WT_ERR(
+					    __wt_verbose(session, WT_VERB_LOG,
+					    "log_archive: Blocked due to open "
+					    "log cursor holding archive lock"));
+			}
 		}
+
 		/* Wait until the next event. */
-		WT_ERR(__wt_cond_wait(session, conn->log_cond, WT_MILLION));
+		WT_ERR(__wt_cond_wait_signal(session, conn->log_cond,
+		    WT_MILLION / WT_FORCE_PER_SECOND, &signalled));
 	}
 
 	if (0) {
 err:		__wt_err(session, ret, "log server error");
 	}
-	if (locked)
-		(void)__wt_writeunlock(session, log->log_archive_lock);
 	return (WT_THREAD_RET_VALUE);
 }
 
@@ -624,6 +717,8 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[])
 	WT_RET(__wt_spin_init(session, &log->log_lock, "log"));
 	WT_RET(__wt_spin_init(session, &log->log_slot_lock, "log slot"));
 	WT_RET(__wt_spin_init(session, &log->log_sync_lock, "log sync"));
+	WT_RET(__wt_spin_init(session, &log->log_writelsn_lock,
+	    "log write LSN"));
 	WT_RET(__wt_rwlock_alloc(session,
 	    &log->log_archive_lock, "log archive lock"));
 	if (FLD_ISSET(conn->direct_io, WT_FILE_TYPE_LOG))
@@ -755,13 +850,11 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
 		WT_TRET(__wt_thread_join(session, conn->log_tid));
 		conn->log_tid_set = 0;
 	}
-	WT_TRET(__wt_cond_destroy(session, &conn->log_cond));
 	if (conn->log_file_tid_set) {
 		WT_TRET(__wt_cond_signal(session, conn->log_file_cond));
 		WT_TRET(__wt_thread_join(session, conn->log_file_tid));
 		conn->log_file_tid_set = 0;
 	}
-	WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond));
 	if (conn->log_file_session != NULL) {
 		wt_session = &conn->log_file_session->iface;
 		WT_TRET(wt_session->close(wt_session, NULL));
@@ -772,13 +865,13 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
 		WT_TRET(__wt_thread_join(session, conn->log_wrlsn_tid));
 		conn->log_wrlsn_tid_set = 0;
 	}
-	WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond));
 	if (conn->log_wrlsn_session != NULL) {
 		wt_session = &conn->log_wrlsn_session->iface;
 		WT_TRET(wt_session->close(wt_session, NULL));
 		conn->log_wrlsn_session = NULL;
 	}
 
+	WT_TRET(__wt_log_slot_destroy(session));
 	WT_TRET(__wt_log_close(session));
 
 	/* Close the server thread's session. */
@@ -788,13 +881,18 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
 		conn->log_session = NULL;
 	}
 
-	WT_TRET(__wt_log_slot_destroy(session));
+	/* Destroy the condition variables now that all threads are stopped */
+	WT_TRET(__wt_cond_destroy(session, &conn->log_cond));
+	WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond));
+	WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond));
+
 	WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond));
 	WT_TRET(__wt_cond_destroy(session, &conn->log->log_write_cond));
 	WT_TRET(__wt_rwlock_destroy(session, &conn->log->log_archive_lock));
 	__wt_spin_destroy(session, &conn->log->log_lock);
 	__wt_spin_destroy(session, &conn->log->log_slot_lock);
 	__wt_spin_destroy(session, &conn->log->log_sync_lock);
+	__wt_spin_destroy(session, &conn->log->log_writelsn_lock);
 	__wt_free(session, conn->log_path);
 	__wt_free(session, conn->log);
 	return (ret);
diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c
index c4350d90adb..8bc69bb3e80 100644
--- a/src/conn/conn_open.c
+++ b/src/conn/conn_open.c
@@ -30,6 +30,7 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[])
 	/* WT_SESSION_IMPL array. */
 	WT_RET(__wt_calloc(session,
 	    conn->session_size, sizeof(WT_SESSION_IMPL), &conn->sessions));
+	WT_CACHE_LINE_ALIGNMENT_VERIFY(session, conn->sessions);
 
 	/*
 	 * Open the default session.  We open this before starting service
@@ -110,14 +111,17 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
 	F_CLR(conn, WT_CONN_SERVER_RUN);
 	WT_TRET(__wt_async_destroy(session));
 	WT_TRET(__wt_lsm_manager_destroy(session));
+	WT_TRET(__wt_sweep_destroy(session));
 
 	F_SET(conn, WT_CONN_CLOSING);
 
 	WT_TRET(__wt_checkpoint_server_destroy(session));
 	WT_TRET(__wt_statlog_destroy(session, 1));
-	WT_TRET(__wt_sweep_destroy(session));
 	WT_TRET(__wt_evict_destroy(session));
 
+	/* Shut down the lookaside table, after all eviction is complete. */
+	WT_TRET(__wt_las_destroy(session));
+
 	/* Close open data handles. */
 	WT_TRET(__wt_conn_dhandle_discard(session));
 
@@ -128,7 +132,8 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
 	 * conditional because we allocate the log path so that printlog can
 	 * run without running logging or recovery.
 	 */
-	if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
+	if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
+	    FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE))
 		WT_TRET(__wt_txn_checkpoint_log(
 		    session, 1, WT_TXN_LOG_CKPT_STOP, NULL));
 	F_CLR(conn, WT_CONN_LOG_SERVER_RUN);
@@ -145,14 +150,14 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
 	 * Complain if files weren't closed, ignoring the lock file, we'll
 	 * close it in a minute.
 	 */
-	SLIST_FOREACH(fh, &conn->fhlh, l) {
+	TAILQ_FOREACH(fh, &conn->fhqh, q) {
 		if (fh == conn->lock_fh)
 			continue;
 
 		__wt_errx(session,
 		    "Connection has open file handles: %s", fh->name);
 		WT_TRET(__wt_close(session, &fh));
-		fh = SLIST_FIRST(&conn->fhlh);
+		fh = TAILQ_FIRST(&conn->fhqh);
 	}
 
 	/* Disconnect from shared cache - must be before cache destroy. */
@@ -236,9 +241,7 @@ __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[])
 	/* Run recovery. */
 	WT_RET(__wt_txn_recover(session));
 
-	/*
-	 * Start the handle sweep thread.
-	 */
+	/* Start the handle sweep thread. */
 	WT_RET(__wt_sweep_create(session));
 
 	/* Start the optional async threads. */
diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c
index 9c438c01cd2..3b188bfd22a 100644
--- a/src/conn/conn_stat.c
+++ b/src/conn/conn_stat.c
@@ -42,11 +42,25 @@ __stat_sources_free(WT_SESSION_IMPL *session, char ***sources)
 void
 __wt_conn_stat_init(WT_SESSION_IMPL *session)
 {
+	WT_CONNECTION_IMPL *conn;
+	WT_CONNECTION_STATS **stats;
+
+	conn = S2C(session);
+	stats = conn->stats;
+
 	__wt_async_stats_update(session);
 	__wt_cache_stats_update(session);
+	__wt_las_stats_update(session);
 	__wt_txn_stats_update(session);
 
-	WT_CONN_STAT(session, file_open) = S2C(session)->open_file_count;
+	WT_STAT_SET(session, stats, file_open, conn->open_file_count);
+	WT_STAT_SET(session,
+	    stats, session_cursor_open, conn->open_cursor_count);
+	WT_STAT_SET(session, stats, dh_conn_handle_count, conn->dhandle_count);
+	WT_STAT_SET(session,
+	    stats, rec_split_stashed_objects, conn->split_stashed_objects);
+	WT_STAT_SET(session,
+	    stats, rec_split_stashed_bytes, conn->split_stashed_bytes);
 }
 
 /*
@@ -135,11 +149,11 @@ __statlog_dump(WT_SESSION_IMPL *session, const char *name, int conn_stats)
 {
 	WT_CONNECTION_IMPL *conn;
 	WT_CURSOR *cursor;
+	WT_CURSOR_STAT *cst;
 	WT_DECL_ITEM(tmp);
 	WT_DECL_RET;
-	WT_STATS *stats;
-	u_int i;
-	uint64_t max;
+	int64_t *stats;
+	int i;
 	const char *uri;
 	const char *cfg[] = {
 	    WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL };
@@ -163,15 +177,14 @@ __statlog_dump(WT_SESSION_IMPL *session, const char *name, int conn_stats)
 	 */
 	switch (ret = __wt_curstat_open(session, uri, cfg, &cursor)) {
 	case 0:
-		max = conn_stats ?
-		    sizeof(WT_CONNECTION_STATS) / sizeof(WT_STATS) :
-		    sizeof(WT_DSRC_STATS) / sizeof(WT_STATS);
-		for (i = 0,
-		    stats = WT_CURSOR_STATS(cursor); i <  max; ++i, ++stats)
+		cst = (WT_CURSOR_STAT *)cursor;
+		for (stats = cst->stats, i = 0; i <  cst->stats_count; ++i)
 			WT_ERR(__wt_fprintf(conn->stat_fp,
-			    "%s %" PRIu64 " %s %s\n",
-			    conn->stat_stamp,
-			    stats->v, name, stats->desc));
+			    "%s %" PRId64 " %s %s\n",
+			    conn->stat_stamp, stats[i],
+			    name, conn_stats ?
+			    __wt_stat_connection_desc(i) :
+			    __wt_stat_dsrc_desc(i)));
 		WT_ERR(cursor->close(cursor));
 		break;
 	case EBUSY:
diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c
index ec6f628a02e..8da32416242 100644
--- a/src/conn/conn_sweep.c
+++ b/src/conn/conn_sweep.c
@@ -8,55 +8,58 @@
 
 #include "wt_internal.h"
 
+#define	WT_DHANDLE_CAN_DISCARD(dhandle)					\
+	(!F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_OPEN) &&	\
+	dhandle->session_inuse == 0 && dhandle->session_ref == 0)
+
 /*
  * __sweep_mark --
  *	Mark idle handles with a time of death, and note if we see dead
  *	handles.
  */
 static int
-__sweep_mark(WT_SESSION_IMPL *session, int *dead_handlesp)
+__sweep_mark(WT_SESSION_IMPL *session, time_t now)
 {
 	WT_CONNECTION_IMPL *conn;
 	WT_DATA_HANDLE *dhandle;
-	time_t now;
 
 	conn = S2C(session);
-	*dead_handlesp = 0;
 
-	/* Don't discard handles that have been open recently. */
-	WT_RET(__wt_seconds(session, &now));
-
-	WT_STAT_FAST_CONN_INCR(session, dh_conn_sweeps);
-	SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+	TAILQ_FOREACH(dhandle, &conn->dhqh, q) {
 		if (WT_IS_METADATA(dhandle))
 			continue;
-		if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) {
-			++*dead_handlesp;
-			continue;
-		}
-		if (dhandle->session_inuse != 0 ||
-		    now <= dhandle->timeofdeath + conn->sweep_idle_time ||
-		    conn->sweep_idle_time == 0)
-			continue;
-		if (dhandle->timeofdeath == 0) {
-			dhandle->timeofdeath = now;
-			WT_STAT_FAST_CONN_INCR(session, dh_conn_tod);
+
+		/*
+		 * There are some internal increments of the in-use count such
+		 * as eviction.  Don't keep handles alive because of those
+		 * cases, but if we see multiple cursors open, clear the time
+		 * of death.
+		 */
+		if (dhandle->session_inuse > 1)
+			dhandle->timeofdeath = 0;
+
+		/*
+		 * If the handle is open exclusive or currently in use, or the
+		 * time of death is already set, move on.
+		 */
+		if (F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) ||
+		    dhandle->session_inuse > 0 ||
+		    dhandle->timeofdeath != 0)
 			continue;
-		}
 
-		/* We now have a candidate to close. */
-		++*dead_handlesp;
+		dhandle->timeofdeath = now;
+		WT_STAT_FAST_CONN_INCR(session, dh_sweep_tod);
 	}
 
 	return (0);
 }
 
 /*
- * __sweep_expire_handle --
+ * __sweep_expire_one --
  *	Mark a single handle dead.
  */
 static int
-__sweep_expire_handle(WT_SESSION_IMPL *session)
+__sweep_expire_one(WT_SESSION_IMPL *session)
 {
 	WT_BTREE *btree;
 	WT_DATA_HANDLE *dhandle;
@@ -113,42 +116,31 @@ err:	WT_TRET(__wt_writeunlock(session, dhandle->rwlock));
  *	until we have reached the configured minimum number of handles.
  */
 static int
-__sweep_expire(WT_SESSION_IMPL *session)
+__sweep_expire(WT_SESSION_IMPL *session, time_t now)
 {
 	WT_CONNECTION_IMPL *conn;
 	WT_DATA_HANDLE *dhandle;
 	WT_DECL_RET;
-	time_t now;
 
 	conn = S2C(session);
 
-	/* If sweep_idle_time is 0, then we won't expire any cursors */
-	if (conn->sweep_idle_time == 0)
-		return (0);
-
-	/* Don't discard handles that have been open recently. */
-	WT_RET(__wt_seconds(session, &now));
-
-	WT_STAT_FAST_CONN_INCR(session, dh_conn_sweeps);
-	SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+	TAILQ_FOREACH(dhandle, &conn->dhqh, q) {
 		/*
-		 * Ignore open files once the open file count reaches the
+		 * Ignore open files once the btree file count is below the
 		 * minimum number of handles.
 		 */
-		if (conn->open_file_count < conn->sweep_handles_min)
+		if (conn->open_btree_count < conn->sweep_handles_min)
 			break;
 
-		if (WT_IS_METADATA(dhandle))
-			continue;
-		if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
-		    F_ISSET(dhandle, WT_DHANDLE_DEAD))
-			continue;
-		if (dhandle->session_inuse != 0 ||
+		if (WT_IS_METADATA(dhandle) ||
+		    !F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
+		    dhandle->session_inuse != 0 ||
+		    dhandle->timeofdeath == 0 ||
 		    now <= dhandle->timeofdeath + conn->sweep_idle_time)
 			continue;
 
 		WT_WITH_DHANDLE(session, dhandle,
-		    ret = __sweep_expire_handle(session));
+		    ret = __sweep_expire_one(session));
 		WT_RET_BUSY_OK(ret);
 	}
 
@@ -156,11 +148,11 @@ __sweep_expire(WT_SESSION_IMPL *session)
 }
 
 /*
- * __sweep_flush --
- *	Flush pages from dead trees.
+ * __sweep_discard_trees --
+ *	Discard pages from dead trees.
  */
 static int
-__sweep_flush(WT_SESSION_IMPL *session)
+__sweep_discard_trees(WT_SESSION_IMPL *session, u_int *dead_handlesp)
 {
 	WT_CONNECTION_IMPL *conn;
 	WT_DATA_HANDLE *dhandle;
@@ -168,8 +160,12 @@ __sweep_flush(WT_SESSION_IMPL *session)
 
 	conn = S2C(session);
 
-	WT_STAT_FAST_CONN_INCR(session, dh_conn_sweeps);
-	SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+	*dead_handlesp = 0;
+
+	TAILQ_FOREACH(dhandle, &conn->dhqh, q) {
+		if (WT_DHANDLE_CAN_DISCARD(dhandle))
+			++*dead_handlesp;
+
 		if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
 		    !F_ISSET(dhandle, WT_DHANDLE_DEAD))
 			continue;
@@ -178,9 +174,12 @@ __sweep_flush(WT_SESSION_IMPL *session)
 		WT_WITH_DHANDLE(session, dhandle, ret =
 		    __wt_conn_btree_sync_and_close(session, 0, 0));
 
-		/* We closed the btree handle, bump the statistic. */
-		if (ret == 0)
-			WT_STAT_FAST_CONN_INCR(session, dh_conn_handles);
+		/* We closed the btree handle. */
+		if (ret == 0) {
+			WT_STAT_FAST_CONN_INCR(session, dh_sweep_close);
+			++*dead_handlesp;
+		} else
+			WT_STAT_FAST_CONN_INCR(session, dh_sweep_ref);
 
 		WT_RET_BUSY_OK(ret);
 	}
@@ -189,8 +188,41 @@ __sweep_flush(WT_SESSION_IMPL *session)
 }
 
 /*
+ * __sweep_remove_one --
+ *	Remove a closed handle from the connection list.
+ */
+static int
+__sweep_remove_one(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle)
+{
+	WT_DECL_RET;
+
+	/* Try to get exclusive access. */
+	WT_RET(__wt_try_writelock(session, dhandle->rwlock));
+
+	/*
+	 * If there are no longer any references to the handle in any
+	 * sessions, attempt to discard it.
+	 */
+	if (!WT_DHANDLE_CAN_DISCARD(dhandle))
+		WT_ERR(EBUSY);
+
+	WT_WITH_DHANDLE(session, dhandle,
+	    ret = __wt_conn_dhandle_discard_single(session, 0, 1));
+
+	/*
+	 * If the handle was not successfully discarded, unlock it and
+	 * don't retry the discard until it times out again.
+	 */
+	if (ret != 0) {
+err:		WT_TRET(__wt_writeunlock(session, dhandle->rwlock));
+	}
+
+	return (ret);
+}
+
+/*
  * __sweep_remove_handles --
- *	Remove closed dhandles from the connection list.
+ *	Remove closed handles from the connection list.
  */
 static int
 __sweep_remove_handles(WT_SESSION_IMPL *session)
@@ -200,41 +232,23 @@ __sweep_remove_handles(WT_SESSION_IMPL *session)
 	WT_DECL_RET;
 
 	conn = S2C(session);
-	dhandle = SLIST_FIRST(&conn->dhlh);
 
-	for (; dhandle != NULL; dhandle = dhandle_next) {
-		dhandle_next = SLIST_NEXT(dhandle, l);
+	for (dhandle = TAILQ_FIRST(&conn->dhqh);
+	    dhandle != NULL;
+	    dhandle = dhandle_next) {
+		dhandle_next = TAILQ_NEXT(dhandle, q);
 		if (WT_IS_METADATA(dhandle))
 			continue;
-		if (F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
-		    dhandle->session_inuse != 0 ||
-		    dhandle->session_ref != 0)
-			continue;
-
-		/* Make sure we get exclusive access. */
-		if ((ret =
-		    __wt_try_writelock(session, dhandle->rwlock)) == EBUSY)
-			continue;
-		WT_RET(ret);
-
-		/*
-		 * If there are no longer any references to the handle in any
-		 * sessions, attempt to discard it.
-		 */
-		if (F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
-		    dhandle->session_inuse != 0 || dhandle->session_ref != 0) {
-			WT_RET(__wt_writeunlock(session, dhandle->rwlock));
+		if (!WT_DHANDLE_CAN_DISCARD(dhandle))
 			continue;
-		}
-
-		WT_WITH_DHANDLE(session, dhandle,
-		    ret = __wt_conn_dhandle_discard_single(session, 0, 1));
 
-		/* If the handle was not successfully discarded, unlock it. */
-		if (ret != 0)
-			WT_TRET(__wt_writeunlock(session, dhandle->rwlock));
+		WT_WITH_HANDLE_LIST_LOCK(session,
+		    ret = __sweep_remove_one(session, dhandle));
+		if (ret == 0)
+			WT_STAT_FAST_CONN_INCR(session, dh_sweep_remove);
+		else
+			WT_STAT_FAST_CONN_INCR(session, dh_sweep_ref);
 		WT_RET_BUSY_OK(ret);
-		WT_STAT_FAST_CONN_INCR(session, dh_conn_ref);
 	}
 
 	return (ret == EBUSY ? 0 : ret);
@@ -250,7 +264,8 @@ __sweep_server(void *arg)
 	WT_CONNECTION_IMPL *conn;
 	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int dead_handles;
+	time_t now;
+	u_int dead_handles;
 
 	session = arg;
 	conn = S2C(session);
@@ -263,35 +278,37 @@ __sweep_server(void *arg)
 		/* Wait until the next event. */
 		WT_ERR(__wt_cond_wait(session, conn->sweep_cond,
 		    (uint64_t)conn->sweep_interval * WT_MILLION));
+		WT_ERR(__wt_seconds(session, &now));
+
+		WT_STAT_FAST_CONN_INCR(session, dh_sweeps);
 
 		/*
-		 * Mark handles with a time of death, and report whether any
-		 * handles are marked dead.
+		 * Sweep the lookaside table. If the lookaside table hasn't yet
+		 * been written, there's no work to do.
 		 */
-		WT_ERR(__sweep_mark(session, &dead_handles));
+		if (__wt_las_is_written(session))
+			WT_ERR(__wt_las_sweep(session));
 
 		/*
-		 * We only want to flush and expire if there are no dead handles
-		 * and if either the sweep_idle_time is not 0, or if we have
-		 * reached the configured limit of handles.
+		 * Mark handles with a time of death, and report whether any
+		 * handles are marked dead.  If sweep_idle_time is 0, handles
+		 * never become idle.
 		 */
-		if (dead_handles == 0 &&
-		    (conn->open_file_count < conn->sweep_handles_min ||
-		    conn->sweep_idle_time != 0))
-			continue;
+		if (conn->sweep_idle_time != 0)
+			WT_ERR(__sweep_mark(session, now));
 
-		/* Close handles if we have reached the configured limit */
-		if (conn->open_file_count >= conn->sweep_handles_min) {
-			WT_WITH_HANDLE_LIST_LOCK(session,
-			    ret = __sweep_expire(session));
-			WT_ERR(ret);
-		}
+		/*
+		 * Close handles if we have reached the configured limit.
+		 * If sweep_idle_time is 0, handles never become idle.
+		 */
+		if (conn->sweep_idle_time != 0 &&
+		    conn->open_btree_count >= conn->sweep_handles_min)
+			WT_ERR(__sweep_expire(session, now));
 
-		WT_ERR(__sweep_flush(session));
+		WT_ERR(__sweep_discard_trees(session, &dead_handles));
 
-		WT_WITH_HANDLE_LIST_LOCK(session,
-		    ret = __sweep_remove_handles(session));
-		WT_ERR(ret);
+		if (dead_handles > 0)
+			WT_ERR(__sweep_remove_handles(session));
 	}
 
 	if (0) {
@@ -349,8 +366,14 @@ __wt_sweep_create(WT_SESSION_IMPL *session)
 	/*
 	 * Handle sweep does enough I/O it may be called upon to perform slow
 	 * operations for the block manager.
+	 *
+	 * The sweep thread sweeps the lookaside table for outdated records,
+	 * it gets its own cursor for that purpose.
+	 *
+	 * Don't tap the sweep thread for eviction.
 	 */
-	F_SET(session, WT_SESSION_CAN_WAIT);
+	F_SET(session, WT_SESSION_CAN_WAIT |
+	    WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION);
 
 	WT_RET(__wt_cond_alloc(
 	    session, "handle sweep server", 0, &conn->sweep_cond));
@@ -389,5 +412,9 @@ __wt_sweep_destroy(WT_SESSION_IMPL *session)
 
 		conn->sweep_session = NULL;
 	}
+
+	/* Discard any saved lookaside key. */
+	__wt_buf_free(session, &conn->las_sweep_key);
+
 	return (ret);
 }
diff --git a/src/cursor/cur_backup.c b/src/cursor/cur_backup.c
index 60d94697189..3d9e5e405e8 100644
--- a/src/cursor/cur_backup.c
+++ b/src/cursor/cur_backup.c
@@ -514,17 +514,23 @@ static int
 __backup_list_all_append(WT_SESSION_IMPL *session, const char *cfg[])
 {
 	WT_CURSOR_BACKUP *cb;
+	const char *name;
 
 	WT_UNUSED(cfg);
 
 	cb = session->bkp_cursor;
+	name = session->dhandle->name;
 
 	/* Ignore files in the process of being bulk-loaded. */
 	if (F_ISSET(S2BT(session), WT_BTREE_BULK))
 		return (0);
 
+	/* Ignore the lookaside table. */
+	if (strcmp(name, WT_LAS_URI) == 0)
+		return (0);
+
 	/* Add the file to the list of files to be copied. */
-	return (__backup_list_append(session, cb, session->dhandle->name));
+	return (__backup_list_append(session, cb, name));
 }
 
 /*
diff --git a/src/cursor/cur_ds.c b/src/cursor/cur_ds.c
index c58d6899150..8ee57d24413 100644
--- a/src/cursor/cur_ds.c
+++ b/src/cursor/cur_ds.c
@@ -510,7 +510,7 @@ __wt_curds_open(
 	source = data_source->source;
 	source->session = (WT_SESSION *)session;
 	memset(&source->q, 0, sizeof(source->q));
-	source->recno = 0;
+	source->recno = WT_RECNO_OOB;
 	memset(source->raw_recno_buf, 0, sizeof(source->raw_recno_buf));
 	memset(&source->key, 0, sizeof(source->key));
 	memset(&source->value, 0, sizeof(source->value));
diff --git a/src/cursor/cur_file.c b/src/cursor/cur_file.c
index d30a2a04c22..436227847af 100644
--- a/src/cursor/cur_file.c
+++ b/src/cursor/cur_file.c
@@ -369,15 +369,20 @@ __curfile_close(WT_CURSOR *cursor)
 		__wt_buf_free(session, &cbulk->last);
 	}
 
-	WT_TRET(__wt_btcur_close(cbt));
-	if (cbt->btree != NULL) {
+	WT_TRET(__wt_btcur_close(cbt, 0));
+	/* The URI is owned by the btree handle. */
+	cursor->internal_uri = NULL;
+	WT_TRET(__wt_cursor_close(cursor));
+
+	/*
+	 * Note: release the data handle last so that cursor statistics are
+	 * updated correctly.
+	 */
+	if (session->dhandle != NULL) {
 		/* Increment the data-source's in-use counter. */
 		__wt_cursor_dhandle_decr_use(session);
 		WT_TRET(__wt_session_release_btree(session));
 	}
-	/* The URI is owned by the btree handle. */
-	cursor->internal_uri = NULL;
-	WT_TRET(__wt_cursor_close(cursor));
 
 err:	API_END_RET(session, ret);
 }
diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c
index 7dad85e9d38..045663b3614 100644
--- a/src/cursor/cur_index.c
+++ b/src/cursor/cur_index.c
@@ -130,7 +130,8 @@ __curindex_move(WT_CURSOR_INDEX *cindex)
 			(*cp)->recno = first->recno;
 		}
 		F_SET(*cp, WT_CURSTD_KEY_EXT);
-		WT_RET((*cp)->search(*cp));
+		if (cindex->cg_needvalue[i])
+			WT_RET((*cp)->search(*cp));
 	}
 
 	F_SET(&cindex->iface, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
@@ -320,6 +321,7 @@ __curindex_close(WT_CURSOR *cursor)
 				*cp = NULL;
 			}
 
+	__wt_free(session, cindex->cg_needvalue);
 	__wt_free(session, cindex->cg_cursors);
 	if (cindex->key_plan != idx->key_plan)
 		__wt_free(session, cindex->key_plan);
@@ -353,14 +355,19 @@ __curindex_open_colgroups(
 	/* Child cursors are opened with dump disabled. */
 	const char *cfg[] = { cfg_arg[0], cfg_arg[1], "dump=\"\"", NULL };
 	char *proj;
+	size_t cgcnt;
 
 	table = cindex->table;
-	WT_RET(__wt_calloc_def(session, WT_COLGROUPS(table), &cp));
+	cgcnt = WT_COLGROUPS(table);
+	WT_RET(__wt_calloc_def(session, cgcnt, &cindex->cg_needvalue));
+	WT_RET(__wt_calloc_def(session, cgcnt, &cp));
 	cindex->cg_cursors = cp;
 
 	/* Work out which column groups we need. */
 	for (proj = (char *)cindex->value_plan; *proj != '\0'; proj++) {
 		arg = strtoul(proj, &proj, 10);
+		if (*proj == WT_PROJ_VALUE)
+			cindex->cg_needvalue[arg] = 1;
 		if ((*proj != WT_PROJ_KEY && *proj != WT_PROJ_VALUE) ||
 		    cp[arg] != NULL)
 			continue;
diff --git a/src/cursor/cur_log.c b/src/cursor/cur_log.c
index 3376f2a3166..ade9fd18962 100644
--- a/src/cursor/cur_log.c
+++ b/src/cursor/cur_log.c
@@ -74,7 +74,7 @@ __curlog_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
 	acl = (WT_CURSOR_LOG *)a;
 	bcl = (WT_CURSOR_LOG *)b;
 	WT_ASSERT(session, cmpp != NULL);
-	*cmpp = WT_LOG_CMP(acl->cur_lsn, bcl->cur_lsn);
+	*cmpp = __wt_log_cmp(acl->cur_lsn, bcl->cur_lsn);
 	/*
 	 * If both are on the same LSN, compare step counter.
 	 */
@@ -392,6 +392,12 @@ __wt_curlog_open(WT_SESSION_IMPL *session,
 
 	WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp));
 
+	/*
+	 * The user may be trying to read a log record they just wrote.
+	 * Log records may be buffered, so force out any now.
+	 */
+	WT_ERR(__wt_log_force_write(session, 1));
+
 	/* Log cursors block archiving. */
 	WT_ERR(__wt_readlock(session, log->log_archive_lock));
 
diff --git a/src/cursor/cur_stat.c b/src/cursor/cur_stat.c
index 82568401319..2216a1d969d 100644
--- a/src/cursor/cur_stat.c
+++ b/src/cursor/cur_stat.c
@@ -113,12 +113,12 @@ __curstat_get_value(WT_CURSOR *cursor, ...)
 
 	if (F_ISSET(cursor, WT_CURSTD_RAW)) {
 		WT_ERR(__wt_struct_size(session, &size, cursor->value_format,
-		    cst->stats_first[WT_STAT_KEY_OFFSET(cst)].desc,
+		    cst->stats_desc(WT_STAT_KEY_OFFSET(cst)),
 		    cst->pv.data, cst->v));
 		WT_ERR(__wt_buf_initsize(session, &cursor->value, size));
 		WT_ERR(__wt_struct_pack(session, cursor->value.mem, size,
 		    cursor->value_format,
-		    cst->stats_first[WT_STAT_KEY_OFFSET(cst)].desc,
+		    cst->stats_desc(WT_STAT_KEY_OFFSET(cst)),
 		    cst->pv.data, cst->v));
 
 		item = va_arg(ap, WT_ITEM *);
@@ -130,7 +130,7 @@ __curstat_get_value(WT_CURSOR *cursor, ...)
 		 * pointer support isn't documented, but it's a cheap test.
 		 */
 		if ((p = va_arg(ap, const char **)) != NULL)
-			*p = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].desc;
+			*p = cst->stats_desc(WT_STAT_KEY_OFFSET(cst));
 		if ((p = va_arg(ap, const char **)) != NULL)
 			*p = cst->pv.data;
 		if ((v = va_arg(ap, uint64_t *)) != NULL)
@@ -215,7 +215,7 @@ __curstat_next(WT_CURSOR *cursor)
 		F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
 		WT_ERR(WT_NOTFOUND);
 	}
-	cst->v = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].v;
+	cst->v = (uint64_t)cst->stats[WT_STAT_KEY_OFFSET(cst)];
 	WT_ERR(__curstat_print_value(session, cst->v, &cst->pv));
 	F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
 
@@ -254,7 +254,7 @@ __curstat_prev(WT_CURSOR *cursor)
 		WT_ERR(WT_NOTFOUND);
 	}
 
-	cst->v = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].v;
+	cst->v = (uint64_t)cst->stats[WT_STAT_KEY_OFFSET(cst)];
 	WT_ERR(__curstat_print_value(session, cst->v, &cst->pv));
 	F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
 
@@ -308,7 +308,7 @@ __curstat_search(WT_CURSOR *cursor)
 	if (cst->key < WT_STAT_KEY_MIN(cst) || cst->key > WT_STAT_KEY_MAX(cst))
 		WT_ERR(WT_NOTFOUND);
 
-	cst->v = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].v;
+	cst->v = (uint64_t)cst->stats[WT_STAT_KEY_OFFSET(cst)];
 	WT_ERR(__curstat_print_value(session, cst->v, &cst->pv));
 	F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
 
@@ -354,13 +354,14 @@ __curstat_conn_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst)
 	 * Optionally clear the connection statistics.
 	 */
 	__wt_conn_stat_init(session);
-	cst->u.conn_stats = conn->stats;
+	__wt_stat_connection_aggregate(conn->stats, &cst->u.conn_stats);
 	if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
-		__wt_stat_refresh_connection_stats(&conn->stats);
+		__wt_stat_connection_clear_all(conn->stats);
 
-	cst->stats_first = cst->stats = (WT_STATS *)&cst->u.conn_stats;
+	cst->stats = (int64_t *)&cst->u.conn_stats;
 	cst->stats_base = WT_CONNECTION_STATS_BASE;
-	cst->stats_count = sizeof(WT_CONNECTION_STATS) / sizeof(WT_STATS);
+	cst->stats_count = sizeof(WT_CONNECTION_STATS) / sizeof(int64_t);
+	cst->stats_desc = __wt_stat_connection_desc;
 }
 
 /*
@@ -383,7 +384,7 @@ __curstat_file_init(WT_SESSION_IMPL *session,
 		filename = uri;
 		if (!WT_PREFIX_SKIP(filename, "file:"))
 			return (EINVAL);
-		__wt_stat_init_dsrc_stats(&cst->u.dsrc_stats);
+		__wt_stat_dsrc_init_single(&cst->u.dsrc_stats);
 		WT_RET(__wt_block_manager_size(
 		    session, filename, &cst->u.dsrc_stats));
 		__wt_curstat_dsrc_final(cst);
@@ -398,9 +399,10 @@ __curstat_file_init(WT_SESSION_IMPL *session,
 	 * Optionally clear the data source statistics.
 	 */
 	if ((ret = __wt_btree_stat_init(session, cst)) == 0) {
-		cst->u.dsrc_stats = dhandle->stats;
+		__wt_stat_dsrc_init_single(&cst->u.dsrc_stats);
+		__wt_stat_dsrc_aggregate(dhandle->stats, &cst->u.dsrc_stats);
 		if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
-			__wt_stat_refresh_dsrc_stats(&dhandle->stats);
+			__wt_stat_dsrc_clear_all(dhandle->stats);
 		__wt_curstat_dsrc_final(cst);
 	}
 
@@ -417,10 +419,10 @@ __curstat_file_init(WT_SESSION_IMPL *session,
 void
 __wt_curstat_dsrc_final(WT_CURSOR_STAT *cst)
 {
-
-	cst->stats_first = cst->stats = (WT_STATS *)&cst->u.dsrc_stats;
+	cst->stats = (int64_t *)&cst->u.dsrc_stats;
 	cst->stats_base = WT_DSRC_STATS_BASE;
-	cst->stats_count = sizeof(WT_DSRC_STATS) / sizeof(WT_STATS);
+	cst->stats_count = sizeof(WT_DSRC_STATS) / sizeof(int64_t);
+	cst->stats_desc = __wt_stat_dsrc_desc;
 }
 
 /*
@@ -495,7 +497,7 @@ __wt_curstat_open(WT_SESSION_IMPL *session,
 
 	conn = S2C(session);
 
-	WT_ERR(__wt_calloc_one(session, &cst));
+	WT_RET(__wt_calloc_one(session, &cst));
 	cursor = &cst->iface;
 	*cursor = iface;
 	cursor->session = &session->iface;
diff --git a/src/cursor/cur_std.c b/src/cursor/cur_std.c
index 858c6af6853..701bd845ae9 100644
--- a/src/cursor/cur_std.c
+++ b/src/cursor/cur_std.c
@@ -258,9 +258,9 @@ __wt_cursor_set_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap)
 			    item->data, item->size, "q", &cursor->recno));
 		} else
 			cursor->recno = va_arg(ap, uint64_t);
-		if (cursor->recno == 0)
+		if (cursor->recno == WT_RECNO_OOB)
 			WT_ERR_MSG(session, EINVAL,
-			    "Record numbers must be greater than zero");
+			    "%d is an invalid record number", WT_RECNO_OOB);
 		buf->data = &cursor->recno;
 		sz = sizeof(cursor->recno);
 	} else {
@@ -463,16 +463,17 @@ __wt_cursor_close(WT_CURSOR *cursor)
 	WT_SESSION_IMPL *session;
 
 	session = (WT_SESSION_IMPL *)cursor->session;
-	__wt_buf_free(session, &cursor->key);
-	__wt_buf_free(session, &cursor->value);
 
 	if (F_ISSET(cursor, WT_CURSTD_OPEN)) {
 		TAILQ_REMOVE(&session->cursors, cursor, q);
 
+		(void)__wt_atomic_sub32(&S2C(session)->open_cursor_count, 1);
 		WT_STAT_FAST_DATA_DECR(session, session_cursor_open);
-		WT_STAT_FAST_CONN_ATOMIC_DECR(session, session_cursor_open);
 	}
 
+	__wt_buf_free(session, &cursor->key);
+	__wt_buf_free(session, &cursor->value);
+
 	__wt_free(session, cursor->internal_uri);
 	__wt_free(session, cursor->uri);
 	__wt_overwrite_and_free(session, cursor);
@@ -683,8 +684,8 @@ __wt_cursor_init(WT_CURSOR *cursor,
 		TAILQ_INSERT_HEAD(&session->cursors, cursor, q);
 
 	F_SET(cursor, WT_CURSTD_OPEN);
+	(void)__wt_atomic_add32(&S2C(session)->open_cursor_count, 1);
 	WT_STAT_FAST_DATA_INCR(session, session_cursor_open);
-	WT_STAT_FAST_CONN_ATOMIC_INCR(session, session_cursor_open);
 
 	*cursorp = (cdump != NULL) ? cdump : cursor;
 	return (0);
diff --git a/src/docs/cursor-random.dox b/src/docs/cursor-random.dox
index 70a28407ea5..5d0b89d6547 100644
--- a/src/docs/cursor-random.dox
+++ b/src/docs/cursor-random.dox
@@ -2,15 +2,11 @@
 
 The \c next_random configuration to the WT_SESSION::open_cursor method
 configures the cursor to return a pseudo-random record from a row-store
-object.
-
-The ability to return a random record was added to support a particular
-application, and as a result has somewhat unusual semantics.  First, the
-returned record may not be random at all in the case of objects with only a few
-rows (especially when the object has never been written to the backing store).
-In such objects, the WT_CURSOR::next method for cursors configured with \c
-next_random may return the same row on each call.  Additionally, even in larger
-objects, the WT_CURSOR::next method usually returns the first record from a
-random page in the underlying file, not a random record from a random page.
+object (the configuration is not supported on other types of objects).
+The configuration has somewhat unusual semantics: first, the returned
+record may not be very random in the case of objects with only a few
+rows. Additionally, even in larger objects, the WT_CURSOR::next method
+generally returns the first record from a random page in the underlying
+file, not a random record from a random page.
 
  */
diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox
index d9ac58103c5..e0640660b0a 100644
--- a/src/docs/upgrading.dox
+++ b/src/docs/upgrading.dox
@@ -3,6 +3,15 @@
 @section version_262 Upgrading to Version 2.6.2
 
 <dl>
+<dt>Change to config_base=false</dt>
+<dd>
+If \c config_base=false in the config passed directly to ::wiredtiger_open,
+any existing base configuration file will now be ignored.  If an application
+was relying on this behavior, a connection will be opened with different
+settings after upgrading, which could lead to errors or unexpected behavior.
+</dd>
+
+<dl>
 <dt>WT_SESSION.verify</dt>
 <dd>
 The WT_SESSION.verify method in this release has a new configuration
diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c
index 38cfc07ac5b..66fabe48fb2 100644
--- a/src/evict/evict_file.c
+++ b/src/evict/evict_file.c
@@ -79,26 +79,19 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop)
 			WT_ERR(__wt_evict(session, ref, 1));
 			break;
 		case WT_SYNC_DISCARD:
-			WT_ASSERT(session,
-			    __wt_page_can_evict(session, page, 0, NULL));
-			__wt_evict_page_clean_update(session, ref, 1);
-			break;
-		case WT_SYNC_DISCARD_FORCE:
 			/*
-			 * Forced discard of the page, whether clean or dirty.
-			 * If we see a dirty page in a forced discard, clean
-			 * the page, both to keep statistics correct, and to
-			 * let the page-discard function assert no dirty page
-			 * is ever discarded.
+			 * Dead handles may reference dirty pages; clean the
+			 * page, both to keep statistics correct, and to let
+			 * the page-discard function assert no dirty page is
+			 * ever discarded.
 			 */
-			if (__wt_page_is_modified(page)) {
-				page->modify->write_gen = 0;
-				__wt_cache_dirty_decr(session, page);
-			}
+			if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
+				__wt_page_modify_clear(session, page);
 
-			F_SET(session, WT_SESSION_DISCARD_FORCE);
+			WT_ASSERT(session,
+			    F_ISSET(session->dhandle, WT_DHANDLE_DEAD) ||
+			    __wt_page_can_evict(session, page, 0, NULL));
 			__wt_evict_page_clean_update(session, ref, 1);
-			F_CLR(session, WT_SESSION_DISCARD_FORCE);
 			break;
 		WT_ILLEGAL_VALUE_ERR(session);
 		}
diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c
index 6aa61b4137b..ce61aa2c798 100644
--- a/src/evict/evict_lru.c
+++ b/src/evict/evict_lru.c
@@ -10,14 +10,13 @@
 
 static int  __evict_clear_all_walks(WT_SESSION_IMPL *);
 static int  __evict_clear_walks(WT_SESSION_IMPL *);
-static int  __evict_has_work(WT_SESSION_IMPL *, uint32_t *);
 static int  WT_CDECL __evict_lru_cmp(const void *, const void *);
 static int  __evict_lru_pages(WT_SESSION_IMPL *, int);
-static int  __evict_lru_walk(WT_SESSION_IMPL *, uint32_t);
+static int  __evict_lru_walk(WT_SESSION_IMPL *);
 static int  __evict_page(WT_SESSION_IMPL *, int);
 static int  __evict_pass(WT_SESSION_IMPL *);
-static int  __evict_walk(WT_SESSION_IMPL *, uint32_t);
-static int  __evict_walk_file(WT_SESSION_IMPL *, u_int *, uint32_t);
+static int  __evict_walk(WT_SESSION_IMPL *);
+static int  __evict_walk_file(WT_SESSION_IMPL *, u_int *);
 static WT_THREAD_RET __evict_worker(void *);
 static int  __evict_server_work(WT_SESSION_IMPL *);
 
@@ -107,7 +106,7 @@ __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref)
 	__wt_spin_lock(session, &cache->evict_lock);
 
 	elem = cache->evict_max;
-	for (i = 0, evict = cache->evict; i < elem; i++, evict++)
+	for (i = 0, evict = cache->evict_queue; i < elem; i++, evict++)
 		if (evict->ref == ref) {
 			__evict_list_clear(session, evict);
 			break;
@@ -159,6 +158,7 @@ __evict_server(void *arg)
 	WT_CONNECTION_IMPL *conn;
 	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
+	u_int spins;
 
 	session = arg;
 	conn = S2C(session);
@@ -176,7 +176,27 @@ __evict_server(void *arg)
 		 * otherwise we can block applications evicting large pages.
 		 */
 		if (!F_ISSET(cache, WT_CACHE_STUCK)) {
-			WT_ERR(__evict_clear_walks(session));
+			for (spins = 0; (ret = __wt_spin_trylock(
+			    session, &conn->dhandle_lock)) == EBUSY &&
+			    !F_ISSET(cache, WT_CACHE_CLEAR_WALKS);
+			    spins++) {
+				if (spins < 1000)
+					__wt_yield();
+				else
+					__wt_sleep(0, 1000);
+			}
+			/*
+			 * If we gave up acquiring the lock, that indicates a
+			 * session is waiting for us to clear walks.  Do that
+			 * as part of a normal pass (without the handle list
+			 * lock) to avoid deadlock.
+			 */
+			if (ret == EBUSY)
+				continue;
+			WT_ERR(ret);
+			ret = __evict_clear_all_walks(session);
+			__wt_spin_unlock(session, &conn->dhandle_lock);
+			WT_ERR(ret);
 
 			/* Next time we wake up, reverse the sweep direction. */
 			cache->flags ^= WT_CACHE_WALK_REVERSE;
@@ -227,9 +247,16 @@ __evict_workers_resize(WT_SESSION_IMPL *session)
 
 	for (i = conn->evict_workers_alloc; i < conn->evict_workers_max; i++) {
 		WT_ERR(__wt_open_internal_session(conn,
-		    "eviction-worker", 0, 0, &workers[i].session));
+		    "eviction-worker", 1, 0, &workers[i].session));
 		workers[i].id = i;
-		F_SET(workers[i].session, WT_SESSION_CAN_WAIT);
+
+		/*
+		 * Eviction worker threads get their own lookaside table cursor.
+		 * Eviction worker threads may be called upon to perform slow
+		 * operations for the block manager.
+		 */
+		F_SET(workers[i].session,
+		    WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_CAN_WAIT);
 
 		if (i < conn->evict_workers_min) {
 			++conn->evict_workers;
@@ -259,7 +286,7 @@ __wt_evict_create(WT_SESSION_IMPL *session)
 
 	/* We need a session handle because we're reading/writing pages. */
 	WT_RET(__wt_open_internal_session(
-	    conn, "eviction-server", 0, 0, &conn->evict_session));
+	    conn, "eviction-server", 1, 0, &conn->evict_session));
 	session = conn->evict_session;
 
 	/*
@@ -276,6 +303,9 @@ __wt_evict_create(WT_SESSION_IMPL *session)
 	else
 		F_SET(session, WT_SESSION_CAN_WAIT);
 
+	/* The eviction server gets its own lookaside table cursor. */
+	F_SET(session, WT_SESSION_LOOKASIDE_CURSOR);
+
 	/*
 	 * Start the primary eviction server thread after the worker threads
 	 * have started to avoid it starting additional worker threads before
@@ -385,47 +415,62 @@ err:		WT_PANIC_MSG(session, ret, "cache eviction worker error");
 }
 
 /*
- * __evict_has_work --
- *	Find out if there is eviction work to be done.
+ * __evict_update_work --
+ *	Configure eviction work state.
  */
-static int
-__evict_has_work(WT_SESSION_IMPL *session, uint32_t *flagsp)
+static bool
+__evict_update_work(WT_SESSION_IMPL *session)
 {
 	WT_CACHE *cache;
 	WT_CONNECTION_IMPL *conn;
-	uint32_t flags;
-	int evict, dirty;
+	uint64_t bytes_inuse, bytes_max, dirty_inuse;
 
 	conn = S2C(session);
 	cache = conn->cache;
-	*flagsp = flags = 0;
+
+	/* Clear previous state. */
+	cache->state = 0;
 
 	if (!F_ISSET(conn, WT_CONN_EVICTION_RUN))
-		return (0);
+		return (false);
 
-	/* Check to see if the eviction server should run. */
-	__wt_cache_status(session, &evict, &dirty);
-	if (evict)
-		/* The cache is too small. */
-		LF_SET(WT_EVICT_PASS_ALL);
-	else if (dirty)
-		/* Too many dirty pages, ignore clean pages. */
-		LF_SET(WT_EVICT_PASS_DIRTY);
-	else if (F_ISSET(cache, WT_CACHE_WOULD_BLOCK)) {
-		/*
-		 * Evict pages with oldest generation (which would otherwise
-		 * block application threads) set regardless of whether we have
-		 * reached the eviction trigger.
-		 */
-		LF_SET(WT_EVICT_PASS_WOULD_BLOCK);
-		F_CLR(cache, WT_CACHE_WOULD_BLOCK);
+	/*
+	 * Page eviction overrides the dirty target and other types of eviction,
+	 * that is, we don't care where we are with respect to the dirty target
+	 * if page eviction is configured.
+	 *
+	 * Avoid division by zero if the cache size has not yet been set in a
+	 * shared cache.
+	 */
+	bytes_max = conn->cache_size + 1;
+	bytes_inuse = __wt_cache_bytes_inuse(cache);
+	if (bytes_inuse > (cache->eviction_target * bytes_max) / 100) {
+		FLD_SET(cache->state, WT_EVICT_PASS_ALL);
+		goto done;
 	}
 
-	if (F_ISSET(cache, WT_CACHE_STUCK))
-		LF_SET(WT_EVICT_PASS_AGGRESSIVE);
+	dirty_inuse = __wt_cache_dirty_inuse(cache);
+	if (dirty_inuse > (cache->eviction_dirty_target * bytes_max) / 100) {
+		FLD_SET(cache->state, WT_EVICT_PASS_DIRTY);
+		goto done;
+	}
 
-	*flagsp = flags;
-	return (0);
+	/*
+	 * Evict pages with oldest generation (which would otherwise block
+	 * application threads), set regardless of whether we have reached
+	 * the eviction trigger.
+	 */
+	if (F_ISSET(cache, WT_CACHE_WOULD_BLOCK)) {
+		FLD_SET(cache->state, WT_EVICT_PASS_WOULD_BLOCK);
+
+		F_CLR(cache, WT_CACHE_WOULD_BLOCK);
+		goto done;
+	}
+	return (false);
+
+done:	if (F_ISSET(cache, WT_CACHE_STUCK))
+		FLD_SET(cache->state, WT_EVICT_PASS_AGGRESSIVE);
+	return (true);
 }
 
 /*
@@ -439,7 +484,6 @@ __evict_pass(WT_SESSION_IMPL *session)
 	WT_CONNECTION_IMPL *conn;
 	WT_EVICT_WORKER *worker;
 	uint64_t pages_evicted;
-	uint32_t flags;
 	int loop;
 
 	conn = S2C(session);
@@ -462,25 +506,36 @@ __evict_pass(WT_SESSION_IMPL *session)
 		}
 
 		/*
-		 * Increment the shared read generation.  We do this
-		 * occasionally even if eviction is not currently required, so
-		 * that pages have some relative read generation when the
-		 * eviction server does need to do some work.
+		 * Increment the shared read generation. Do this occasionally
+		 * even if eviction is not currently required, so that pages
+		 * have some relative read generation when the eviction server
+		 * does need to do some work.
 		 */
 		__wt_cache_read_gen_incr(session);
 
-		WT_RET(__evict_has_work(session, &flags));
-		if (flags == 0)
+		/*
+		 * Update the oldest ID: we use it to decide whether pages are
+		 * candidates for eviction.  Without this, if all threads are
+		 * blocked after a long-running transaction (such as a
+		 * checkpoint) completes, we may never start evicting again.
+		 *
+		 * Do this every time the eviction server wakes up, regardless
+		 * of whether the cache is full, to prevent the oldest ID
+		 * falling too far behind.
+		 */
+		__wt_txn_update_oldest(session, 1);
+
+		if (!__evict_update_work(session))
 			break;
 
 		if (loop > 10)
-			LF_SET(WT_EVICT_PASS_AGGRESSIVE);
+			FLD_SET(cache->state, WT_EVICT_PASS_AGGRESSIVE);
 
 		/*
 		 * Start a worker if we have capacity and we haven't reached
 		 * the eviction targets.
 		 */
-		if (LF_ISSET(WT_EVICT_PASS_ALL |
+		if (FLD_ISSET(cache->state, WT_EVICT_PASS_ALL |
 		    WT_EVICT_PASS_DIRTY | WT_EVICT_PASS_WOULD_BLOCK) &&
 		    conn->evict_workers < conn->evict_workers_max) {
 			WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER,
@@ -499,7 +554,7 @@ __evict_pass(WT_SESSION_IMPL *session)
 		    " In use: %" PRIu64 " Dirty: %" PRIu64,
 		    conn->cache_size, cache->bytes_inmem, cache->bytes_dirty));
 
-		WT_RET(__evict_lru_walk(session, flags));
+		WT_RET(__evict_lru_walk(session));
 		WT_RET(__evict_server_work(session));
 
 		/*
@@ -520,7 +575,8 @@ __evict_pass(WT_SESSION_IMPL *session)
 				 * Mark the cache as stuck if we need space
 				 * and aren't evicting any pages.
 				 */
-				if (!LF_ISSET(WT_EVICT_PASS_WOULD_BLOCK)) {
+				if (!FLD_ISSET(cache->state,
+				    WT_EVICT_PASS_WOULD_BLOCK)) {
 					F_SET(cache, WT_CACHE_STUCK);
 					WT_STAT_FAST_CONN_INCR(
 					    session, cache_eviction_slow);
@@ -546,9 +602,14 @@ static int
 __evict_clear_walk(WT_SESSION_IMPL *session)
 {
 	WT_BTREE *btree;
+	WT_CACHE *cache;
 	WT_REF *ref;
 
 	btree = S2BT(session);
+	cache = S2C(session)->cache;
+
+	if (session->dhandle == cache->evict_file_next)
+		cache->evict_file_next = NULL;
 
 	if ((ref = btree->evict_ref) == NULL)
 		return (0);
@@ -568,21 +629,17 @@ __evict_clear_walk(WT_SESSION_IMPL *session)
 static int
 __evict_clear_walks(WT_SESSION_IMPL *session)
 {
-	WT_CACHE *cache;
 	WT_CONNECTION_IMPL *conn;
 	WT_DECL_RET;
 	WT_SESSION_IMPL *s;
 	u_int i, session_cnt;
 
 	conn = S2C(session);
-	cache = conn->cache;
 
 	WT_ORDERED_READ(session_cnt, conn->session_cnt);
 	for (s = conn->sessions, i = 0; i < session_cnt; ++s, ++i) {
 		if (!s->active || !F_ISSET(s, WT_SESSION_CLEAR_EVICT_WALK))
 			continue;
-		if (s->dhandle == cache->evict_file_next)
-			cache->evict_file_next = NULL;
 		WT_WITH_DHANDLE(
 		    session, s->dhandle, WT_TRET(__evict_clear_walk(session)));
 	}
@@ -606,7 +663,8 @@ __evict_request_walk_clear(WT_SESSION_IMPL *session)
 
 	F_SET(session, WT_SESSION_CLEAR_EVICT_WALK);
 
-	while (btree->evict_ref != NULL && ret == 0) {
+	while (ret == 0 && (btree->evict_ref != NULL ||
+	    cache->evict_file_next == session->dhandle)) {
 		F_SET(cache, WT_CACHE_CLEAR_WALKS);
 		ret = __wt_cond_wait(
 		    session, cache->evict_waiter_cond, 100000);
@@ -630,7 +688,7 @@ __evict_clear_all_walks(WT_SESSION_IMPL *session)
 
 	conn = S2C(session);
 
-	SLIST_FOREACH(dhandle, &conn->dhlh, l)
+	TAILQ_FOREACH(dhandle, &conn->dhqh, q)
 		if (WT_PREFIX_MATCH(dhandle->name, "file:"))
 			WT_WITH_DHANDLE(session,
 			    dhandle, WT_TRET(__evict_clear_walk(session)));
@@ -638,44 +696,6 @@ __evict_clear_all_walks(WT_SESSION_IMPL *session)
 }
 
 /*
- * __wt_evict_page --
- *	Evict a given page.
- */
-int
-__wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref)
-{
-	WT_DECL_RET;
-	WT_TXN *txn;
-	WT_TXN_ISOLATION saved_iso;
-
-	/*
-	 * We have to take care when evicting pages not to write a change that:
-	 *  (a) is not yet committed; or
-	 *  (b) is committed more recently than an in-progress checkpoint.
-	 *
-	 * We handle both of these cases by setting up the transaction context
-	 * before evicting, using a special "eviction" isolation level, where
-	 * only globally visible updates can be evicted.
-	 */
-	__wt_txn_update_oldest(session, 1);
-	txn = &session->txn;
-	saved_iso = txn->isolation;
-	txn->isolation = WT_ISO_EVICTION;
-
-	/*
-	 * Sanity check: if a transaction has updates, its updates should not
-	 * be visible to eviction.
-	 */
-	WT_ASSERT(session, !F_ISSET(txn, WT_TXN_HAS_ID) ||
-	    !__wt_txn_visible(session, txn->id));
-
-	ret = __wt_evict(session, ref, 0);
-	txn->isolation = saved_iso;
-
-	return (ret);
-}
-
-/*
  * __wt_evict_file_exclusive_on --
  *	Get exclusive eviction access to a file and discard any of the file's
  *	blocks queued for eviction.
@@ -719,7 +739,7 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, int *evict_resetp)
 	 * clear it.
 	 */
 	elem = cache->evict_max;
-	for (i = 0, evict = cache->evict; i < elem; i++, evict++)
+	for (i = 0, evict = cache->evict_queue; i < elem; i++, evict++)
 		if (evict->btree == btree)
 			__evict_list_clear(session, evict);
 	__wt_spin_unlock(session, &cache->evict_lock);
@@ -773,7 +793,7 @@ __evict_lru_pages(WT_SESSION_IMPL *session, int is_server)
  *	Add pages to the LRU queue to be evicted from cache.
  */
 static int
-__evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags)
+__evict_lru_walk(WT_SESSION_IMPL *session)
 {
 	WT_CACHE *cache;
 	WT_DECL_RET;
@@ -784,17 +804,17 @@ __evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags)
 	cache = S2C(session)->cache;
 
 	/* Get some more pages to consider for eviction. */
-	if ((ret = __evict_walk(session, flags)) != 0)
+	if ((ret = __evict_walk(session)) != 0)
 		return (ret == EBUSY ? 0 : ret);
 
 	/* Sort the list into LRU order and restart. */
 	__wt_spin_lock(session, &cache->evict_lock);
 
 	entries = cache->evict_entries;
-	qsort(cache->evict,
+	qsort(cache->evict_queue,
 	    entries, sizeof(WT_EVICT_ENTRY), __evict_lru_cmp);
 
-	while (entries > 0 && cache->evict[entries - 1].ref == NULL)
+	while (entries > 0 && cache->evict_queue[entries - 1].ref == NULL)
 		--entries;
 
 	cache->evict_entries = entries;
@@ -811,12 +831,13 @@ __evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags)
 		return (0);
 	}
 
-	WT_ASSERT(session, cache->evict[0].ref != NULL);
+	WT_ASSERT(session, cache->evict_queue[0].ref != NULL);
 
 	/* Track the oldest read generation we have in the queue. */
-	cache->read_gen_oldest = cache->evict[0].ref->page->read_gen;
+	cache->read_gen_oldest = cache->evict_queue[0].ref->page->read_gen;
 
-	if (LF_ISSET(WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK))
+	if (FLD_ISSET(cache->state,
+	    WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK))
 		/*
 		 * Take all candidates if we only gathered pages with an oldest
 		 * read generation set.
@@ -824,8 +845,8 @@ __evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags)
 		cache->evict_candidates = entries;
 	else {
 		/* Find the bottom 25% of read generations. */
-		cutoff = (3 * __evict_read_gen(&cache->evict[0]) +
-		    __evict_read_gen(&cache->evict[entries - 1])) / 4;
+		cutoff = (3 * __evict_read_gen(&cache->evict_queue[0]) +
+		    __evict_read_gen(&cache->evict_queue[entries - 1])) / 4;
 		/*
 		 * Don't take less than 10% or more than 50% of entries,
 		 * regardless.  That said, if there is only one entry, which is
@@ -835,21 +856,21 @@ __evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags)
 		    candidates < entries / 2;
 		    candidates++)
 			if (__evict_read_gen(
-			    &cache->evict[candidates]) > cutoff)
+			    &cache->evict_queue[candidates]) > cutoff)
 				break;
 		cache->evict_candidates = candidates;
 	}
 
 	/* If we have more than the minimum number of entries, clear them. */
 	if (cache->evict_entries > WT_EVICT_WALK_BASE) {
-		for (i = WT_EVICT_WALK_BASE, evict = cache->evict + i;
+		for (i = WT_EVICT_WALK_BASE, evict = cache->evict_queue + i;
 		    i < cache->evict_entries;
 		    i++, evict++)
 			__evict_list_clear(session, evict);
 		cache->evict_entries = WT_EVICT_WALK_BASE;
 	}
 
-	cache->evict_current = cache->evict;
+	cache->evict_current = cache->evict_queue;
 	__wt_spin_unlock(session, &cache->evict_lock);
 
 	/*
@@ -894,7 +915,7 @@ __evict_server_work(WT_SESSION_IMPL *session)
  *	Fill in the array by walking the next set of pages.
  */
 static int
-__evict_walk(WT_SESSION_IMPL *session, uint32_t flags)
+__evict_walk(WT_SESSION_IMPL *session)
 {
 	WT_BTREE *btree;
 	WT_CACHE *cache;
@@ -910,14 +931,6 @@ __evict_walk(WT_SESSION_IMPL *session, uint32_t flags)
 	incr = dhandle_locked = 0;
 	retries = 0;
 
-	/*
-	 * Update the oldest ID: we use it to decide whether pages are
-	 * candidates for eviction.  Without this, if all threads are blocked
-	 * after a long-running transaction (such as a checkpoint) completes,
-	 * we may never start evicting again.
-	 */
-	__wt_txn_update_oldest(session, 1);
-
 	if (cache->evict_current == NULL)
 		WT_STAT_FAST_CONN_INCR(session, cache_eviction_queue_empty);
 	else
@@ -957,15 +970,24 @@ retry:	while (slot < max_entries && ret == 0) {
 			dhandle_locked = 1;
 		}
 
-		if (dhandle == NULL)
-			dhandle = SLIST_FIRST(&conn->dhlh);
-		else {
+		if (dhandle == NULL) {
+			/*
+			 * On entry, continue from wherever we got to in the
+			 * scan last time through.  If we don't have a saved
+			 * handle, start from the beginning of the list.
+			 */
+			if ((dhandle = cache->evict_file_next) != NULL)
+				cache->evict_file_next = NULL;
+			else
+				dhandle = TAILQ_FIRST(&conn->dhqh);
+		} else {
 			if (incr) {
 				WT_ASSERT(session, dhandle->session_inuse > 0);
-				(void)WT_ATOMIC_SUB4(dhandle->session_inuse, 1);
+				(void)__wt_atomic_subi32(
+				    &dhandle->session_inuse, 1);
 				incr = 0;
 			}
-			dhandle = SLIST_NEXT(dhandle, l);
+			dhandle = TAILQ_NEXT(dhandle, q);
 		}
 
 		/* If we reach the end of the list, we're done. */
@@ -977,15 +999,6 @@ retry:	while (slot < max_entries && ret == 0) {
 		    !F_ISSET(dhandle, WT_DHANDLE_OPEN))
 			continue;
 
-		/*
-		 * Each time we reenter this function, start at the next handle
-		 * on the list.
-		 */
-		if (cache->evict_file_next != NULL &&
-		    cache->evict_file_next != dhandle)
-			continue;
-		cache->evict_file_next = NULL;
-
 		/* Skip files that don't allow eviction. */
 		btree = dhandle->handle;
 		if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
@@ -996,7 +1009,7 @@ retry:	while (slot < max_entries && ret == 0) {
 		 * stick in cache until we get aggressive.
 		 */
 		if ((btree->checkpointing || btree->evict_priority != 0) &&
-		    !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE))
+		    !FLD_ISSET(cache->state, WT_EVICT_PASS_AGGRESSIVE))
 			continue;
 
 		/* Skip files if we have used all available hazard pointers. */
@@ -1015,7 +1028,7 @@ retry:	while (slot < max_entries && ret == 0) {
 		btree->evict_walk_skips = 0;
 		prev_slot = slot;
 
-		(void)WT_ATOMIC_ADD4(dhandle->session_inuse, 1);
+		(void)__wt_atomic_addi32(&dhandle->session_inuse, 1);
 		incr = 1;
 		__wt_spin_unlock(session, &conn->dhandle_lock);
 		dhandle_locked = 0;
@@ -1028,7 +1041,7 @@ retry:	while (slot < max_entries && ret == 0) {
 		 */
 		if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) {
 			WT_WITH_DHANDLE(session, dhandle,
-			    ret = __evict_walk_file(session, &slot, flags));
+			    ret = __evict_walk_file(session, &slot));
 			WT_ASSERT(session, session->split_gen == 0);
 		}
 
@@ -1046,8 +1059,11 @@ retry:	while (slot < max_entries && ret == 0) {
 	}
 
 	if (incr) {
+		/* Remember the file we should visit first, next loop. */
+		cache->evict_file_next = dhandle;
+
 		WT_ASSERT(session, dhandle->session_inuse > 0);
-		(void)WT_ATOMIC_SUB4(dhandle->session_inuse, 1);
+		(void)__wt_atomic_subi32(&dhandle->session_inuse, 1);
 		incr = 0;
 	}
 
@@ -1059,21 +1075,18 @@ retry:	while (slot < max_entries && ret == 0) {
 	/*
 	 * Walk the list of files a few times if we don't find enough pages.
 	 * Try two passes through all the files, give up when we have some
-	 * candidates and we aren't finding more.  Take care not to skip files
-	 * on subsequent passes.
+	 * candidates and we aren't finding more.
 	 */
 	if (!F_ISSET(cache, WT_CACHE_CLEAR_WALKS) && ret == 0 &&
 	    slot < max_entries && (retries < 2 ||
-	    (!LF_ISSET(WT_EVICT_PASS_WOULD_BLOCK) && retries < 10 &&
+	    (retries < 10 &&
+	    !FLD_ISSET(cache->state, WT_EVICT_PASS_WOULD_BLOCK) &&
 	    (slot == cache->evict_entries || slot > start_slot)))) {
-		cache->evict_file_next = NULL;
 		start_slot = slot;
 		++retries;
 		goto retry;
 	}
 
-	/* Remember the file we should visit first, next loop. */
-	cache->evict_file_next = dhandle;
 	cache->evict_entries = slot;
 	return (ret);
 }
@@ -1092,7 +1105,7 @@ __evict_init_candidate(
 	cache = S2C(session)->cache;
 
 	/* Keep track of the maximum slot we are using. */
-	slot = (u_int)(evict - cache->evict);
+	slot = (u_int)(evict - cache->evict_queue);
 	if (slot >= cache->evict_max)
 		cache->evict_max = slot + 1;
 
@@ -1110,10 +1123,11 @@ __evict_init_candidate(
  *	Get a few page eviction candidates from a single underlying file.
  */
 static int
-__evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
+__evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
 {
 	WT_BTREE *btree;
 	WT_CACHE *cache;
+	WT_CONNECTION_IMPL *conn;
 	WT_DECL_RET;
 	WT_EVICT_ENTRY *end, *evict, *start;
 	WT_PAGE *page;
@@ -1123,11 +1137,12 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
 	uint32_t walk_flags;
 	int enough, internal_pages, modified, restarts;
 
+	conn = S2C(session);
 	btree = S2BT(session);
-	cache = S2C(session)->cache;
-	start = cache->evict + *slotp;
+	cache = conn->cache;
+	start = cache->evict_queue + *slotp;
 	end = WT_MIN(start + WT_EVICT_WALK_PER_FILE,
-	    cache->evict + cache->evict_slots);
+	    cache->evict_queue + cache->evict_slots);
 	enough = internal_pages = restarts = 0;
 
 	walk_flags = WT_READ_CACHE | WT_READ_NO_EVICT |
@@ -1178,21 +1193,21 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
 			goto fast;
 
 		/* Optionally ignore clean pages. */
-		if (!modified && LF_ISSET(WT_EVICT_PASS_DIRTY))
+		if (!modified && FLD_ISSET(cache->state, WT_EVICT_PASS_DIRTY))
 			continue;
 
 		/*
 		 * If we are only trickling out pages marked for definite
 		 * eviction, skip anything that isn't marked.
 		 */
-		if (LF_ISSET(WT_EVICT_PASS_WOULD_BLOCK) &&
+		if (FLD_ISSET(cache->state, WT_EVICT_PASS_WOULD_BLOCK) &&
 		    page->read_gen != WT_READGEN_OLDEST)
 			continue;
 
 		/* Limit internal pages to 50% unless we get aggressive. */
 		if (WT_PAGE_IS_INTERNAL(page) &&
 		    ++internal_pages > WT_EVICT_WALK_PER_FILE / 2 &&
-		    !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE))
+		    !FLD_ISSET(cache->state, WT_EVICT_PASS_AGGRESSIVE))
 			continue;
 
 		/*
@@ -1207,36 +1222,44 @@ fast:		/* If the page can't be evicted, give up. */
 			continue;
 
 		/*
-		 * If the page is clean but has modifications that appear too
-		 * new to evict, skip it.
+		 * Additional tests if eviction is likely to succeed.
 		 *
-		 * Note: take care with ordering: if we detected that the page
-		 * is modified above, we expect mod != NULL.
+		 * If eviction is stuck or we are helping with forced eviction,
+		 * try anyway: maybe a transaction that was running last time
+		 * we wrote the page has since rolled back, or we can help the
+		 * checkpoint complete sooner. Additionally, being stuck will
+		 * configure lookaside table writes in reconciliation, allowing
+		 * us to evict pages we can't usually evict.
 		 */
-		mod = page->modify;
-		if (!modified && mod != NULL && !LF_ISSET(
-		    WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK) &&
-		    !__wt_txn_visible_all(session, mod->rec_max_txn))
-			continue;
+		if (!FLD_ISSET(cache->state,
+		    WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK)) {
+			/*
+			 * Note: take care with ordering: if we detected that
+			 * the page is modified above, we expect mod != NULL.
+			 */
+			mod = page->modify;
 
-		/*
-		 * If the oldest transaction hasn't changed since the last time
-		 * this page was written, it's unlikely that we can make
-		 * progress.  Similarly, if the most recent update on the page
-		 * is not yet globally visible, eviction will fail.  These
-		 * heuristics attempt to avoid repeated attempts to evict the
-		 * same page.
-		 *
-		 * That said, if eviction is stuck, or we are helping with
-		 * forced eviction, try anyway: maybe a transaction that was
-		 * running last time we wrote the page has since rolled back,
-		 * or we can help get the checkpoint completed sooner.
-		 */
-		if (modified && !LF_ISSET(
-		    WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK) &&
-		    (mod->disk_snap_min == S2C(session)->txn_global.oldest_id ||
-		    !__wt_txn_visible_all(session, mod->update_txn)))
-			continue;
+			/*
+			 * If the page is clean but has modifications that
+			 * appear too new to evict, skip it.
+			 */
+			if (!modified && mod != NULL &&
+			    !__wt_txn_visible_all(session, mod->rec_max_txn))
+				continue;
+
+			/*
+			 * If the oldest transaction hasn't changed since the
+			 * last time this page was written, it's unlikely we
+			 * can make progress.  Similarly, if the most recent
+			 * update on the page is not yet globally visible,
+			 * eviction will fail.  These heuristics attempt to
+			 * avoid repeated attempts to evict the same page.
+			 */
+			if (modified &&
+			    (mod->disk_snap_min == conn->txn_global.oldest_id ||
+			    !__wt_txn_visible_all(session, mod->update_txn)))
+				continue;
+		}
 
 		WT_ASSERT(session, evict->ref == NULL);
 		__evict_init_candidate(session, evict, ref);
@@ -1245,28 +1268,28 @@ fast:		/* If the page can't be evicted, give up. */
 		WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER,
 		    "select: %p, size %" PRIu64, page, page->memory_footprint));
 	}
+	WT_RET_NOTFOUND_OK(ret);
+
+	*slotp += (u_int)(evict - start);
 
 	/*
 	 * If we happen to end up on the root page, clear it.  We have to track
 	 * hazard pointers, and the root page complicates that calculation.
 	 *
-	 * Also clear the walk if we land on a page requiring forced eviction.
-	 * The eviction server may go to sleep, and we want this page evicted
-	 * as quickly as possible.
+	 * If we land on a page requiring forced eviction, move on to the next
+	 * page: we want this page evicted as quickly as possible.
 	 */
-	if ((ref = btree->evict_ref) != NULL && (__wt_ref_is_root(ref) ||
-	    ref->page->read_gen == WT_READGEN_OLDEST)) {
-		btree->evict_ref = NULL;
-		__wt_page_release(session, ref, WT_READ_NO_EVICT);
+	if ((ref = btree->evict_ref) != NULL) {
+		if (__wt_ref_is_root(ref))
+			WT_RET(__evict_clear_walk(session));
+		else if (ref->page->read_gen == WT_READGEN_OLDEST)
+			WT_RET_NOTFOUND_OK(__wt_tree_walk(session,
+			    &btree->evict_ref, &pages_walked, walk_flags));
 	}
 
-	/* If the walk was interrupted by a locked page, that's okay. */
-	if (ret == WT_NOTFOUND)
-		ret = 0;
-
-	*slotp += (u_int)(evict - start);
 	WT_STAT_FAST_CONN_INCRV(session, cache_eviction_walk, pages_walked);
-	return (ret);
+
+	return (0);
 }
 
 /*
@@ -1310,7 +1333,7 @@ __evict_get_ref(
 
 	/* Get the next page queued for eviction. */
 	while ((evict = cache->evict_current) != NULL &&
-	    evict < cache->evict + candidates && evict->ref != NULL) {
+	    evict < cache->evict_queue + candidates && evict->ref != NULL) {
 		WT_ASSERT(session, evict->btree != NULL);
 
 		/* Move to the next item. */
@@ -1321,8 +1344,8 @@ __evict_get_ref(
 		 * multiple attempts to evict it.  For pages that are already
 		 * being evicted, this operation will fail and we will move on.
 		 */
-		if (!WT_ATOMIC_CAS4(
-		    evict->ref->state, WT_REF_MEM, WT_REF_LOCKED)) {
+		if (!__wt_atomic_casv32(
+		    &evict->ref->state, WT_REF_MEM, WT_REF_LOCKED)) {
 			__evict_list_clear(session, evict);
 			continue;
 		}
@@ -1331,7 +1354,7 @@ __evict_get_ref(
 		 * Increment the busy count in the btree handle to prevent it
 		 * from being closed under us.
 		 */
-		(void)WT_ATOMIC_ADD4(evict->btree->evict_busy, 1);
+		(void)__wt_atomic_addv32(&evict->btree->evict_busy, 1);
 
 		*btreep = evict->btree;
 		*refp = evict->ref;
@@ -1345,7 +1368,7 @@ __evict_get_ref(
 	}
 
 	/* Clear the current pointer if there are no more candidates. */
-	if (evict >= cache->evict + cache->evict_candidates)
+	if (evict >= cache->evict_queue + cache->evict_candidates)
 		cache->evict_current = NULL;
 	__wt_spin_unlock(session, &cache->evict_lock);
 
@@ -1402,15 +1425,12 @@ __evict_page(WT_SESSION_IMPL *session, int is_server)
 	 * page-discard function assert that no dirty pages are ever
 	 * discarded.
 	 */
-	if (F_ISSET(btree->dhandle, WT_DHANDLE_DEAD) &&
-	    __wt_page_is_modified(page)) {
-		page->modify->write_gen = 0;
-		__wt_cache_dirty_decr(session, page);
-	}
+	if (F_ISSET(btree->dhandle, WT_DHANDLE_DEAD))
+		__wt_page_modify_clear(session, page);
 
-	WT_WITH_BTREE(session, btree, ret = __wt_evict_page(session, ref));
+	WT_WITH_BTREE(session, btree, ret = __wt_evict(session, ref, 0));
 
-	(void)WT_ATOMIC_SUB4(btree->evict_busy, 1);
+	(void)__wt_atomic_subv32(&btree->evict_busy, 1);
 
 	WT_RET(ret);
 
@@ -1427,7 +1447,7 @@ __evict_page(WT_SESSION_IMPL *session, int is_server)
  * crosses its boundaries.
  */
 int
-__wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, int pct_full)
+__wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, u_int pct_full)
 {
 	WT_CACHE *cache;
 	WT_CONNECTION_IMPL *conn;
@@ -1544,29 +1564,31 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, int pct_full)
  *	NOTE: this function is not called anywhere, it is intended to be called
  *	from a debugger.
  */
-void
-__wt_cache_dump(WT_SESSION_IMPL *session)
+int
+__wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile)
 {
-	WT_BTREE *btree;
+	FILE *fp;
 	WT_CONNECTION_IMPL *conn;
-	WT_DATA_HANDLE *dhandle;
-	WT_REF *next_walk;
+	WT_DATA_HANDLE *dhandle, *saved_dhandle;
 	WT_PAGE *page;
+	WT_REF *next_walk;
 	uint64_t file_intl_pages, file_leaf_pages;
 	uint64_t file_bytes, file_dirty, total_bytes;
 
 	conn = S2C(session);
 	total_bytes = 0;
 
-	SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+	if (ofile == NULL)
+		fp = stdout;
+	else
+		WT_RET(__wt_fopen(session, ofile, WT_FHANDLE_WRITE, 0, &fp));
+
+	saved_dhandle = session->dhandle;
+	TAILQ_FOREACH(dhandle, &conn->dhqh, q) {
 		if (!WT_PREFIX_MATCH(dhandle->name, "file:") ||
 		    !F_ISSET(dhandle, WT_DHANDLE_OPEN))
 			continue;
 
-		btree = dhandle->handle;
-		if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
-			continue;
-
 		file_bytes = file_dirty = file_intl_pages = file_leaf_pages = 0;
 		next_walk = NULL;
 		session->dhandle = dhandle;
@@ -1581,12 +1603,14 @@ __wt_cache_dump(WT_SESSION_IMPL *session)
 			file_bytes += page->memory_footprint;
 			if (__wt_page_is_modified(page))
 				file_dirty += page->memory_footprint;
+			(void)__wt_fprintf(fp,
+			    "%" WT_SIZET_FMT ", ", page->memory_footprint);
 		}
 		session->dhandle = NULL;
 
-		printf("cache dump: %s%s%s%s:"
-		    " %" PRIu64 " intl pages, %" PRIu64 " leaf pages,"
-		    " %" PRIu64 "MB, %" PRIu64 "MB dirty\n",
+		(void)__wt_fprintf(fp, "\n" "cache dump: %s%s%s%s\n\t"
+		    " %" PRIu64 " internal pages, %" PRIu64 " leaf pages,"
+		    " %" PRIu64 "MB, %" PRIu64 "MB dirty\n==============\n",
 		    dhandle->name,
 		    dhandle->checkpoint == NULL ? "" : " [",
 		    dhandle->checkpoint == NULL ? "" : dhandle->checkpoint,
@@ -1596,9 +1620,13 @@ __wt_cache_dump(WT_SESSION_IMPL *session)
 
 		total_bytes += file_bytes;
 	}
-	printf("cache dump: total found = %" PRIu64 "MB"
+	session->dhandle = saved_dhandle;
+
+	(void)__wt_fprintf(fp, "cache dump: total found = %" PRIu64 "MB"
 	    " vs tracked inuse %" PRIu64 "MB\n",
 	    total_bytes >> 20, __wt_cache_bytes_inuse(conn->cache) >> 20);
-	fflush(stdout);
+	if (fp != stdout)
+		WT_RET(__wt_fclose(&fp, WT_FHANDLE_WRITE));
+	return (0);
 }
 #endif
diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c
index 1e5faf45de2..11284ce7b21 100644
--- a/src/evict/evict_page.c
+++ b/src/evict/evict_page.c
@@ -150,17 +150,12 @@ done:	if (((inmem_split && ret == 0) || (forced_eviction && ret == EBUSY)) &&
 int
 __wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
 {
-	int evict;
-
 	/*
 	 * If doing normal system eviction, but only in the service of reducing
 	 * the number of dirty pages, leave the clean page in cache.
 	 */
-	if (!closing) {
-		__wt_cache_status(session, &evict, NULL);
-		if (!evict)
-			return (EBUSY);
-	}
+	if (!closing && __wt_eviction_dirty_target(session))
+		return (EBUSY);
 
 	/*
 	 * Discard the page and update the reference structure; if the page has
@@ -184,7 +179,6 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
 	WT_ADDR *addr;
 	WT_PAGE *parent;
 	WT_PAGE_MODIFY *mod;
-	int evict;
 
 	parent = ref->home;
 	mod = ref->page->modify;
@@ -229,11 +223,8 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
 		 * push it out of cache (and read it back in, when needed), we
 		 * would rather have more, smaller pages than fewer large pages.
 		 */
-		if (!closing) {
-			__wt_cache_status(session, &evict, NULL);
-			if (!evict)
-				return (EBUSY);
-		}
+		if (!closing && __wt_eviction_dirty_target(session))
+			return (EBUSY);
 
 		/* Discard the parent's address. */
 		if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) {
@@ -309,8 +300,7 @@ __evict_review(
 {
 	WT_DECL_RET;
 	WT_PAGE *page;
-	WT_PAGE_MODIFY *mod;
-	uint32_t reconcile_flags;
+	uint32_t flags;
 
 	/*
 	 * Get exclusive access to the page if our caller doesn't have the tree
@@ -331,7 +321,6 @@ __evict_review(
 
 	/* Now that we have exclusive access, review the page. */
 	page = ref->page;
-	mod = page->modify;
 
 	/*
 	 * Fail if an internal has active children, the children must be evicted
@@ -347,6 +336,13 @@ __evict_review(
 
 	/* Check if the page can be evicted. */
 	if (!closing) {
+		/*
+		 * Update the oldest ID to avoid wasted effort should it have
+		 * fallen behind current.
+		 */
+		if (__wt_page_is_modified(page))
+			__wt_txn_update_oldest(session, 1);
+
 		if (!__wt_page_can_evict(session, page, 0, inmem_splitp))
 			return (EBUSY);
 
@@ -361,9 +357,12 @@ __evict_review(
 			return (__wt_split_insert(session, ref));
 	}
 
+	/* If the page is clean, we're done and we can evict. */
+	if (!__wt_page_is_modified(page))
+		return (0);
+
 	/*
-	 * If the page is dirty and can possibly change state, reconcile it to
-	 * determine the final state.
+	 * If the page is dirty, reconcile it to decide if we can evict it.
 	 *
 	 * If we have an exclusive lock (we're discarding the tree), assert
 	 * there are no updates we cannot read.
@@ -377,30 +376,38 @@ __evict_review(
 	 * in-memory pages, (restoring the updates that stopped us from writing
 	 * the block), and inserting the whole mess into the page's parent.
 	 *
-	 * Don't set the update-restore flag for internal pages, they don't have
-	 * updates that can be saved and restored.
+	 * Otherwise, if eviction is getting pressed, configure reconciliation
+	 * to write not-yet-globally-visible updates to the lookaside table,
+	 * allowing the eviction of pages we'd otherwise have to retain in cache
+	 * to support older readers.
+	 *
+	 * Don't set the update-restore or lookaside table flags for internal
+	 * pages, they don't have update lists that can be saved and restored.
 	 */
-	reconcile_flags = WT_EVICTING;
-	if (__wt_page_is_modified(page)) {
-		if (closing)
-			FLD_SET(reconcile_flags, WT_SKIP_UPDATE_ERR);
-		else if (!WT_PAGE_IS_INTERNAL(page) &&
-		    page->read_gen == WT_READGEN_OLDEST)
-			FLD_SET(reconcile_flags, WT_SKIP_UPDATE_RESTORE);
-		WT_RET(__wt_reconcile(session, ref, NULL, reconcile_flags));
-		WT_ASSERT(session,
-		    !__wt_page_is_modified(page) ||
-		    FLD_ISSET(reconcile_flags, WT_SKIP_UPDATE_RESTORE));
+	flags = WT_EVICTING;
+	if (closing)
+		LF_SET(WT_VISIBILITY_ERR);
+	else if (!WT_PAGE_IS_INTERNAL(page)) {
+		if (page->read_gen == WT_READGEN_OLDEST)
+			LF_SET(WT_EVICT_UPDATE_RESTORE);
+		else if (__wt_eviction_aggressive(session))
+			LF_SET(WT_EVICT_LOOKASIDE);
 	}
 
+	WT_RET(__wt_reconcile(session, ref, NULL, flags));
+
 	/*
-	 * If the page was ever modified, make sure all of the updates
-	 * on the page are old enough they can be discarded from cache.
+	 * Success: assert the page is clean or reconciliation was configured
+	 * for an update/restore split, and if the page is clean, reconciliation
+	 * was configured for a lookaside table or all updates on the page are
+	 * globally visible.
 	 */
-	if (!closing && mod != NULL &&
-	    !__wt_txn_visible_all(session, mod->rec_max_txn) &&
-	    !FLD_ISSET(reconcile_flags, WT_SKIP_UPDATE_RESTORE))
-		return (EBUSY);
+	WT_ASSERT(session,
+	    LF_ISSET(WT_EVICT_UPDATE_RESTORE) || !__wt_page_is_modified(page));
+	WT_ASSERT(session,
+	    LF_SET(WT_EVICT_LOOKASIDE) ||
+	    __wt_page_is_modified(page) ||
+	    __wt_txn_visible_all(session, page->modify->rec_max_txn));
 
 	return (0);
 }
diff --git a/src/include/async.h b/src/include/async.h
index 88ecad6eb2c..fb9a64e774d 100644
--- a/src/include/async.h
+++ b/src/include/async.h
@@ -6,20 +6,6 @@
  * See the file LICENSE for redistribution information.
  */
 
-typedef enum {
-	WT_ASYNCOP_ENQUEUED,	/* Placed on the work queue */
-	WT_ASYNCOP_FREE,	/* Able to be allocated to user */
-	WT_ASYNCOP_READY,	/* Allocated and ready for user to use */
-	WT_ASYNCOP_WORKING	/* Operation in progress by worker */
-} WT_ASYNC_STATE;
-
-typedef enum {
-	WT_ASYNC_FLUSH_NONE=0,		/* No flush in progress */
-	WT_ASYNC_FLUSH_COMPLETE,	/* Notify flush caller it's done */
-	WT_ASYNC_FLUSH_IN_PROGRESS,	/* Prevent other callers */
-	WT_ASYNC_FLUSHING		/* Notify workers */
-} WT_ASYNC_FLUSH_STATE;
-
 #define	MAX_ASYNC_SLEEP_USECS	100000	/* Maximum sleep waiting for work */
 #define	MAX_ASYNC_YIELD		200	/* Maximum number of yields for work */
 
@@ -31,7 +17,7 @@ typedef enum {
  *	The URI/config/format cache.
  */
 struct __wt_async_format {
-	STAILQ_ENTRY(__wt_async_format) q;
+	TAILQ_ENTRY(__wt_async_format) q;
 	const char	*config;
 	uint64_t	cfg_hash;		/* Config hash */
 	const char	*uri;
@@ -53,7 +39,13 @@ struct __wt_async_op_impl {
 	uint64_t	unique_id;	/* Unique identifier. */
 
 	WT_ASYNC_FORMAT *format;	/* Format structure */
-	WT_ASYNC_STATE	state;		/* Op state */
+
+#define	WT_ASYNCOP_ENQUEUED	0	/* Placed on the work queue */
+#define	WT_ASYNCOP_FREE		1	/* Able to be allocated to user */
+#define	WT_ASYNCOP_READY	2	/* Allocated, ready for user to use */
+#define	WT_ASYNCOP_WORKING	3	/* Operation in progress by worker */
+	uint32_t	state;
+
 	WT_ASYNC_OPTYPE	optype;		/* Operation type */
 };
 
@@ -88,10 +80,16 @@ struct __wt_async {
 	uint64_t		 alloc_tail;	/* Next slot to dequeue */
 	uint64_t		 tail_slot;	/* Worker slot consumed */
 
-	STAILQ_HEAD(__wt_async_format_qh, __wt_async_format) formatqh;
-	int			 cur_queue;	/* Currently enqueued */
-	int			 max_queue;	/* Maximum enqueued */
-	WT_ASYNC_FLUSH_STATE	 flush_state;	/* Queue flush state */
+	TAILQ_HEAD(__wt_async_format_qh, __wt_async_format) formatqh;
+	uint32_t		 cur_queue;	/* Currently enqueued */
+	uint32_t		 max_queue;	/* Maximum enqueued */
+
+#define	WT_ASYNC_FLUSH_NONE		0	/* No flush in progress */
+#define	WT_ASYNC_FLUSH_COMPLETE		1	/* Notify flush caller done */
+#define	WT_ASYNC_FLUSH_IN_PROGRESS	2	/* Prevent other callers */
+#define	WT_ASYNC_FLUSHING		3	/* Notify workers */
+	uint32_t	 	 flush_state;
+
 	/* Notify any waiting threads when flushing is done. */
 	WT_CONDVAR		*flush_cond;
 	WT_ASYNC_OP_IMPL	 flush_op;	/* Special flush op */
@@ -112,7 +110,7 @@ struct __wt_async {
  *	has a cache of async cursors to reuse for operations.
  */
 struct __wt_async_cursor {
-	STAILQ_ENTRY(__wt_async_cursor) q;	/* Worker cache */
+	TAILQ_ENTRY(__wt_async_cursor) q;	/* Worker cache */
 	uint64_t	cfg_hash;		/* Config hash */
 	uint64_t	uri_hash;		/* URI hash */
 	WT_CURSOR	*c;			/* WT cursor */
@@ -124,6 +122,6 @@ struct __wt_async_cursor {
  */
 struct __wt_async_worker_state {
 	uint32_t	id;
-	STAILQ_HEAD(__wt_cursor_qh, __wt_async_cursor)	cursorqh;
+	TAILQ_HEAD(__wt_cursor_qh, __wt_async_cursor)	cursorqh;
 	uint32_t	num_cursors;
 };
diff --git a/src/include/bitstring.i b/src/include/bitstring.i
index c548c12761d..5449ffe6209 100644
--- a/src/include/bitstring.i
+++ b/src/include/bitstring.i
@@ -84,10 +84,10 @@ __bit_alloc(WT_SESSION_IMPL *session, uint64_t nbits, void *retp)
  * __bit_test --
  *	Test one bit in name.
  */
-static inline int
+static inline bool
 __bit_test(uint8_t *bitf, uint64_t bit)
 {
-	return (bitf[__bit_byte(bit)] & __bit_mask(bit) ? 1 : 0);
+	return ((bitf[__bit_byte(bit)] & __bit_mask(bit)) != 0);
 }
 
 /*
diff --git a/src/include/block.h b/src/include/block.h
index 795d646db1e..ce33b331e76 100644
--- a/src/include/block.h
+++ b/src/include/block.h
@@ -215,8 +215,8 @@ struct __wt_block {
 	/* A list of block manager handles, sharing a file descriptor. */
 	uint32_t ref;			/* References */
 	WT_FH	*fh;			/* Backing file handle */
-	SLIST_ENTRY(__wt_block) l;	/* Linked list of handles */
-	SLIST_ENTRY(__wt_block) hashl;	/* Hashed list of handles */
+	TAILQ_ENTRY(__wt_block) q;	/* Linked list of handles */
+	TAILQ_ENTRY(__wt_block) hashq;	/* Hashed list of handles */
 
 	/* Configuration information, set when the file is opened. */
 	uint32_t allocfirst;		/* Allocation is first-fit */
diff --git a/src/include/btmem.h b/src/include/btmem.h
index f13504d66ca..f214ddb1dc3 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -6,6 +6,8 @@
  * See the file LICENSE for redistribution information.
  */
 
+#define	WT_RECNO_OOB	0		/* Illegal record number */
+
 /*
  * WT_PAGE_HEADER --
  *	Blocks have a common header, a WT_PAGE_HEADER structure followed by a
@@ -43,6 +45,7 @@ struct __wt_page_header {
 #define	WT_PAGE_EMPTY_V_ALL	0x02	/* Page has all zero-length values */
 #define	WT_PAGE_EMPTY_V_NONE	0x04	/* Page has no zero-length values */
 #define	WT_PAGE_ENCRYPTED	0x08	/* Page is encrypted on disk */
+#define	WT_PAGE_LAS_UPDATE	0x10	/* Page updates in lookaside store */
 	uint8_t flags;			/* 25: flags */
 
 	/*
@@ -168,6 +171,29 @@ struct __wt_ovfl_txnc {
 };
 
 /*
+ * Lookaside table support: when a page is being reconciled for eviction and has
+ * updates that might be required by earlier readers in the system, the updates
+ * are written into a lookaside table, and restored as necessary if the page is
+ * read. The key is a unique marker for the page (a file ID plus an address),
+ * a counter (used to ensure the update records remain in the original order),
+ * the on-page item's transaction ID (so we can discard any update records from
+ * the lookaside table once the on-page item's transaction is globally visible),
+ * and the page key (byte-string for row-store, record number for column-store).
+ * The value is the WT_UPDATE structure's transaction ID, update size and value.
+ *
+ * As the key for the lookaside table is different for row- and column-store, we
+ * store both key types in a WT_ITEM, building/parsing them in the code, because
+ * otherwise we'd need two lookaside files with different key formats. We could
+ * make the lookaside table's key standard by moving the source key into the
+ * lookaside table value, but that doesn't make the coding any simpler, and it
+ * makes the lookaside table's value more likely to overflow the page size when
+ * the row-store key is relatively large.
+ */
+#define	WT_LAS_FORMAT							\
+    "key_format=" WT_UNCHECKED_STRING(IuQQu)				\
+    ",value_format=" WT_UNCHECKED_STRING(QIu)
+
+/*
  * WT_PAGE_MODIFY --
  *	When a page is modified, there's additional information to maintain.
  */
@@ -238,15 +264,17 @@ struct __wt_page_modify {
 		 * Eviction, but block wasn't written: unresolved updates and
 		 * associated disk image.
 		 *
-		 * Skipped updates are either a WT_INSERT, or a row-store leaf
-		 * page entry.
+		 * Saved updates are either a WT_INSERT, or a row-store leaf
+		 * page entry; in the case of creating lookaside records, there
+		 * is an additional value, the committed item's transaction ID.
 		 */
-		struct __wt_upd_skipped {
+		struct __wt_save_upd {
 			WT_INSERT *ins;
 			WT_ROW	  *rip;
-		} *skip;
-		uint32_t skip_entries;
-		void	*skip_dsk;
+			uint64_t   onpage_txn;
+		} *supd;
+		uint32_t supd_entries;
+		void	*supd_dsk;
 
 		/*
 		 * Block was written: address, size and checksum.
@@ -556,9 +584,8 @@ struct __wt_page {
 #define	WT_PAGE_DISK_ALLOC	0x02	/* Disk image in allocated memory */
 #define	WT_PAGE_DISK_MAPPED	0x04	/* Disk image in mapped memory */
 #define	WT_PAGE_EVICT_LRU	0x08	/* Page is on the LRU queue */
-#define	WT_PAGE_SCANNING	0x10	/* Obsolete updates are being scanned */
+#define	WT_PAGE_RECONCILIATION	0x10	/* Page reconciliation lock */
 #define	WT_PAGE_SPLIT_INSERT	0x20	/* A leaf page was split for append */
-#define	WT_PAGE_SPLIT_LOCKED	0x40	/* An internal page is growing */
 	uint8_t flags_atomic;		/* Atomic flags, use F_*_ATOMIC */
 
 	/*
@@ -656,14 +683,6 @@ struct __wt_page {
  * to the readers.  If the evicting thread does not find a hazard pointer,
  * the page is evicted.
  */
-typedef enum __wt_page_state {
-	WT_REF_DISK=0,			/* Page is on disk */
-	WT_REF_DELETED,			/* Page is on disk, but deleted */
-	WT_REF_LOCKED,			/* Page locked for exclusive access */
-	WT_REF_MEM,			/* Page is in cache and valid */
-	WT_REF_READING,			/* Page being read */
-	WT_REF_SPLIT			/* Parent page split (WT_REF dead) */
-} WT_PAGE_STATE;
 
 /*
  * WT_PAGE_DELETED --
@@ -691,7 +710,13 @@ struct __wt_ref {
 	WT_PAGE * volatile home;	/* Reference page */
 	uint32_t pindex_hint;		/* Reference page index hint */
 
-	volatile WT_PAGE_STATE state;	/* Page state */
+#define	WT_REF_DISK	0		/* Page is on disk */
+#define	WT_REF_DELETED	1		/* Page is on disk, but deleted */
+#define	WT_REF_LOCKED	2		/* Page locked for exclusive access */
+#define	WT_REF_MEM	3		/* Page is in cache and valid */
+#define	WT_REF_READING	4		/* Page being read */
+#define	WT_REF_SPLIT	5		/* Parent page split (WT_REF dead) */
+	volatile uint32_t state;	/* Page state */
 
 	/*
 	 * Address: on-page cell if read from backing block, off-page WT_ADDR
@@ -871,8 +896,9 @@ WT_PACKED_STRUCT_BEGIN(__wt_update)
 	 * store 4GB objects; I'd rather do that than increase the size of this
 	 * structure for a flag bit.
 	 */
-#define	WT_UPDATE_DELETED_ISSET(upd)	((upd)->size == UINT32_MAX)
-#define	WT_UPDATE_DELETED_SET(upd)	((upd)->size = UINT32_MAX)
+#define	WT_UPDATE_DELETED_VALUE		UINT32_MAX
+#define	WT_UPDATE_DELETED_SET(upd)	((upd)->size = WT_UPDATE_DELETED_VALUE)
+#define	WT_UPDATE_DELETED_ISSET(upd)	((upd)->size == WT_UPDATE_DELETED_VALUE)
 	uint32_t size;			/* update length */
 
 	/* The untyped value immediately follows the WT_UPDATE structure. */
@@ -958,7 +984,7 @@ struct __wt_insert {
 #define	WT_PAGE_ALLOC_AND_SWAP(s, page, dest, v, count)	do {		\
 	if (((v) = (dest)) == NULL) {					\
 		WT_ERR(__wt_calloc_def(s, count, &(v)));		\
-		if (WT_ATOMIC_CAS8(dest, NULL, v))			\
+		if (__wt_atomic_cas_ptr(&dest, NULL, v))		\
 			__wt_cache_page_inmem_incr(			\
 			    s, page, (count) * sizeof(*(v)));		\
 		else							\
diff --git a/src/include/btree.h b/src/include/btree.h
index deecd8f6d88..98ce4c22c10 100644
--- a/src/include/btree.h
+++ b/src/include/btree.h
@@ -146,12 +146,14 @@ struct __wt_btree {
 	/* Flags values up to 0xff are reserved for WT_DHANDLE_* */
 #define	WT_BTREE_BULK		0x00100	/* Bulk-load handle */
 #define	WT_BTREE_IN_MEMORY	0x00200	/* Cache-resident object */
-#define	WT_BTREE_NO_EVICTION	0x00400	/* Disable eviction */
-#define	WT_BTREE_NO_LOGGING	0x00800	/* Disable logging */
-#define	WT_BTREE_SALVAGE	0x01000	/* Handle is for salvage */
-#define	WT_BTREE_SKIP_CKPT	0x02000	/* Handle skipped checkpoint */
-#define	WT_BTREE_UPGRADE	0x04000	/* Handle is for upgrade */
-#define	WT_BTREE_VERIFY		0x08000	/* Handle is for verify */
+#define	WT_BTREE_LOOKASIDE	0x00400	/* Look-aside table */
+#define	WT_BTREE_NO_CHECKPOINT	0x00800	/* Disable checkpoints */
+#define	WT_BTREE_NO_EVICTION	0x01000	/* Disable eviction */
+#define	WT_BTREE_NO_LOGGING	0x02000	/* Disable logging */
+#define	WT_BTREE_SALVAGE	0x04000	/* Handle is for salvage */
+#define	WT_BTREE_SKIP_CKPT	0x08000	/* Handle skipped checkpoint */
+#define	WT_BTREE_UPGRADE	0x10000	/* Handle is for upgrade */
+#define	WT_BTREE_VERIFY		0x20000	/* Handle is for verify */
 	uint32_t flags;
 };
 
diff --git a/src/include/btree.i b/src/include/btree.i
index d13ec1972fb..b54cecb6ce0 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -10,17 +10,17 @@
  * __wt_ref_is_root --
  *	Return if the page reference is for the root page.
  */
-static inline int
+static inline bool
 __wt_ref_is_root(WT_REF *ref)
 {
-	return (ref->home == NULL ? 1 : 0);
+	return (ref->home == NULL);
 }
 
 /*
  * __wt_page_is_empty --
  *	Return if the page is empty.
  */
-static inline int
+static inline bool
 __wt_page_is_empty(WT_PAGE *page)
 {
 	return (page->modify != NULL &&
@@ -31,10 +31,10 @@ __wt_page_is_empty(WT_PAGE *page)
  * __wt_page_is_modified --
  *	Return if the page is dirty.
  */
-static inline int
+static inline bool
 __wt_page_is_modified(WT_PAGE *page)
 {
-	return (page->modify != NULL && page->modify->write_gen != 0 ? 1 : 0);
+	return (page->modify != NULL && page->modify->write_gen != 0);
 }
 
 /*
@@ -49,46 +49,74 @@ __wt_cache_page_inmem_incr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size)
 	WT_ASSERT(session, size < WT_EXABYTE);
 
 	cache = S2C(session)->cache;
-	(void)WT_ATOMIC_ADD8(cache->bytes_inmem, size);
-	(void)WT_ATOMIC_ADD8(page->memory_footprint, size);
+	(void)__wt_atomic_add64(&cache->bytes_inmem, size);
+	(void)__wt_atomic_addsize(&page->memory_footprint, size);
 	if (__wt_page_is_modified(page)) {
-		(void)WT_ATOMIC_ADD8(cache->bytes_dirty, size);
-		(void)WT_ATOMIC_ADD8(page->modify->bytes_dirty, size);
+		(void)__wt_atomic_add64(&cache->bytes_dirty, size);
+		(void)__wt_atomic_addsize(&page->modify->bytes_dirty, size);
 	}
 	/* Track internal and overflow size in cache. */
 	if (WT_PAGE_IS_INTERNAL(page))
-		(void)WT_ATOMIC_ADD8(cache->bytes_internal, size);
+		(void)__wt_atomic_add64(&cache->bytes_internal, size);
 	else if (page->type == WT_PAGE_OVFL)
-		(void)WT_ATOMIC_ADD8(cache->bytes_overflow, size);
+		(void)__wt_atomic_add64(&cache->bytes_overflow, size);
 }
 
-/* 
- * WT_CACHE_DECR --
- *	Macro to decrement a field by a size.
- *
- * Be defensive and don't underflow: a band-aid on a gaping wound, but underflow
- * won't make things better no matter the problem (specifically, underflow makes
- * eviction crazy trying to evict non-existent memory).
+/*
+ * __wt_cache_decr_check_size --
+ *	Decrement a size_t cache value and check for underflow.
  */
+static inline void
+__wt_cache_decr_check_size(
+    WT_SESSION_IMPL *session, size_t *vp, size_t v, const char *fld)
+{
+	if (__wt_atomic_subsize(vp, v) < WT_EXABYTE)
+		return;
+
 #ifdef HAVE_DIAGNOSTIC
-#define	WT_CACHE_DECR(session, f, sz) do {				\
-	static int __first = 1;						\
-	if (WT_ATOMIC_SUB8(f, sz) > WT_EXABYTE) {			\
-		(void)WT_ATOMIC_ADD8(f, sz);				\
-		if (__first) {						\
-			__wt_errx(session,				\
-			    "%s underflow: decrementing %" WT_SIZET_FMT,\
-			    #f, sz);					\
-			__first = 0;					\
-		}							\
-	}								\
-} while (0)
+	(void)__wt_atomic_addsize(vp, v);
+
+	{
+	static int first = 1;
+
+	if (!first)
+		return;
+	__wt_errx(session, "%s underflow: decrementing %" WT_SIZET_FMT, fld, v);
+	first = 0;
+	}
 #else
-#define	WT_CACHE_DECR(s, f, sz) do {					\
-	if (WT_ATOMIC_SUB8(f, sz) > WT_EXABYTE)				\
-		(void)WT_ATOMIC_ADD8(f, sz);				\
-} while (0)
+	WT_UNUSED(fld);
+	WT_UNUSED(session);
 #endif
+}
+
+/*
+ * __wt_cache_decr_check_uint64 --
+ *	Decrement a uint64_t cache value and check for underflow.
+ */
+static inline void
+__wt_cache_decr_check_uint64(
+    WT_SESSION_IMPL *session, uint64_t *vp, size_t v, const char *fld)
+{
+	if (__wt_atomic_sub64(vp, v) < WT_EXABYTE)
+		return;
+
+#ifdef HAVE_DIAGNOSTIC
+	(void)__wt_atomic_add64(vp, v);
+
+	{
+	static int first = 1;
+
+	if (!first)
+		return;
+	__wt_errx(session, "%s underflow: decrementing %" WT_SIZET_FMT, fld, v);
+	first = 0;
+	}
+#else
+	WT_UNUSED(fld);
+	WT_UNUSED(session);
+#endif
+}
 
 /*
  * __wt_cache_page_byte_dirty_decr --
@@ -128,9 +156,10 @@ __wt_cache_page_byte_dirty_decr(
 		 */
 		orig = page->modify->bytes_dirty;
 		decr = WT_MIN(size, orig);
-		if (WT_ATOMIC_CAS8(
-		    page->modify->bytes_dirty, orig, orig - decr)) {
-			WT_CACHE_DECR(session, cache->bytes_dirty, decr);
+		if (__wt_atomic_cassize(
+		    &page->modify->bytes_dirty, orig, orig - decr)) {
+			__wt_cache_decr_check_uint64(session,
+			    &cache->bytes_dirty, decr, "WT_CACHE.bytes_dirty");
 			break;
 		}
 	}
@@ -149,15 +178,19 @@ __wt_cache_page_inmem_decr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size)
 
 	WT_ASSERT(session, size < WT_EXABYTE);
 
-	WT_CACHE_DECR(session, cache->bytes_inmem, size);
-	WT_CACHE_DECR(session, page->memory_footprint, size);
+	__wt_cache_decr_check_uint64(
+	    session, &cache->bytes_inmem, size, "WT_CACHE.bytes_inmem");
+	__wt_cache_decr_check_size(
+	    session, &page->memory_footprint, size, "WT_PAGE.memory_footprint");
 	if (__wt_page_is_modified(page))
 		__wt_cache_page_byte_dirty_decr(session, page, size);
 	/* Track internal and overflow size in cache. */
 	if (WT_PAGE_IS_INTERNAL(page))
-		WT_CACHE_DECR(session, cache->bytes_internal, size);
+		__wt_cache_decr_check_uint64(session,
+		    &cache->bytes_internal, size, "WT_CACHE.bytes_internal");
 	else if (page->type == WT_PAGE_OVFL)
-		WT_CACHE_DECR(session, cache->bytes_overflow, size);
+		__wt_cache_decr_check_uint64(session,
+		    &cache->bytes_overflow, size, "WT_CACHE.bytes_overflow");
 }
 
 /*
@@ -172,15 +205,15 @@ __wt_cache_dirty_incr(WT_SESSION_IMPL *session, WT_PAGE *page)
 	size_t size;
 
 	cache = S2C(session)->cache;
-	(void)WT_ATOMIC_ADD8(cache->pages_dirty, 1);
+	(void)__wt_atomic_add64(&cache->pages_dirty, 1);
 
 	/*
 	 * Take care to read the memory_footprint once in case we are racing
 	 * with updates.
 	 */
 	size = page->memory_footprint;
-	(void)WT_ATOMIC_ADD8(cache->bytes_dirty, size);
-	(void)WT_ATOMIC_ADD8(page->modify->bytes_dirty, size);
+	(void)__wt_atomic_add64(&cache->bytes_dirty, size);
+	(void)__wt_atomic_addsize(&page->modify->bytes_dirty, size);
 }
 
 /*
@@ -202,7 +235,7 @@ __wt_cache_dirty_decr(WT_SESSION_IMPL *session, WT_PAGE *page)
 		   "count went negative");
 		cache->pages_dirty = 0;
 	} else
-		(void)WT_ATOMIC_SUB8(cache->pages_dirty, 1);
+		(void)__wt_atomic_sub64(&cache->pages_dirty, 1);
 
 	modify = page->modify;
 	if (modify != NULL && modify->bytes_dirty != 0)
@@ -224,12 +257,15 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page)
 	modify = page->modify;
 
 	/* Update the bytes in-memory to reflect the eviction. */
-	WT_CACHE_DECR(session, cache->bytes_inmem, page->memory_footprint);
+	__wt_cache_decr_check_uint64(session,
+	    &cache->bytes_inmem,
+	    page->memory_footprint, "WT_CACHE.bytes_inmem");
 
 	/* Update the bytes_internal value to reflect the eviction */
 	if (WT_PAGE_IS_INTERNAL(page))
-		WT_CACHE_DECR(session,
-		    cache->bytes_internal, page->memory_footprint);
+		__wt_cache_decr_check_uint64(session,
+		    &cache->bytes_internal,
+		    page->memory_footprint, "WT_CACHE.bytes_internal");
 
 	/* Update the cache's dirty-byte count. */
 	if (modify != NULL && modify->bytes_dirty != 0) {
@@ -239,13 +275,14 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page)
 			   "dirty byte count went negative");
 			cache->bytes_dirty = 0;
 		} else
-			WT_CACHE_DECR(
-			    session, cache->bytes_dirty, modify->bytes_dirty);
+			__wt_cache_decr_check_uint64(session,
+			    &cache->bytes_dirty,
+			    modify->bytes_dirty, "WT_CACHE.bytes_dirty");
 	}
 
 	/* Update pages and bytes evicted. */
-	(void)WT_ATOMIC_ADD8(cache->bytes_evict, page->memory_footprint);
-	(void)WT_ATOMIC_ADD8(cache->pages_evict, 1);
+	(void)__wt_atomic_add64(&cache->bytes_evict, page->memory_footprint);
+	(void)__wt_atomic_add64(&cache->pages_evict, 1);
 }
 
 /*
@@ -306,7 +343,7 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
 	 * Every time the page transitions from clean to dirty, update the cache
 	 * and transactional information.
 	 */
-	if (WT_ATOMIC_ADD4(page->modify->write_gen, 1) == 1) {
+	if (__wt_atomic_add32(&page->modify->write_gen, 1) == 1) {
 		__wt_cache_dirty_incr(session, page);
 
 		/*
@@ -321,9 +358,13 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
 		 * have committed in the meantime, and the last_running field
 		 * been updated past it.  That is all very unlikely, but not
 		 * impossible, so we take care to read the global state before
-		 * the atomic increment.  If we raced with reconciliation, just
-		 * leave the previous value here: at worst, we will write a
-		 * page in a checkpoint when not absolutely necessary.
+		 * the atomic increment.
+		 *
+		 * If the page was dirty on entry, then last_running == 0. The
+		 * page could have become clean since then, if reconciliation
+		 * completed. In that case, we leave the previous value for
+		 * first_dirty_txn rather than potentially racing to update it,
+		 * at worst, we'll unnecessarily write a page in a checkpoint.
 		 */
 		if (last_running != 0)
 			page->modify->first_dirty_txn = last_running;
@@ -335,6 +376,25 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
 }
 
 /*
+ * __wt_page_modify_clear --
+ *	Clean a modified page.
+ */
+static inline void
+__wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	/*
+	 * The page must be held exclusive when this call is made, this call
+	 * can only be used when the page is owned by a single thread.
+	 *
+	 * Allow the call to be made on clean pages.
+	 */
+	if (__wt_page_is_modified(page)) {
+		page->modify->write_gen = 0;
+		__wt_cache_dirty_decr(session, page);
+	}
+}
+
+/*
  * __wt_page_modify_set --
  *	Mark the page and tree dirty.
  */
@@ -354,6 +414,9 @@ __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
 	 * shouldn't cause problems; regardless, let's play it safe.)
 	 */
 	if (S2BT(session)->modified == 0) {
+		/* Assert we never dirty a checkpoint handle. */
+		WT_ASSERT(session, session->dhandle->checkpoint == NULL);
+
 		S2BT(session)->modified = 1;
 		WT_FULL_BARRIER();
 	}
@@ -395,7 +458,7 @@ __wt_page_parent_modify_set(
  * __wt_off_page --
  *	Return if a pointer references off-page data.
  */
-static inline int
+static inline bool
 __wt_off_page(WT_PAGE *page, const void *p)
 {
 	/*
@@ -496,7 +559,12 @@ __wt_ref_key_instantiated(WT_REF *ref)
 static inline void
 __wt_ref_key_clear(WT_REF *ref)
 {
-	/* The key union has 2 fields, both of which are 8B. */
+	/*
+	 * The key union has 2 8B fields; this is equivalent to:
+	 *
+	 *	ref->key.recno = WT_RECNO_OOB;
+	 *	ref->key.ikey = NULL;
+	 */
 	ref->key.recno = 0;
 }
 
@@ -506,7 +574,7 @@ __wt_ref_key_clear(WT_REF *ref)
  * had without unpacking a cell, and information about the cell, if the key
  * isn't cheaply available.
  */
-static inline int
+static inline bool
 __wt_row_leaf_key_info(WT_PAGE *page, void *copy,
     WT_IKEY **ikeyp, WT_CELL **cellp, void *datap, size_t *sizep)
 {
@@ -597,7 +665,7 @@ __wt_row_leaf_key_info(WT_PAGE *page, void *copy,
 		if (cellp != NULL)
 			*cellp =
 			    WT_PAGE_REF_OFFSET(page, WT_CELL_DECODE_OFFSET(v));
-		return (0);
+		return (false);
 	case WT_K_FLAG:
 		/* Encoded key: no instantiated key, no cell. */
 		if (cellp != NULL)
@@ -608,9 +676,9 @@ __wt_row_leaf_key_info(WT_PAGE *page, void *copy,
 			*(void **)datap =
 			    WT_PAGE_REF_OFFSET(page, WT_K_DECODE_KEY_OFFSET(v));
 			*sizep = WT_K_DECODE_KEY_LEN(v);
-			return (1);
+			return (true);
 		}
-		return (0);
+		return (false);
 	case WT_KV_FLAG:
 		/* Encoded key/value pair: no instantiated key, no cell. */
 		if (cellp != NULL)
@@ -621,9 +689,9 @@ __wt_row_leaf_key_info(WT_PAGE *page, void *copy,
 			*(void **)datap = WT_PAGE_REF_OFFSET(
 			    page, WT_KV_DECODE_KEY_OFFSET(v));
 			*sizep = WT_KV_DECODE_KEY_LEN(v);
-			return (1);
+			return (true);
 		}
-		return (0);
+		return (false);
 
 	}
 
@@ -636,9 +704,9 @@ __wt_row_leaf_key_info(WT_PAGE *page, void *copy,
 	if (datap != NULL) {
 		*(void **)datap = WT_IKEY_DATA(ikey);
 		*sizep = ikey->size;
-		return (1);
+		return (true);
 	}
-	return (0);
+	return (false);
 }
 
 /*
@@ -826,7 +894,7 @@ __wt_row_leaf_value_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *kpack)
  * __wt_row_leaf_value --
  *	Return the value for a row-store leaf page encoded key/value pair.
  */
-static inline int
+static inline bool
 __wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value)
 {
 	uintptr_t v;
@@ -842,9 +910,9 @@ __wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value)
 		value->data =
 		    WT_PAGE_REF_OFFSET(page, WT_KV_DECODE_VALUE_OFFSET(v));
 		value->size = WT_KV_DECODE_VALUE_LEN(v);
-		return (1);
+		return (true);
 	}
-	return (0);
+	return (false);
 }
 
 /*
@@ -903,11 +971,13 @@ __wt_ref_info(WT_SESSION_IMPL *session,
  * __wt_page_can_split --
  *	Check whether a page can be split in memory.
  */
-static inline int
+static inline bool
 __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
 {
 	WT_BTREE *btree;
 	WT_INSERT_HEAD *ins_head;
+	WT_INSERT *ins;
+	int i;
 
 	btree = S2BT(session);
 
@@ -916,58 +986,54 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
 	 * of the page could continually split without benefit.
 	 */
 	if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT))
-		return (0);
+		return (false);
 
 	/*
 	 * Check for pages with append-only workloads. A common application
 	 * pattern is to have multiple threads frantically appending to the
 	 * tree. We want to reconcile and evict this page, but we'd like to
-	 * do it without making the appending threads wait. If we're not
-	 * discarding the tree, check and see if it's worth doing a split to
-	 * let the threads continue before doing eviction.
-	 *
-	 * Ignore anything other than large, dirty row-store leaf pages.
+	 * do it without making the appending threads wait. See if it's worth
+	 * doing a split to let the threads continue before doing eviction.
 	 *
-	 * XXX KEITH
-	 * Need a better test for append-only workloads.
+	 * Ignore anything other than large, dirty row-store leaf pages. The
+	 * split code only supports row-store pages, and we depend on the page
+	 * being dirty for correctness (the page must be reconciled again
+	 * before being evicted after the split, information from a previous
+	 * reconciliation will be wrong, so we can't evict immediately).
 	 */
 	if (page->type != WT_PAGE_ROW_LEAF ||
 	    page->memory_footprint < btree->maxmempage ||
 	    !__wt_page_is_modified(page))
-		return (0);
-
-	/* Don't split a page that is pending a multi-block split. */
-	if (F_ISSET(page->modify, WT_PM_REC_MULTIBLOCK))
-		return (0);
+		return (false);
 
 	/*
 	 * There is no point splitting if the list is small, no deep items is
-	 * our heuristic for that. (A 1/4 probability of adding a new skiplist
-	 * level means there will be a new 6th level for roughly each 4KB of
-	 * entries in the list. If we have at least two 6th level entries, the
-	 * list is at least large enough to work with.)
-	 *
-	 * The following code requires at least two items on the insert list,
-	 * this test serves the additional purpose of confirming that.
+	 * our heuristic for that. A 1/4 probability of adding a new skiplist
+	 * level, with level-0 always created, means there will be a 5th level
+	 * entry for roughly every 1024 entries in the list. If there are at
+	 * least 4 5th level entries (4K items), the list is large enough.
 	 */
-#define	WT_MIN_SPLIT_SKIPLIST_DEPTH	WT_MIN(6, WT_SKIP_MAXDEPTH - 1)
+#define	WT_MIN_SPLIT_SKIPLIST_DEPTH	WT_MIN(5, WT_SKIP_MAXDEPTH - 1)
 	ins_head = page->pg_row_entries == 0 ?
 	    WT_ROW_INSERT_SMALLEST(page) :
 	    WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1);
-	if (ins_head == NULL ||
-	    ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH] == NULL ||
-	    ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH] ==
-	    ins_head->tail[WT_MIN_SPLIT_SKIPLIST_DEPTH])
-		return (0);
-
-	return (1);
+	if (ins_head == NULL)
+		return (false);
+	for (i = 0, ins = ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH];
+	    ins != NULL; ins = ins->next[WT_MIN_SPLIT_SKIPLIST_DEPTH])
+		if (++i == 4) {
+			WT_STAT_FAST_CONN_INCR(session, cache_inmem_splittable);
+			WT_STAT_FAST_DATA_INCR(session, cache_inmem_splittable);
+			return (true);
+		}
+	return (false);
 }
 
 /*
  * __wt_page_can_evict --
  *	Check whether a page can be evicted.
  */
-static inline int
+static inline bool
 __wt_page_can_evict(WT_SESSION_IMPL *session,
     WT_PAGE *page, int check_splits, int *inmem_splitp)
 {
@@ -980,11 +1046,22 @@ __wt_page_can_evict(WT_SESSION_IMPL *session,
 
 	btree = S2BT(session);
 	mod = page->modify;
-	txn_global = &S2C(session)->txn_global;
 
 	/* Pages that have never been modified can always be evicted. */
 	if (mod == NULL)
-		return (1);
+		return (true);
+
+	/*
+	 * Check for in-memory splits before other eviction tests. If the page
+	 * should split in-memory, return success immediately and skip more
+	 * detailed eviction tests. We don't need further tests since the page
+	 * won't be written or discarded from the cache.
+	 */
+	if (__wt_page_can_split(session, page)) {
+		if (inmem_splitp != NULL)
+			*inmem_splitp = 1;
+		return (true);
+	}
 
 	/*
 	 * If the tree was deepened, there's a requirement that newly created
@@ -997,20 +1074,7 @@ __wt_page_can_evict(WT_SESSION_IMPL *session,
 	 */
 	if (check_splits && WT_PAGE_IS_INTERNAL(page) &&
 	    !__wt_txn_visible_all(session, mod->mod_split_txn))
-		return (0);
-
-	/*
-	 * Allow for the splitting of pages when a checkpoint is underway only
-	 * if the allow_splits flag has been passed, we know we are performing
-	 * a checkpoint, the page is larger than the stated maximum and there
-	 * has not already been a split on this page as the WT_PM_REC_MULTIBLOCK
-	 * flag is unset.
-	 */
-	if (__wt_page_can_split(session, page)) {
-		if (inmem_splitp != NULL)
-			*inmem_splitp = 1;
-		return (1);
-	}
+		return (false);
 
 	/*
 	 * If the file is being checkpointed, we can't evict dirty pages:
@@ -1018,48 +1082,27 @@ __wt_page_can_evict(WT_SESSION_IMPL *session,
 	 * previous version might be referenced by an internal page already
 	 * been written in the checkpoint, leaving the checkpoint inconsistent.
 	 */
-	if (btree->checkpointing &&
-	    (__wt_page_is_modified(page) ||
-	    F_ISSET(mod, WT_PM_REC_MULTIBLOCK))) {
+	if (btree->checkpointing && __wt_page_is_modified(page)) {
 		WT_STAT_FAST_CONN_INCR(session, cache_eviction_checkpoint);
 		WT_STAT_FAST_DATA_INCR(session, cache_eviction_checkpoint);
-		return (0);
+		return (false);
 	}
 
 	/*
-	 * If we aren't (potentially) doing eviction that can restore updates
-	 * and the updates on this page are too recent, give up.
-	 *
-	 * Don't rely on new updates being skipped by the transaction used
-	 * for transaction reads: (1) there are paths that dirty pages for
-	 * artificial reasons; (2) internal pages aren't transactional; and
-	 * (3) if an update was skipped during the checkpoint (leaving the page
-	 * dirty), then rolled back, we could still successfully overwrite a
-	 * page and corrupt the checkpoint.
+	 * If the page was recently split in-memory, don't evict it immediately:
+	 * we want to give application threads that are appending a chance to
+	 * move to the new leaf page created by the split.
 	 *
-	 * Further, we can't race with the checkpoint's reconciliation of
-	 * an internal page as we evict a clean child from the page's subtree.
-	 * This works in the usual way: eviction locks the page and then checks
-	 * for existing hazard pointers, the checkpoint thread reconciling an
-	 * internal page acquires hazard pointers on child pages it reads, and
-	 * is blocked by the exclusive lock.
-	 */
-	if (page->read_gen != WT_READGEN_OLDEST &&
-	    !__wt_txn_visible_all(session, __wt_page_is_modified(page) ?
-	    mod->update_txn : mod->rec_max_txn))
-		return (0);
-
-	/*
-	 * If the page was recently split in-memory, don't force it out: we
-	 * hope an eviction thread will find it first.  The check here is
-	 * similar to __wt_txn_visible_all, but ignores the checkpoint's
-	 * transaction.
+	 * Note the check here is similar to __wt_txn_visible_all, but ignores
+	 * the checkpoint's transaction.
 	 */
-	if (check_splits &&
-	    WT_TXNID_LE(txn_global->oldest_id, mod->inmem_split_txn))
-		return (0);
+	if (check_splits) {
+		txn_global = &S2C(session)->txn_global;
+		if (WT_TXNID_LE(txn_global->oldest_id, mod->inmem_split_txn))
+			return (false);
+	}
 
-	return (1);
+	return (true);
 }
 
 /*
@@ -1082,17 +1125,17 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref)
 	 * reference without first locking the page, it could be evicted in
 	 * between.
 	 */
-	locked = WT_ATOMIC_CAS4(ref->state, WT_REF_MEM, WT_REF_LOCKED);
+	locked = __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED);
 	if ((ret = __wt_hazard_clear(session, page)) != 0 || !locked) {
 		if (locked)
 			ref->state = WT_REF_MEM;
 		return (ret == 0 ? EBUSY : ret);
 	}
 
-	(void)WT_ATOMIC_ADD4(btree->evict_busy, 1);
+	(void)__wt_atomic_addv32(&btree->evict_busy, 1);
 
 	too_big = (page->memory_footprint > btree->maxmempage) ? 1 : 0;
-	if ((ret = __wt_evict_page(session, ref)) == 0) {
+	if ((ret = __wt_evict(session, ref, 0)) == 0) {
 		if (too_big)
 			WT_STAT_FAST_CONN_INCR(session, cache_eviction_force);
 		else
@@ -1106,7 +1149,7 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref)
 	} else
 		WT_STAT_FAST_CONN_INCR(session, cache_eviction_force_fail);
 
-	(void)WT_ATOMIC_SUB4(btree->evict_busy, 1);
+	(void)__wt_atomic_subv32(&btree->evict_busy, 1);
 
 	return (ret);
 }
@@ -1143,12 +1186,13 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
 	 * memory_page_max setting, when we see many deleted items, and when we
 	 * are attempting to scan without trashing the cache.
 	 *
-	 * Fast checks if eviction is disabled for this operation or this tree,
-	 * then perform a general check if eviction will be possible.
+	 * Fast checks if eviction is disabled for this handle, operation or
+	 * tree, then perform a general check if eviction will be possible.
 	 */
 	page = ref->page;
 	if (page->read_gen != WT_READGEN_OLDEST ||
 	    LF_ISSET(WT_READ_NO_EVICT) ||
+	    F_ISSET(session, WT_SESSION_NO_EVICTION) ||
 	    F_ISSET(btree, WT_BTREE_NO_EVICTION) ||
 	    !__wt_page_can_evict(session, page, 1, NULL))
 		return (__wt_hazard_clear(session, page));
@@ -1264,13 +1308,13 @@ __wt_skip_choose_depth(WT_SESSION_IMPL *session)
 }
 
 /*
- * __wt_btree_lsm_size --
+ * __wt_btree_lsm_over_size --
  *	Return if the size of an in-memory tree with a single leaf page is over
  * a specified maximum.  If called on anything other than a simple tree with a
  * single leaf page, returns true so our LSM caller will switch to a new tree.
  */
-static inline int
-__wt_btree_lsm_size(WT_SESSION_IMPL *session, uint64_t maxsize)
+static inline bool
+__wt_btree_lsm_over_size(WT_SESSION_IMPL *session, uint64_t maxsize)
 {
 	WT_BTREE *btree;
 	WT_PAGE *child, *root;
@@ -1282,20 +1326,20 @@ __wt_btree_lsm_size(WT_SESSION_IMPL *session, uint64_t maxsize)
 
 	/* Check for a non-existent tree. */
 	if (root == NULL)
-		return (0);
+		return (false);
 
 	/* A tree that can be evicted always requires a switch. */
 	if (!F_ISSET(btree, WT_BTREE_NO_EVICTION))
-		return (1);
+		return (true);
 
 	/* Check for a tree with a single leaf page. */
 	WT_INTL_INDEX_GET(session, root, pindex);
 	if (pindex->entries != 1)		/* > 1 child page, switch */
-		return (1);
+		return (true);
 
 	first = pindex->index[0];
 	if (first->state != WT_REF_MEM)		/* no child page, ignore */
-		return (0);
+		return (false);
 
 	/*
 	 * We're reaching down into the page without a hazard pointer, but
@@ -1304,7 +1348,7 @@ __wt_btree_lsm_size(WT_SESSION_IMPL *session, uint64_t maxsize)
 	 */
 	child = first->page;
 	if (child->type != WT_PAGE_ROW_LEAF)	/* not a single leaf page */
-		return (1);
+		return (true);
 
 	return (child->memory_footprint > maxsize);
 }
diff --git a/src/include/cache.h b/src/include/cache.h
index ed93f82538c..f199372ea5e 100644
--- a/src/include/cache.h
+++ b/src/include/cache.h
@@ -18,11 +18,6 @@
 #define	WT_EVICT_WALK_BASE	300	/* Pages tracked across file visits */
 #define	WT_EVICT_WALK_INCR	100	/* Pages added each walk */
 
-#define	WT_EVICT_PASS_AGGRESSIVE	0x01
-#define	WT_EVICT_PASS_ALL		0x02
-#define	WT_EVICT_PASS_DIRTY		0x04
-#define	WT_EVICT_PASS_WOULD_BLOCK	0x08
-
 /*
  * WT_EVICT_ENTRY --
  *	Encapsulation of an eviction candidate.
@@ -96,7 +91,7 @@ struct __wt_cache {
 	/*
 	 * LRU eviction list information.
 	 */
-	WT_EVICT_ENTRY *evict;		/* LRU pages being tracked */
+	WT_EVICT_ENTRY *evict_queue;	/* LRU pages being tracked */
 	WT_EVICT_ENTRY *evict_current;	/* LRU current page to be evicted */
 	uint32_t evict_candidates;	/* LRU list pages to evict */
 	uint32_t evict_entries;		/* LRU entries in the queue */
@@ -109,6 +104,7 @@ struct __wt_cache {
 	 * Cache pool information.
 	 */
 	uint64_t cp_pass_pressure;	/* Calculated pressure from this pass */
+	uint64_t cp_quota;		/* Maximum size for this cache */
 	uint64_t cp_reserved;		/* Base size for this cache */
 	WT_SESSION_IMPL *cp_session;	/* May be used for cache management */
 	uint32_t cp_skip_count;		/* Post change stabilization */
@@ -119,6 +115,15 @@ struct __wt_cache {
 	uint64_t cp_saved_read;		/* Read count at last review */
 
 	/*
+	 * Work state.
+	 */
+#define	WT_EVICT_PASS_AGGRESSIVE	0x01
+#define	WT_EVICT_PASS_ALL		0x02
+#define	WT_EVICT_PASS_DIRTY		0x04
+#define	WT_EVICT_PASS_WOULD_BLOCK	0x08
+	uint32_t state;
+
+	/*
 	 * Flags.
 	 */
 #define	WT_CACHE_POOL_MANAGER	0x01	/* The active cache pool manager */
@@ -140,6 +145,7 @@ struct __wt_cache_pool {
 	const char *name;
 	uint64_t size;
 	uint64_t chunk;
+	uint64_t quota;
 	uint64_t currently_used;
 	uint32_t refs;		/* Reference count for structure. */
 	/* Locked: List of connections participating in the cache pool. */
diff --git a/src/include/cache.i b/src/include/cache.i
index 87f8c5543d1..bc33f82d927 100644
--- a/src/include/cache.i
+++ b/src/include/cache.i
@@ -104,48 +104,6 @@ __wt_cache_dirty_inuse(WT_CACHE *cache)
 }
 
 /*
- * __wt_cache_status --
- *	Return if the cache usage exceeds the eviction or dirty targets.
- */
-static inline void
-__wt_cache_status(WT_SESSION_IMPL *session, int *evictp, int *dirtyp)
-{
-	WT_CONNECTION_IMPL *conn;
-	WT_CACHE *cache;
-	uint64_t bytes_inuse, bytes_max, dirty_inuse;
-
-	conn = S2C(session);
-	cache = conn->cache;
-
-	/*
-	 * There's an assumption "evict" overrides "dirty", that is, if eviction
-	 * is required, we no longer care where we are with respect to the dirty
-	 * target.
-	 *
-	 * Avoid division by zero if the cache size has not yet been set in a
-	 * shared cache.
-	 */
-	bytes_max = conn->cache_size + 1;
-	if (evictp != NULL) {
-		bytes_inuse = __wt_cache_bytes_inuse(cache);
-		if (bytes_inuse > (cache->eviction_target * bytes_max) / 100) {
-			*evictp = 1;
-			return;
-		}
-		*evictp = 0;
-	}
-	if (dirtyp != NULL) {
-		dirty_inuse = __wt_cache_dirty_inuse(cache);
-		if (dirty_inuse >
-		    (cache->eviction_dirty_target * bytes_max) / 100) {
-			*dirtyp = 1;
-			return;
-		}
-		*dirtyp = 0;
-	}
-}
-
-/*
  * __wt_session_can_wait --
  *	Return if a session available for a potentially slow operation.
  */
@@ -161,29 +119,52 @@ __wt_session_can_wait(WT_SESSION_IMPL *session)
 		return (0);
 
 	/*
-	 * LSM sets the no-cache-check flag when holding the LSM tree lock,
+	 * LSM sets the no-eviction flag when holding the LSM tree lock,
 	 * in that case, or when holding the schema lock, we don't want to
 	 * highjack the thread for eviction.
 	 */
 	if (F_ISSET(session,
-	    WT_SESSION_NO_CACHE_CHECK | WT_SESSION_LOCKED_SCHEMA))
+	    WT_SESSION_NO_EVICTION | WT_SESSION_LOCKED_SCHEMA))
 		return (0);
 
 	return (1);
 }
 
 /*
+ * __wt_eviction_aggressive --
+ *	Return if the eviction server is running in aggressive mode.
+ */
+static inline int
+__wt_eviction_aggressive(WT_SESSION_IMPL *session)
+{
+	return (FLD_ISSET(
+	    S2C(session)->cache->state, WT_EVICT_PASS_AGGRESSIVE) ? 1 : 0);
+}
+
+/*
+ * __wt_eviction_dirty_target --
+ *	Return if the eviction server is running to reduce the number of dirty
+ * pages (versus running to discard pages from the cache).
+ */
+static inline int
+__wt_eviction_dirty_target(WT_SESSION_IMPL *session)
+{
+	return (FLD_ISSET(
+	    S2C(session)->cache->state, WT_EVICT_PASS_DIRTY) ? 1 : 0);
+}
+
+/*
  * __wt_eviction_needed --
  *	Return if an application thread should do eviction, and the cache full
  * percentage as a side-effect.
  */
-static inline int
-__wt_eviction_needed(WT_SESSION_IMPL *session, int *pct_fullp)
+static inline bool
+__wt_eviction_needed(WT_SESSION_IMPL *session, u_int *pct_fullp)
 {
 	WT_CONNECTION_IMPL *conn;
 	WT_CACHE *cache;
 	uint64_t bytes_inuse, bytes_max;
-	int pct_full;
+	u_int pct_full;
 
 	conn = S2C(session);
 	cache = conn->cache;
@@ -196,25 +177,20 @@ __wt_eviction_needed(WT_SESSION_IMPL *session, int *pct_fullp)
 	bytes_max = conn->cache_size + 1;
 
 	/*
-	 * Return the cache full percentage; anything over 95% means we involve
-	 * the application thread.
+	 * Calculate the cache full percentage; anything over the trigger means
+	 * we involve the application thread.
 	 */
-	pct_full = (int)((100 * bytes_inuse) / bytes_max);
+	pct_full = (u_int)((100 * bytes_inuse) / bytes_max);
 	if (pct_fullp != NULL)
 		*pct_fullp = pct_full;
-	if (pct_full >= 95)
-		return (1);
+	if (pct_full > cache->eviction_trigger)
+		return (true);
 
-	/*
-	 * Return if we're over the trigger cache size or there are too many
-	 * dirty pages.
-	 */
-	if (bytes_inuse > (cache->eviction_trigger * bytes_max) / 100)
-		return (1);
+	/* Return if there are too many dirty bytes in cache. */
 	if (__wt_cache_dirty_inuse(cache) >
 	    (cache->eviction_dirty_trigger * bytes_max) / 100)
-		return (1);
-	return (0);
+		return (true);
+	return (false);
 }
 
 /*
@@ -225,7 +201,7 @@ static inline int
 __wt_cache_eviction_check(WT_SESSION_IMPL *session, int busy, int *didworkp)
 {
 	WT_BTREE *btree;
-	int pct_full;
+	u_int pct_full;
 
 	if (didworkp != NULL)
 		*didworkp = 0;
@@ -235,7 +211,7 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, int busy, int *didworkp)
 	 * that case, or when holding the schema or handle list locks (which
 	 * block eviction), we don't want to highjack the thread for eviction.
 	 */
-	if (F_ISSET(session, WT_SESSION_NO_CACHE_CHECK |
+	if (F_ISSET(session, WT_SESSION_NO_EVICTION |
 	    WT_SESSION_LOCKED_HANDLE_LIST | WT_SESSION_LOCKED_SCHEMA))
 		return (0);
 
diff --git a/src/include/cell.i b/src/include/cell.i
index 20a4d214015..a517ac4a523 100644
--- a/src/include/cell.i
+++ b/src/include/cell.i
@@ -182,7 +182,7 @@ __wt_cell_pack_addr(WT_CELL *cell, u_int cell_type, uint64_t recno, size_t size)
 
 	p = cell->__chunk + 1;
 
-	if (recno == 0)
+	if (recno == WT_RECNO_OOB)
 		cell->__chunk[0] = cell_type;		/* Type */
 	else {
 		cell->__chunk[0] = cell_type | WT_CELL_64V;
@@ -547,7 +547,8 @@ __wt_cell_leaf_value_parse(WT_PAGE *page, WT_CELL *cell)
  *	Unpack a WT_CELL into a structure during verification.
  */
 static inline int
-__wt_cell_unpack_safe(WT_CELL *cell, WT_CELL_UNPACK *unpack, uint8_t *end)
+__wt_cell_unpack_safe(
+    WT_CELL *cell, WT_CELL_UNPACK *unpack, const void *start, const void *end)
 {
 	struct {
 		uint32_t len;
@@ -560,14 +561,15 @@ __wt_cell_unpack_safe(WT_CELL *cell, WT_CELL_UNPACK *unpack, uint8_t *end)
 	copy.v = 0;			/* -Werror=maybe-uninitialized */
 
 	/*
-	 * The verification code specifies an end argument, a pointer to 1 past
-	 * the end-of-page.  In that case, make sure we don't go past the end
-	 * of the page when reading.  If an error occurs, we simply return the
-	 * error code, the verification code takes care of complaining (and, in
-	 * the case of salvage, it won't complain at all, it's OK to fail).
+	 * The verification code specifies start/end arguments, pointers to the
+	 * start of the page and to 1 past the end-of-page. In which case, make
+	 * sure all reads are inside the page image. If an error occurs, return
+	 * an error code but don't output messages, our caller handles that.
 	 */
-#define	WT_CELL_LEN_CHK(p, len) do {					\
-	if (end != NULL && (((uint8_t *)p) + (len)) > end)		\
+#define	WT_CELL_LEN_CHK(t, len) do {					\
+	if (start != NULL &&						\
+	    ((uint8_t *)t < (uint8_t *)start ||				\
+	    (((uint8_t *)t) + (len)) > (uint8_t *)end))			\
 		return (WT_ERROR);					\
 } while (0)
 
@@ -630,7 +632,7 @@ restart:
 	 */
 	if (cell->__chunk[0] & WT_CELL_64V)		/* skip value */
 		WT_RET(__wt_vunpack_uint(
-		    &p, end == NULL ? 0 : (size_t)(end - p), &unpack->v));
+		    &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &unpack->v));
 
 	/*
 	 * Handle special actions for a few different cell types and set the
@@ -647,7 +649,7 @@ restart:
 		 * earlier cell.
 		 */
 		WT_RET(__wt_vunpack_uint(
-		    &p, end == NULL ? 0 : (size_t)(end - p), &v));
+		    &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &v));
 		copy.len = WT_PTRDIFF32(p, cell);
 		copy.v = unpack->v;
 		cell = (WT_CELL *)((uint8_t *)cell - v);
@@ -675,7 +677,7 @@ restart:
 		 * data.
 		 */
 		WT_RET(__wt_vunpack_uint(
-		    &p, end == NULL ? 0 : (size_t)(end - p), &v));
+		    &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &v));
 
 		if (unpack->raw == WT_CELL_KEY ||
 		    unpack->raw == WT_CELL_KEY_PFX ||
@@ -716,7 +718,7 @@ done:	WT_CELL_LEN_CHK(cell, unpack->__len);
 static inline void
 __wt_cell_unpack(WT_CELL *cell, WT_CELL_UNPACK *unpack)
 {
-	(void)__wt_cell_unpack_safe(cell, unpack, NULL);
+	(void)__wt_cell_unpack_safe(cell, unpack, NULL, NULL);
 }
 
 /*
diff --git a/src/include/connection.h b/src/include/connection.h
index cd55aadfc07..d8ff261cd82 100644
--- a/src/include/connection.h
+++ b/src/include/connection.h
@@ -38,8 +38,8 @@ struct __wt_keyed_encryptor {
 	size_t size_const;		/* The result of the sizing callback */
 	WT_ENCRYPTOR *encryptor;	/* User supplied callbacks */
 					/* Linked list of encryptors */
-	SLIST_ENTRY(__wt_keyed_encryptor) hashl;
-	SLIST_ENTRY(__wt_keyed_encryptor) l;
+	TAILQ_ENTRY(__wt_keyed_encryptor) hashq;
+	TAILQ_ENTRY(__wt_keyed_encryptor) q;
 };
 
 /*
@@ -82,9 +82,9 @@ struct __wt_named_encryptor {
 	const char *name;		/* Name of encryptor */
 	WT_ENCRYPTOR *encryptor;	/* User supplied callbacks */
 					/* Locked: list of encryptors by key */
-	SLIST_HEAD(__wt_keyedhash, __wt_keyed_encryptor)
-				keyedhashlh[WT_HASH_ARRAY_SIZE];
-	SLIST_HEAD(__wt_keyed_lh, __wt_keyed_encryptor) keyedlh;
+	TAILQ_HEAD(__wt_keyedhash, __wt_keyed_encryptor)
+				keyedhashqh[WT_HASH_ARRAY_SIZE];
+	TAILQ_HEAD(__wt_keyed_qh, __wt_keyed_encryptor) keyedqh;
 					/* Linked list of encryptors */
 	TAILQ_ENTRY(__wt_named_encryptor) q;
 };
@@ -100,10 +100,10 @@ struct __wt_named_extractor {
 };
 
 /*
- * Allocate some additional slots for internal sessions.  There is a default
- * session for each connection, plus a session for each server thread.
+ * Allocate some additional slots for internal sessions so the user cannot
+ * configure too few sessions for us to run.
  */
-#define	WT_NUM_INTERNAL_SESSIONS	10
+#define	WT_EXTRA_INTERNAL_SESSIONS	10
 
 /*
  * WT_CONN_CHECK_PANIC --
@@ -119,14 +119,15 @@ struct __wt_named_extractor {
  * main queue and the hashed queue.
  */
 #define	WT_CONN_DHANDLE_INSERT(conn, dhandle, bucket) do {		\
-	SLIST_INSERT_HEAD(&(conn)->dhlh, dhandle, l);			\
-	SLIST_INSERT_HEAD(&(conn)->dhhash[bucket], dhandle, hashl);	\
+	TAILQ_INSERT_HEAD(&(conn)->dhqh, dhandle, q);			\
+	TAILQ_INSERT_HEAD(&(conn)->dhhash[bucket], dhandle, hashq);	\
+	++conn->dhandle_count;						\
 } while (0)
 
 #define	WT_CONN_DHANDLE_REMOVE(conn, dhandle, bucket) do {		\
-	SLIST_REMOVE(&(conn)->dhlh, dhandle, __wt_data_handle, l);	\
-	SLIST_REMOVE(&(conn)->dhhash[bucket],				\
-	    dhandle, __wt_data_handle, hashl);				\
+	TAILQ_REMOVE(&(conn)->dhqh, dhandle, q);			\
+	TAILQ_REMOVE(&(conn)->dhhash[bucket], dhandle, hashq);		\
+	--conn->dhandle_count;						\
 } while (0)
 
 /*
@@ -134,14 +135,13 @@ struct __wt_named_extractor {
  * main queue and the hashed queue.
  */
 #define	WT_CONN_BLOCK_INSERT(conn, block, bucket) do {			\
-	SLIST_INSERT_HEAD(&(conn)->blocklh, block, l);			\
-	SLIST_INSERT_HEAD(&(conn)->blockhash[bucket], block, hashl);	\
+	TAILQ_INSERT_HEAD(&(conn)->blockqh, block, q);			\
+	TAILQ_INSERT_HEAD(&(conn)->blockhash[bucket], block, hashq);	\
 } while (0)
 
 #define	WT_CONN_BLOCK_REMOVE(conn, block, bucket) do {			\
-	SLIST_REMOVE(&(conn)->blocklh, block, __wt_block, l);		\
-	SLIST_REMOVE(							\
-	    &(conn)->blockhash[bucket], block, __wt_block, hashl);	\
+	TAILQ_REMOVE(&(conn)->blockqh, block, q);			\
+	TAILQ_REMOVE(&(conn)->blockhash[bucket], block, hashq);		\
 } while (0)
 
 /*
@@ -149,13 +149,13 @@ struct __wt_named_extractor {
  * main queue and the hashed queue.
  */
 #define	WT_CONN_FILE_INSERT(conn, fh, bucket) do {			\
-	SLIST_INSERT_HEAD(&(conn)->fhlh, fh, l);			\
-	SLIST_INSERT_HEAD(&(conn)->fhhash[bucket], fh, hashl);		\
+	TAILQ_INSERT_HEAD(&(conn)->fhqh, fh, q);			\
+	TAILQ_INSERT_HEAD(&(conn)->fhhash[bucket], fh, hashq);		\
 } while (0)
 
 #define	WT_CONN_FILE_REMOVE(conn, fh, bucket) do {			\
-	SLIST_REMOVE(&(conn)->fhlh, fh, __wt_fh, l);			\
-	SLIST_REMOVE(&(conn)->fhhash[bucket], fh, __wt_fh, hashl);	\
+	TAILQ_REMOVE(&(conn)->fhqh, fh, q);				\
+	TAILQ_REMOVE(&(conn)->fhhash[bucket], fh, hashq);		\
 } while (0)
 
 /*
@@ -180,13 +180,17 @@ struct __wt_connection_impl {
 	WT_SPINLOCK table_lock;		/* Table creation spinlock */
 
 	/*
-	 * We distribute the btree page locks across a set of spin locks; it
-	 * can't be an array, we impose cache-line alignment and gcc doesn't
-	 * support that for arrays.  Don't use too many: they are only held for
-	 * very short operations, each one is 64 bytes, so 256 will fill the L1
-	 * cache on most CPUs.
+	 * We distribute the btree page locks across a set of spin locks. Don't
+	 * use too many: they are only held for very short operations, each one
+	 * is 64 bytes, so 256 will fill the L1 cache on most CPUs.
+	 *
+	 * Use a prime number of buckets rather than assuming a good hash
+	 * (Reference Sedgewick, Algorithms in C, "Hash Functions").
+	 *
+	 * Note: this can't be an array, we impose cache-line alignment and gcc
+	 * doesn't support that for arrays smaller than the alignment.
 	 */
-#define	WT_PAGE_LOCKS(conn)	16
+#define	WT_PAGE_LOCKS		17
 	WT_SPINLOCK *page_lock;	        /* Btree page spinlocks */
 	u_int	     page_lock_cnt;	/* Next spinlock to use */
 
@@ -211,6 +215,8 @@ struct __wt_connection_impl {
 	WT_FH *lock_fh;			/* Lock file handle */
 
 	volatile uint64_t  split_gen;	/* Generation number for splits */
+	uint64_t split_stashed_bytes;	/* Atomic: split statistics */
+	uint64_t split_stashed_objects;
 
 	/*
 	 * The connection keeps a cache of data handles. The set of handles
@@ -219,24 +225,26 @@ struct __wt_connection_impl {
 	 * URI.
 	 */
 					/* Locked: data handle hash array */
-	SLIST_HEAD(__wt_dhhash, __wt_data_handle) dhhash[WT_HASH_ARRAY_SIZE];
+	TAILQ_HEAD(__wt_dhhash, __wt_data_handle) dhhash[WT_HASH_ARRAY_SIZE];
 					/* Locked: data handle list */
-	SLIST_HEAD(__wt_dhandle_lh, __wt_data_handle) dhlh;
+	TAILQ_HEAD(__wt_dhandle_qh, __wt_data_handle) dhqh;
 					/* Locked: LSM handle list. */
 	TAILQ_HEAD(__wt_lsm_qh, __wt_lsm_tree) lsmqh;
 					/* Locked: file list */
-	SLIST_HEAD(__wt_fhhash, __wt_fh) fhhash[WT_HASH_ARRAY_SIZE];
-	SLIST_HEAD(__wt_fh_lh, __wt_fh) fhlh;
+	TAILQ_HEAD(__wt_fhhash, __wt_fh) fhhash[WT_HASH_ARRAY_SIZE];
+	TAILQ_HEAD(__wt_fh_qh, __wt_fh) fhqh;
 					/* Locked: library list */
 	TAILQ_HEAD(__wt_dlh_qh, __wt_dlh) dlhqh;
 
 	WT_SPINLOCK block_lock;		/* Locked: block manager list */
-	SLIST_HEAD(__wt_blockhash, __wt_block) blockhash[WT_HASH_ARRAY_SIZE];
-	SLIST_HEAD(__wt_block_lh, __wt_block) blocklh;
+	TAILQ_HEAD(__wt_blockhash, __wt_block) blockhash[WT_HASH_ARRAY_SIZE];
+	TAILQ_HEAD(__wt_block_qh, __wt_block) blockqh;
 
+	u_int dhandle_count;		/* Locked: handles in the queue */
 	u_int open_btree_count;		/* Locked: open writable btree count */
 	uint32_t next_file_id;		/* Locked: file ID counter */
 	uint32_t open_file_count;	/* Atomic: open file handle count */
+	uint32_t open_cursor_count;	/* Atomic: open cursor handle count */
 
 	/*
 	 * WiredTiger allocates space for 50 simultaneous sessions (threads of
@@ -262,7 +270,9 @@ struct __wt_connection_impl {
 	uint32_t   hazard_max;		/* Hazard array size */
 
 	WT_CACHE  *cache;		/* Page cache */
-	uint64_t   cache_size;		/* Configured cache size */
+	volatile uint64_t cache_size;	/* Cache size (either statically
+					   configured or the current size
+					   within a cache pool). */
 
 	WT_TXN_GLOBAL txn_global;	/* Global transaction state */
 
@@ -277,9 +287,12 @@ struct __wt_connection_impl {
 #define	WT_CKPT_LOGSIZE(conn)	((conn)->ckpt_logsize != 0)
 	wt_off_t	 ckpt_logsize;	/* Checkpoint log size period */
 	uint32_t	 ckpt_signalled;/* Checkpoint signalled */
-	uint64_t	 ckpt_usecs;	/* Checkpoint period */
 
-	int compact_in_memory_pass;	/* Compaction serialization */
+	uint64_t  ckpt_usecs;		/* Checkpoint timer */
+	uint64_t  ckpt_time_max;	/* Checkpoint time min/max */
+	uint64_t  ckpt_time_min;
+	uint64_t  ckpt_time_recent;	/* Checkpoint time recent/total */
+	uint64_t  ckpt_time_total;
 
 #define	WT_CONN_STAT_ALL	0x01	/* "all" statistics configured */
 #define	WT_CONN_STAT_CLEAR	0x02	/* clear after gathering */
@@ -289,7 +302,9 @@ struct __wt_connection_impl {
 #define	WT_CONN_STAT_SIZE	0x20	/* "size" statistics configured */
 	uint32_t stat_flags;
 
-	WT_CONNECTION_STATS stats;	/* Connection statistics */
+					/* Connection statistics */
+	WT_CONNECTION_STATS *stats[WT_COUNTER_SLOTS];
+	WT_CONNECTION_STATS  stat_array[WT_COUNTER_SLOTS];
 
 	WT_ASYNC	*async;		/* Async structure */
 	int		 async_cfg;	/* Global async configuration */
@@ -325,7 +340,8 @@ struct __wt_connection_impl {
 #define	WT_CONN_LOG_ENABLED	0x02	/* Logging is enabled */
 #define	WT_CONN_LOG_EXISTED	0x04	/* Log files found */
 #define	WT_CONN_LOG_PREALLOC	0x08	/* Pre-allocation is enabled */
-#define	WT_CONN_LOG_RECOVER_ERR	0x10	/* Error if recovery required */
+#define	WT_CONN_LOG_RECOVER_DONE	0x10	/* Recovery completed */
+#define	WT_CONN_LOG_RECOVER_ERR	0x20	/* Error if recovery required */
 	uint32_t	 log_flags;	/* Global logging configuration */
 	WT_CONDVAR	*log_cond;	/* Log server wait mutex */
 	WT_SESSION_IMPL *log_session;	/* Log server session */
@@ -354,6 +370,20 @@ struct __wt_connection_impl {
 	time_t		 sweep_interval;/* Handle sweep interval */
 	u_int		 sweep_handles_min;/* Handle sweep minimum open */
 
+	/*
+	 * Shared lookaside lock, session and cursor, used by threads accessing
+	 * the lookaside table (other than eviction server and worker threads
+	 * and the sweep thread, all of which have their own lookaside cursors).
+	 */
+	WT_SPINLOCK	 las_lock;	/* Lookaside table spinlock */
+	WT_SESSION_IMPL *las_session;	/* Lookaside table session */
+	WT_CURSOR	*las_cursor;	/* Lookaside table cursor */
+	bool		 las_written;	/* Lookaside table has been written */
+
+	WT_ITEM		 las_sweep_key;	/* Sweep server's saved key */
+	int		 las_sweep_call;/* Sweep server's call count */
+	uint64_t	 las_sweep_cnt;	/* Sweep server's per-call row count */
+
 					/* Locked: collator list */
 	TAILQ_HEAD(__wt_coll_qh, __wt_named_collator) collqh;
 
diff --git a/src/include/cursor.h b/src/include/cursor.h
index 36f36f2c46c..2f55dfc8186 100644
--- a/src/include/cursor.h
+++ b/src/include/cursor.h
@@ -261,6 +261,7 @@ struct __wt_cursor_index {
 
 	WT_CURSOR *child;
 	WT_CURSOR **cg_cursors;
+	uint8_t	*cg_needvalue;
 };
 
 struct __wt_cursor_json {
@@ -303,10 +304,10 @@ struct __wt_cursor_stat {
 	int	notinitialized;		/* Cursor not initialized */
 	int	notpositioned;		/* Cursor not positioned */
 
-	WT_STATS *stats;		/* Stats owned by the cursor */
-	WT_STATS *stats_first;		/* First stats reference */
-	int	  stats_base;		/* Base statistics value */
-	int	  stats_count;		/* Count of stats elements */
+	int64_t	     *stats;		/* Statistics */
+	int	      stats_base;	/* Base statistics value */
+	int	      stats_count;	/* Count of statistics values */
+	const char *(*stats_desc)(int);	/* Statistics descriptions */
 
 	union {				/* Copies of the statistics */
 		WT_DSRC_STATS dsrc_stats;
@@ -325,12 +326,10 @@ struct __wt_cursor_stat {
 
 /*
  * WT_CURSOR_STATS --
- *	Return a reference to a statistic cursor's stats structures; use the
- * WT_CURSOR.stats_first field instead of WT_CURSOR.stats because the latter
- * is NULL when non-cursor memory is used to hold the statistics.
+ *	Return a reference to a statistic cursor's stats structures.
  */
 #define	WT_CURSOR_STATS(cursor)						\
-	(((WT_CURSOR_STAT *)cursor)->stats_first)
+	(((WT_CURSOR_STAT *)cursor)->stats)
 
 struct __wt_cursor_table {
 	WT_CURSOR iface;
diff --git a/src/include/cursor.i b/src/include/cursor.i
index 9e592ede450..e7fed250251 100644
--- a/src/include/cursor.i
+++ b/src/include/cursor.i
@@ -32,7 +32,7 @@ __cursor_pos_clear(WT_CURSOR_BTREE *cbt)
 	 * and it's a minimal set of things we need to clear. It would be a
 	 * lot simpler to clear everything, but we call this function a lot.
 	 */
-	cbt->recno = 0;
+	cbt->recno = WT_RECNO_OOB;
 
 	cbt->ins = NULL;
 	cbt->ins_head = NULL;
@@ -150,7 +150,7 @@ __wt_cursor_dhandle_incr_use(WT_SESSION_IMPL *session)
 	dhandle = session->dhandle;
 
 	/* If we open a handle with a time of death set, clear it. */
-	if (WT_ATOMIC_ADD4(dhandle->session_inuse, 1) == 1 &&
+	if (__wt_atomic_addi32(&dhandle->session_inuse, 1) == 1 &&
 	    dhandle->timeofdeath != 0)
 		dhandle->timeofdeath = 0;
 }
@@ -168,7 +168,7 @@ __wt_cursor_dhandle_decr_use(WT_SESSION_IMPL *session)
 
 	/* If we close a handle with a time of death set, clear it. */
 	WT_ASSERT(session, dhandle->session_inuse > 0);
-	if (WT_ATOMIC_SUB4(dhandle->session_inuse, 1) == 0 &&
+	if (__wt_atomic_subi32(&dhandle->session_inuse, 1) == 0 &&
 	    dhandle->timeofdeath != 0)
 		dhandle->timeofdeath = 0;
 }
@@ -187,6 +187,12 @@ __cursor_func_init(WT_CURSOR_BTREE *cbt, int reenter)
 	if (reenter)
 		WT_RET(__curfile_leave(cbt));
 
+	/*
+	 * Any old insert position is now invalid.  We rely on this being
+	 * cleared to detect if a new skiplist is installed after a search.
+	 */
+	cbt->ins_stack[0] = NULL;
+
 	/* If the transaction is idle, check that the cache isn't full. */
 	WT_RET(__wt_txn_idle_cache_check(session));
 
diff --git a/src/include/dhandle.h b/src/include/dhandle.h
index d41631696b4..9a54b4ddb66 100644
--- a/src/include/dhandle.h
+++ b/src/include/dhandle.h
@@ -28,14 +28,19 @@
  */
 #define	WT_SAVE_DHANDLE(s, e) WT_WITH_DHANDLE(s, (s)->dhandle, e)
 
+/* Check if a handle is inactive. */
+#define	WT_DHANDLE_INACTIVE(dhandle)					\
+	(F_ISSET(dhandle, WT_DHANDLE_DEAD) ||				\
+	!F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_OPEN))
+
 /*
  * WT_DATA_HANDLE --
  *	A handle for a generic named data source.
  */
 struct __wt_data_handle {
 	WT_RWLOCK *rwlock;		/* Lock for shared/exclusive ops */
-	SLIST_ENTRY(__wt_data_handle) l;
-	SLIST_ENTRY(__wt_data_handle) hashl;
+	TAILQ_ENTRY(__wt_data_handle) q;
+	TAILQ_ENTRY(__wt_data_handle) hashq;
 
 	/*
 	 * Sessions caching a connection's data handle will have a non-zero
@@ -64,7 +69,9 @@ struct __wt_data_handle {
 	 */
 	WT_SPINLOCK	close_lock;	/* Lock to close the handle */
 
-	WT_DSRC_STATS stats;		/* Data-source statistics */
+					/* Data-source statistics */
+	WT_DSRC_STATS *stats[WT_COUNTER_SLOTS];
+	WT_DSRC_STATS  stat_array[WT_COUNTER_SLOTS];
 
 	/* Flags values over 0xff are reserved for WT_BTREE_* */
 #define	WT_DHANDLE_DEAD		        0x01	/* Dead, awaiting discard */
diff --git a/src/include/error.h b/src/include/error.h
index fcb96b16361..abffc02945e 100644
--- a/src/include/error.h
+++ b/src/include/error.h
@@ -92,7 +92,8 @@
 		return (__wt_illegal_value(session, NULL))
 #define	WT_ILLEGAL_VALUE_ERR(session)					\
 	default:							\
-		WT_ERR(__wt_illegal_value(session, NULL))
+		ret = __wt_illegal_value(session, NULL);		\
+		goto err
 #define	WT_ILLEGAL_VALUE_SET(session)					\
 	default:							\
 		ret = __wt_illegal_value(session, NULL);		\
diff --git a/src/include/extern.h b/src/include/extern.h
index f0c1a0e310a..a8f11c8694f 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -63,7 +63,7 @@ extern int __wt_block_ext_prealloc(WT_SESSION_IMPL *session, u_int max);
 extern int __wt_block_ext_discard(WT_SESSION_IMPL *session, u_int max);
 extern int __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block);
 extern int __wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block);
-extern int __wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size);
+extern bool __wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size);
 extern int __wt_block_salvage_next(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t *addr_sizep, int *eofp);
 extern int __wt_block_salvage_valid(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t addr_size, int valid);
 extern int __wt_block_verify_start(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase, const char *cfg[]);
@@ -101,8 +101,9 @@ extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt);
 extern int __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp);
 extern int __wt_btcur_equals( WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *equalp);
 extern int __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop);
+extern void __wt_btcur_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt);
 extern void __wt_btcur_open(WT_CURSOR_BTREE *cbt);
-extern int __wt_btcur_close(WT_CURSOR_BTREE *cbt);
+extern int __wt_btcur_close(WT_CURSOR_BTREE *cbt, int lowlevel);
 extern int __wt_debug_set_verbose(WT_SESSION_IMPL *session, const char *v);
 extern int __wt_debug_addr_print( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size);
 extern int __wt_debug_addr(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, const char *ofile);
@@ -115,12 +116,13 @@ extern int __wt_debug_tree(WT_SESSION_IMPL *session, WT_PAGE *page, const char *
 extern int __wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile);
 extern int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp);
 extern void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref);
-extern int __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref);
+extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref);
 extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref);
 extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref);
 extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep);
 extern void __wt_free_ref( WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref, int free_pages);
 extern void __wt_free_ref_index(WT_SESSION_IMPL *session, WT_PAGE *page, WT_PAGE_INDEX *pindex, int free_pages);
+extern void __wt_free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd);
 extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]);
 extern int __wt_btree_close(WT_SESSION_IMPL *session);
 extern void __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, int is_recno);
@@ -138,15 +140,15 @@ extern const char *__wt_addr_string(WT_SESSION_IMPL *session, const uint8_t *add
 extern int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store);
 extern int __wt_ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, void *cookie, WT_CELL_UNPACK *vpack);
 extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell);
+extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint64_t recno, uint32_t alloc_entries, int alloc_refs, WT_PAGE **pagep);
+extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep);
+extern int __wt_las_remove_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size);
 extern int
 __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
 #ifdef HAVE_DIAGNOSTIC
  , const char *file, int line
 #endif
  );
-extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint64_t recno, uint32_t alloc_entries, int alloc_refs, WT_PAGE **pagep);
-extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep);
-extern int __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref);
 extern int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd);
 extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]);
 extern void __wt_split_stash_discard(WT_SESSION_IMPL *session);
@@ -159,10 +161,10 @@ extern int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst);
 extern int __wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op);
 extern int __wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]);
 extern int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]);
-extern int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk, size_t size, int empty_page_ok);
-extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM *buf);
+extern int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk, size_t size, int empty_page_ok);
+extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *tag, WT_ITEM *buf);
 extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags);
-extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove);
+extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, int is_remove);
 extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt);
 extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page);
 extern int __wt_row_leaf_key_copy( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key);
@@ -179,6 +181,14 @@ extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page,
 extern int __wt_search_insert( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key);
 extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, int insert);
 extern int __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt);
+extern void __wt_las_stats_update(WT_SESSION_IMPL *session);
+extern int __wt_las_create(WT_SESSION_IMPL *session);
+extern int __wt_las_destroy(WT_SESSION_IMPL *session);
+extern void __wt_las_set_written(WT_SESSION_IMPL *session);
+extern bool __wt_las_is_written(WT_SESSION_IMPL *session);
+extern int __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags);
+extern int __wt_las_cursor_close( 	WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags);
+extern int __wt_las_sweep(WT_SESSION_IMPL *session);
 extern int __wt_config_initn( WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str, size_t len);
 extern int __wt_config_init(WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str);
 extern int __wt_config_subinit( WT_SESSION_IMPL *session, WT_CONFIG *conf, WT_CONFIG_ITEM *item);
@@ -237,7 +247,7 @@ extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session);
 extern int __wt_connection_init(WT_CONNECTION_IMPL *conn);
 extern int __wt_connection_destroy(WT_CONNECTION_IMPL *conn);
 extern int __wt_log_truncate_files( WT_SESSION_IMPL *session, WT_CURSOR *cursor, const char *cfg[]);
-extern int __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield);
+extern int __wt_log_wrlsn(WT_SESSION_IMPL *session);
 extern int __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]);
 extern int __wt_logmgr_open(WT_SESSION_IMPL *session);
 extern int __wt_logmgr_destroy(WT_SESSION_IMPL *session);
@@ -309,14 +319,14 @@ extern void __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref);
 extern int __wt_evict_server_wake(WT_SESSION_IMPL *session);
 extern int __wt_evict_create(WT_SESSION_IMPL *session);
 extern int __wt_evict_destroy(WT_SESSION_IMPL *session);
-extern int __wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref);
 extern int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, int *evict_resetp);
 extern void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session);
-extern int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, int pct_full);
-extern void __wt_cache_dump(WT_SESSION_IMPL *session);
+extern int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, u_int pct_full);
+extern int __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile);
 extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int closing);
 extern int __wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, int closing);
 extern int __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn);
+extern int __wt_log_ckpt_lsn(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn);
 extern int __wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn);
 extern int __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn);
 extern int __wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, int *rec);
@@ -324,12 +334,14 @@ extern void __wt_log_written_reset(WT_SESSION_IMPL *session);
 extern int __wt_log_get_all_files(WT_SESSION_IMPL *session, char ***filesp, u_int *countp, uint32_t *maxid, int active_only);
 extern void __wt_log_files_free(WT_SESSION_IMPL *session, char **files, u_int count);
 extern int __wt_log_extract_lognum( WT_SESSION_IMPL *session, const char *name, uint32_t *id);
+extern int __wt_log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot);
 extern int __wt_log_allocfile( WT_SESSION_IMPL *session, uint32_t lognum, const char *dest, int prealloc);
 extern int __wt_log_remove(WT_SESSION_IMPL *session, const char *file_prefix, uint32_t lognum);
 extern int __wt_log_open(WT_SESSION_IMPL *session);
 extern int __wt_log_close(WT_SESSION_IMPL *session);
-extern int __wt_log_newfile(WT_SESSION_IMPL *session, int conn_create, int *created);
+extern int __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep);
 extern int __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, int (*func)(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, WT_LSN *next_lsnp, void *cookie, int firstrecord), void *cookie);
+extern int __wt_log_force_write(WT_SESSION_IMPL *session, int retry);
 extern int __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags);
 extern int __wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap);
 extern int __wt_logrec_alloc(WT_SESSION_IMPL *session, size_t size, WT_ITEM **logrecp);
@@ -355,14 +367,15 @@ extern int __wt_logop_row_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logr
 extern int __wt_logop_row_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *startp, WT_ITEM *stopp, uint32_t *modep);
 extern int __wt_logop_row_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
 extern int __wt_txn_op_printlog( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern void __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
+extern int __wt_log_slot_close( WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *releasep, int forced);
+extern int __wt_log_slot_switch( WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int retry, int forced);
+extern int __wt_log_slot_new(WT_SESSION_IMPL *session);
 extern int __wt_log_slot_init(WT_SESSION_IMPL *session);
 extern int __wt_log_slot_destroy(WT_SESSION_IMPL *session);
-extern int __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslotp);
-extern int __wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
-extern int __wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
-extern int __wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
-extern int64_t __wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size);
-extern int __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
+extern void __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot);
+extern int64_t __wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size);
+extern void __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
 extern int __wt_clsm_request_switch(WT_CURSOR_LSM *clsm);
 extern int __wt_clsm_await_switch(WT_CURSOR_LSM *clsm);
 extern int __wt_clsm_init_merge( WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_int nchunks);
@@ -475,7 +488,7 @@ extern int __wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t siz
 extern int __wt_mmap_discard(WT_SESSION_IMPL *session, void *p, size_t size);
 extern int __wt_munmap(WT_SESSION_IMPL *session, WT_FH *fh, void *map, size_t len, void **mappingcookie);
 extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, int is_signalled, WT_CONDVAR **condp);
-extern int __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs);
+extern int __wt_cond_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, int *signalled);
 extern int __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond);
 extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp);
 extern int __wt_rwlock_alloc( WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name);
@@ -489,7 +502,7 @@ extern int __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp);
 extern int __wt_once(void (*init_routine)(void));
 extern int __wt_open(WT_SESSION_IMPL *session, const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp);
 extern int __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp);
-extern int __wt_absolute_path(const char *path);
+extern bool __wt_absolute_path(const char *path);
 extern const char *__wt_path_separator(void);
 extern int __wt_has_priv(void);
 extern int __wt_remove(WT_SESSION_IMPL *session, const char *name);
@@ -577,6 +590,8 @@ extern int __wt_schema_worker(WT_SESSION_IMPL *session, const char *uri, int (*f
 extern int __wt_session_reset_cursors(WT_SESSION_IMPL *session, int free_buffers);
 extern int __wt_session_copy_values(WT_SESSION_IMPL *session);
 extern int __wt_open_cursor(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_session_create( WT_SESSION_IMPL *session, const char *uri, const char *config);
+extern int __wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]);
 extern int __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, int uses_dhandles, int open_metadata, WT_SESSION_IMPL **sessionp);
 extern int __wt_open_session(WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler, const char *config, int open_metadata, WT_SESSION_IMPL **sessionp);
 extern int __wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, int *skip);
@@ -639,7 +654,7 @@ extern int __wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg, cons
 extern uint32_t __wt_nlpo2_round(uint32_t v);
 extern uint32_t __wt_nlpo2(uint32_t v);
 extern uint32_t __wt_log2_int(uint32_t n);
-extern int __wt_ispo2(uint32_t v);
+extern bool __wt_ispo2(uint32_t v);
 extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2);
 extern void __wt_random_init(WT_RAND_STATE volatile *rnd_state);
 extern uint32_t __wt_random(WT_RAND_STATE volatile *rnd_state);
@@ -655,11 +670,19 @@ __wt_scr_alloc_func(WT_SESSION_IMPL *session, size_t size, WT_ITEM **scratchp
 extern void __wt_scr_discard(WT_SESSION_IMPL *session);
 extern void *__wt_ext_scr_alloc( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, size_t size);
 extern void __wt_ext_scr_free(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, void *p);
-extern void __wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats);
-extern void __wt_stat_refresh_dsrc_stats(void *stats_arg);
-extern void __wt_stat_aggregate_dsrc_stats(const void *child, const void *parent);
-extern void __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats);
-extern void __wt_stat_refresh_connection_stats(void *stats_arg);
+extern const char *__wt_stat_dsrc_desc(int slot);
+extern void __wt_stat_dsrc_init_single(WT_DSRC_STATS *stats);
+extern void __wt_stat_dsrc_init(WT_DATA_HANDLE *handle);
+extern void __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats);
+extern void __wt_stat_dsrc_clear_all(WT_DSRC_STATS **stats);
+extern void __wt_stat_dsrc_aggregate_single( WT_DSRC_STATS *from, WT_DSRC_STATS *to);
+extern void __wt_stat_dsrc_aggregate( WT_DSRC_STATS **from, WT_DSRC_STATS *to);
+extern const char *__wt_stat_connection_desc(int slot);
+extern void __wt_stat_connection_init_single(WT_CONNECTION_STATS *stats);
+extern void __wt_stat_connection_init(WT_CONNECTION_IMPL *handle);
+extern void __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats);
+extern void __wt_stat_connection_clear_all(WT_CONNECTION_STATS **stats);
+extern void __wt_stat_connection_aggregate( WT_CONNECTION_STATS **from, WT_CONNECTION_STATS *to);
 extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session);
 extern void __wt_txn_get_snapshot(WT_SESSION_IMPL *session);
 extern void __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force);
diff --git a/src/include/flags.h b/src/include/flags.h
index 675ede9a8a0..ca3c3c38245 100644
--- a/src/include/flags.h
+++ b/src/include/flags.h
@@ -18,6 +18,8 @@
 #define	WT_CONN_SERVER_SWEEP				0x00002000
 #define	WT_CONN_WAS_BACKUP				0x00004000
 #define	WT_EVICTING					0x00000001
+#define	WT_EVICT_LOOKASIDE				0x00000002
+#define	WT_EVICT_UPDATE_RESTORE				0x00000004
 #define	WT_FILE_TYPE_CHECKPOINT				0x00000001
 #define	WT_FILE_TYPE_DATA				0x00000002
 #define	WT_FILE_TYPE_DIRECTORY				0x00000004
@@ -42,27 +44,25 @@
 #define	WT_READ_WONT_NEED				0x00000100
 #define	WT_SESSION_CAN_WAIT				0x00000001
 #define	WT_SESSION_CLEAR_EVICT_WALK			0x00000002
-#define	WT_SESSION_DISCARD_FORCE			0x00000004
-#define	WT_SESSION_INTERNAL				0x00000008
-#define	WT_SESSION_LOCKED_CHECKPOINT			0x00000010
-#define	WT_SESSION_LOCKED_HANDLE_LIST			0x00000020
-#define	WT_SESSION_LOCKED_SCHEMA			0x00000040
+#define	WT_SESSION_INTERNAL				0x00000004
+#define	WT_SESSION_LOCKED_CHECKPOINT			0x00000008
+#define	WT_SESSION_LOCKED_HANDLE_LIST			0x00000010
+#define	WT_SESSION_LOCKED_SCHEMA			0x00000020
+#define	WT_SESSION_LOCKED_SLOT				0x00000040
 #define	WT_SESSION_LOCKED_TABLE				0x00000080
 #define	WT_SESSION_LOGGING_INMEM			0x00000100
-#define	WT_SESSION_NO_CACHE				0x00000200
-#define	WT_SESSION_NO_CACHE_CHECK			0x00000400
+#define	WT_SESSION_LOOKASIDE_CURSOR			0x00000200
+#define	WT_SESSION_NO_CACHE				0x00000400
 #define	WT_SESSION_NO_DATA_HANDLES			0x00000800
-#define	WT_SESSION_NO_LOGGING				0x00001000
-#define	WT_SESSION_NO_SCHEMA_LOCK			0x00002000
-#define	WT_SESSION_SALVAGE_CORRUPT_OK			0x00004000
-#define	WT_SESSION_SERVER_ASYNC				0x00008000
-#define	WT_SKIP_UPDATE_ERR				0x00000002
-#define	WT_SKIP_UPDATE_RESTORE				0x00000004
+#define	WT_SESSION_NO_EVICTION				0x00001000
+#define	WT_SESSION_NO_LOGGING				0x00002000
+#define	WT_SESSION_NO_SCHEMA_LOCK			0x00004000
+#define	WT_SESSION_QUIET_CORRUPT_FILE			0x00008000
+#define	WT_SESSION_SERVER_ASYNC				0x00010000
 #define	WT_SYNC_CHECKPOINT				0x00000001
 #define	WT_SYNC_CLOSE					0x00000002
 #define	WT_SYNC_DISCARD					0x00000004
-#define	WT_SYNC_DISCARD_FORCE				0x00000008
-#define	WT_SYNC_WRITE_LEAVES				0x00000010
+#define	WT_SYNC_WRITE_LEAVES				0x00000008
 #define	WT_TXN_LOG_CKPT_CLEANUP				0x00000001
 #define	WT_TXN_LOG_CKPT_PREPARE				0x00000002
 #define	WT_TXN_LOG_CKPT_START				0x00000004
@@ -92,6 +92,7 @@
 #define	WT_VERB_VERIFY					0x00200000
 #define	WT_VERB_VERSION					0x00400000
 #define	WT_VERB_WRITE					0x00800000
+#define	WT_VISIBILITY_ERR				0x00000008
 /*
  * flags section: END
  * DO NOT EDIT: automatically built by dist/flags.py.
diff --git a/src/include/gcc.h b/src/include/gcc.h
index 1c61768d372..01e33792d73 100644
--- a/src/include/gcc.h
+++ b/src/include/gcc.h
@@ -85,56 +85,71 @@
  * In summary, locking > barriers > volatile.
  *
  * To avoid locking shared data structures such as statistics and to permit
- * atomic state changes, we rely on the WT_ATOMIC_ADD and WT_ATOMIC_CAS
- * (compare and swap) operations.
+ * atomic state changes, we rely on the atomic-add and atomic-cas (compare and
+ * swap) operations.
  */
-#define	__WT_ATOMIC_ADD(v, val, n)					\
-	(WT_STATIC_ASSERT(sizeof(v) == (n)), __sync_add_and_fetch(&(v), val))
-#define	__WT_ATOMIC_FETCH_ADD(v, val, n)				\
-	(WT_STATIC_ASSERT(sizeof(v) == (n)), __sync_fetch_and_add(&(v), val))
+
 #ifdef __clang__
 /*
- * We avoid __sync_bool_compare_and_swap with due to problems with
- * optimization with some versions of clang.  See
- * http://llvm.org/bugs/show_bug.cgi?id=21499 for details.
+ * We avoid __sync_bool_compare_and_swap with due to problems with optimization
+ * with some versions of clang. See http://llvm.org/bugs/show_bug.cgi?id=21499
+ * for details.
  */
-#define	__WT_ATOMIC_CAS(v, old, new, n)					\
-	(WT_STATIC_ASSERT(sizeof(v) == (n)),				\
-	__sync_val_compare_and_swap(&(v), old, new) == (old))
+#define	WT_ATOMIC_CAS(ptr, oldval, newval)				\
+	(__sync_val_compare_and_swap(ptr, oldval, newval) == oldval)
 #else
-#define	__WT_ATOMIC_CAS(v, old, new, n)					\
-	(WT_STATIC_ASSERT(sizeof(v) == (n)),				\
-	__sync_bool_compare_and_swap(&(v), old, new))
+#define	WT_ATOMIC_CAS(ptr, oldval, newval)				\
+	__sync_bool_compare_and_swap(ptr, oldval, newval)
 #endif
-#define	__WT_ATOMIC_STORE(v, val, n)					\
-	(WT_STATIC_ASSERT(sizeof(v) == (n)),				\
-	__sync_lock_test_and_set(&(v), val))
-#define	__WT_ATOMIC_SUB(v, val, n)					\
-	(WT_STATIC_ASSERT(sizeof(v) == (n)), __sync_sub_and_fetch(&(v), val))
-
-#define	WT_ATOMIC_ADD1(v, val)		__WT_ATOMIC_ADD(v, val, 1)
-#define	WT_ATOMIC_FETCH_ADD1(v, val)	__WT_ATOMIC_FETCH_ADD(v, val, 1)
-#define	WT_ATOMIC_CAS1(v, old, new)	__WT_ATOMIC_CAS(v, old, new, 1)
-#define	WT_ATOMIC_STORE1(v, val)	__WT_ATOMIC_STORE(v, val, 1)
-#define	WT_ATOMIC_SUB1(v, val)		__WT_ATOMIC_SUB(v, val, 1)
-
-#define	WT_ATOMIC_ADD2(v, val)		__WT_ATOMIC_ADD(v, val, 2)
-#define	WT_ATOMIC_FETCH_ADD2(v, val)	__WT_ATOMIC_FETCH_ADD(v, val, 2)
-#define	WT_ATOMIC_CAS2(v, old, new)	__WT_ATOMIC_CAS(v, old, new, 2)
-#define	WT_ATOMIC_STORE2(v, val)	__WT_ATOMIC_STORE(v, val, 2)
-#define	WT_ATOMIC_SUB2(v, val)		__WT_ATOMIC_SUB(v, val, 2)
-
-#define	WT_ATOMIC_ADD4(v, val)		__WT_ATOMIC_ADD(v, val, 4)
-#define	WT_ATOMIC_FETCH_ADD4(v, val)	__WT_ATOMIC_FETCH_ADD(v, val, 4)
-#define	WT_ATOMIC_CAS4(v, old, new)	__WT_ATOMIC_CAS(v, old, new, 4)
-#define	WT_ATOMIC_STORE4(v, val)	__WT_ATOMIC_STORE(v, val, 4)
-#define	WT_ATOMIC_SUB4(v, val)		__WT_ATOMIC_SUB(v, val, 4)
-
-#define	WT_ATOMIC_ADD8(v, val)		__WT_ATOMIC_ADD(v, val, 8)
-#define	WT_ATOMIC_FETCH_ADD8(v, val)	__WT_ATOMIC_FETCH_ADD(v, val, 8)
-#define	WT_ATOMIC_CAS8(v, old, new)	__WT_ATOMIC_CAS(v, old, new, 8)
-#define	WT_ATOMIC_STORE8(v, val)	__WT_ATOMIC_STORE(v, val, 8)
-#define	WT_ATOMIC_SUB8(v, val)		__WT_ATOMIC_SUB(v, val, 8)
+
+#define	WT_ATOMIC_FUNC(name, ret, type)					\
+static inline ret							\
+__wt_atomic_add##name(type *vp, type v)					\
+{									\
+	return (__sync_add_and_fetch(vp, v));				\
+}									\
+static inline ret							\
+__wt_atomic_fetch_add##name(type *vp, type v)				\
+{									\
+	return (__sync_fetch_and_add(vp, v));				\
+}									\
+static inline ret							\
+__wt_atomic_store##name(type *vp, type v)				\
+{									\
+	return (__sync_lock_test_and_set(vp, v));			\
+}									\
+static inline ret							\
+__wt_atomic_sub##name(type *vp, type v)					\
+{									\
+	return (__sync_sub_and_fetch(vp, v));				\
+}									\
+static inline bool							\
+__wt_atomic_cas##name(type *vp, type old, type new)			\
+{									\
+	return (WT_ATOMIC_CAS(vp, old, new));				\
+}
+
+WT_ATOMIC_FUNC(8, uint8_t, uint8_t)
+WT_ATOMIC_FUNC(16, uint16_t, uint16_t)
+WT_ATOMIC_FUNC(32, uint32_t, uint32_t)
+WT_ATOMIC_FUNC(v32, uint32_t, volatile uint32_t)
+WT_ATOMIC_FUNC(i32, int32_t, int32_t)
+WT_ATOMIC_FUNC(iv32, int32_t, volatile int32_t)
+WT_ATOMIC_FUNC(64, uint64_t, uint64_t)
+WT_ATOMIC_FUNC(v64, uint64_t, volatile uint64_t)
+WT_ATOMIC_FUNC(i64, int64_t, int64_t)
+WT_ATOMIC_FUNC(iv64, int64_t, volatile int64_t)
+WT_ATOMIC_FUNC(size, size_t, size_t)
+
+/*
+ * __wt_atomic_cas_ptr --
+ *	Pointer compare and swap.
+ */
+static inline bool
+__wt_atomic_cas_ptr(void *vp, void *old, void *new)
+{
+	return (WT_ATOMIC_CAS((void **)vp, old, new));
+}
 
 /* Compile read-write barrier */
 #define	WT_BARRIER() __asm__ volatile("" ::: "memory")
diff --git a/src/include/hardware.h b/src/include/hardware.h
index e3c098826d0..32353072c5b 100644
--- a/src/include/hardware.h
+++ b/src/include/hardware.h
@@ -33,8 +33,8 @@
 	uint8_t __orig;							\
 	do {								\
 		__orig = (p)->flags_atomic;				\
-	} while (!WT_ATOMIC_CAS1((p)->flags_atomic,			\
-	    __orig, __orig | (uint8_t)(mask)));				\
+	} while (!__wt_atomic_cas8(					\
+	    &(p)->flags_atomic, __orig, __orig | (uint8_t)(mask)));	\
 } while (0)
 
 #define	F_CAS_ATOMIC(p, mask, ret) do {					\
@@ -46,16 +46,30 @@
 			ret = EBUSY;					\
 			break;						\
 		}							\
-	} while (!WT_ATOMIC_CAS1((p)->flags_atomic,			\
-	    __orig, __orig | (uint8_t)(mask)));				\
+	} while (!__wt_atomic_cas8(					\
+	    &(p)->flags_atomic, __orig, __orig | (uint8_t)(mask)));	\
+} while (0)
+
+#define	F_CAS_ATOMIC_WAIT(p, mask) do {					\
+	int __ret;							\
+	for (;;) {							\
+		F_CAS_ATOMIC(p, mask, __ret);				\
+		if (__ret == 0)						\
+			break;						\
+		__wt_yield();						\
+	}								\
 } while (0)
 
 #define	F_CLR_ATOMIC(p, mask)	do {					\
 	uint8_t __orig;							\
 	do {								\
 		__orig = (p)->flags_atomic;				\
-	} while (!WT_ATOMIC_CAS1((p)->flags_atomic,			\
-	    __orig, __orig & ~(uint8_t)(mask)));			\
+	} while (!__wt_atomic_cas8(					\
+	    &(p)->flags_atomic, __orig, __orig & ~(uint8_t)(mask)));	\
 } while (0)
 
 #define	WT_CACHE_LINE_ALIGNMENT	64	/* Cache line alignment */
+#define	WT_CACHE_LINE_ALIGNMENT_VERIFY(session, a)			\
+	WT_ASSERT(session,						\
+	    WT_PTRDIFF(&(a)[1], &(a)[0]) >= WT_CACHE_LINE_ALIGNMENT &&	\
+	    WT_PTRDIFF(&(a)[1], &(a)[0]) % WT_CACHE_LINE_ALIGNMENT == 0)
diff --git a/src/include/lint.h b/src/include/lint.h
index 964aa5c118f..f288fb98683 100644
--- a/src/include/lint.h
+++ b/src/include/lint.h
@@ -18,40 +18,71 @@
 #define	WT_GCC_FUNC_ATTRIBUTE(x)
 #define	WT_GCC_FUNC_DECL_ATTRIBUTE(x)
 
-#define	__WT_ATOMIC_ADD(v, val)						\
-    ((v) += (val))
-#define	__WT_ATOMIC_FETCH_ADD(v, val)					\
-    ((v) += (val), (v))
-#define	__WT_ATOMIC_CAS(v, old, new)					\
-    ((v) = ((v) == (old) ? (new) : (old)), (v) == (old))
-#define	__WT_ATOMIC_STORE(v, val)					\
-    ((v) = (val))
-#define	__WT_ATOMIC_SUB(v, val)						\
-    ((v) -= (val), (v))
-
-#define	WT_ATOMIC_ADD1(v, val)		__WT_ATOMIC_ADD(v, val)
-#define	WT_ATOMIC_FETCH_ADD1(v, val)	__WT_ATOMIC_FETCH_ADD(v, val)
-#define	WT_ATOMIC_CAS1(v, old, new)	__WT_ATOMIC_CAS(v, old, new)
-#define	WT_ATOMIC_STORE1(v, val)	__WT_ATOMIC_STORE(v, val)
-#define	WT_ATOMIC_SUB1(v, val)		__WT_ATOMIC_SUB(v, val)
-
-#define	WT_ATOMIC_ADD2(v, val)		__WT_ATOMIC_ADD(v, val)
-#define	WT_ATOMIC_FETCH_ADD2(v, val)	__WT_ATOMIC_FETCH_ADD(v, val)
-#define	WT_ATOMIC_CAS2(v, old, new)	__WT_ATOMIC_CAS(v, old, new)
-#define	WT_ATOMIC_STORE2(v, val)	__WT_ATOMIC_STORE(v, val)
-#define	WT_ATOMIC_SUB2(v, val)		__WT_ATOMIC_SUB(v, val)
-
-#define	WT_ATOMIC_ADD4(v, val)		__WT_ATOMIC_ADD(v, val)
-#define	WT_ATOMIC_FETCH_ADD4(v, val)	__WT_ATOMIC_FETCH_ADD(v, val)
-#define	WT_ATOMIC_CAS4(v, old, new)	__WT_ATOMIC_CAS(v, old, new)
-#define	WT_ATOMIC_STORE4(v, val)	__WT_ATOMIC_STORE(v, val)
-#define	WT_ATOMIC_SUB4(v, val)		__WT_ATOMIC_SUB(v, val)
-
-#define	WT_ATOMIC_ADD8(v, val)		__WT_ATOMIC_ADD(v, val)
-#define	WT_ATOMIC_FETCH_ADD8(v, val)	__WT_ATOMIC_FETCH_ADD(v, val)
-#define	WT_ATOMIC_CAS8(v, old, new)	__WT_ATOMIC_CAS(v, old, new)
-#define	WT_ATOMIC_STORE8(v, val)	__WT_ATOMIC_STORE(v, val)
-#define	WT_ATOMIC_SUB8(v, val)		__WT_ATOMIC_SUB(v, val)
+#define	WT_ATOMIC_FUNC(name, ret, type)					\
+static inline ret							\
+__wt_atomic_add##name(type *vp, type v)					\
+{									\
+	*vp += v;							\
+	return (*vp);							\
+}									\
+static inline ret							\
+__wt_atomic_fetch_add##name(type *vp, type v)				\
+{									\
+	type orig;							\
+									\
+	old = *vp;							\
+	*vp += v;							\
+	return (old);							\
+}									\
+static inline ret							\
+__wt_atomic_store##name(type *vp, type v)				\
+{									\
+	type orig;							\
+									\
+	orig = *vp;							\
+	*vp = v;							\
+	return (old);							\
+}									\
+static inline ret							\
+__wt_atomic_sub##name(type *vp, type v)					\
+{									\
+	*vp -= v;							\
+	return (*vp);							\
+}									\
+static inline bool							\
+__wt_atomic_cas##name(type *vp, type old, type new)			\
+{									\
+	if (*vp == old) {						\
+		*vp = new;						\
+		return (true);						\
+	}								\
+	return (false);							\
+}
+
+WT_ATOMIC_FUNC(8, uint8_t, uint8_t)
+WT_ATOMIC_FUNC(16, uint16_t, uint16_t)
+WT_ATOMIC_FUNC(32, uint32_t, uint32_t)
+WT_ATOMIC_FUNC(v32, uint32_t, volatile uint32_t)
+WT_ATOMIC_FUNC(i32, int32_t, int32_t)
+WT_ATOMIC_FUNC(iv32, int32_t, volatile int32_t)
+WT_ATOMIC_FUNC(64, uint64_t, uint64_t)
+WT_ATOMIC_FUNC(v64, uint64_t, volatile uint64_t)
+WT_ATOMIC_FUNC(i64, int64_t, int64_t)
+WT_ATOMIC_FUNC(iv64, int64_t, volatile int64_t)
+WT_ATOMIC_FUNC(size, size_t, size_t)
+
+/*
+ * __wt_atomic_cas_ptr --
+ *	Pointer compare and swap.
+ */
+static inline bool
+__wt_atomic_cas_ptr(void *vp, void *old, void *new) {
+	if (*(void **)vp == old) {
+		*(void **)vp = new;
+		return (true);
+	}
+	return (false);
+}
 
 static inline void WT_BARRIER(void) { return; }
 static inline void WT_FULL_BARRIER(void) { return; }
diff --git a/src/include/log.h b/src/include/log.h
index fbb0a3e3842..06be95697c7 100644
--- a/src/include/log.h
+++ b/src/include/log.h
@@ -12,7 +12,6 @@
 
 /* Logging subsystem declarations. */
 #define	WT_LOG_ALIGN			128
-#define	WT_LOG_SLOT_BUF_SIZE		256 * 1024
 
 #define	WT_INIT_LSN(l)	do {						\
 	(l)->file = 1;							\
@@ -48,67 +47,136 @@
     ((size) - offsetof(WT_LOG_RECORD, record))
 
 /*
- * Compare 2 LSNs, return -1 if lsn0 < lsn1, 0 if lsn0 == lsn1
- * and 1 if lsn0 > lsn1.
- */
-#define	WT_LOG_CMP(lsn1, lsn2)						\
-	((lsn1)->file != (lsn2)->file ?					\
-	((lsn1)->file < (lsn2)->file ? -1 : 1) :			\
-	((lsn1)->offset != (lsn2)->offset ?				\
-	((lsn1)->offset < (lsn2)->offset ? -1 : 1) : 0))
-
-/*
  * Possible values for the consolidation array slot states:
- * (NOTE: Any new states must be > WT_LOG_SLOT_DONE and < WT_LOG_SLOT_READY.)
  *
- * < WT_LOG_SLOT_DONE - threads are actively writing to the log.
- * WT_LOG_SLOT_DONE - all activity on this slot is complete.
+ * WT_LOG_SLOT_CLOSE - slot is in use but closed to new joins.
  * WT_LOG_SLOT_FREE - slot is available for allocation.
- * WT_LOG_SLOT_PENDING - slot is transitioning from ready to active.
  * WT_LOG_SLOT_WRITTEN - slot is written and should be processed by worker.
- * WT_LOG_SLOT_READY - slot is ready for threads to join.
- * > WT_LOG_SLOT_READY - threads are actively consolidating on this slot.
  *
  * The slot state must be volatile: threads loop checking the state and can't
  * cache the first value they see.
+ *
+ * The slot state is divided into two 32 bit sizes.  One half is the
+ * amount joined and the other is the amount released.  Since we use
+ * a few special states, reserve the top few bits for state.  That makes
+ * the maximum size less than 32 bits for both joined and released.
+ */
+
+/*
+ * The high bit is reserved for the special states.  If the high bit is
+ * set (WT_LOG_SLOT_RESERVED) then we are guaranteed to be in a special state.
+ */
+#define	WT_LOG_SLOT_FREE	-1	/* Not in use */
+#define	WT_LOG_SLOT_WRITTEN	-2	/* Slot data written, not processed */
+
+/*
+ * We allocate the buffer size, but trigger a slot switch when we cross
+ * the maximum size of half the buffer.  If a record is more than the buffer
+ * maximum then we trigger a slot switch and write that record unbuffered.
+ * We use a larger buffer to provide overflow space so that we can switch
+ * once we cross the threshold.
+ */
+#define	WT_LOG_SLOT_BUF_SIZE		(256 * 1024)	/* Must be power of 2 */
+#define	WT_LOG_SLOT_BUF_MAX		((uint32_t)log->slot_buf_size / 2)
+#define	WT_LOG_SLOT_UNBUFFERED		(WT_LOG_SLOT_BUF_SIZE << 1)
+
+/*
+ * If new slot states are added, adjust WT_LOG_SLOT_BITS and
+ * WT_LOG_SLOT_MASK_OFF accordingly for how much of the top 32
+ * bits we are using.  More slot states here will reduce the maximum
+ * size that a slot can hold unbuffered by half.  If a record is
+ * larger than the maximum we can account for in the slot state we fall
+ * back to direct writes.
+ */
+#define	WT_LOG_SLOT_BITS	2
+#define	WT_LOG_SLOT_MAXBITS	(32 - WT_LOG_SLOT_BITS)
+#define	WT_LOG_SLOT_CLOSE	0x4000000000000000LL	/* Force slot close */
+#define	WT_LOG_SLOT_RESERVED	0x8000000000000000LL	/* Reserved states */
+
+/*
+ * Check if the unbuffered flag is set in the joined portion of
+ * the slot state.
  */
-#define	WT_LOG_SLOT_DONE	0
-#define	WT_LOG_SLOT_FREE	1
-#define	WT_LOG_SLOT_PENDING	2
-#define	WT_LOG_SLOT_WRITTEN	3
-#define	WT_LOG_SLOT_READY	4
-typedef WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) struct {
+#define	WT_LOG_SLOT_UNBUFFERED_ISSET(state)				\
+    ((state) & ((int64_t)WT_LOG_SLOT_UNBUFFERED << 32))
+
+#define	WT_LOG_SLOT_MASK_OFF	0x3fffffffffffffffLL
+#define	WT_LOG_SLOT_MASK_ON	~(WT_LOG_SLOT_MASK_OFF)
+#define	WT_LOG_SLOT_JOIN_MASK	(WT_LOG_SLOT_MASK_OFF >> 32)
+
+/*
+ * These macros manipulate the slot state and its component parts.
+ */
+#define	WT_LOG_SLOT_FLAGS(state)	((state) & WT_LOG_SLOT_MASK_ON)
+#define	WT_LOG_SLOT_JOINED(state)	(((state) & WT_LOG_SLOT_MASK_OFF) >> 32)
+#define	WT_LOG_SLOT_JOINED_BUFFERED(state)				\
+    (WT_LOG_SLOT_JOINED(state) &			\
+    (WT_LOG_SLOT_UNBUFFERED - 1))
+#define	WT_LOG_SLOT_JOIN_REL(j, r, s)	(((j) << 32) + (r) + (s))
+#define	WT_LOG_SLOT_RELEASED(state)	((int64_t)(int32_t)(state))
+#define	WT_LOG_SLOT_RELEASED_BUFFERED(state)				\
+    ((int64_t)((int32_t)WT_LOG_SLOT_RELEASED(state) &			\
+    (WT_LOG_SLOT_UNBUFFERED - 1)))
+
+/* Slot is in use */
+#define	WT_LOG_SLOT_ACTIVE(state)					\
+    (WT_LOG_SLOT_JOINED(state) != WT_LOG_SLOT_JOIN_MASK)
+/* Slot is in use, but closed to new joins */
+#define	WT_LOG_SLOT_CLOSED(state)					\
+    (WT_LOG_SLOT_ACTIVE(state) &&					\
+    (FLD64_ISSET((uint64_t)state, WT_LOG_SLOT_CLOSE) &&			\
+    !FLD64_ISSET((uint64_t)state, WT_LOG_SLOT_RESERVED)))
+/* Slot is in use, all data copied into buffer */
+#define	WT_LOG_SLOT_INPROGRESS(state)					\
+    (WT_LOG_SLOT_RELEASED(state) != WT_LOG_SLOT_JOINED(state))
+#define	WT_LOG_SLOT_DONE(state)						\
+    (WT_LOG_SLOT_CLOSED(state) &&					\
+    !WT_LOG_SLOT_INPROGRESS(state))
+/* Slot is in use, more threads may join this slot */
+#define	WT_LOG_SLOT_OPEN(state)						\
+    (WT_LOG_SLOT_ACTIVE(state) &&					\
+    !WT_LOG_SLOT_UNBUFFERED_ISSET(state) &&				\
+    !FLD64_ISSET((uint64_t)(state), WT_LOG_SLOT_CLOSE) &&		\
+    WT_LOG_SLOT_JOINED(state) < WT_LOG_SLOT_BUF_MAX)
+
+struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_logslot {
 	volatile int64_t slot_state;	/* Slot state */
-	uint64_t slot_group_size;	/* Group size */
+	int64_t	 slot_unbuffered;	/* Unbuffered data in this slot */
 	int32_t	 slot_error;		/* Error value */
-#define	WT_SLOT_INVALID_INDEX	0xffffffff
-	uint32_t slot_index;		/* Active slot index */
 	wt_off_t slot_start_offset;	/* Starting file offset */
-	WT_LSN	slot_release_lsn;	/* Slot release LSN */
-	WT_LSN	slot_start_lsn;		/* Slot starting LSN */
-	WT_LSN	slot_end_lsn;		/* Slot ending LSN */
+	wt_off_t slot_last_offset;	/* Last record offset */
+	WT_LSN	 slot_release_lsn;	/* Slot release LSN */
+	WT_LSN	 slot_start_lsn;	/* Slot starting LSN */
+	WT_LSN	 slot_end_lsn;		/* Slot ending LSN */
 	WT_FH	*slot_fh;		/* File handle for this group */
-	WT_ITEM slot_buf;		/* Buffer for grouped writes */
-	int32_t	slot_churn;		/* Active slots are scarce. */
+	WT_ITEM  slot_buf;		/* Buffer for grouped writes */
 
-#define	WT_SLOT_BUFFERED	0x01		/* Buffer writes */
-#define	WT_SLOT_CLOSEFH		0x02		/* Close old fh on release */
-#define	WT_SLOT_SYNC		0x04		/* Needs sync on release */
-#define	WT_SLOT_SYNC_DIR	0x08		/* Directory sync on release */
+#define	WT_SLOT_CLOSEFH		0x01		/* Close old fh on release */
+#define	WT_SLOT_SYNC		0x02		/* Needs sync on release */
+#define	WT_SLOT_SYNC_DIR	0x04		/* Directory sync on release */
 	uint32_t flags;			/* Flags */
-} WT_LOGSLOT;
+};
 
-#define	WT_SLOT_INIT_FLAGS	(WT_SLOT_BUFFERED)
+#define	WT_SLOT_INIT_FLAGS	0
 
-typedef struct {
-	WT_LOGSLOT	*slot;
-	wt_off_t	 offset;
-} WT_MYSLOT;
+#define	WT_WITH_SLOT_LOCK(session, log, op) do {			\
+	WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT));	\
+	WT_WITH_LOCK(session,						\
+	    &log->log_slot_lock, WT_SESSION_LOCKED_SLOT, op);		\
+} while (0)
+
+struct __wt_myslot {
+	WT_LOGSLOT	*slot;		/* Slot I'm using */
+	wt_off_t	 end_offset;	/* My end offset in buffer */
+	wt_off_t	 offset;	/* Slot buffer offset */
+#define	WT_MYSLOT_CLOSE		0x01	/* This thread is closing the slot */
+#define	WT_MYSLOT_UNBUFFERED	0x02	/* Write directly */
+	uint32_t flags;			/* Flags */
+};
 
-					/* Offset of first record */
 #define	WT_LOG_FIRST_RECORD	log->allocsize
 
-typedef struct {
+struct __wt_log {
 	uint32_t	allocsize;	/* Allocation alignment size */
 	wt_off_t	log_written;	/* Amount of log written this period */
 	/*
@@ -119,8 +187,9 @@ typedef struct {
 	uint32_t	 tmp_fileid;	/* Temporary file number */
 	uint32_t	 prep_missed;	/* Pre-allocated file misses */
 	WT_FH           *log_fh;	/* Logging file handle */
-	WT_FH           *log_close_fh;	/* Logging file handle to close */
 	WT_FH           *log_dir_fh;	/* Log directory file handle */
+	WT_FH           *log_close_fh;	/* Logging file handle to close */
+	WT_LSN		 log_close_lsn;	/* LSN needed to close */
 
 	/*
 	 * System LSNs
@@ -141,8 +210,9 @@ typedef struct {
 	WT_SPINLOCK      log_lock;      /* Locked: Logging fields */
 	WT_SPINLOCK      log_slot_lock; /* Locked: Consolidation array */
 	WT_SPINLOCK      log_sync_lock; /* Locked: Single-thread fsync */
+	WT_SPINLOCK      log_writelsn_lock; /* Locked: write LSN */
 
-	WT_RWLOCK	 *log_archive_lock; /* Archive and log cursors */
+	WT_RWLOCK	 *log_archive_lock;	/* Archive and log cursors */
 
 	/* Notify any waiting threads when sync_lsn is updated. */
 	WT_CONDVAR	*log_sync_cond;
@@ -151,22 +221,25 @@ typedef struct {
 
 	/*
 	 * Consolidation array information
-	 * WT_SLOT_ACTIVE must be less than WT_SLOT_POOL.
 	 * Our testing shows that the more consolidation we generate the
 	 * better the performance we see which equates to an active slot
 	 * slot count of one.
+	 *
+	 * Note: this can't be an array, we impose cache-line alignment and
+	 * gcc doesn't support that for arrays.
 	 */
-#define	WT_SLOT_ACTIVE	1
 #define	WT_SLOT_POOL	128
-	WT_LOGSLOT	*slot_array[WT_SLOT_ACTIVE];	/* Active slots */
+	WT_LOGSLOT	*active_slot;			/* Active slot */
 	WT_LOGSLOT	 slot_pool[WT_SLOT_POOL];	/* Pool of all slots */
 	size_t		 slot_buf_size;		/* Buffer size for slots */
+#ifdef HAVE_DIAGNOSTIC
+	uint64_t	 write_calls;		/* Calls to log_write */
+#endif
 
-#define	WT_LOG_FORCE_CONSOLIDATE	0x01	/* Disable direct writes */
 	uint32_t	 flags;
-} WT_LOG;
+};
 
-typedef struct {
+struct __wt_log_record {
 	uint32_t	len;		/* 00-03: Record length including hdr */
 	uint32_t	checksum;	/* 04-07: Checksum of the record */
 
@@ -176,7 +249,7 @@ typedef struct {
 	uint8_t		unused[2];	/* 10-11: Padding */
 	uint32_t	mem_len;	/* 12-15: Uncompressed len if needed */
 	uint8_t		record[0];	/* Beginning of actual data */
-} WT_LOG_RECORD;
+};
 
 /*
  * WT_LOG_DESC --
diff --git a/src/include/log.i b/src/include/log.i
new file mode 100644
index 00000000000..ff309c31265
--- /dev/null
+++ b/src/include/log.i
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+static inline int __wt_log_cmp(WT_LSN *lsn1, WT_LSN *lsn2);
+
+/*
+ * __wt_log_cmp --
+ *	Compare 2 LSNs, return -1 if lsn1 < lsn2, 0if lsn1 == lsn2
+ *	and 1 if lsn1 > lsn2.
+ */
+static inline int
+__wt_log_cmp(WT_LSN *lsn1, WT_LSN *lsn2)
+{
+	WT_LSN l1, l2;
+
+	/*
+	 * Read LSNs into local variables so that we only read each field
+	 * once and all comparisons are on the same values.
+	 */
+	l1 = *(volatile WT_LSN *)lsn1;
+	l2 = *(volatile WT_LSN *)lsn2;
+
+	/*
+	 * If the file numbers are different we don't need to compare the
+	 * offset.
+	 */
+	if (l1.file != l2.file)
+		return (l1.file < l2.file ? -1 : 1);
+	/*
+	 * If the file numbers are the same, compare the offset.
+	 */
+	if (l1.offset != l2.offset)
+	    return (l1.offset < l2.offset ? -1 : 1);
+	return (0);
+}
diff --git a/src/include/lsm.h b/src/include/lsm.h
index 08e57794fb8..11cf8204aec 100644
--- a/src/include/lsm.h
+++ b/src/include/lsm.h
@@ -74,7 +74,7 @@ struct __wt_cursor_lsm {
  * WT_LSM_CHUNK --
  *	A single chunk (file) in an LSM tree.
  */
-struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_lsm_chunk {
+struct __wt_lsm_chunk {
 	const char *uri;		/* Data source for this chunk */
 	const char *bloom_uri;		/* URI of Bloom filter, if any */
 	struct timespec create_ts;	/* Creation time (for rate limiting) */
@@ -177,16 +177,14 @@ struct __wt_lsm_tree {
 	const char *collator_name;
 	int collator_owned;
 
-	int refcnt;			/* Number of users of the tree */
-	int8_t exclusive;		/* Tree is locked exclusively */
+	uint32_t refcnt;		/* Number of users of the tree */
+	uint8_t exclusive;		/* Tree is locked exclusively */
 
 #define	LSM_TREE_MAX_QUEUE	100
-	int queue_ref;
+	uint32_t queue_ref;
 	WT_RWLOCK *rwlock;
 	TAILQ_ENTRY(__wt_lsm_tree) q;
 
-	WT_DSRC_STATS stats;		/* LSM-level statistics */
-
 	uint64_t dsk_gen;
 
 	uint64_t ckpt_throttle;		/* Rate limiting due to checkpoints */
@@ -221,9 +219,28 @@ struct __wt_lsm_tree {
 	WT_LSM_CHUNK **old_chunks;	/* Array of old LSM chunks */
 	size_t old_alloc;		/* Space allocated for old chunks */
 	u_int nold_chunks;		/* Number of old chunks */
-	int freeing_old_chunks;		/* Whether chunks are being freed */
+	uint32_t freeing_old_chunks;	/* Whether chunks are being freed */
 	uint32_t merge_aggressiveness;	/* Increase amount of work per merge */
 
+	/*
+	 * We maintain a set of statistics outside of the normal statistics
+	 * area, copying them into place when a statistics cursor is created.
+	 */
+#define	WT_LSM_TREE_STAT_INCR(session, fld) do {			\
+	if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST))	\
+		++(fld);						\
+} while (0)
+#define	WT_LSM_TREE_STAT_INCRV(session, fld, v) do {			\
+	if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST))	\
+		(fld) += (int64_t)(v);					\
+} while (0)
+	int64_t bloom_false_positive;
+	int64_t bloom_hit;
+	int64_t bloom_miss;
+	int64_t lsm_checkpoint_throttle;
+	int64_t lsm_lookup_no_bloom;
+	int64_t lsm_merge_throttle;
+
 #define	WT_LSM_TREE_ACTIVE		0x01	/* Workers are active */
 #define	WT_LSM_TREE_AGGRESSIVE_TIMER	0x02	/* Timer for merge aggression */
 #define	WT_LSM_TREE_COMPACTING		0x04	/* Tree being compacted */
diff --git a/src/include/meta.h b/src/include/meta.h
index 66547262417..a5a303f1630 100644
--- a/src/include/meta.h
+++ b/src/include/meta.h
@@ -21,7 +21,9 @@
 #define	WT_METADATA_TURTLE_SET	"WiredTiger.turtle.set"	/* Turtle temp file */
 
 #define	WT_METADATA_URI		"metadata:"		/* Metadata alias */
-#define	WT_METAFILE_URI		"file:WiredTiger.wt"	/* Metadata file URI */
+#define	WT_METAFILE_URI		"file:WiredTiger.wt"	/* Metadata table URI */
+
+#define	WT_LAS_URI		"file:WiredTigerLAS.wt"	/* Lookaside table URI*/
 
 /*
  * Pre computed hash for the metadata file. Used to optimize comparisons
diff --git a/src/include/misc.h b/src/include/misc.h
index 7fb6ae13d38..1b2cbf11fc2 100644
--- a/src/include/misc.h
+++ b/src/include/misc.h
@@ -130,6 +130,7 @@
 
 #define	FLD_CLR(field, mask)	((field) &= ~((uint32_t)(mask)))
 #define	FLD_ISSET(field, mask)	((field) & ((uint32_t)(mask)))
+#define	FLD64_ISSET(field, mask)	((field) & ((uint64_t)(mask)))
 #define	FLD_SET(field, mask)	((field) |= ((uint32_t)(mask)))
 
 /*
diff --git a/src/include/misc.i b/src/include/misc.i
index 98facff02b9..6b502c4c1d1 100644
--- a/src/include/misc.i
+++ b/src/include/misc.i
@@ -7,6 +7,18 @@
  */
 
 /*
+ * __wt_cond_wait --
+ *	Wait on a mutex, optionally timing out.
+ */
+static inline int
+__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs)
+{
+	int notused;
+
+	return (__wt_cond_wait_signal(session, cond, usecs, &notused));
+}
+
+/*
  * __wt_strdup --
  *	ANSI strdup function.
  */
diff --git a/src/include/msvc.h b/src/include/msvc.h
index bc72ddf8193..8f5aa9abde8 100644
--- a/src/include/msvc.h
+++ b/src/include/msvc.h
@@ -31,52 +31,56 @@
 #define	WT_GCC_FUNC_ATTRIBUTE(x)
 #define	WT_GCC_FUNC_DECL_ATTRIBUTE(x)
 
-#define	__WT_ATOMIC_ADD(v, val, n, s, t)				\
-	(WT_STATIC_ASSERT(sizeof(v) == (n)),				\
-	_InterlockedExchangeAdd ## s((t*)&(v), (t)(val)) + (val))
-#define	__WT_ATOMIC_FETCH_ADD(v, val, n, s, t)				\
-	(WT_STATIC_ASSERT(sizeof(v) == (n)),				\
-	_InterlockedExchangeAdd ## s((t*)&(v), (t)(val)))
-#define	__WT_ATOMIC_CAS(v, old, new, n, s, t)				\
-	(WT_STATIC_ASSERT(sizeof(v) == (n)),				\
-	_InterlockedCompareExchange ## s				\
-	((t*)&(v), (t)(new), (t)(old)) == (t)(old))
-#define	__WT_ATOMIC_STORE(v, val, n, s, t)				\
-	(WT_STATIC_ASSERT(sizeof(v) == (n)),				\
-	_InterlockedExchange ## s((t*)&(v), (t)(val)))
-#define	__WT_ATOMIC_SUB(v, val, n, s, t)				\
-	(WT_STATIC_ASSERT(sizeof(v) == (n)),				\
-	_InterlockedExchangeAdd ## s((t*)&(v), -(t) val) - (val))
+#define	WT_ATOMIC_FUNC(name, ret, type, s, t)				\
+static inline ret							\
+__wt_atomic_add##name(type *vp, type v)					\
+{									\
+	return (_InterlockedExchangeAdd ## s((t *)(vp), (t)(v)) + (v));	\
+}									\
+static inline ret							\
+__wt_atomic_fetch_add##name(type *vp, type v)				\
+{									\
+	return (_InterlockedExchangeAdd ## s((t *)(vp), (t)(v)));	\
+}									\
+static inline ret							\
+__wt_atomic_store##name(type *vp, type v)				\
+{									\
+	return (_InterlockedExchange ## s((t *)(vp), (t)(v)));		\
+}									\
+static inline ret							\
+__wt_atomic_sub##name(type *vp, type v)					\
+{									\
+	return (_InterlockedExchangeAdd ## s((t *)(vp), - (t)v) - (v));	\
+}									\
+static inline bool							\
+__wt_atomic_cas##name(type *vp, type old, type new)			\
+{									\
+	return (_InterlockedCompareExchange ## s			\
+	    ((t *)(vp), (t)(new), (t)(old)) == (t)(old));		\
+}
 
-#define	WT_ATOMIC_ADD1(v, val)		__WT_ATOMIC_ADD(v, val, 1, 8, char)
-#define	WT_ATOMIC_FETCH_ADD1(v, val)					\
-	__WT_ATOMIC_FETCH_ADD(v, val, 1, 8, char)
-#define	WT_ATOMIC_CAS1(v, old, new)	__WT_ATOMIC_CAS(v, old, new, 1, 8, char)
-#define	WT_ATOMIC_STORE1(v, val)	__WT_ATOMIC_STORE(v, val, 1, 8, char)
-#define	WT_ATOMIC_SUB1(v, val)		__WT_ATOMIC_SUB(v, val, 1, 8, char)
+WT_ATOMIC_FUNC(8, uint8_t, uint8_t, 8, char)
+WT_ATOMIC_FUNC(16, uint16_t, uint16_t, 16, short)
+WT_ATOMIC_FUNC(32, uint32_t, uint32_t, , long)
+WT_ATOMIC_FUNC(v32, uint32_t, volatile uint32_t, , long)
+WT_ATOMIC_FUNC(i32, int32_t, int32_t, , long)
+WT_ATOMIC_FUNC(iv32, int32_t, volatile int32_t, , long)
+WT_ATOMIC_FUNC(64, uint64_t, uint64_t, 64, __int64)
+WT_ATOMIC_FUNC(v64, uint64_t, volatile uint64_t, 64, __int64)
+WT_ATOMIC_FUNC(i64, int64_t, int64_t, 64, __int64)
+WT_ATOMIC_FUNC(iv64, int64_t, volatile int64_t, 64, __int64)
+WT_ATOMIC_FUNC(size, size_t, size_t, 64, __int64)
 
-#define	WT_ATOMIC_ADD2(v, val)		__WT_ATOMIC_ADD(v, val, 2, 16, short)
-#define	WT_ATOMIC_FETCH_ADD2(v, val)					\
-	__WT_ATOMIC_FETCH_ADD(v, val, 2, 16, short)
-#define	WT_ATOMIC_CAS2(v, old, new)					\
-	__WT_ATOMIC_CAS(v, old, new, 2, 16, short)
-#define	WT_ATOMIC_STORE2(v, val)	__WT_ATOMIC_STORE(v, val, 2, 16, short)
-#define	WT_ATOMIC_SUB2(v, val)		__WT_ATOMIC_SUB(v, val, 2, 16, short)
-
-#define	WT_ATOMIC_ADD4(v, val)		__WT_ATOMIC_ADD(v, val, 4, , long)
-#define	WT_ATOMIC_FETCH_ADD4(v, val)	__WT_ATOMIC_FETCH_ADD(v, val, 4, , long)
-#define	WT_ATOMIC_CAS4(v, old, new)	__WT_ATOMIC_CAS(v, old, new, 4, , long)
-#define	WT_ATOMIC_STORE4(v, val)	__WT_ATOMIC_STORE(v, val, 4, , long)
-#define	WT_ATOMIC_SUB4(v, val)		__WT_ATOMIC_SUB(v, val, 4, , long)
-
-#define	WT_ATOMIC_ADD8(v, val)		__WT_ATOMIC_ADD(v, val, 8, 64, __int64)
-#define	WT_ATOMIC_FETCH_ADD8(v, val)					\
-	__WT_ATOMIC_FETCH_ADD(v, val, 8, 64, __int64)
-#define	WT_ATOMIC_CAS8(v, old, new)					\
-	__WT_ATOMIC_CAS(v, old, new, 8, 64, __int64)
-#define	WT_ATOMIC_STORE8(v, val)					\
-	__WT_ATOMIC_STORE(v, val, 8, 64, __int64)
-#define	WT_ATOMIC_SUB8(v, val)		__WT_ATOMIC_SUB(v, val, 8, 64, __int64)
+/*
+ * __wt_atomic_cas_ptr --
+ *	Pointer compare and swap.
+ */
+static inline bool
+__wt_atomic_cas_ptr(void *vp, void *old, void *new)
+{
+	return (_InterlockedCompareExchange64(
+	    vp, (int64_t)new, (int64_t)old) == ((int64_t)old));
+}
 
 static inline void WT_BARRIER(void) { _ReadWriteBarrier(); }
 static inline void WT_FULL_BARRIER(void) { _mm_mfence(); }
diff --git a/src/include/mutex.h b/src/include/mutex.h
index 7a5028d6a28..1f1bb8f4b5c 100644
--- a/src/include/mutex.h
+++ b/src/include/mutex.h
@@ -24,24 +24,20 @@ struct __wt_condvar {
 
 /*
  * !!!
- * Don't touch this structure without understanding the read/write
- * locking functions.
+ * Don't modify this structure without understanding the read/write locking
+ * functions.
  */
-typedef union {			/* Read/write lock */
-#ifdef WORDS_BIGENDIAN
-	WiredTiger read/write locks require modification for big-endian systems.
-#else
+typedef union {				/* Read/write lock */
 	uint64_t u;
 	struct {
-		uint32_t us;
+		uint32_t wr;		/* Writers and readers */
 	} i;
 	struct {
-		uint16_t writers;
-		uint16_t readers;
-		uint16_t users;
-		uint16_t pad;
+		uint16_t writers;	/* Now serving for writers */
+		uint16_t readers;	/* Now serving for readers */
+		uint16_t users;		/* Next available ticket number */
+		uint16_t __notused;	/* Padding */
 	} s;
-#endif
 } wt_rwlock_t;
 
 /*
@@ -69,20 +65,21 @@ struct __wt_rwlock {
 
 #if SPINLOCK_TYPE == SPINLOCK_GCC
 
-typedef volatile int WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT)
-    WT_SPINLOCK;
+struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_spinlock {
+	volatile int lock;
+};
 
 #elif SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX ||\
 	SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE ||\
 	SPINLOCK_TYPE == SPINLOCK_MSVC
 
-typedef WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) struct {
+struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_spinlock {
 	wt_mutex_t lock;
 
 	const char *name;		/* Statistics: mutex name */
 
 	int8_t initialized;		/* Lock initialized, for cleanup */
-} WT_SPINLOCK;
+};
 
 #else
 
diff --git a/src/include/mutex.i b/src/include/mutex.i
index 8bca50635e6..5ea4583a2ab 100644
--- a/src/include/mutex.i
+++ b/src/include/mutex.i
@@ -31,7 +31,7 @@ __wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name)
 	WT_UNUSED(session);
 	WT_UNUSED(name);
 
-	*(t) = 0;
+	t->lock = 0;
 	return (0);
 }
 
@@ -44,7 +44,7 @@ __wt_spin_destroy(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
 {
 	WT_UNUSED(session);
 
-	*(t) = 0;
+	t->lock = 0;
 }
 
 /*
@@ -56,7 +56,7 @@ __wt_spin_trylock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
 {
 	WT_UNUSED(session);
 
-	return (__sync_lock_test_and_set(t, 1) == 0 ? 0 : EBUSY);
+	return (__sync_lock_test_and_set(&t->lock, 1) == 0 ? 0 : EBUSY);
 }
 
 /*
@@ -70,10 +70,10 @@ __wt_spin_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
 
 	WT_UNUSED(session);
 
-	while (__sync_lock_test_and_set(t, 1)) {
-		for (i = 0; *t && i < WT_SPIN_COUNT; i++)
+	while (__sync_lock_test_and_set(&t->lock, 1)) {
+		for (i = 0; t->lock && i < WT_SPIN_COUNT; i++)
 			WT_PAUSE();
-		if (*t)
+		if (t->lock)
 			__wt_yield();
 	}
 }
@@ -87,7 +87,7 @@ __wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
 {
 	WT_UNUSED(session);
 
-	__sync_lock_release(t);
+	__sync_lock_release(&t->lock);
 }
 
 #elif SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX ||\
diff --git a/src/include/os.h b/src/include/os.h
index ba5d95657d5..518b124f547 100644
--- a/src/include/os.h
+++ b/src/include/os.h
@@ -56,7 +56,7 @@ typedef enum {
 		case EMFILE:						\
 		case ENFILE:						\
 		case ENOSPC:						\
-			__wt_sleep(0L, 500000L);			\
+			__wt_sleep(0L, 50000L);				\
 			continue;					\
 		default:						\
 			break;						\
@@ -77,8 +77,8 @@ typedef enum {
 struct __wt_fh {
 	char	*name;				/* File name */
 	uint64_t name_hash;			/* Hash of name */
-	SLIST_ENTRY(__wt_fh) l;			/* List of open handles */
-	SLIST_ENTRY(__wt_fh) hashl;		/* Hashed list of handles */
+	TAILQ_ENTRY(__wt_fh) q;			/* List of open handles */
+	TAILQ_ENTRY(__wt_fh) hashq;		/* Hashed list of handles */
 
 	u_int	ref;				/* Reference count */
 
diff --git a/src/include/queue.h b/src/include/queue.h
index 42e736e7b09..1d494875cf6 100644
--- a/src/include/queue.h
+++ b/src/include/queue.h
@@ -38,71 +38,17 @@ extern "C" {
 #endif
 
 /*
- * This file defines four types of data structures: singly-linked lists,
- * singly-linked tail queues, lists and tail queues.
+ * WiredTiger only uses the TAILQ macros (we've gotten into trouble in the past
+ * by trying to use simpler queues and subsequently discovering a list we didn't
+ * think would ever get to be large could, under some workloads, become large,
+ * and the linear performance for removal of elements from simpler macros proved
+ * to be more trouble than the memory savings were worth.
  *
- * A singly-linked list is headed by a single forward pointer. The elements
- * are singly linked for minimum space and pointer manipulation overhead at
- * the expense of O(n) removal for arbitrary elements. New elements can be
- * added to the list after an existing element or at the head of the list.
- * Elements being removed from the head of the list should use the explicit
- * macro for this purpose for optimum efficiency. A singly-linked list may
- * only be traversed in the forward direction.  Singly-linked lists are ideal
- * for applications with large datasets and few or no removals or for
- * implementing a LIFO queue.
+ * Additionally, we've altered the TAILQ_INSERT_XXX functions to include a write
+ * barrier, in order to ensure we never insert a partially built structure onto
+ * a list (this is required because the spinlocks we use don't necessarily imply
+ * a write barrier).
  *
- * A singly-linked tail queue is headed by a pair of pointers, one to the
- * head of the list and the other to the tail of the list. The elements are
- * singly linked for minimum space and pointer manipulation overhead at the
- * expense of O(n) removal for arbitrary elements. New elements can be added
- * to the list after an existing element, at the head of the list, or at the
- * end of the list. Elements being removed from the head of the tail queue
- * should use the explicit macro for this purpose for optimum efficiency.
- * A singly-linked tail queue may only be traversed in the forward direction.
- * Singly-linked tail queues are ideal for applications with large datasets
- * and few or no removals or for implementing a FIFO queue.
- *
- * A list is headed by a single forward pointer (or an array of forward
- * pointers for a hash table header). The elements are doubly linked
- * so that an arbitrary element can be removed without a need to
- * traverse the list. New elements can be added to the list before
- * or after an existing element or at the head of the list. A list
- * may only be traversed in the forward direction.
- *
- * A tail queue is headed by a pair of pointers, one to the head of the
- * list and the other to the tail of the list. The elements are doubly
- * linked so that an arbitrary element can be removed without a need to
- * traverse the list. New elements can be added to the list before or
- * after an existing element, at the head of the list, or at the end of
- * the list. A tail queue may be traversed in either direction.
- *
- * For details on the use of these macros, see the queue(3) manual page.
- *
- *
- *			SLIST	LIST	STAILQ	TAILQ
- * _HEAD		+	+	+	+
- * _HEAD_INITIALIZER	+	+	+	+
- * _ENTRY		+	+	+	+
- * _INIT		+	+	+	+
- * _EMPTY		+	+	+	+
- * _FIRST		+	+	+	+
- * _NEXT		+	+	+	+
- * _PREV		-	-	-	+
- * _LAST		-	-	+	+
- * _FOREACH		+	+	+	+
- * _FOREACH_REVERSE	-	-	-	+
- * _INSERT_HEAD		+	+	+	+
- * _INSERT_BEFORE	-	+	-	+
- * _INSERT_AFTER	+	+	+	+
- * _INSERT_TAIL		-	-	+	+
- * _CONCAT		-	-	+	+
- * _REMOVE_HEAD		+	-	+	-
- * _REMOVE		+	+	+	+
- *
- */
-
-/*
- * XXX
  * We #undef all of the macros because there are incompatible versions of this
  * file and these macros on various systems.  What makes the problem worse is
  * they are included and/or defined by system include files which we may have
@@ -111,50 +57,7 @@ extern "C" {
  * several of the LIST_XXX macros.  Visual C.NET 7.0 also defines some of these
  * same macros in Vc7\PlatformSDK\Include\WinNT.h.  Make sure we use ours.
  */
-#undef LIST_EMPTY
-#undef LIST_ENTRY
-#undef LIST_FIRST
-#undef LIST_FOREACH
-#undef LIST_HEAD
-#undef LIST_HEAD_INITIALIZER
-#undef LIST_INIT
-#undef LIST_INSERT_AFTER
-#undef LIST_INSERT_BEFORE
-#undef LIST_INSERT_HEAD
-#undef LIST_NEXT
-#undef LIST_REMOVE
-#undef QMD_TRACE_ELEM
-#undef QMD_TRACE_HEAD
-#undef QUEUE_MACRO_DEBUG
-#undef SLIST_EMPTY
-#undef SLIST_ENTRY
-#undef SLIST_FIRST
-#undef SLIST_FOREACH
-#undef SLIST_FOREACH_PREVPTR
-#undef SLIST_HEAD
-#undef SLIST_HEAD_INITIALIZER
-#undef SLIST_INIT
-#undef SLIST_INSERT_AFTER
-#undef SLIST_INSERT_HEAD
-#undef SLIST_NEXT
-#undef SLIST_REMOVE
-#undef SLIST_REMOVE_HEAD
-#undef STAILQ_CONCAT
-#undef STAILQ_EMPTY
-#undef STAILQ_ENTRY
-#undef STAILQ_FIRST
-#undef STAILQ_FOREACH
-#undef STAILQ_HEAD
-#undef STAILQ_HEAD_INITIALIZER
-#undef STAILQ_INIT
-#undef STAILQ_INSERT_AFTER
-#undef STAILQ_INSERT_HEAD
-#undef STAILQ_INSERT_TAIL
-#undef STAILQ_LAST
-#undef STAILQ_NEXT
-#undef STAILQ_REMOVE
-#undef STAILQ_REMOVE_HEAD
-#undef STAILQ_REMOVE_HEAD_UNTIL
+
 #undef TAILQ_CONCAT
 #undef TAILQ_EMPTY
 #undef TAILQ_ENTRY
@@ -210,230 +113,6 @@ struct qm_trace {
 #endif	/* QUEUE_MACRO_DEBUG */
 
 /*
- * Singly-linked List declarations.
- */
-#define	SLIST_HEAD(name, type)						\
-struct name {								\
-	struct type *slh_first;	/* first element */			\
-}
-
-#define	SLIST_HEAD_INITIALIZER(head)					\
-	{ NULL }
-
-#define	SLIST_ENTRY(type)						\
-struct {								\
-	struct type *sle_next;	/* next element */			\
-}
-
-/*
- * Singly-linked List functions.
- */
-#define	SLIST_EMPTY(head)	((head)->slh_first == NULL)
-
-#define	SLIST_FIRST(head)	((head)->slh_first)
-
-#define	SLIST_FOREACH(var, head, field)					\
-	for ((var) = SLIST_FIRST((head));				\
-	    (var);							\
-	    (var) = SLIST_NEXT((var), field))
-
-#define	SLIST_FOREACH_PREVPTR(var, varp, head, field)			\
-	for ((varp) = &SLIST_FIRST((head));				\
-	    ((var) = *(varp)) != NULL;					\
-	    (varp) = &SLIST_NEXT((var), field))
-
-#define	SLIST_INIT(head) do {						\
-	SLIST_FIRST((head)) = NULL;					\
-} while (0)
-
-#define	SLIST_INSERT_AFTER(slistelm, elm, field) do {			\
-	SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field);	\
-	SLIST_NEXT((slistelm), field) = (elm);				\
-} while (0)
-
-#define	SLIST_INSERT_HEAD(head, elm, field) do {			\
-	SLIST_NEXT((elm), field) = SLIST_FIRST((head));			\
-	SLIST_FIRST((head)) = (elm);					\
-} while (0)
-
-#define	SLIST_NEXT(elm, field)	((elm)->field.sle_next)
-
-#define	SLIST_REMOVE(head, elm, type, field) do {			\
-	if (SLIST_FIRST((head)) == (elm)) {				\
-		SLIST_REMOVE_HEAD((head), field);			\
-	}								\
-	else {								\
-		struct type *curelm = SLIST_FIRST((head));		\
-		while (SLIST_NEXT(curelm, field) != (elm))		\
-			curelm = SLIST_NEXT(curelm, field);		\
-		SLIST_NEXT(curelm, field) =				\
-		    SLIST_NEXT(SLIST_NEXT(curelm, field), field);	\
-	}								\
-} while (0)
-
-#define	SLIST_REMOVE_HEAD(head, field) do {				\
-	SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field);	\
-} while (0)
-
-/*
- * Singly-linked Tail queue declarations.
- */
-#define	STAILQ_HEAD(name, type)						\
-struct name {								\
-	struct type *stqh_first;/* first element */			\
-	struct type **stqh_last;/* addr of last next element */		\
-}
-
-#define	STAILQ_HEAD_INITIALIZER(head)					\
-	{ NULL, &(head).stqh_first }
-
-#define	STAILQ_ENTRY(type)						\
-struct {								\
-	struct type *stqe_next;	/* next element */			\
-}
-
-/*
- * Singly-linked Tail queue functions.
- */
-#define	STAILQ_CONCAT(head1, head2) do {				\
-	if (!STAILQ_EMPTY((head2))) {					\
-		*(head1)->stqh_last = (head2)->stqh_first;		\
-		(head1)->stqh_last = (head2)->stqh_last;		\
-		STAILQ_INIT((head2));					\
-	}								\
-} while (0)
-
-#define	STAILQ_EMPTY(head)	((head)->stqh_first == NULL)
-
-#define	STAILQ_FIRST(head)	((head)->stqh_first)
-
-#define	STAILQ_FOREACH(var, head, field)				\
-	for ((var) = STAILQ_FIRST((head));				\
-	   (var);							\
-	   (var) = STAILQ_NEXT((var), field))
-
-#define	STAILQ_INIT(head) do {						\
-	STAILQ_FIRST((head)) = NULL;					\
-	(head)->stqh_last = &STAILQ_FIRST((head));			\
-} while (0)
-
-#define	STAILQ_INSERT_AFTER(head, tqelm, elm, field) do {		\
-	if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\
-		(head)->stqh_last = &STAILQ_NEXT((elm), field);		\
-	STAILQ_NEXT((tqelm), field) = (elm);				\
-} while (0)
-
-#define	STAILQ_INSERT_HEAD(head, elm, field) do {			\
-	if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL)	\
-		(head)->stqh_last = &STAILQ_NEXT((elm), field);		\
-	STAILQ_FIRST((head)) = (elm);					\
-} while (0)
-
-#define	STAILQ_INSERT_TAIL(head, elm, field) do {			\
-	STAILQ_NEXT((elm), field) = NULL;				\
-	*(head)->stqh_last = (elm);					\
-	(head)->stqh_last = &STAILQ_NEXT((elm), field);			\
-} while (0)
-
-#define	STAILQ_LAST(head, type, field)					\
-	(STAILQ_EMPTY((head)) ?						\
-		NULL :							\
-		((struct type *)					\
-		((char *)((head)->stqh_last) - __offsetof(struct type, field))))
-
-#define	STAILQ_NEXT(elm, field)	((elm)->field.stqe_next)
-
-#define	STAILQ_REMOVE(head, elm, type, field) do {			\
-	if (STAILQ_FIRST((head)) == (elm)) {				\
-		STAILQ_REMOVE_HEAD((head), field);			\
-	}								\
-	else {								\
-		struct type *curelm = STAILQ_FIRST((head));		\
-		while (STAILQ_NEXT(curelm, field) != (elm))		\
-			curelm = STAILQ_NEXT(curelm, field);		\
-		if ((STAILQ_NEXT(curelm, field) =			\
-		     STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\
-			(head)->stqh_last = &STAILQ_NEXT((curelm), field);\
-	}								\
-} while (0)
-
-#define	STAILQ_REMOVE_HEAD(head, field) do {				\
-	if ((STAILQ_FIRST((head)) =					\
-	     STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL)		\
-		(head)->stqh_last = &STAILQ_FIRST((head));		\
-} while (0)
-
-#define	STAILQ_REMOVE_HEAD_UNTIL(head, elm, field) do {			\
-	if ((STAILQ_FIRST((head)) = STAILQ_NEXT((elm), field)) == NULL)	\
-		(head)->stqh_last = &STAILQ_FIRST((head));		\
-} while (0)
-
-/*
- * List declarations.
- */
-#define	LIST_HEAD(name, type)						\
-struct name {								\
-	struct type *lh_first;	/* first element */			\
-}
-
-#define	LIST_HEAD_INITIALIZER(head)					\
-	{ NULL }
-
-#define	LIST_ENTRY(type)						\
-struct {								\
-	struct type *le_next;	/* next element */			\
-	struct type **le_prev;	/* address of previous next element */	\
-}
-
-/*
- * List functions.
- */
-
-#define	LIST_EMPTY(head)	((head)->lh_first == NULL)
-
-#define	LIST_FIRST(head)	((head)->lh_first)
-
-#define	LIST_FOREACH(var, head, field)					\
-	for ((var) = LIST_FIRST((head));				\
-	    (var);							\
-	    (var) = LIST_NEXT((var), field))
-
-#define	LIST_INIT(head) do {						\
-	LIST_FIRST((head)) = NULL;					\
-} while (0)
-
-#define	LIST_INSERT_AFTER(listelm, elm, field) do {			\
-	if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\
-		LIST_NEXT((listelm), field)->field.le_prev =		\
-		    &LIST_NEXT((elm), field);				\
-	LIST_NEXT((listelm), field) = (elm);				\
-	(elm)->field.le_prev = &LIST_NEXT((listelm), field);		\
-} while (0)
-
-#define	LIST_INSERT_BEFORE(listelm, elm, field) do {			\
-	(elm)->field.le_prev = (listelm)->field.le_prev;		\
-	LIST_NEXT((elm), field) = (listelm);				\
-	*(listelm)->field.le_prev = (elm);				\
-	(listelm)->field.le_prev = &LIST_NEXT((elm), field);		\
-} while (0)
-
-#define	LIST_INSERT_HEAD(head, elm, field) do {				\
-	if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL)	\
-		LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\
-	LIST_FIRST((head)) = (elm);					\
-	(elm)->field.le_prev = &LIST_FIRST((head));			\
-} while (0)
-
-#define	LIST_NEXT(elm, field)	((elm)->field.le_next)
-
-#define	LIST_REMOVE(elm, field) do {					\
-	if (LIST_NEXT((elm), field) != NULL)				\
-		LIST_NEXT((elm), field)->field.le_prev =		\
-		    (elm)->field.le_prev;				\
-	*(elm)->field.le_prev = LIST_NEXT((elm), field);		\
-} while (0)
-
-/*
  * Tail queue declarations.
  */
 #define	TAILQ_HEAD(name, type)						\
@@ -488,6 +167,7 @@ struct {								\
 } while (0)
 
 #define	TAILQ_INSERT_AFTER(head, listelm, elm, field) do {		\
+	WT_WRITE_BARRIER();						\
 	if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\
 		TAILQ_NEXT((elm), field)->field.tqe_prev =		\
 		    &TAILQ_NEXT((elm), field);				\
@@ -502,6 +182,7 @@ struct {								\
 } while (0)
 
 #define	TAILQ_INSERT_BEFORE(listelm, elm, field) do {			\
+	WT_WRITE_BARRIER();						\
 	(elm)->field.tqe_prev = (listelm)->field.tqe_prev;		\
 	TAILQ_NEXT((elm), field) = (listelm);				\
 	*(listelm)->field.tqe_prev = (elm);				\
@@ -511,6 +192,7 @@ struct {								\
 } while (0)
 
 #define	TAILQ_INSERT_HEAD(head, elm, field) do {			\
+	WT_WRITE_BARRIER();						\
 	if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL)	\
 		TAILQ_FIRST((head))->field.tqe_prev =			\
 		    &TAILQ_NEXT((elm), field);				\
@@ -523,6 +205,7 @@ struct {								\
 } while (0)
 
 #define	TAILQ_INSERT_TAIL(head, elm, field) do {			\
+	WT_WRITE_BARRIER();						\
 	TAILQ_NEXT((elm), field) = NULL;				\
 	(elm)->field.tqe_prev = (head)->tqh_last;			\
 	*(head)->tqh_last = (elm);					\
diff --git a/src/include/schema.h b/src/include/schema.h
index 8f4884281cd..0664af5adba 100644
--- a/src/include/schema.h
+++ b/src/include/schema.h
@@ -62,8 +62,8 @@ struct __wt_table {
 	WT_INDEX **indices;
 	size_t idx_alloc;
 
-	SLIST_ENTRY(__wt_table) l;
-	SLIST_ENTRY(__wt_table) hashl;
+	TAILQ_ENTRY(__wt_table) q;
+	TAILQ_ENTRY(__wt_table) hashq;
 
 	int cg_complete, idx_complete, is_simple;
 	u_int ncolgroups, nindices, nkey_columns;
diff --git a/src/include/serial.i b/src/include/serial.i
index 9e6b0f7916c..d90b29c2133 100644
--- a/src/include/serial.i
+++ b/src/include/serial.i
@@ -30,11 +30,11 @@ __page_write_gen_wrapped_check(WT_PAGE *page)
 }
 
 /*
- * __insert_serial_func --
- *	Worker function to add a WT_INSERT entry to a skiplist.
+ * __insert_simple_func --
+ *	Worker function to add a WT_INSERT entry to the middle of a skiplist.
  */
 static inline int
-__insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head,
+__insert_simple_func(WT_SESSION_IMPL *session,
     WT_INSERT ***ins_stack, WT_INSERT *new_ins, u_int skipdepth)
 {
 	u_int i;
@@ -42,31 +42,62 @@ __insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head,
 	WT_UNUSED(session);
 
 	/*
-	 * Confirm we are still in the expected position, and no item has been
-	 * added where our insert belongs.  Take extra care at the beginning
-	 * and end of the list (at each level): retry if we race there.
+	 * Update the skiplist elements referencing the new WT_INSERT item.
+	 * If we fail connecting one of the upper levels in the skiplist,
+	 * return success: the levels we updated are correct and sufficient.
+	 * Even though we don't get the benefit of the memory we allocated,
+	 * we can't roll back.
 	 *
-	 * !!!
-	 * Note the test for ins_stack[0] == NULL: that's the test for an
-	 * uninitialized cursor, ins_stack[0] is cleared as part of
-	 * initializing a cursor for a search.
+	 * All structure setup must be flushed before the structure is entered
+	 * into the list. We need a write barrier here, our callers depend on
+	 * it.  Don't pass complex arguments to the macro, some implementations
+	 * read the old value multiple times.
 	 */
 	for (i = 0; i < skipdepth; i++) {
-		if (ins_stack[i] == NULL ||
-		    *ins_stack[i] != new_ins->next[i])
-			return (WT_RESTART);
-		if (new_ins->next[i] == NULL &&
-		    ins_head->tail[i] != NULL &&
-		    ins_stack[i] != &ins_head->tail[i]->next[i])
-			return (WT_RESTART);
+		WT_INSERT *old_ins = *ins_stack[i];
+		if (old_ins != new_ins->next[i] ||
+		    !__wt_atomic_cas_ptr(ins_stack[i], old_ins, new_ins))
+			return (i == 0 ? WT_RESTART : 0);
 	}
 
-	/* Update the skiplist elements referencing the new WT_INSERT item. */
+	return (0);
+}
+
+/*
+ * __insert_serial_func --
+ *	Worker function to add a WT_INSERT entry to a skiplist.
+ */
+static inline int
+__insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head,
+    WT_INSERT ***ins_stack, WT_INSERT *new_ins, u_int skipdepth)
+{
+	u_int i;
+
+	/* The cursor should be positioned. */
+	WT_ASSERT(session, ins_stack[0] != NULL);
+
+	/*
+	 * Update the skiplist elements referencing the new WT_INSERT item.
+	 *
+	 * Confirm we are still in the expected position, and no item has been
+	 * added where our insert belongs.  If we fail connecting one of the
+	 * upper levels in the skiplist, return success: the levels we updated
+	 * are correct and sufficient. Even though we don't get the benefit of
+	 * the memory we allocated, we can't roll back.
+	 *
+	 * All structure setup must be flushed before the structure is entered
+	 * into the list. We need a write barrier here, our callers depend on
+	 * it.  Don't pass complex arguments to the macro, some implementations
+	 * read the old value multiple times.
+	 */
 	for (i = 0; i < skipdepth; i++) {
+		WT_INSERT *old_ins = *ins_stack[i];
+		if (old_ins != new_ins->next[i] ||
+		    !__wt_atomic_cas_ptr(ins_stack[i], old_ins, new_ins))
+			return (i == 0 ? WT_RESTART : 0);
 		if (ins_head->tail[i] == NULL ||
 		    ins_stack[i] == &ins_head->tail[i]->next[i])
 			ins_head->tail[i] = new_ins;
-		*ins_stack[i] = new_ins;
 	}
 
 	return (0);
@@ -92,7 +123,7 @@ __col_append_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head,
 	 * If the application didn't specify a record number, allocate a new one
 	 * and set up for an append.
 	 */
-	if ((recno = WT_INSERT_RECNO(new_ins)) == 0) {
+	if ((recno = WT_INSERT_RECNO(new_ins)) == WT_RECNO_OOB) {
 		recno = WT_INSERT_RECNO(new_ins) = btree->last_recno + 1;
 		WT_ASSERT(session, WT_SKIP_LAST(ins_head) == NULL ||
 		    recno > WT_INSERT_RECNO(WT_SKIP_LAST(ins_head)));
@@ -128,20 +159,20 @@ __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
 	WT_INSERT *new_ins = *new_insp;
 	WT_DECL_RET;
 
-	/* Clear references to memory we now own. */
-	*new_insp = NULL;
-
 	/* Check for page write generation wrap. */
 	WT_RET(__page_write_gen_wrapped_check(page));
 
+	/* Clear references to memory we now own and must free on error. */
+	*new_insp = NULL;
+
 	/* Acquire the page's spinlock, call the worker function. */
 	WT_PAGE_LOCK(session, page);
 	ret = __col_append_serial_func(
 	    session, ins_head, ins_stack, new_ins, recnop, skipdepth);
 	WT_PAGE_UNLOCK(session, page);
 
-	/* Free unused memory on error. */
 	if (ret != 0) {
+		/* Free unused memory on error. */
 		__wt_free(session, new_ins);
 		return (ret);
 	}
@@ -171,21 +202,32 @@ __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
 {
 	WT_INSERT *new_ins = *new_insp;
 	WT_DECL_RET;
-
-	/* Clear references to memory we now own. */
-	*new_insp = NULL;
+	int simple;
+	u_int i;
 
 	/* Check for page write generation wrap. */
 	WT_RET(__page_write_gen_wrapped_check(page));
 
-	/* Acquire the page's spinlock, call the worker function. */
-	WT_PAGE_LOCK(session, page);
-	ret = __insert_serial_func(
-	    session, ins_head, ins_stack, new_ins, skipdepth);
-	WT_PAGE_UNLOCK(session, page);
+	/* Clear references to memory we now own and must free on error. */
+	*new_insp = NULL;
+
+	simple = 1;
+	for (i = 0; i < skipdepth; i++)
+		if (new_ins->next[i] == NULL)
+			simple = 0;
+
+	if (simple)
+		ret = __insert_simple_func(
+		    session, ins_stack, new_ins, skipdepth);
+	else {
+		WT_PAGE_LOCK(session, page);
+		ret = __insert_serial_func(
+		    session, ins_head, ins_stack, new_ins, skipdepth);
+		WT_PAGE_UNLOCK(session, page);
+	}
 
-	/* Free unused memory on error. */
 	if (ret != 0) {
+		/* Free unused memory on error. */
 		__wt_free(session, new_ins);
 		return (ret);
 	}
@@ -215,26 +257,27 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
 	WT_DECL_RET;
 	WT_UPDATE *obsolete, *upd = *updp;
 
-	/* Clear references to memory we now own. */
-	*updp = NULL;
-
 	/* Check for page write generation wrap. */
 	WT_RET(__page_write_gen_wrapped_check(page));
 
+	/* Clear references to memory we now own and must free on error. */
+	*updp = NULL;
+
 	/*
+	 * All structure setup must be flushed before the structure is entered
+	 * into the list. We need a write barrier here, our callers depend on
+	 * it.
+	 *
 	 * Swap the update into place.  If that fails, a new update was added
-	 * after our search, we raced.  Check if our update is still permitted,
-	 * and if it is, do a full-barrier to ensure the update's next pointer
-	 * is set before we update the linked list and try again.
+	 * after our search, we raced.  Check if our update is still permitted.
 	 */
-	while (!WT_ATOMIC_CAS8(*srch_upd, upd->next, upd)) {
+	while (!__wt_atomic_cas_ptr(srch_upd, upd->next, upd)) {
 		if ((ret = __wt_txn_update_check(
 		    session, upd->next = *srch_upd)) != 0) {
 			/* Free unused memory on error. */
 			__wt_free(session, upd);
 			return (ret);
 		}
-		WT_WRITE_BARRIER();
 	}
 
 	/*
@@ -249,25 +292,37 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
 	__wt_page_modify_set(session, page);
 
 	/*
-	 * If there are subsequent WT_UPDATE structures, we're evicting pages
-	 * and the page-scanning mutex isn't held, discard obsolete WT_UPDATE
-	 * structures.  Serialization is needed so only one thread does the
-	 * obsolete check at a time, and to protect updates from disappearing
-	 * under reconciliation.
+	 * If there are no subsequent WT_UPDATE structures we are done here.
 	 */
-	if (upd->next != NULL &&
-	    __wt_txn_visible_all(session, page->modify->obsolete_check_txn)) {
-		F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret);
-		/* If we can't lock it, don't scan, that's okay. */
-		if (ret != 0)
-			return (0);
-		obsolete = __wt_update_obsolete_check(session, page, upd->next);
-		F_CLR_ATOMIC(page, WT_PAGE_SCANNING);
-		if (obsolete != NULL) {
+	if (upd->next == NULL)
+		return (0);
+	/*
+	 * We would like to call __wt_txn_update_oldest only in the event that
+	 * there are further updates to this page, the check against WT_TXN_NONE
+	 * is used as an indicator of there being further updates on this page.
+	 */
+	if (page->modify->obsolete_check_txn != WT_TXN_NONE) {
+		if (!__wt_txn_visible_all(session,
+		    page->modify->obsolete_check_txn)) {
+			/* Try to move the oldest ID forward and re-check */
+			__wt_txn_update_oldest(session,0);
+		}
+		if (!__wt_txn_visible_all(session,
+		    page->modify->obsolete_check_txn)) {
 			page->modify->obsolete_check_txn = WT_TXN_NONE;
-			__wt_update_obsolete_free(session, page, obsolete);
+			return (0);
 		}
 	}
+	F_CAS_ATOMIC(page, WT_PAGE_RECONCILIATION, ret);
+
+	/* If we can't lock it, don't scan, that's okay. */
+	if (ret != 0)
+		return (0);
+	obsolete = __wt_update_obsolete_check(session, page, upd->next);
+	F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION);
+	if (obsolete != NULL) {
+		__wt_update_obsolete_free(session, page, obsolete);
+	}
 
 	return (0);
 }
diff --git a/src/include/session.h b/src/include/session.h
index f32da177bf9..a691794fd46 100644
--- a/src/include/session.h
+++ b/src/include/session.h
@@ -14,8 +14,8 @@
 struct __wt_data_handle_cache {
 	WT_DATA_HANDLE *dhandle;
 
-	SLIST_ENTRY(__wt_data_handle_cache) l;
-	SLIST_ENTRY(__wt_data_handle_cache) hashl;
+	TAILQ_ENTRY(__wt_data_handle_cache) q;
+	TAILQ_ENTRY(__wt_data_handle_cache) hashq;
 };
 
 /*
@@ -66,7 +66,7 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl {
 	 * across session close - so it is declared further down.
 	 */
 					/* Session handle reference list */
-	SLIST_HEAD(__dhandles, __wt_data_handle_cache) dhandles;
+	TAILQ_HEAD(__dhandles, __wt_data_handle_cache) dhandles;
 	time_t last_sweep;		/* Last sweep for dead handles */
 
 	WT_CURSOR *cursor;		/* Current cursor */
@@ -76,6 +76,11 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl {
 	WT_CURSOR_BACKUP *bkp_cursor;	/* Hot backup cursor */
 	WT_COMPACT	 *compact;	/* Compact state */
 
+	/*
+	 * Lookaside table cursor, sweep and eviction worker threads only.
+	 */
+	WT_CURSOR	*las_cursor;	/* Lookaside table cursor */
+
 	WT_DATA_HANDLE *meta_dhandle;	/* Metadata file */
 	void	*meta_track;		/* Metadata operation tracking */
 	void	*meta_track_next;	/* Current position */
@@ -90,7 +95,7 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl {
 	 * table of lists. The hash table list is kept in allocated memory
 	 * that lives across session close - so it is declared further down.
 	 */
-	SLIST_HEAD(__tables, __wt_table) tables;
+	TAILQ_HEAD(__tables, __wt_table) tables;
 
 	WT_ITEM	**scratch;		/* Temporary memory for any function */
 	u_int	  scratch_alloc;	/* Currently allocated */
@@ -151,9 +156,9 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl {
 	WT_RAND_STATE rnd;		/* Random number generation state */
 
 					/* Hashed handle reference list array */
-	SLIST_HEAD(__dhandles_hash, __wt_data_handle_cache) *dhhash;
+	TAILQ_HEAD(__dhandles_hash, __wt_data_handle_cache) *dhhash;
 					/* Hashed table reference list array */
-	SLIST_HEAD(__tables_hash, __wt_table) *tablehash;
+	TAILQ_HEAD(__tables_hash, __wt_table) *tablehash;
 
 	/*
 	 * Splits can "free" memory that may still be in use, and we use a
diff --git a/src/include/stat.h b/src/include/stat.h
index 6dc9282a613..cd2c149bc94 100644
--- a/src/include/stat.h
+++ b/src/include/stat.h
@@ -6,122 +6,217 @@
  * See the file LICENSE for redistribution information.
  */
 
-struct __wt_stats {
-	const char	*desc;				/* text description */
-	uint64_t	 v;				/* 64-bit value */
-};
+/*
+ * Statistics counters:
+ *
+ * We use an array of statistics structures; threads write different structures
+ * to avoid writing the same cache line and incurring cache coherency overheads,
+ * which can dramatically slow fast and otherwise read-mostly workloads.
+ *
+ * With an 8B statistics value and 64B cache-line alignment, 8 values share the
+ * same cache line. There are collisions when different threads choose the same
+ * statistics structure and update values that live on the cache line. There is
+ * likely some locality however: a thread updating the cursor search statistic
+ * is likely to update other cursor statistics with a chance of hitting already
+ * cached values.
+ *
+ * The actual statistic value must be signed, because one thread might increment
+ * the value in its structure, and then another thread might decrement the same
+ * value in another structure (where the value was initially zero), so the value
+ * in the second thread's slot will go negative.
+ *
+ * When reading a statistics value, the array values are summed and returned to
+ * the caller. The summation is performed without locking, so the value read
+ * may be inconsistent (and might be negative, if increments/decrements race
+ * with the reader).
+ *
+ * Choosing how many structures isn't easy: obviously, a smaller number creates
+ * more conflicts while a larger number uses more memory.
+ *
+ * Ideally, if the application running on the system is CPU-intensive, and using
+ * all CPUs on the system, we want to use the same number of slots as there are
+ * CPUs (because their L1 caches are the units of coherency). However, in
+ * practice we cannot easily determine how many CPUs are actually available to
+ * the application.
+ *
+ * Our next best option is to use the number of threads in the application as a
+ * heuristic for the number of CPUs (presumably, the application architect has
+ * figured out how many CPUs are available). However, inside WiredTiger we don't
+ * know when the application creates its threads.
+ *
+ * For now, we use a fixed number of slots. Ideally, we would approximate the
+ * largest number of cores we expect on any machine where WiredTiger might be
+ * run, however, we don't want to waste that much memory on smaller machines.
+ * As of 2015, machines with more than 24 CPUs are relatively rare.
+ *
+ * Default hash table size; use a prime number of buckets rather than assuming
+ * a good hash (Reference Sedgewick, Algorithms in C, "Hash Functions").
+ */
+#define	WT_COUNTER_SLOTS	23
 
 /*
- * Read/write statistics without any test for statistics configuration.
+ * WT_STATS_SLOT_ID is the thread's slot ID for the array of structures.
+ *
+ * Ideally, we want a slot per CPU, and we want each thread to index the slot
+ * corresponding to the CPU it runs on. Unfortunately, getting the ID of the
+ * current CPU is difficult: some operating systems provide a system call to
+ * acquire a CPU ID, but not all (regardless, making a system call to increment
+ * a statistics value is far too expensive).
+ *
+ * Our second-best option is to use the thread ID. Unfortunately, there is no
+ * portable way to obtain a unique thread ID that's a small-enough number to
+ * be used as an array index (portable thread IDs are usually a pointer or an
+ * opaque chunk, not a simple integer).
+ *
+ * Our solution is to use the session ID; there is normally a session per thread
+ * and the session ID is a small, monotonically increasing number.
  */
-#define	WT_STAT(stats, fld)						\
-	((stats)->fld.v)
-#define	WT_STAT_ATOMIC_DECRV(stats, fld, value) do {			\
-	(void)WT_ATOMIC_SUB8(WT_STAT(stats, fld), (value));		\
-} while (0)
-#define	WT_STAT_ATOMIC_DECR(stats, fld) WT_STAT_ATOMIC_DECRV(stats, fld, 1)
-#define	WT_STAT_ATOMIC_INCRV(stats, fld, value) do {			\
-	(void)WT_ATOMIC_ADD8(WT_STAT(stats, fld), (value));		\
-} while (0)
-#define	WT_STAT_ATOMIC_INCR(stats, fld) WT_STAT_ATOMIC_INCRV(stats, fld, 1)
-#define	WT_STAT_DECRV(stats, fld, value) do {				\
-	(stats)->fld.v -= (value);					\
-} while (0)
-#define	WT_STAT_DECR(stats, fld) WT_STAT_DECRV(stats, fld, 1)
-#define	WT_STAT_INCRV(stats, fld, value) do {				\
-	(stats)->fld.v += (value);					\
-} while (0)
-#define	WT_STAT_INCR(stats, fld) WT_STAT_INCRV(stats, fld, 1)
-#define	WT_STAT_SET(stats, fld, value) do {				\
-	(stats)->fld.v = (uint64_t)(value);				\
-} while (0)
+#define	WT_STATS_SLOT_ID(session)					\
+	((session)->id) % WT_COUNTER_SLOTS
 
 /*
- * Read/write statistics if "fast" statistics are configured.
+ * Statistic structures are arrays of int64_t's. We have functions to read/write
+ * those structures regardless of the specific statistic structure we're working
+ * with, by translating statistics structure field names to structure offsets.
+ *
+ * Translate a statistic's value name to an offset.
  */
-#define	WT_STAT_FAST_ATOMIC_DECRV(session, stats, fld, value) do {	\
-	if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST))	\
-		WT_STAT_ATOMIC_DECRV(stats, fld, value);		\
-} while (0)
-#define	WT_STAT_FAST_ATOMIC_DECR(session, stats, fld)			\
-	WT_STAT_FAST_ATOMIC_DECRV(session, stats, fld, 1)
-#define	WT_STAT_FAST_ATOMIC_INCRV(session, stats, fld, value) do {	\
-	if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST))	\
-		WT_STAT_ATOMIC_INCRV(stats, fld, value);		\
+#define	WT_STATS_FIELD_TO_SLOT(stats, fld)				\
+	(int)(&(stats)[0]->fld - (int64_t *)(stats)[0])
+
+/*
+ * Sum the values from all structures in the array.
+ */
+static inline int64_t
+__wt_stats_aggregate(void *stats_arg, int slot)
+{
+	int64_t **stats, aggr_v;
+	int i;
+
+	stats = stats_arg;
+	for (aggr_v = 0, i = 0; i < WT_COUNTER_SLOTS; i++)
+		aggr_v += stats[i][slot];
+
+	/*
+	 * This can race. However, any implementation with a single value can
+	 * race as well, different threads could set the same counter value
+	 * simultaneously. While we are making races more likely, we are not
+	 * fundamentally weakening the isolation semantics found in updating a
+	 * single value.
+	 *
+	 * Additionally, the aggregation can go negative (imagine a thread
+	 * incrementing a value after aggregation has passed its slot and a
+	 * second thread decrementing a value before aggregation has reached
+	 * its slot).
+	 *
+	 * For historic API compatibility, the external type is a uint64_t;
+	 * limit our return to positive values, negative numbers would just
+	 * look really, really large.
+	 */
+	if (aggr_v < 0)
+		aggr_v = 0;
+	return (aggr_v);
+}
+
+/*
+ * Clear the values in all structures in the array.
+ */
+static inline void
+__wt_stats_clear(void *stats_arg, int slot)
+{
+	int64_t **stats;
+	int i;
+
+	stats = stats_arg;
+	for (i = 0; i < WT_COUNTER_SLOTS; i++)
+		stats[i][slot] = 0;
+}
+
+/*
+ * Read/write statistics without any test for statistics configuration. Reading
+ * and writing the field requires different actions: reading sums the values
+ * across the array of structures, writing updates a single structure's value.
+ */
+#define	WT_STAT_READ(stats, fld)					\
+	__wt_stats_aggregate(stats, WT_STATS_FIELD_TO_SLOT(stats, fld))
+#define	WT_STAT_WRITE(session, stats, fld)				\
+	((stats)[WT_STATS_SLOT_ID(session)]->fld);
+
+#define	WT_STAT_DECRV(session, stats, fld, value)			\
+	(stats)[WT_STATS_SLOT_ID(session)]->fld -= (int64_t)(value)
+#define	WT_STAT_DECR(session, stats, fld)				\
+	WT_STAT_DECRV(session, stats, fld, 1)
+#define	WT_STAT_INCRV(session, stats, fld, value)			\
+	(stats)[WT_STATS_SLOT_ID(session)]->fld += (int64_t)(value)
+#define	WT_STAT_INCR(session, stats, fld)				\
+	WT_STAT_INCRV(session, stats, fld, 1)
+#define	WT_STAT_SET(session, stats, fld, value) do {			\
+	__wt_stats_clear(stats, WT_STATS_FIELD_TO_SLOT(stats, fld));	\
+	(stats)[0]->fld = (int64_t)(value);				\
 } while (0)
-#define	WT_STAT_FAST_ATOMIC_INCR(session, stats, fld)			\
-	WT_STAT_FAST_ATOMIC_INCRV(session, stats, fld, 1)
+
+/*
+ * Update statistics if "fast" statistics are configured.
+ */
 #define	WT_STAT_FAST_DECRV(session, stats, fld, value) do {		\
 	if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST))	\
-		WT_STAT_DECRV(stats, fld, value);			\
+		WT_STAT_DECRV(session, stats, fld, value);		\
 } while (0)
 #define	WT_STAT_FAST_DECR(session, stats, fld)				\
 	WT_STAT_FAST_DECRV(session, stats, fld, 1)
 #define	WT_STAT_FAST_INCRV(session, stats, fld, value) do {		\
 	if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST))	\
-		WT_STAT_INCRV(stats, fld, value);			\
+		WT_STAT_INCRV(session, stats, fld, value);		\
 } while (0)
 #define	WT_STAT_FAST_INCR(session, stats, fld)				\
 	WT_STAT_FAST_INCRV(session, stats, fld, 1)
 #define	WT_STAT_FAST_SET(session, stats, fld, value) do {		\
 	if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST))	\
-		WT_STAT_SET(stats, fld, value);				\
+		WT_STAT_SET(session, stats, fld, value);		\
 } while (0)
 
 /*
- * Read/write connection handle statistics if "fast" statistics are configured.
+ * Update connection handle statistics if "fast" statistics are configured.
  */
-#define	WT_STAT_FAST_CONN_ATOMIC_DECRV(session, fld, value)		\
-	WT_STAT_FAST_ATOMIC_DECRV(session, &S2C(session)->stats, fld, value)
-#define	WT_STAT_FAST_CONN_ATOMIC_DECR(session, fld)			\
-	WT_STAT_FAST_ATOMIC_DECR(session, &S2C(session)->stats, fld)
-#define	WT_STAT_FAST_CONN_ATOMIC_INCRV(session, fld, value)		\
-	WT_STAT_FAST_ATOMIC_INCRV(session, &S2C(session)->stats, fld, value)
-#define	WT_STAT_FAST_CONN_ATOMIC_INCR(session, fld)			\
-	WT_STAT_FAST_ATOMIC_INCR(session, &S2C(session)->stats, fld)
 #define	WT_STAT_FAST_CONN_DECR(session, fld)				\
-	WT_STAT_FAST_DECR(session, &S2C(session)->stats, fld)
+	WT_STAT_FAST_DECR(session, S2C(session)->stats, fld)
 #define	WT_STAT_FAST_CONN_DECRV(session, fld, value)			\
-	WT_STAT_FAST_DECRV(session, &S2C(session)->stats, fld, value)
+	WT_STAT_FAST_DECRV(session, S2C(session)->stats, fld, value)
 #define	WT_STAT_FAST_CONN_INCR(session, fld)				\
-	WT_STAT_FAST_INCR(session, &S2C(session)->stats, fld)
+	WT_STAT_FAST_INCR(session, S2C(session)->stats, fld)
 #define	WT_STAT_FAST_CONN_INCRV(session, fld, value)			\
-	WT_STAT_FAST_INCRV(session, &S2C(session)->stats, fld, value)
+	WT_STAT_FAST_INCRV(session, S2C(session)->stats, fld, value)
 #define	WT_STAT_FAST_CONN_SET(session, fld, value)			\
-	WT_STAT_FAST_SET(session, &S2C(session)->stats, fld, value)
+	WT_STAT_FAST_SET(session, S2C(session)->stats, fld, value)
 
 /*
- * Read/write data-source handle statistics if the data-source handle is set
- * and "fast" statistics are configured.
+ * Update data-source handle statistics if "fast" statistics are configured
+ * and the data-source handle is set.
  *
  * XXX
  * We shouldn't have to check if the data-source handle is NULL, but it's
- * useful until everything is converted to using data-source handles.
+ * necessary until everything is converted to using data-source handles.
  */
 #define	WT_STAT_FAST_DATA_DECRV(session, fld, value) do {		\
 	if ((session)->dhandle != NULL)					\
 		WT_STAT_FAST_DECRV(					\
-		    session, &(session)->dhandle->stats, fld, value);	\
+		    session, (session)->dhandle->stats, fld, value);	\
 } while (0)
 #define	WT_STAT_FAST_DATA_DECR(session, fld)				\
 	WT_STAT_FAST_DATA_DECRV(session, fld, 1)
 #define	WT_STAT_FAST_DATA_INCRV(session, fld, value) do {		\
 	if ((session)->dhandle != NULL)					\
 		WT_STAT_FAST_INCRV(					\
-		    session, &(session)->dhandle->stats, fld, value);	\
+		    session, (session)->dhandle->stats, fld, value);	\
 } while (0)
 #define	WT_STAT_FAST_DATA_INCR(session, fld)				\
 	WT_STAT_FAST_DATA_INCRV(session, fld, 1)
 #define	WT_STAT_FAST_DATA_SET(session, fld, value) do {			\
 	if ((session)->dhandle != NULL)					\
 		WT_STAT_FAST_SET(					\
-		   session, &(session)->dhandle->stats, fld, value);	\
+		    session, (session)->dhandle->stats, fld, value);	\
 } while (0)
 
-/* Connection handle statistics value. */
-#define	WT_CONN_STAT(session, fld)					\
-	WT_STAT(&S2C(session)->stats, fld)
-
 /*
  * DO NOT EDIT: automatically built by dist/stat.py.
  */
@@ -132,148 +227,157 @@ struct __wt_stats {
  */
 #define	WT_CONNECTION_STATS_BASE	1000
 struct __wt_connection_stats {
-	WT_STATS async_alloc_race;
-	WT_STATS async_alloc_view;
-	WT_STATS async_cur_queue;
-	WT_STATS async_flush;
-	WT_STATS async_full;
-	WT_STATS async_max_queue;
-	WT_STATS async_nowork;
-	WT_STATS async_op_alloc;
-	WT_STATS async_op_compact;
-	WT_STATS async_op_insert;
-	WT_STATS async_op_remove;
-	WT_STATS async_op_search;
-	WT_STATS async_op_update;
-	WT_STATS block_byte_map_read;
-	WT_STATS block_byte_read;
-	WT_STATS block_byte_write;
-	WT_STATS block_map_read;
-	WT_STATS block_preload;
-	WT_STATS block_read;
-	WT_STATS block_write;
-	WT_STATS cache_bytes_dirty;
-	WT_STATS cache_bytes_internal;
-	WT_STATS cache_bytes_inuse;
-	WT_STATS cache_bytes_leaf;
-	WT_STATS cache_bytes_max;
-	WT_STATS cache_bytes_overflow;
-	WT_STATS cache_bytes_read;
-	WT_STATS cache_bytes_write;
-	WT_STATS cache_eviction_app;
-	WT_STATS cache_eviction_checkpoint;
-	WT_STATS cache_eviction_clean;
-	WT_STATS cache_eviction_deepen;
-	WT_STATS cache_eviction_dirty;
-	WT_STATS cache_eviction_fail;
-	WT_STATS cache_eviction_force;
-	WT_STATS cache_eviction_force_delete;
-	WT_STATS cache_eviction_force_fail;
-	WT_STATS cache_eviction_hazard;
-	WT_STATS cache_eviction_internal;
-	WT_STATS cache_eviction_maximum_page_size;
-	WT_STATS cache_eviction_queue_empty;
-	WT_STATS cache_eviction_queue_not_empty;
-	WT_STATS cache_eviction_server_evicting;
-	WT_STATS cache_eviction_server_not_evicting;
-	WT_STATS cache_eviction_slow;
-	WT_STATS cache_eviction_split;
-	WT_STATS cache_eviction_walk;
-	WT_STATS cache_eviction_worker_evicting;
-	WT_STATS cache_inmem_split;
-	WT_STATS cache_overhead;
-	WT_STATS cache_pages_dirty;
-	WT_STATS cache_pages_inuse;
-	WT_STATS cache_read;
-	WT_STATS cache_write;
-	WT_STATS cond_wait;
-	WT_STATS cursor_create;
-	WT_STATS cursor_insert;
-	WT_STATS cursor_next;
-	WT_STATS cursor_prev;
-	WT_STATS cursor_remove;
-	WT_STATS cursor_reset;
-	WT_STATS cursor_search;
-	WT_STATS cursor_search_near;
-	WT_STATS cursor_update;
-	WT_STATS dh_conn_handles;
-	WT_STATS dh_conn_ref;
-	WT_STATS dh_conn_sweeps;
-	WT_STATS dh_conn_tod;
-	WT_STATS dh_session_handles;
-	WT_STATS dh_session_sweeps;
-	WT_STATS file_open;
-	WT_STATS log_buffer_size;
-	WT_STATS log_bytes_payload;
-	WT_STATS log_bytes_written;
-	WT_STATS log_close_yields;
-	WT_STATS log_compress_len;
-	WT_STATS log_compress_mem;
-	WT_STATS log_compress_small;
-	WT_STATS log_compress_write_fails;
-	WT_STATS log_compress_writes;
-	WT_STATS log_max_filesize;
-	WT_STATS log_prealloc_files;
-	WT_STATS log_prealloc_max;
-	WT_STATS log_prealloc_used;
-	WT_STATS log_release_write_lsn;
-	WT_STATS log_scan_records;
-	WT_STATS log_scan_rereads;
-	WT_STATS log_scans;
-	WT_STATS log_slot_closes;
-	WT_STATS log_slot_coalesced;
-	WT_STATS log_slot_consolidated;
-	WT_STATS log_slot_joins;
-	WT_STATS log_slot_races;
-	WT_STATS log_slot_toobig;
-	WT_STATS log_slot_toosmall;
-	WT_STATS log_slot_transitions;
-	WT_STATS log_sync;
-	WT_STATS log_sync_dir;
-	WT_STATS log_write_lsn;
-	WT_STATS log_writes;
-	WT_STATS lsm_checkpoint_throttle;
-	WT_STATS lsm_merge_throttle;
-	WT_STATS lsm_rows_merged;
-	WT_STATS lsm_work_queue_app;
-	WT_STATS lsm_work_queue_manager;
-	WT_STATS lsm_work_queue_max;
-	WT_STATS lsm_work_queue_switch;
-	WT_STATS lsm_work_units_created;
-	WT_STATS lsm_work_units_discarded;
-	WT_STATS lsm_work_units_done;
-	WT_STATS memory_allocation;
-	WT_STATS memory_free;
-	WT_STATS memory_grow;
-	WT_STATS page_busy_blocked;
-	WT_STATS page_forcible_evict_blocked;
-	WT_STATS page_locked_blocked;
-	WT_STATS page_read_blocked;
-	WT_STATS page_sleep;
-	WT_STATS read_io;
-	WT_STATS rec_pages;
-	WT_STATS rec_pages_eviction;
-	WT_STATS rec_split_stashed_bytes;
-	WT_STATS rec_split_stashed_objects;
-	WT_STATS rwlock_read;
-	WT_STATS rwlock_write;
-	WT_STATS session_cursor_open;
-	WT_STATS session_open;
-	WT_STATS txn_begin;
-	WT_STATS txn_checkpoint;
-	WT_STATS txn_checkpoint_generation;
-	WT_STATS txn_checkpoint_running;
-	WT_STATS txn_checkpoint_time_max;
-	WT_STATS txn_checkpoint_time_min;
-	WT_STATS txn_checkpoint_time_recent;
-	WT_STATS txn_checkpoint_time_total;
-	WT_STATS txn_commit;
-	WT_STATS txn_fail_cache;
-	WT_STATS txn_pinned_checkpoint_range;
-	WT_STATS txn_pinned_range;
-	WT_STATS txn_rollback;
-	WT_STATS txn_sync;
-	WT_STATS write_io;
+	int64_t async_alloc_race;
+	int64_t async_alloc_view;
+	int64_t async_cur_queue;
+	int64_t async_flush;
+	int64_t async_full;
+	int64_t async_max_queue;
+	int64_t async_nowork;
+	int64_t async_op_alloc;
+	int64_t async_op_compact;
+	int64_t async_op_insert;
+	int64_t async_op_remove;
+	int64_t async_op_search;
+	int64_t async_op_update;
+	int64_t block_byte_map_read;
+	int64_t block_byte_read;
+	int64_t block_byte_write;
+	int64_t block_map_read;
+	int64_t block_preload;
+	int64_t block_read;
+	int64_t block_write;
+	int64_t cache_bytes_dirty;
+	int64_t cache_bytes_internal;
+	int64_t cache_bytes_inuse;
+	int64_t cache_bytes_leaf;
+	int64_t cache_bytes_max;
+	int64_t cache_bytes_overflow;
+	int64_t cache_bytes_read;
+	int64_t cache_bytes_write;
+	int64_t cache_eviction_app;
+	int64_t cache_eviction_checkpoint;
+	int64_t cache_eviction_clean;
+	int64_t cache_eviction_deepen;
+	int64_t cache_eviction_dirty;
+	int64_t cache_eviction_fail;
+	int64_t cache_eviction_force;
+	int64_t cache_eviction_force_delete;
+	int64_t cache_eviction_force_fail;
+	int64_t cache_eviction_hazard;
+	int64_t cache_eviction_internal;
+	int64_t cache_eviction_maximum_page_size;
+	int64_t cache_eviction_queue_empty;
+	int64_t cache_eviction_queue_not_empty;
+	int64_t cache_eviction_server_evicting;
+	int64_t cache_eviction_server_not_evicting;
+	int64_t cache_eviction_slow;
+	int64_t cache_eviction_split;
+	int64_t cache_eviction_walk;
+	int64_t cache_eviction_worker_evicting;
+	int64_t cache_inmem_split;
+	int64_t cache_inmem_splittable;
+	int64_t cache_lookaside_insert;
+	int64_t cache_lookaside_remove;
+	int64_t cache_overhead;
+	int64_t cache_pages_dirty;
+	int64_t cache_pages_inuse;
+	int64_t cache_read;
+	int64_t cache_read_lookaside;
+	int64_t cache_write;
+	int64_t cache_write_lookaside;
+	int64_t cache_write_restore;
+	int64_t cond_wait;
+	int64_t cursor_create;
+	int64_t cursor_insert;
+	int64_t cursor_next;
+	int64_t cursor_prev;
+	int64_t cursor_remove;
+	int64_t cursor_reset;
+	int64_t cursor_restart;
+	int64_t cursor_search;
+	int64_t cursor_search_near;
+	int64_t cursor_update;
+	int64_t dh_conn_handle_count;
+	int64_t dh_session_handles;
+	int64_t dh_session_sweeps;
+	int64_t dh_sweep_close;
+	int64_t dh_sweep_ref;
+	int64_t dh_sweep_remove;
+	int64_t dh_sweep_tod;
+	int64_t dh_sweeps;
+	int64_t file_open;
+	int64_t log_buffer_size;
+	int64_t log_bytes_payload;
+	int64_t log_bytes_written;
+	int64_t log_close_yields;
+	int64_t log_compress_len;
+	int64_t log_compress_mem;
+	int64_t log_compress_small;
+	int64_t log_compress_write_fails;
+	int64_t log_compress_writes;
+	int64_t log_max_filesize;
+	int64_t log_prealloc_files;
+	int64_t log_prealloc_max;
+	int64_t log_prealloc_used;
+	int64_t log_release_write_lsn;
+	int64_t log_scan_records;
+	int64_t log_scan_rereads;
+	int64_t log_scans;
+	int64_t log_slot_closes;
+	int64_t log_slot_coalesced;
+	int64_t log_slot_consolidated;
+	int64_t log_slot_joins;
+	int64_t log_slot_races;
+	int64_t log_slot_switch_busy;
+	int64_t log_slot_transitions;
+	int64_t log_slot_unbuffered;
+	int64_t log_sync;
+	int64_t log_sync_dir;
+	int64_t log_write_lsn;
+	int64_t log_writes;
+	int64_t lsm_checkpoint_throttle;
+	int64_t lsm_merge_throttle;
+	int64_t lsm_rows_merged;
+	int64_t lsm_work_queue_app;
+	int64_t lsm_work_queue_manager;
+	int64_t lsm_work_queue_max;
+	int64_t lsm_work_queue_switch;
+	int64_t lsm_work_units_created;
+	int64_t lsm_work_units_discarded;
+	int64_t lsm_work_units_done;
+	int64_t memory_allocation;
+	int64_t memory_free;
+	int64_t memory_grow;
+	int64_t page_busy_blocked;
+	int64_t page_forcible_evict_blocked;
+	int64_t page_locked_blocked;
+	int64_t page_read_blocked;
+	int64_t page_sleep;
+	int64_t read_io;
+	int64_t rec_pages;
+	int64_t rec_pages_eviction;
+	int64_t rec_split_stashed_bytes;
+	int64_t rec_split_stashed_objects;
+	int64_t rwlock_read;
+	int64_t rwlock_write;
+	int64_t session_cursor_open;
+	int64_t session_open;
+	int64_t txn_begin;
+	int64_t txn_checkpoint;
+	int64_t txn_checkpoint_generation;
+	int64_t txn_checkpoint_running;
+	int64_t txn_checkpoint_time_max;
+	int64_t txn_checkpoint_time_min;
+	int64_t txn_checkpoint_time_recent;
+	int64_t txn_checkpoint_time_total;
+	int64_t txn_commit;
+	int64_t txn_fail_cache;
+	int64_t txn_pinned_checkpoint_range;
+	int64_t txn_pinned_range;
+	int64_t txn_rollback;
+	int64_t txn_sync;
+	int64_t write_io;
 };
 
 /*
@@ -281,96 +385,102 @@ struct __wt_connection_stats {
  */
 #define	WT_DSRC_STATS_BASE	2000
 struct __wt_dsrc_stats {
-	WT_STATS allocation_size;
-	WT_STATS block_alloc;
-	WT_STATS block_checkpoint_size;
-	WT_STATS block_extension;
-	WT_STATS block_free;
-	WT_STATS block_magic;
-	WT_STATS block_major;
-	WT_STATS block_minor;
-	WT_STATS block_reuse_bytes;
-	WT_STATS block_size;
-	WT_STATS bloom_count;
-	WT_STATS bloom_false_positive;
-	WT_STATS bloom_hit;
-	WT_STATS bloom_miss;
-	WT_STATS bloom_page_evict;
-	WT_STATS bloom_page_read;
-	WT_STATS bloom_size;
-	WT_STATS btree_checkpoint_generation;
-	WT_STATS btree_column_deleted;
-	WT_STATS btree_column_fix;
-	WT_STATS btree_column_internal;
-	WT_STATS btree_column_variable;
-	WT_STATS btree_compact_rewrite;
-	WT_STATS btree_entries;
-	WT_STATS btree_fixed_len;
-	WT_STATS btree_maximum_depth;
-	WT_STATS btree_maxintlkey;
-	WT_STATS btree_maxintlpage;
-	WT_STATS btree_maxleafkey;
-	WT_STATS btree_maxleafpage;
-	WT_STATS btree_maxleafvalue;
-	WT_STATS btree_overflow;
-	WT_STATS btree_row_internal;
-	WT_STATS btree_row_leaf;
-	WT_STATS cache_bytes_read;
-	WT_STATS cache_bytes_write;
-	WT_STATS cache_eviction_checkpoint;
-	WT_STATS cache_eviction_clean;
-	WT_STATS cache_eviction_deepen;
-	WT_STATS cache_eviction_dirty;
-	WT_STATS cache_eviction_fail;
-	WT_STATS cache_eviction_hazard;
-	WT_STATS cache_eviction_internal;
-	WT_STATS cache_eviction_split;
-	WT_STATS cache_inmem_split;
-	WT_STATS cache_overflow_value;
-	WT_STATS cache_read;
-	WT_STATS cache_read_overflow;
-	WT_STATS cache_write;
-	WT_STATS compress_raw_fail;
-	WT_STATS compress_raw_fail_temporary;
-	WT_STATS compress_raw_ok;
-	WT_STATS compress_read;
-	WT_STATS compress_write;
-	WT_STATS compress_write_fail;
-	WT_STATS compress_write_too_small;
-	WT_STATS cursor_create;
-	WT_STATS cursor_insert;
-	WT_STATS cursor_insert_bulk;
-	WT_STATS cursor_insert_bytes;
-	WT_STATS cursor_next;
-	WT_STATS cursor_prev;
-	WT_STATS cursor_remove;
-	WT_STATS cursor_remove_bytes;
-	WT_STATS cursor_reset;
-	WT_STATS cursor_search;
-	WT_STATS cursor_search_near;
-	WT_STATS cursor_update;
-	WT_STATS cursor_update_bytes;
-	WT_STATS lsm_checkpoint_throttle;
-	WT_STATS lsm_chunk_count;
-	WT_STATS lsm_generation_max;
-	WT_STATS lsm_lookup_no_bloom;
-	WT_STATS lsm_merge_throttle;
-	WT_STATS rec_dictionary;
-	WT_STATS rec_multiblock_internal;
-	WT_STATS rec_multiblock_leaf;
-	WT_STATS rec_multiblock_max;
-	WT_STATS rec_overflow_key_internal;
-	WT_STATS rec_overflow_key_leaf;
-	WT_STATS rec_overflow_value;
-	WT_STATS rec_page_delete;
-	WT_STATS rec_page_match;
-	WT_STATS rec_pages;
-	WT_STATS rec_pages_eviction;
-	WT_STATS rec_prefix_compression;
-	WT_STATS rec_suffix_compression;
-	WT_STATS session_compact;
-	WT_STATS session_cursor_open;
-	WT_STATS txn_update_conflict;
+	int64_t allocation_size;
+	int64_t block_alloc;
+	int64_t block_checkpoint_size;
+	int64_t block_extension;
+	int64_t block_free;
+	int64_t block_magic;
+	int64_t block_major;
+	int64_t block_minor;
+	int64_t block_reuse_bytes;
+	int64_t block_size;
+	int64_t bloom_count;
+	int64_t bloom_false_positive;
+	int64_t bloom_hit;
+	int64_t bloom_miss;
+	int64_t bloom_page_evict;
+	int64_t bloom_page_read;
+	int64_t bloom_size;
+	int64_t btree_checkpoint_generation;
+	int64_t btree_column_deleted;
+	int64_t btree_column_fix;
+	int64_t btree_column_internal;
+	int64_t btree_column_rle;
+	int64_t btree_column_variable;
+	int64_t btree_compact_rewrite;
+	int64_t btree_entries;
+	int64_t btree_fixed_len;
+	int64_t btree_maximum_depth;
+	int64_t btree_maxintlkey;
+	int64_t btree_maxintlpage;
+	int64_t btree_maxleafkey;
+	int64_t btree_maxleafpage;
+	int64_t btree_maxleafvalue;
+	int64_t btree_overflow;
+	int64_t btree_row_internal;
+	int64_t btree_row_leaf;
+	int64_t cache_bytes_read;
+	int64_t cache_bytes_write;
+	int64_t cache_eviction_checkpoint;
+	int64_t cache_eviction_clean;
+	int64_t cache_eviction_deepen;
+	int64_t cache_eviction_dirty;
+	int64_t cache_eviction_fail;
+	int64_t cache_eviction_hazard;
+	int64_t cache_eviction_internal;
+	int64_t cache_eviction_split;
+	int64_t cache_inmem_split;
+	int64_t cache_inmem_splittable;
+	int64_t cache_overflow_value;
+	int64_t cache_read;
+	int64_t cache_read_lookaside;
+	int64_t cache_read_overflow;
+	int64_t cache_write;
+	int64_t cache_write_lookaside;
+	int64_t cache_write_restore;
+	int64_t compress_raw_fail;
+	int64_t compress_raw_fail_temporary;
+	int64_t compress_raw_ok;
+	int64_t compress_read;
+	int64_t compress_write;
+	int64_t compress_write_fail;
+	int64_t compress_write_too_small;
+	int64_t cursor_create;
+	int64_t cursor_insert;
+	int64_t cursor_insert_bulk;
+	int64_t cursor_insert_bytes;
+	int64_t cursor_next;
+	int64_t cursor_prev;
+	int64_t cursor_remove;
+	int64_t cursor_remove_bytes;
+	int64_t cursor_reset;
+	int64_t cursor_restart;
+	int64_t cursor_search;
+	int64_t cursor_search_near;
+	int64_t cursor_update;
+	int64_t cursor_update_bytes;
+	int64_t lsm_checkpoint_throttle;
+	int64_t lsm_chunk_count;
+	int64_t lsm_generation_max;
+	int64_t lsm_lookup_no_bloom;
+	int64_t lsm_merge_throttle;
+	int64_t rec_dictionary;
+	int64_t rec_multiblock_internal;
+	int64_t rec_multiblock_leaf;
+	int64_t rec_multiblock_max;
+	int64_t rec_overflow_key_internal;
+	int64_t rec_overflow_key_leaf;
+	int64_t rec_overflow_value;
+	int64_t rec_page_delete;
+	int64_t rec_page_match;
+	int64_t rec_pages;
+	int64_t rec_pages_eviction;
+	int64_t rec_prefix_compression;
+	int64_t rec_suffix_compression;
+	int64_t session_compact;
+	int64_t session_cursor_open;
+	int64_t txn_update_conflict;
 };
 
 /* Statistics section: END */
diff --git a/src/include/txn.h b/src/include/txn.h
index 7a67f713244..4a325c70a95 100644
--- a/src/include/txn.h
+++ b/src/include/txn.h
@@ -31,7 +31,7 @@
 struct __wt_named_snapshot {
 	const char *name;
 
-	STAILQ_ENTRY(__wt_named_snapshot) q;
+	TAILQ_ENTRY(__wt_named_snapshot) q;
 
 	uint64_t snap_min, snap_max;
 	uint64_t *snapshot;
@@ -72,15 +72,14 @@ struct __wt_txn_global {
 	/* Named snapshot state. */
 	WT_RWLOCK *nsnap_rwlock;
 	volatile uint64_t nsnap_oldest_id;
-	STAILQ_HEAD(__wt_nsnap_qh, __wt_named_snapshot) nsnaph;
+	TAILQ_HEAD(__wt_nsnap_qh, __wt_named_snapshot) nsnaph;
 
 	WT_TXN_STATE *states;		/* Per-session transaction states */
 };
 
 typedef enum __wt_txn_isolation {
-	WT_ISO_EVICTION,		/* Internal: eviction context */
-	WT_ISO_READ_UNCOMMITTED,
 	WT_ISO_READ_COMMITTED,
+	WT_ISO_READ_UNCOMMITTED,
 	WT_ISO_SNAPSHOT
 } WT_TXN_ISOLATION;
 
diff --git a/src/include/txn.i b/src/include/txn.i
index a9b54d26e47..2b42990f5e5 100644
--- a/src/include/txn.i
+++ b/src/include/txn.i
@@ -140,12 +140,22 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session)
 }
 
 /*
+ * __wt_txn_committed --
+ *	Return if a transaction has been committed.
+ */
+static inline bool
+__wt_txn_committed(WT_SESSION_IMPL *session, uint64_t id)
+{
+	return (WT_TXNID_LT(id, S2C(session)->txn_global.last_running));
+}
+
+/*
  * __wt_txn_visible_all --
  *	Check if a given transaction ID is "globally visible".	This is, if
  *	all sessions in the system will see the transaction ID including the
  *	ID that belongs to a running checkpoint.
  */
-static inline int
+static inline bool
 __wt_txn_visible_all(WT_SESSION_IMPL *session, uint64_t id)
 {
 	uint64_t oldest_id;
@@ -159,28 +169,21 @@ __wt_txn_visible_all(WT_SESSION_IMPL *session, uint64_t id)
  * __wt_txn_visible --
  *	Can the current transaction see the given ID?
  */
-static inline int
+static inline bool
 __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id)
 {
 	WT_TXN *txn;
-	int found;
+	bool found;
 
 	txn = &session->txn;
 
 	/* Changes with no associated transaction are always visible. */
 	if (id == WT_TXN_NONE)
-		return (1);
+		return (true);
 
 	/* Nobody sees the results of aborted transactions. */
 	if (id == WT_TXN_ABORTED)
-		return (0);
-
-	/*
-	 * Eviction only sees globally visible updates, or if there is a
-	 * checkpoint transaction running, use its transaction.
-	 */
-	if (txn->isolation == WT_ISO_EVICTION)
-		return (__wt_txn_visible_all(session, id));
+		return (false);
 
 	/*
 	 * Read-uncommitted transactions see all other changes.
@@ -194,11 +197,11 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id)
 	 */
 	if (txn->isolation == WT_ISO_READ_UNCOMMITTED ||
 	    session->dhandle == session->meta_dhandle)
-		return (1);
+		return (true);
 
 	/* Transactions see their own changes. */
 	if (id == txn->id)
-		return (1);
+		return (true);
 
 	/*
 	 * WT_ISO_SNAPSHOT, WT_ISO_READ_COMMITTED: the ID is visible if it is
@@ -210,9 +213,9 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id)
 	 * snapshot is empty.
 	 */
 	if (WT_TXNID_LE(txn->snap_max, id))
-		return (0);
+		return (false);
 	if (txn->snapshot_count == 0 || WT_TXNID_LT(id, txn->snap_min))
-		return (1);
+		return (true);
 
 	WT_BINARY_SEARCH(id, txn->snapshot, txn->snapshot_count, found);
 	return (!found);
@@ -266,7 +269,7 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
 	}
 
 	F_SET(txn, WT_TXN_RUNNING);
-	return (0);
+	return (false);
 }
 
 /*
@@ -300,7 +303,7 @@ __wt_txn_new_id(WT_SESSION_IMPL *session)
 	 * global current ID, so we want post-increment semantics.  Our atomic
 	 * add primitive does pre-increment, so adjust the result here.
 	 */
-	return (WT_ATOMIC_ADD8(S2C(session)->txn_global.current, 1) - 1);
+	return (__wt_atomic_addv64(&S2C(session)->txn_global.current, 1) - 1);
 }
 
 /*
@@ -376,8 +379,9 @@ __wt_txn_id_check(WT_SESSION_IMPL *session)
 		 */
 		do {
 			txn_state->id = txn->id = txn_global->current;
-		} while (!WT_ATOMIC_CAS8(
-		    txn_global->current, txn->id, txn->id + 1));
+		} while (!__wt_atomic_casv64(
+		    &txn_global->current, txn->id, txn->id + 1) ||
+		    WT_TXNID_LT(txn->id, txn_global->last_running));
 
 		/*
 		 * If we have used 64-bits of transaction IDs, there is nothing
@@ -476,7 +480,7 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session)
  * __wt_txn_am_oldest --
  *	Am I the oldest transaction in the system?
  */
-static inline int
+static inline bool
 __wt_txn_am_oldest(WT_SESSION_IMPL *session)
 {
 	WT_CONNECTION_IMPL *conn;
@@ -491,12 +495,12 @@ __wt_txn_am_oldest(WT_SESSION_IMPL *session)
 	txn_global = &conn->txn_global;
 
 	if (txn->id == WT_TXN_NONE)
-		return (0);
+		return (false);
 
 	WT_ORDERED_READ(session_cnt, conn->session_cnt);
 	for (i = 0, s = txn_global->states; i < session_cnt; i++, s++)
 		if ((id = s->id) != WT_TXN_NONE && WT_TXNID_LT(id, txn->id))
-			return (0);
+			return (false);
 
-	return (1);
+	return (true);
 }
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index e8f3b9958ce..71ba3f41a44 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -1750,6 +1750,9 @@ struct __wt_connection {
 	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;name, the name of a cache that
 	 * is shared between databases or \c "none" when no shared cache is
 	 * configured., a string; default \c none.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;quota, maximum size of cache this
+	 * database can be allocated from the shared cache.  Defaults to the
+	 * entire shared cache size., an integer; default \c 0.}
 	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;reserve, amount of cache this
 	 * database is guaranteed to have available from the shared cache.  This
 	 * setting is per database.  Defaults to the chunk size., an integer;
@@ -2072,8 +2075,10 @@ struct __wt_connection {
  * @config{checkpoint_sync, flush files to stable storage when closing or
  * writing checkpoints., a boolean flag; default \c true.}
  * @config{config_base, write the base configuration file if creating the
- * database\, see @ref config_base for more information., a boolean flag;
- * default \c true.}
+ * database.  If \c false in the config passed directly to ::wiredtiger_open\,
+ * will ignore any existing base configuration file in addition to not creating
+ * one.  See @ref config_base for more information., a boolean flag; default \c
+ * true.}
  * @config{create, create the database if it does not exist., a boolean flag;
  * default \c false.}
  * @config{direct_io, Use \c O_DIRECT to access files.  Options are given as a
@@ -2214,10 +2219,12 @@ struct __wt_connection {
  * @config{&nbsp;&nbsp;&nbsp;&nbsp;name, the name of a cache that is shared
  * between databases or \c "none" when no shared cache is configured., a string;
  * default \c none.}
- * @config{&nbsp;&nbsp;&nbsp;&nbsp;reserve, amount of cache
- * this database is guaranteed to have available from the shared cache.  This
- * setting is per database.  Defaults to the chunk size., an integer; default \c
- * 0.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;quota, maximum size of
+ * cache this database can be allocated from the shared cache.  Defaults to the
+ * entire shared cache size., an integer; default \c 0.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;reserve, amount of cache this database is
+ * guaranteed to have available from the shared cache.  This setting is per
+ * database.  Defaults to the chunk size., an integer; default \c 0.}
  * @config{&nbsp;&nbsp;&nbsp;&nbsp;size, maximum memory to allocate for the
  * shared cache.  Setting this will update the value if one is already set., an
  * integer between 1MB and 10TB; default \c 500MB.}
@@ -3640,192 +3647,210 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
 #define	WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING	1047
 /*! cache: in-memory page splits */
 #define	WT_STAT_CONN_CACHE_INMEM_SPLIT			1048
+/*! cache: in-memory page passed criteria to be split */
+#define	WT_STAT_CONN_CACHE_INMEM_SPLITTABLE		1049
+/*! cache: lookaside table insert calls */
+#define	WT_STAT_CONN_CACHE_LOOKASIDE_INSERT		1050
+/*! cache: lookaside table remove calls */
+#define	WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE		1051
 /*! cache: percentage overhead */
-#define	WT_STAT_CONN_CACHE_OVERHEAD			1049
+#define	WT_STAT_CONN_CACHE_OVERHEAD			1052
 /*! cache: tracked dirty pages in the cache */
-#define	WT_STAT_CONN_CACHE_PAGES_DIRTY			1050
+#define	WT_STAT_CONN_CACHE_PAGES_DIRTY			1053
 /*! cache: pages currently held in the cache */
-#define	WT_STAT_CONN_CACHE_PAGES_INUSE			1051
+#define	WT_STAT_CONN_CACHE_PAGES_INUSE			1054
 /*! cache: pages read into cache */
-#define	WT_STAT_CONN_CACHE_READ				1052
+#define	WT_STAT_CONN_CACHE_READ				1055
+/*! cache: pages read into cache requiring lookaside entries */
+#define	WT_STAT_CONN_CACHE_READ_LOOKASIDE		1056
 /*! cache: pages written from cache */
-#define	WT_STAT_CONN_CACHE_WRITE			1053
+#define	WT_STAT_CONN_CACHE_WRITE			1057
+/*! cache: page written requiring lookaside records */
+#define	WT_STAT_CONN_CACHE_WRITE_LOOKASIDE		1058
+/*! cache: pages written requiring in-memory restoration */
+#define	WT_STAT_CONN_CACHE_WRITE_RESTORE		1059
 /*! connection: pthread mutex condition wait calls */
-#define	WT_STAT_CONN_COND_WAIT				1054
+#define	WT_STAT_CONN_COND_WAIT				1060
 /*! cursor: cursor create calls */
-#define	WT_STAT_CONN_CURSOR_CREATE			1055
+#define	WT_STAT_CONN_CURSOR_CREATE			1061
 /*! cursor: cursor insert calls */
-#define	WT_STAT_CONN_CURSOR_INSERT			1056
+#define	WT_STAT_CONN_CURSOR_INSERT			1062
 /*! cursor: cursor next calls */
-#define	WT_STAT_CONN_CURSOR_NEXT			1057
+#define	WT_STAT_CONN_CURSOR_NEXT			1063
 /*! cursor: cursor prev calls */
-#define	WT_STAT_CONN_CURSOR_PREV			1058
+#define	WT_STAT_CONN_CURSOR_PREV			1064
 /*! cursor: cursor remove calls */
-#define	WT_STAT_CONN_CURSOR_REMOVE			1059
+#define	WT_STAT_CONN_CURSOR_REMOVE			1065
 /*! cursor: cursor reset calls */
-#define	WT_STAT_CONN_CURSOR_RESET			1060
+#define	WT_STAT_CONN_CURSOR_RESET			1066
+/*! cursor: cursor restarted searches */
+#define	WT_STAT_CONN_CURSOR_RESTART			1067
 /*! cursor: cursor search calls */
-#define	WT_STAT_CONN_CURSOR_SEARCH			1061
+#define	WT_STAT_CONN_CURSOR_SEARCH			1068
 /*! cursor: cursor search near calls */
-#define	WT_STAT_CONN_CURSOR_SEARCH_NEAR			1062
+#define	WT_STAT_CONN_CURSOR_SEARCH_NEAR			1069
 /*! cursor: cursor update calls */
-#define	WT_STAT_CONN_CURSOR_UPDATE			1063
-/*! data-handle: connection dhandles swept */
-#define	WT_STAT_CONN_DH_CONN_HANDLES			1064
-/*! data-handle: connection candidate referenced */
-#define	WT_STAT_CONN_DH_CONN_REF			1065
-/*! data-handle: connection sweeps */
-#define	WT_STAT_CONN_DH_CONN_SWEEPS			1066
-/*! data-handle: connection time-of-death sets */
-#define	WT_STAT_CONN_DH_CONN_TOD			1067
+#define	WT_STAT_CONN_CURSOR_UPDATE			1070
+/*! data-handle: connection data handles currently active */
+#define	WT_STAT_CONN_DH_CONN_HANDLE_COUNT		1071
 /*! data-handle: session dhandles swept */
-#define	WT_STAT_CONN_DH_SESSION_HANDLES			1068
+#define	WT_STAT_CONN_DH_SESSION_HANDLES			1072
 /*! data-handle: session sweep attempts */
-#define	WT_STAT_CONN_DH_SESSION_SWEEPS			1069
+#define	WT_STAT_CONN_DH_SESSION_SWEEPS			1073
+/*! data-handle: connection sweep dhandles closed */
+#define	WT_STAT_CONN_DH_SWEEP_CLOSE			1074
+/*! data-handle: connection sweep candidate became referenced */
+#define	WT_STAT_CONN_DH_SWEEP_REF			1075
+/*! data-handle: connection sweep dhandles removed from hash list */
+#define	WT_STAT_CONN_DH_SWEEP_REMOVE			1076
+/*! data-handle: connection sweep time-of-death sets */
+#define	WT_STAT_CONN_DH_SWEEP_TOD			1077
+/*! data-handle: connection sweeps */
+#define	WT_STAT_CONN_DH_SWEEPS				1078
 /*! connection: files currently open */
-#define	WT_STAT_CONN_FILE_OPEN				1070
+#define	WT_STAT_CONN_FILE_OPEN				1079
 /*! log: total log buffer size */
-#define	WT_STAT_CONN_LOG_BUFFER_SIZE			1071
+#define	WT_STAT_CONN_LOG_BUFFER_SIZE			1080
 /*! log: log bytes of payload data */
-#define	WT_STAT_CONN_LOG_BYTES_PAYLOAD			1072
+#define	WT_STAT_CONN_LOG_BYTES_PAYLOAD			1081
 /*! log: log bytes written */
-#define	WT_STAT_CONN_LOG_BYTES_WRITTEN			1073
+#define	WT_STAT_CONN_LOG_BYTES_WRITTEN			1082
 /*! log: yields waiting for previous log file close */
-#define	WT_STAT_CONN_LOG_CLOSE_YIELDS			1074
+#define	WT_STAT_CONN_LOG_CLOSE_YIELDS			1083
 /*! log: total size of compressed records */
-#define	WT_STAT_CONN_LOG_COMPRESS_LEN			1075
+#define	WT_STAT_CONN_LOG_COMPRESS_LEN			1084
 /*! log: total in-memory size of compressed records */
-#define	WT_STAT_CONN_LOG_COMPRESS_MEM			1076
+#define	WT_STAT_CONN_LOG_COMPRESS_MEM			1085
 /*! log: log records too small to compress */
-#define	WT_STAT_CONN_LOG_COMPRESS_SMALL			1077
+#define	WT_STAT_CONN_LOG_COMPRESS_SMALL			1086
 /*! log: log records not compressed */
-#define	WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS		1078
+#define	WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS		1087
 /*! log: log records compressed */
-#define	WT_STAT_CONN_LOG_COMPRESS_WRITES		1079
+#define	WT_STAT_CONN_LOG_COMPRESS_WRITES		1088
 /*! log: maximum log file size */
-#define	WT_STAT_CONN_LOG_MAX_FILESIZE			1080
+#define	WT_STAT_CONN_LOG_MAX_FILESIZE			1089
 /*! log: pre-allocated log files prepared */
-#define	WT_STAT_CONN_LOG_PREALLOC_FILES			1081
+#define	WT_STAT_CONN_LOG_PREALLOC_FILES			1090
 /*! log: number of pre-allocated log files to create */
-#define	WT_STAT_CONN_LOG_PREALLOC_MAX			1082
+#define	WT_STAT_CONN_LOG_PREALLOC_MAX			1091
 /*! log: pre-allocated log files used */
-#define	WT_STAT_CONN_LOG_PREALLOC_USED			1083
+#define	WT_STAT_CONN_LOG_PREALLOC_USED			1092
 /*! log: log release advances write LSN */
-#define	WT_STAT_CONN_LOG_RELEASE_WRITE_LSN		1084
+#define	WT_STAT_CONN_LOG_RELEASE_WRITE_LSN		1093
 /*! log: records processed by log scan */
-#define	WT_STAT_CONN_LOG_SCAN_RECORDS			1085
+#define	WT_STAT_CONN_LOG_SCAN_RECORDS			1094
 /*! log: log scan records requiring two reads */
-#define	WT_STAT_CONN_LOG_SCAN_REREADS			1086
+#define	WT_STAT_CONN_LOG_SCAN_REREADS			1095
 /*! log: log scan operations */
-#define	WT_STAT_CONN_LOG_SCANS				1087
+#define	WT_STAT_CONN_LOG_SCANS				1096
 /*! log: consolidated slot closures */
-#define	WT_STAT_CONN_LOG_SLOT_CLOSES			1088
+#define	WT_STAT_CONN_LOG_SLOT_CLOSES			1097
 /*! log: written slots coalesced */
-#define	WT_STAT_CONN_LOG_SLOT_COALESCED			1089
+#define	WT_STAT_CONN_LOG_SLOT_COALESCED			1098
 /*! log: logging bytes consolidated */
-#define	WT_STAT_CONN_LOG_SLOT_CONSOLIDATED		1090
+#define	WT_STAT_CONN_LOG_SLOT_CONSOLIDATED		1099
 /*! log: consolidated slot joins */
-#define	WT_STAT_CONN_LOG_SLOT_JOINS			1091
+#define	WT_STAT_CONN_LOG_SLOT_JOINS			1100
 /*! log: consolidated slot join races */
-#define	WT_STAT_CONN_LOG_SLOT_RACES			1092
-/*! log: record size exceeded maximum */
-#define	WT_STAT_CONN_LOG_SLOT_TOOBIG			1093
-/*! log: failed to find a slot large enough for record */
-#define	WT_STAT_CONN_LOG_SLOT_TOOSMALL			1094
+#define	WT_STAT_CONN_LOG_SLOT_RACES			1101
+/*! log: busy returns attempting to switch slots */
+#define	WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY		1102
 /*! log: consolidated slot join transitions */
-#define	WT_STAT_CONN_LOG_SLOT_TRANSITIONS		1095
+#define	WT_STAT_CONN_LOG_SLOT_TRANSITIONS		1103
+/*! log: consolidated slot unbuffered writes */
+#define	WT_STAT_CONN_LOG_SLOT_UNBUFFERED		1104
 /*! log: log sync operations */
-#define	WT_STAT_CONN_LOG_SYNC				1096
+#define	WT_STAT_CONN_LOG_SYNC				1105
 /*! log: log sync_dir operations */
-#define	WT_STAT_CONN_LOG_SYNC_DIR			1097
+#define	WT_STAT_CONN_LOG_SYNC_DIR			1106
 /*! log: log server thread advances write LSN */
-#define	WT_STAT_CONN_LOG_WRITE_LSN			1098
+#define	WT_STAT_CONN_LOG_WRITE_LSN			1107
 /*! log: log write operations */
-#define	WT_STAT_CONN_LOG_WRITES				1099
+#define	WT_STAT_CONN_LOG_WRITES				1108
 /*! LSM: sleep for LSM checkpoint throttle */
-#define	WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE		1100
+#define	WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE		1109
 /*! LSM: sleep for LSM merge throttle */
-#define	WT_STAT_CONN_LSM_MERGE_THROTTLE			1101
+#define	WT_STAT_CONN_LSM_MERGE_THROTTLE			1110
 /*! LSM: rows merged in an LSM tree */
-#define	WT_STAT_CONN_LSM_ROWS_MERGED			1102
+#define	WT_STAT_CONN_LSM_ROWS_MERGED			1111
 /*! LSM: application work units currently queued */
-#define	WT_STAT_CONN_LSM_WORK_QUEUE_APP			1103
+#define	WT_STAT_CONN_LSM_WORK_QUEUE_APP			1112
 /*! LSM: merge work units currently queued */
-#define	WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER		1104
+#define	WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER		1113
 /*! LSM: tree queue hit maximum */
-#define	WT_STAT_CONN_LSM_WORK_QUEUE_MAX			1105
+#define	WT_STAT_CONN_LSM_WORK_QUEUE_MAX			1114
 /*! LSM: switch work units currently queued */
-#define	WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH		1106
+#define	WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH		1115
 /*! LSM: tree maintenance operations scheduled */
-#define	WT_STAT_CONN_LSM_WORK_UNITS_CREATED		1107
+#define	WT_STAT_CONN_LSM_WORK_UNITS_CREATED		1116
 /*! LSM: tree maintenance operations discarded */
-#define	WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED		1108
+#define	WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED		1117
 /*! LSM: tree maintenance operations executed */
-#define	WT_STAT_CONN_LSM_WORK_UNITS_DONE		1109
+#define	WT_STAT_CONN_LSM_WORK_UNITS_DONE		1118
 /*! connection: memory allocations */
-#define	WT_STAT_CONN_MEMORY_ALLOCATION			1110
+#define	WT_STAT_CONN_MEMORY_ALLOCATION			1119
 /*! connection: memory frees */
-#define	WT_STAT_CONN_MEMORY_FREE			1111
+#define	WT_STAT_CONN_MEMORY_FREE			1120
 /*! connection: memory re-allocations */
-#define	WT_STAT_CONN_MEMORY_GROW			1112
+#define	WT_STAT_CONN_MEMORY_GROW			1121
 /*! thread-yield: page acquire busy blocked */
-#define	WT_STAT_CONN_PAGE_BUSY_BLOCKED			1113
+#define	WT_STAT_CONN_PAGE_BUSY_BLOCKED			1122
 /*! thread-yield: page acquire eviction blocked */
-#define	WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED	1114
+#define	WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED	1123
 /*! thread-yield: page acquire locked blocked */
-#define	WT_STAT_CONN_PAGE_LOCKED_BLOCKED		1115
+#define	WT_STAT_CONN_PAGE_LOCKED_BLOCKED		1124
 /*! thread-yield: page acquire read blocked */
-#define	WT_STAT_CONN_PAGE_READ_BLOCKED			1116
+#define	WT_STAT_CONN_PAGE_READ_BLOCKED			1125
 /*! thread-yield: page acquire time sleeping (usecs) */
-#define	WT_STAT_CONN_PAGE_SLEEP				1117
+#define	WT_STAT_CONN_PAGE_SLEEP				1126
 /*! connection: total read I/Os */
-#define	WT_STAT_CONN_READ_IO				1118
+#define	WT_STAT_CONN_READ_IO				1127
 /*! reconciliation: page reconciliation calls */
-#define	WT_STAT_CONN_REC_PAGES				1119
+#define	WT_STAT_CONN_REC_PAGES				1128
 /*! reconciliation: page reconciliation calls for eviction */
-#define	WT_STAT_CONN_REC_PAGES_EVICTION			1120
+#define	WT_STAT_CONN_REC_PAGES_EVICTION			1129
 /*! reconciliation: split bytes currently awaiting free */
-#define	WT_STAT_CONN_REC_SPLIT_STASHED_BYTES		1121
+#define	WT_STAT_CONN_REC_SPLIT_STASHED_BYTES		1130
 /*! reconciliation: split objects currently awaiting free */
-#define	WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS		1122
+#define	WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS		1131
 /*! connection: pthread mutex shared lock read-lock calls */
-#define	WT_STAT_CONN_RWLOCK_READ			1123
+#define	WT_STAT_CONN_RWLOCK_READ			1132
 /*! connection: pthread mutex shared lock write-lock calls */
-#define	WT_STAT_CONN_RWLOCK_WRITE			1124
+#define	WT_STAT_CONN_RWLOCK_WRITE			1133
 /*! session: open cursor count */
-#define	WT_STAT_CONN_SESSION_CURSOR_OPEN		1125
+#define	WT_STAT_CONN_SESSION_CURSOR_OPEN		1134
 /*! session: open session count */
-#define	WT_STAT_CONN_SESSION_OPEN			1126
+#define	WT_STAT_CONN_SESSION_OPEN			1135
 /*! transaction: transaction begins */
-#define	WT_STAT_CONN_TXN_BEGIN				1127
+#define	WT_STAT_CONN_TXN_BEGIN				1136
 /*! transaction: transaction checkpoints */
-#define	WT_STAT_CONN_TXN_CHECKPOINT			1128
+#define	WT_STAT_CONN_TXN_CHECKPOINT			1137
 /*! transaction: transaction checkpoint generation */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_GENERATION		1129
+#define	WT_STAT_CONN_TXN_CHECKPOINT_GENERATION		1138
 /*! transaction: transaction checkpoint currently running */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_RUNNING		1130
+#define	WT_STAT_CONN_TXN_CHECKPOINT_RUNNING		1139
 /*! transaction: transaction checkpoint max time (msecs) */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX		1131
+#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX		1140
 /*! transaction: transaction checkpoint min time (msecs) */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN		1132
+#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN		1141
 /*! transaction: transaction checkpoint most recent time (msecs) */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT		1133
+#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT		1142
 /*! transaction: transaction checkpoint total time (msecs) */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL		1134
+#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL		1143
 /*! transaction: transactions committed */
-#define	WT_STAT_CONN_TXN_COMMIT				1135
+#define	WT_STAT_CONN_TXN_COMMIT				1144
 /*! transaction: transaction failures due to cache overflow */
-#define	WT_STAT_CONN_TXN_FAIL_CACHE			1136
+#define	WT_STAT_CONN_TXN_FAIL_CACHE			1145
 /*! transaction: transaction range of IDs currently pinned by a checkpoint */
-#define	WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE	1137
+#define	WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE	1146
 /*! transaction: transaction range of IDs currently pinned */
-#define	WT_STAT_CONN_TXN_PINNED_RANGE			1138
+#define	WT_STAT_CONN_TXN_PINNED_RANGE			1147
 /*! transaction: transactions rolled back */
-#define	WT_STAT_CONN_TXN_ROLLBACK			1139
+#define	WT_STAT_CONN_TXN_ROLLBACK			1148
 /*! transaction: transaction sync calls */
-#define	WT_STAT_CONN_TXN_SYNC				1140
+#define	WT_STAT_CONN_TXN_SYNC				1149
 /*! connection: total write I/Os */
-#define	WT_STAT_CONN_WRITE_IO				1141
+#define	WT_STAT_CONN_WRITE_IO				1150
 
 /*!
  * @}
@@ -3875,146 +3900,158 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
 #define	WT_STAT_DSRC_BTREE_COLUMN_FIX			2019
 /*! btree: column-store internal pages */
 #define	WT_STAT_DSRC_BTREE_COLUMN_INTERNAL		2020
+/*! btree: column-store variable-size RLE encoded values */
+#define	WT_STAT_DSRC_BTREE_COLUMN_RLE			2021
 /*! btree: column-store variable-size leaf pages */
-#define	WT_STAT_DSRC_BTREE_COLUMN_VARIABLE		2021
+#define	WT_STAT_DSRC_BTREE_COLUMN_VARIABLE		2022
 /*! btree: pages rewritten by compaction */
-#define	WT_STAT_DSRC_BTREE_COMPACT_REWRITE		2022
+#define	WT_STAT_DSRC_BTREE_COMPACT_REWRITE		2023
 /*! btree: number of key/value pairs */
-#define	WT_STAT_DSRC_BTREE_ENTRIES			2023
+#define	WT_STAT_DSRC_BTREE_ENTRIES			2024
 /*! btree: fixed-record size */
-#define	WT_STAT_DSRC_BTREE_FIXED_LEN			2024
+#define	WT_STAT_DSRC_BTREE_FIXED_LEN			2025
 /*! btree: maximum tree depth */
-#define	WT_STAT_DSRC_BTREE_MAXIMUM_DEPTH		2025
+#define	WT_STAT_DSRC_BTREE_MAXIMUM_DEPTH		2026
 /*! btree: maximum internal page key size */
-#define	WT_STAT_DSRC_BTREE_MAXINTLKEY			2026
+#define	WT_STAT_DSRC_BTREE_MAXINTLKEY			2027
 /*! btree: maximum internal page size */
-#define	WT_STAT_DSRC_BTREE_MAXINTLPAGE			2027
+#define	WT_STAT_DSRC_BTREE_MAXINTLPAGE			2028
 /*! btree: maximum leaf page key size */
-#define	WT_STAT_DSRC_BTREE_MAXLEAFKEY			2028
+#define	WT_STAT_DSRC_BTREE_MAXLEAFKEY			2029
 /*! btree: maximum leaf page size */
-#define	WT_STAT_DSRC_BTREE_MAXLEAFPAGE			2029
+#define	WT_STAT_DSRC_BTREE_MAXLEAFPAGE			2030
 /*! btree: maximum leaf page value size */
-#define	WT_STAT_DSRC_BTREE_MAXLEAFVALUE			2030
+#define	WT_STAT_DSRC_BTREE_MAXLEAFVALUE			2031
 /*! btree: overflow pages */
-#define	WT_STAT_DSRC_BTREE_OVERFLOW			2031
+#define	WT_STAT_DSRC_BTREE_OVERFLOW			2032
 /*! btree: row-store internal pages */
-#define	WT_STAT_DSRC_BTREE_ROW_INTERNAL			2032
+#define	WT_STAT_DSRC_BTREE_ROW_INTERNAL			2033
 /*! btree: row-store leaf pages */
-#define	WT_STAT_DSRC_BTREE_ROW_LEAF			2033
+#define	WT_STAT_DSRC_BTREE_ROW_LEAF			2034
 /*! cache: bytes read into cache */
-#define	WT_STAT_DSRC_CACHE_BYTES_READ			2034
+#define	WT_STAT_DSRC_CACHE_BYTES_READ			2035
 /*! cache: bytes written from cache */
-#define	WT_STAT_DSRC_CACHE_BYTES_WRITE			2035
+#define	WT_STAT_DSRC_CACHE_BYTES_WRITE			2036
 /*! cache: checkpoint blocked page eviction */
-#define	WT_STAT_DSRC_CACHE_EVICTION_CHECKPOINT		2036
+#define	WT_STAT_DSRC_CACHE_EVICTION_CHECKPOINT		2037
 /*! cache: unmodified pages evicted */
-#define	WT_STAT_DSRC_CACHE_EVICTION_CLEAN		2037
+#define	WT_STAT_DSRC_CACHE_EVICTION_CLEAN		2038
 /*! cache: page split during eviction deepened the tree */
-#define	WT_STAT_DSRC_CACHE_EVICTION_DEEPEN		2038
+#define	WT_STAT_DSRC_CACHE_EVICTION_DEEPEN		2039
 /*! cache: modified pages evicted */
-#define	WT_STAT_DSRC_CACHE_EVICTION_DIRTY		2039
+#define	WT_STAT_DSRC_CACHE_EVICTION_DIRTY		2040
 /*! cache: data source pages selected for eviction unable to be evicted */
-#define	WT_STAT_DSRC_CACHE_EVICTION_FAIL		2040
+#define	WT_STAT_DSRC_CACHE_EVICTION_FAIL		2041
 /*! cache: hazard pointer blocked page eviction */
-#define	WT_STAT_DSRC_CACHE_EVICTION_HAZARD		2041
+#define	WT_STAT_DSRC_CACHE_EVICTION_HAZARD		2042
 /*! cache: internal pages evicted */
-#define	WT_STAT_DSRC_CACHE_EVICTION_INTERNAL		2042
+#define	WT_STAT_DSRC_CACHE_EVICTION_INTERNAL		2043
 /*! cache: pages split during eviction */
-#define	WT_STAT_DSRC_CACHE_EVICTION_SPLIT		2043
+#define	WT_STAT_DSRC_CACHE_EVICTION_SPLIT		2044
 /*! cache: in-memory page splits */
-#define	WT_STAT_DSRC_CACHE_INMEM_SPLIT			2044
+#define	WT_STAT_DSRC_CACHE_INMEM_SPLIT			2045
+/*! cache: in-memory page passed criteria to be split */
+#define	WT_STAT_DSRC_CACHE_INMEM_SPLITTABLE		2046
 /*! cache: overflow values cached in memory */
-#define	WT_STAT_DSRC_CACHE_OVERFLOW_VALUE		2045
+#define	WT_STAT_DSRC_CACHE_OVERFLOW_VALUE		2047
 /*! cache: pages read into cache */
-#define	WT_STAT_DSRC_CACHE_READ				2046
+#define	WT_STAT_DSRC_CACHE_READ				2048
+/*! cache: pages read into cache requiring lookaside entries */
+#define	WT_STAT_DSRC_CACHE_READ_LOOKASIDE		2049
 /*! cache: overflow pages read into cache */
-#define	WT_STAT_DSRC_CACHE_READ_OVERFLOW		2047
+#define	WT_STAT_DSRC_CACHE_READ_OVERFLOW		2050
 /*! cache: pages written from cache */
-#define	WT_STAT_DSRC_CACHE_WRITE			2048
+#define	WT_STAT_DSRC_CACHE_WRITE			2051
+/*! cache: page written requiring lookaside records */
+#define	WT_STAT_DSRC_CACHE_WRITE_LOOKASIDE		2052
+/*! cache: pages written requiring in-memory restoration */
+#define	WT_STAT_DSRC_CACHE_WRITE_RESTORE		2053
 /*! compression: raw compression call failed, no additional data available */
-#define	WT_STAT_DSRC_COMPRESS_RAW_FAIL			2049
+#define	WT_STAT_DSRC_COMPRESS_RAW_FAIL			2054
 /*! compression: raw compression call failed, additional data available */
-#define	WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY	2050
+#define	WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY	2055
 /*! compression: raw compression call succeeded */
-#define	WT_STAT_DSRC_COMPRESS_RAW_OK			2051
+#define	WT_STAT_DSRC_COMPRESS_RAW_OK			2056
 /*! compression: compressed pages read */
-#define	WT_STAT_DSRC_COMPRESS_READ			2052
+#define	WT_STAT_DSRC_COMPRESS_READ			2057
 /*! compression: compressed pages written */
-#define	WT_STAT_DSRC_COMPRESS_WRITE			2053
+#define	WT_STAT_DSRC_COMPRESS_WRITE			2058
 /*! compression: page written failed to compress */
-#define	WT_STAT_DSRC_COMPRESS_WRITE_FAIL		2054
+#define	WT_STAT_DSRC_COMPRESS_WRITE_FAIL		2059
 /*! compression: page written was too small to compress */
-#define	WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL		2055
+#define	WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL		2060
 /*! cursor: create calls */
-#define	WT_STAT_DSRC_CURSOR_CREATE			2056
+#define	WT_STAT_DSRC_CURSOR_CREATE			2061
 /*! cursor: insert calls */
-#define	WT_STAT_DSRC_CURSOR_INSERT			2057
+#define	WT_STAT_DSRC_CURSOR_INSERT			2062
 /*! cursor: bulk-loaded cursor-insert calls */
-#define	WT_STAT_DSRC_CURSOR_INSERT_BULK			2058
+#define	WT_STAT_DSRC_CURSOR_INSERT_BULK			2063
 /*! cursor: cursor-insert key and value bytes inserted */
-#define	WT_STAT_DSRC_CURSOR_INSERT_BYTES		2059
+#define	WT_STAT_DSRC_CURSOR_INSERT_BYTES		2064
 /*! cursor: next calls */
-#define	WT_STAT_DSRC_CURSOR_NEXT			2060
+#define	WT_STAT_DSRC_CURSOR_NEXT			2065
 /*! cursor: prev calls */
-#define	WT_STAT_DSRC_CURSOR_PREV			2061
+#define	WT_STAT_DSRC_CURSOR_PREV			2066
 /*! cursor: remove calls */
-#define	WT_STAT_DSRC_CURSOR_REMOVE			2062
+#define	WT_STAT_DSRC_CURSOR_REMOVE			2067
 /*! cursor: cursor-remove key bytes removed */
-#define	WT_STAT_DSRC_CURSOR_REMOVE_BYTES		2063
+#define	WT_STAT_DSRC_CURSOR_REMOVE_BYTES		2068
 /*! cursor: reset calls */
-#define	WT_STAT_DSRC_CURSOR_RESET			2064
+#define	WT_STAT_DSRC_CURSOR_RESET			2069
+/*! cursor: restarted searches */
+#define	WT_STAT_DSRC_CURSOR_RESTART			2070
 /*! cursor: search calls */
-#define	WT_STAT_DSRC_CURSOR_SEARCH			2065
+#define	WT_STAT_DSRC_CURSOR_SEARCH			2071
 /*! cursor: search near calls */
-#define	WT_STAT_DSRC_CURSOR_SEARCH_NEAR			2066
+#define	WT_STAT_DSRC_CURSOR_SEARCH_NEAR			2072
 /*! cursor: update calls */
-#define	WT_STAT_DSRC_CURSOR_UPDATE			2067
+#define	WT_STAT_DSRC_CURSOR_UPDATE			2073
 /*! cursor: cursor-update value bytes updated */
-#define	WT_STAT_DSRC_CURSOR_UPDATE_BYTES		2068
+#define	WT_STAT_DSRC_CURSOR_UPDATE_BYTES		2074
 /*! LSM: sleep for LSM checkpoint throttle */
-#define	WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE		2069
+#define	WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE		2075
 /*! LSM: chunks in the LSM tree */
-#define	WT_STAT_DSRC_LSM_CHUNK_COUNT			2070
+#define	WT_STAT_DSRC_LSM_CHUNK_COUNT			2076
 /*! LSM: highest merge generation in the LSM tree */
-#define	WT_STAT_DSRC_LSM_GENERATION_MAX			2071
+#define	WT_STAT_DSRC_LSM_GENERATION_MAX			2077
 /*! LSM: queries that could have benefited from a Bloom filter that did
  * not exist */
-#define	WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM		2072
+#define	WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM		2078
 /*! LSM: sleep for LSM merge throttle */
-#define	WT_STAT_DSRC_LSM_MERGE_THROTTLE			2073
+#define	WT_STAT_DSRC_LSM_MERGE_THROTTLE			2079
 /*! reconciliation: dictionary matches */
-#define	WT_STAT_DSRC_REC_DICTIONARY			2074
+#define	WT_STAT_DSRC_REC_DICTIONARY			2080
 /*! reconciliation: internal page multi-block writes */
-#define	WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL		2075
+#define	WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL		2081
 /*! reconciliation: leaf page multi-block writes */
-#define	WT_STAT_DSRC_REC_MULTIBLOCK_LEAF		2076
+#define	WT_STAT_DSRC_REC_MULTIBLOCK_LEAF		2082
 /*! reconciliation: maximum blocks required for a page */
-#define	WT_STAT_DSRC_REC_MULTIBLOCK_MAX			2077
+#define	WT_STAT_DSRC_REC_MULTIBLOCK_MAX			2083
 /*! reconciliation: internal-page overflow keys */
-#define	WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL		2078
+#define	WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL		2084
 /*! reconciliation: leaf-page overflow keys */
-#define	WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF		2079
+#define	WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF		2085
 /*! reconciliation: overflow values written */
-#define	WT_STAT_DSRC_REC_OVERFLOW_VALUE			2080
+#define	WT_STAT_DSRC_REC_OVERFLOW_VALUE			2086
 /*! reconciliation: pages deleted */
-#define	WT_STAT_DSRC_REC_PAGE_DELETE			2081
+#define	WT_STAT_DSRC_REC_PAGE_DELETE			2087
 /*! reconciliation: page checksum matches */
-#define	WT_STAT_DSRC_REC_PAGE_MATCH			2082
+#define	WT_STAT_DSRC_REC_PAGE_MATCH			2088
 /*! reconciliation: page reconciliation calls */
-#define	WT_STAT_DSRC_REC_PAGES				2083
+#define	WT_STAT_DSRC_REC_PAGES				2089
 /*! reconciliation: page reconciliation calls for eviction */
-#define	WT_STAT_DSRC_REC_PAGES_EVICTION			2084
+#define	WT_STAT_DSRC_REC_PAGES_EVICTION			2090
 /*! reconciliation: leaf page key bytes discarded using prefix compression */
-#define	WT_STAT_DSRC_REC_PREFIX_COMPRESSION		2085
+#define	WT_STAT_DSRC_REC_PREFIX_COMPRESSION		2091
 /*! reconciliation: internal page key bytes discarded using suffix
  * compression */
-#define	WT_STAT_DSRC_REC_SUFFIX_COMPRESSION		2086
+#define	WT_STAT_DSRC_REC_SUFFIX_COMPRESSION		2092
 /*! session: object compaction */
-#define	WT_STAT_DSRC_SESSION_COMPACT			2087
+#define	WT_STAT_DSRC_SESSION_COMPACT			2093
 /*! session: open cursor count */
-#define	WT_STAT_DSRC_SESSION_CURSOR_OPEN		2088
+#define	WT_STAT_DSRC_SESSION_CURSOR_OPEN		2094
 /*! transaction: update conflicts */
-#define	WT_STAT_DSRC_TXN_UPDATE_CONFLICT		2089
+#define	WT_STAT_DSRC_TXN_UPDATE_CONFLICT		2095
 /*! @} */
 /*
  * Statistics section: END
diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h
index 64e29e104bc..4d46a25b63c 100644
--- a/src/include/wt_internal.h
+++ b/src/include/wt_internal.h
@@ -41,6 +41,7 @@ extern "C" {
 #else
 #include <pthread.h>
 #endif
+#include <stdbool.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdint.h>
@@ -55,11 +56,6 @@ extern "C" {
 #include <windows.h>
 #endif
 
-/*******************************************
- * WiredTiger externally maintained include files.
- *******************************************/
-#include "queue.h"
-
 /*
  * DO NOT EDIT: automatically built by dist/s_typedef.
  * Forward type declarations for internal types: BEGIN
@@ -182,12 +178,18 @@ struct __wt_insert_head;
     typedef struct __wt_insert_head WT_INSERT_HEAD;
 struct __wt_keyed_encryptor;
     typedef struct __wt_keyed_encryptor WT_KEYED_ENCRYPTOR;
+struct __wt_log;
+    typedef struct __wt_log WT_LOG;
 struct __wt_log_desc;
     typedef struct __wt_log_desc WT_LOG_DESC;
 struct __wt_log_op_desc;
     typedef struct __wt_log_op_desc WT_LOG_OP_DESC;
 struct __wt_log_rec_desc;
     typedef struct __wt_log_rec_desc WT_LOG_REC_DESC;
+struct __wt_log_record;
+    typedef struct __wt_log_record WT_LOG_RECORD;
+struct __wt_logslot;
+    typedef struct __wt_logslot WT_LOGSLOT;
 struct __wt_lsm_chunk;
     typedef struct __wt_lsm_chunk WT_LSM_CHUNK;
 struct __wt_lsm_data_source;
@@ -204,6 +206,8 @@ struct __wt_lsm_worker_cookie;
     typedef struct __wt_lsm_worker_cookie WT_LSM_WORKER_COOKIE;
 struct __wt_multi;
     typedef struct __wt_multi WT_MULTI;
+struct __wt_myslot;
+    typedef struct __wt_myslot WT_MYSLOT;
 struct __wt_named_collator;
     typedef struct __wt_named_collator WT_NAMED_COLLATOR;
 struct __wt_named_compressor;
@@ -242,16 +246,18 @@ struct __wt_rwlock;
     typedef struct __wt_rwlock WT_RWLOCK;
 struct __wt_salvage_cookie;
     typedef struct __wt_salvage_cookie WT_SALVAGE_COOKIE;
+struct __wt_save_upd;
+    typedef struct __wt_save_upd WT_SAVE_UPD;
 struct __wt_scratch_track;
     typedef struct __wt_scratch_track WT_SCRATCH_TRACK;
 struct __wt_session_impl;
     typedef struct __wt_session_impl WT_SESSION_IMPL;
 struct __wt_size;
     typedef struct __wt_size WT_SIZE;
+struct __wt_spinlock;
+    typedef struct __wt_spinlock WT_SPINLOCK;
 struct __wt_split_stash;
     typedef struct __wt_split_stash WT_SPLIT_STASH;
-struct __wt_stats;
-    typedef struct __wt_stats WT_STATS;
 struct __wt_table;
     typedef struct __wt_table WT_TABLE;
 struct __wt_txn;
@@ -262,8 +268,6 @@ struct __wt_txn_op;
     typedef struct __wt_txn_op WT_TXN_OP;
 struct __wt_txn_state;
     typedef struct __wt_txn_state WT_TXN_STATE;
-struct __wt_upd_skipped;
-    typedef struct __wt_upd_skipped WT_UPD_SKIPPED;
 struct __wt_update;
     typedef struct __wt_update WT_UPDATE;
 union __wt_rand_state;
@@ -285,6 +289,8 @@ union __wt_rand_state;
 #endif
 #include "hardware.h"
 
+#include "queue.h"
+
 #ifdef _WIN32
 #include "os_windows.h"
 #else
@@ -330,6 +336,7 @@ union __wt_rand_state;
 #include "cache.i"			/* required by txn.i */
 #include "cell.i"			/* required by btree.i */
 
+#include "log.i"
 #include "mutex.i"			/* required by btree.i */
 #include "txn.i"			/* required by btree.i */
 
diff --git a/src/log/log.c b/src/log/log.c
index 4242571fe53..4041761d062 100644
--- a/src/log/log.c
+++ b/src/log/log.c
@@ -34,6 +34,24 @@ __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn)
 }
 
 /*
+ * __wt_log_ckpt_lsn --
+ *	Force out buffered records and return an LSN for checkpoint.
+ */
+int
+__wt_log_ckpt_lsn(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_LOG *log;
+
+	conn = S2C(session);
+	log = conn->log;
+	WT_RET(__wt_log_force_write(session, 1));
+	WT_RET(__wt_log_wrlsn(session));
+	*ckp_lsn = log->write_start_lsn;
+	return (0);
+}
+
+/*
  * __wt_log_background --
  *	Record the given LSN as the background LSN and signal the
  *	thread as needed.
@@ -53,7 +71,7 @@ __wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn)
 	 * needed.
 	 */
 	__wt_spin_lock(session, &log->log_sync_lock);
-	if (WT_LOG_CMP(lsn, &log->bg_sync_lsn) > 0)
+	if (__wt_log_cmp(lsn, &log->bg_sync_lsn) > 0)
 		log->bg_sync_lsn = *lsn;
 	__wt_spin_unlock(session, &log->log_sync_lock);
 	return (__wt_cond_signal(session, conn->log_file_cond));
@@ -100,7 +118,7 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn)
 	/*
 	 * Sync the log file if needed.
 	 */
-	if (WT_LOG_CMP(&log->sync_lsn, min_lsn) < 0) {
+	if (__wt_log_cmp(&log->sync_lsn, min_lsn) < 0) {
 		WT_ERR(__wt_verbose(session, WT_VERB_LOG,
 		    "log_force_sync: sync to LSN %d/%lu",
 		    min_lsn->file, min_lsn->offset));
@@ -241,6 +259,11 @@ __wt_log_get_all_files(WT_SESSION_IMPL *session,
 	log = S2C(session)->log;
 
 	*maxid = 0;
+	/*
+	 * These may be files needed by backup.  Force the current slot
+	 * to get written to the file.
+	 */
+	WT_RET(__wt_log_force_write(session, 1));
 	WT_RET(__log_get_files(session, WT_LOG_FILENAME, &files, &count));
 
 	/* Filter out any files that are below the checkpoint LSN. */
@@ -354,70 +377,12 @@ static int
 __log_size_fit(WT_SESSION_IMPL *session, WT_LSN *lsn, uint64_t recsize)
 {
 	WT_CONNECTION_IMPL *conn;
-
-	conn = S2C(session);
-	return (lsn->offset + (wt_off_t)recsize < conn->log_file_max);
-}
-
-/*
- * __log_acquire --
- *	Called with the log slot lock held.  Can be called recursively
- *	from __wt_log_newfile when we change log files.
- */
-static int
-__log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot)
-{
-	WT_CONNECTION_IMPL *conn;
 	WT_LOG *log;
-	int created_log;
 
 	conn = S2C(session);
 	log = conn->log;
-	created_log = 1;
-	/*
-	 * Called locked.  Add recsize to alloc_lsn.  Save our starting LSN
-	 * where the previous allocation finished for the release LSN.
-	 * That way when log files switch, we're waiting for the correct LSN
-	 * from outstanding writes.
-	 */
-	slot->slot_release_lsn = log->alloc_lsn;
-	if (!__log_size_fit(session, &log->alloc_lsn, recsize)) {
-		WT_RET(__wt_log_newfile(session, 0, &created_log));
-		if (log->log_close_fh != NULL)
-			F_SET(slot, WT_SLOT_CLOSEFH);
-	}
-
-	/*
-	 * Checkpoints can be configured based on amount of log written.
-	 * Add in this log record to the sum and if needed, signal the
-	 * checkpoint condition.  The logging subsystem manages the
-	 * accumulated field.  There is a bit of layering violation
-	 * here checking the connection ckpt field and using its
-	 * condition.
-	 */
-	if (WT_CKPT_LOGSIZE(conn)) {
-		log->log_written += (wt_off_t)recsize;
-		WT_RET(__wt_checkpoint_signal(session, log->log_written));
-	}
-
-	/*
-	 * Need to minimally fill in slot info here.  Our slot start LSN
-	 * comes after any potential new log file creations.
-	 */
-	slot->slot_start_lsn = log->alloc_lsn;
-	slot->slot_start_offset = log->alloc_lsn.offset;
-	/*
-	 * Pre-allocate on the first real write into the log file, if it
-	 * was just created (i.e. not pre-allocated).
-	 */
-	if (log->alloc_lsn.offset == WT_LOG_FIRST_RECORD && created_log)
-		WT_RET(__log_prealloc(session, log->log_fh));
-
-	log->alloc_lsn.offset += (wt_off_t)recsize;
-	slot->slot_end_lsn = log->alloc_lsn;
-	slot->slot_error = 0;
-	slot->slot_fh = log->log_fh;
-	return (0);
+	return (lsn->offset == WT_LOG_FIRST_RECORD ||
+	    lsn->offset + (wt_off_t)recsize < conn->log_file_max);
 }
 
 /*
@@ -490,24 +455,32 @@ __log_decrypt(WT_SESSION_IMPL *session, WT_ITEM *in, WT_ITEM *out)
  */
 static int
 __log_fill(WT_SESSION_IMPL *session,
-    WT_MYSLOT *myslot, int direct, WT_ITEM *record, WT_LSN *lsnp)
+    WT_MYSLOT *myslot, int force, WT_ITEM *record, WT_LSN *lsnp)
 {
 	WT_DECL_RET;
 	WT_LOG_RECORD *logrec;
 
+	/*
+	 * The WT_LOG_SLOT_BUF_MAX macro uses log.
+	 */
 	logrec = (WT_LOG_RECORD *)record->mem;
 	/*
-	 * Call __wt_write.  For now the offset is the real byte offset.  If the
-	 * offset becomes a unit of WT_LOG_ALIGN this is where we would multiply
-	 * by WT_LOG_ALIGN to get the real file byte offset for write().
+	 * Call __wt_write or copy into the buffer.  For now the offset is the
+	 * real byte offset.  If the offset becomes a unit of WT_LOG_ALIGN this
+	 * is where we would multiply by WT_LOG_ALIGN to get the real file byte
+	 * offset for write().
 	 */
-	if (direct)
+	if (!force && !F_ISSET(myslot, WT_MYSLOT_UNBUFFERED))
+		memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset,
+		    logrec, logrec->len);
+	else
+		/*
+		 * If this is a force or unbuffered write, write it now.
+		 * A forced write sends in a temporary, local slot.
+		 */
 		WT_ERR(__wt_write(session, myslot->slot->slot_fh,
 		    myslot->offset + myslot->slot->slot_start_offset,
 		    (size_t)logrec->len, (void *)logrec));
-	else
-		memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset,
-		    logrec, logrec->len);
 
 	WT_STAT_FAST_CONN_INCRV(session, log_bytes_written, logrec->len);
 	if (lsnp != NULL) {
@@ -563,13 +536,13 @@ __log_file_header(
 	logrec->checksum = 0;
 	logrec->checksum = __wt_cksum(logrec, log->allocsize);
 	WT_CLEAR(tmp);
+	memset(&myslot, 0, sizeof(myslot));
 	myslot.slot = &tmp;
-	myslot.offset = 0;
 
 	/*
-	 * We may recursively call __log_acquire to allocate log space for the
-	 * log descriptor record.  Call __log_fill to write it, but we
-	 * do not need to call __log_release because we're not waiting for
+	 * We may recursively call __wt_log_acquire to allocate log space for
+	 * the log descriptor record.  Call __log_fill to write it, but we
+	 * do not need to call __wt_log_release because we're not waiting for
 	 * any earlier operations to complete.
 	 */
 	if (prealloc) {
@@ -577,7 +550,7 @@ __log_file_header(
 		tmp.slot_fh = fh;
 	} else {
 		WT_ASSERT(session, fh == NULL);
-		WT_ERR(__log_acquire(session, logrec->len, &tmp));
+		WT_ERR(__wt_log_acquire(session, logrec->len, &tmp));
 	}
 	WT_ERR(__log_fill(session, &myslot, 1, buf, NULL));
 	/*
@@ -697,6 +670,146 @@ err:	__wt_scr_free(session, &from_path);
 }
 
 /*
+ * __log_newfile --
+ *	Create the next log file and write the file header record into it.
+ */
+static int
+__log_newfile(WT_SESSION_IMPL *session, int conn_open, int *created)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_LOG *log;
+	WT_LSN end_lsn;
+	int create_log, yield_cnt;
+
+	conn = S2C(session);
+	log = conn->log;
+
+	create_log = 1;
+	yield_cnt = 0;
+	/*
+	 * Set aside the log file handle to be closed later.  Other threads
+	 * may still be using it to write to the log.  If the log file size
+	 * is small we could fill a log file before the previous one is closed.
+	 * Wait for that to close.
+	 */
+	WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
+	while (log->log_close_fh != NULL) {
+		WT_STAT_FAST_CONN_INCR(session, log_close_yields);
+		WT_RET(__wt_log_wrlsn(session));
+		if (++yield_cnt > 10000)
+			return (EBUSY);
+		__wt_yield();
+	}
+	log->log_close_fh = log->log_fh;
+	if (log->log_close_fh != NULL)
+		log->log_close_lsn = log->alloc_lsn;
+	log->fileid++;
+	/*
+	 * Make sure everything we set above is visible.
+	 */
+	WT_FULL_BARRIER();
+	/*
+	 * If we're pre-allocating log files, look for one.  If there aren't any
+	 * or we're not pre-allocating, then create one.
+	 */
+	if (conn->log_prealloc) {
+		ret = __log_alloc_prealloc(session, log->fileid);
+		/*
+		 * If ret is 0 it means we found a pre-allocated file.
+		 * If ret is non-zero but not WT_NOTFOUND, we return the error.
+		 * If ret is WT_NOTFOUND, we leave create_log set and create
+		 * the new log file.
+		 */
+		if (ret == 0)
+			create_log = 0;
+		/*
+		 * If we get any error other than WT_NOTFOUND, return it.
+		 */
+		if (ret != 0 && ret != WT_NOTFOUND)
+			return (ret);
+		ret = 0;
+	}
+	/*
+	 * If we need to create the log file, do so now.
+	 */
+	if (create_log) {
+		log->prep_missed++;
+		WT_RET(__wt_log_allocfile(
+		    session, log->fileid, WT_LOG_FILENAME, 1));
+	}
+	WT_RET(__log_openfile(session,
+	    0, &log->log_fh, WT_LOG_FILENAME, log->fileid));
+	/*
+	 * We need to setup the LSNs.  Set the end LSN and alloc LSN to
+	 * the end of the header.
+	 */
+	log->alloc_lsn.file = log->fileid;
+	log->alloc_lsn.offset = WT_LOG_FIRST_RECORD;
+	end_lsn = log->alloc_lsn;
+
+	/*
+	 * If we're called from connection creation code, we need to update
+	 * the LSNs since we're the only write in progress.
+	 */
+	if (conn_open) {
+		WT_RET(__wt_fsync(session, log->log_fh));
+		log->sync_lsn = end_lsn;
+		log->write_lsn = end_lsn;
+		log->write_start_lsn = end_lsn;
+	}
+	if (created != NULL)
+		*created = create_log;
+	return (0);
+}
+
+/*
+ * __wt_log_acquire --
+ *	Called serially when switching slots.  Can be called recursively
+ *	from __log_newfile when we change log files.
+ */
+int
+__wt_log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_LOG *log;
+	int created_log;
+
+	conn = S2C(session);
+	log = conn->log;
+	created_log = 1;
+	/*
+	 * Add recsize to alloc_lsn.  Save our starting LSN
+	 * where the previous allocation finished for the release LSN.
+	 * That way when log files switch, we're waiting for the correct LSN
+	 * from outstanding writes.
+	 */
+	WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
+	/*
+	 * We need to set the release LSN earlier, before a log file change.
+	 */
+	slot->slot_release_lsn = log->alloc_lsn;
+	if (!__log_size_fit(session, &log->alloc_lsn, recsize)) {
+		WT_RET(__log_newfile(session, 0, &created_log));
+		if (log->log_close_fh != NULL)
+			F_SET(slot, WT_SLOT_CLOSEFH);
+	}
+
+	/*
+	 * Pre-allocate on the first real write into the log file, if it
+	 * was just created (i.e. not pre-allocated).
+	 */
+	if (log->alloc_lsn.offset == WT_LOG_FIRST_RECORD && created_log)
+		WT_RET(__log_prealloc(session, log->log_fh));
+	/*
+	 * Initialize the slot for activation.
+	 */
+	__wt_log_slot_activate(session, slot);
+
+	return (0);
+}
+
+/*
  * __log_truncate --
  *	Truncate the log to the given LSN.  If this_log is set, it will only
  *	truncate the log file indicated in the given LSN.  If not set,
@@ -791,7 +904,7 @@ __wt_log_allocfile(
 	 */
 	WT_RET(__wt_scr_alloc(session, 0, &from_path));
 	WT_ERR(__wt_scr_alloc(session, 0, &to_path));
-	tmp_id = WT_ATOMIC_ADD4(log->tmp_fileid, 1);
+	tmp_id = __wt_atomic_add32(&log->tmp_fileid, 1);
 	WT_ERR(__log_filename(session, tmp_id, WT_LOG_TMPNAME, from_path));
 	WT_ERR(__log_filename(session, lognum, dest, to_path));
 	/*
@@ -842,7 +955,7 @@ err:	__wt_scr_free(session, &path);
  * __wt_log_open --
  *	Open the appropriate log file for the connection.  The purpose is
  *	to find the last log file that exists, open it and set our initial
- *	LSNs to the end of that file.  If none exist, call __wt_log_newfile
+ *	LSNs to the end of that file.  If none exist, call __log_newfile
  *	to create it.
  */
 int
@@ -917,7 +1030,9 @@ __wt_log_open(WT_SESSION_IMPL *session)
 	 * Start logging at the beginning of the next log file, no matter
 	 * where the previous log file ends.
 	 */
-	WT_ERR(__wt_log_newfile(session, 1, NULL));
+	WT_WITH_SLOT_LOCK(session, log,
+	    ret = __log_newfile(session, 1, NULL));
+	WT_ERR(ret);
 
 	/* If we found log files, save the new state. */
 	if (logcount > 0) {
@@ -1055,48 +1170,67 @@ err:
 }
 
 /*
- * __log_release --
+ * __wt_log_release --
  *	Release a log slot.
  */
-static int
-__log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep)
+int
+__wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep)
 {
 	WT_CONNECTION_IMPL *conn;
 	WT_DECL_RET;
 	WT_LOG *log;
 	WT_LSN sync_lsn;
-	size_t write_size;
-	int locked, yield_count;
+	int locked, need_relock, yield_count;
+	int64_t release_buffered, release_bytes;
 
 	conn = S2C(session);
 	log = conn->log;
-	locked = yield_count = 0;
-	*freep = 1;
+	locked = need_relock = yield_count = 0;
+	if (freep != NULL)
+		*freep = 1;
+	release_buffered =
+	    WT_LOG_SLOT_RELEASED_BUFFERED(slot->slot_state);
+	release_bytes = release_buffered + slot->slot_unbuffered;
 
 	/* Write the buffered records */
-	if (F_ISSET(slot, WT_SLOT_BUFFERED)) {
-		write_size = (size_t)
-		    (slot->slot_end_lsn.offset - slot->slot_start_offset);
-		WT_ERR(__wt_write(session, slot->slot_fh,
-		    slot->slot_start_offset, write_size, slot->slot_buf.mem));
+	/*
+	 * Checkpoints can be configured based on amount of log written.
+	 * Add in this log record to the sum and if needed, signal the
+	 * checkpoint condition.  The logging subsystem manages the
+	 * accumulated field.  There is a bit of layering violation
+	 * here checking the connection ckpt field and using its
+	 * condition.
+	 */
+	if (WT_CKPT_LOGSIZE(conn)) {
+		log->log_written += (wt_off_t)release_bytes;
+		WT_RET(__wt_checkpoint_signal(session, log->log_written));
 	}
 
+	if (release_buffered != 0)
+		WT_ERR(__wt_write(session,
+		    slot->slot_fh, slot->slot_start_offset,
+		    (size_t)release_buffered, slot->slot_buf.mem));
+
 	/*
-	 * If this is not a buffered write, meaning the slot we have is a
-	 * dummy constructed slot, not from the slot pool, or we have to wait
-	 * for a synchronous operation, we do not pass handling of this slot
-	 * off to the worker thread.  The caller is responsible for freeing
-	 * the slot in that case.  Otherwise the worker thread will free it.
+	 * If we have to wait for a synchronous operation, we do not pass
+	 * handling of this slot off to the worker thread.  The caller is
+	 * responsible for freeing the slot in that case.  Otherwise the
+	 * worker thread will free it.
 	 */
-	if (F_ISSET(slot, WT_SLOT_BUFFERED) &&
-	    !F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) {
-		*freep = 0;
+	if (!F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) {
+		if (freep != NULL)
+			*freep = 0;
 		slot->slot_state = WT_LOG_SLOT_WRITTEN;
 		/*
 		 * After this point the worker thread owns the slot.  There
 		 * is nothing more to do but return.
 		 */
-		WT_ERR(__wt_cond_signal(session, conn->log_wrlsn_cond));
+		/*
+		 * !!! Signalling the wrlsn_cond condition here results in
+		 * worse performance because it causes more scheduling churn
+		 * and more walking of the slot pool for a very small number
+		 * of slots to process.  Don't signal here.
+		 */
 		goto done;
 	}
 
@@ -1105,15 +1239,31 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep)
 	 * be holes in the log file.
 	 */
 	WT_STAT_FAST_CONN_INCR(session, log_release_write_lsn);
-	while (WT_LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0) {
+	while (__wt_log_cmp(&log->write_lsn, &slot->slot_release_lsn) != 0) {
+		/*
+		 * If we're on a locked path and the write LSN is not advancing,
+		 * unlock in case an earlier thread is trying to switch its
+		 * slot and complete its operation.
+		 */
+		if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) {
+			__wt_spin_unlock(session, &log->log_slot_lock);
+			need_relock = 1;
+		}
 		if (++yield_count < 1000)
 			__wt_yield();
 		else
 			WT_ERR(__wt_cond_wait(
 			    session, log->log_write_cond, 200));
+		if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) {
+			__wt_spin_lock(session, &log->log_slot_lock);
+			need_relock = 0;
+		}
 	}
+
 	log->write_start_lsn = slot->slot_start_lsn;
 	log->write_lsn = slot->slot_end_lsn;
+
+	WT_ASSERT(session, slot != log->active_slot);
 	WT_ERR(__wt_cond_signal(session, log->log_write_cond));
 
 	/*
@@ -1168,7 +1318,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep)
 		 * Sync the log file if needed.
 		 */
 		if (F_ISSET(slot, WT_SLOT_SYNC) &&
-		    WT_LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
+		    __wt_log_cmp(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
 			WT_ERR(__wt_verbose(session, WT_VERB_LOG,
 			    "log_release: sync log %s", log->log_fh->name));
 			WT_STAT_FAST_CONN_INCR(session, log_sync);
@@ -1186,6 +1336,8 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep)
 	}
 err:	if (locked)
 		__wt_spin_unlock(session, &log->log_sync_lock);
+	if (need_relock)
+		__wt_spin_lock(session, &log->log_slot_lock);
 	if (ret != 0 && slot->slot_error == 0)
 		slot->slot_error = ret;
 done:
@@ -1193,93 +1345,6 @@ done:
 }
 
 /*
- * __wt_log_newfile --
- *	Create the next log file and write the file header record into it.
- */
-int
-__wt_log_newfile(WT_SESSION_IMPL *session, int conn_create, int *created)
-{
-	WT_CONNECTION_IMPL *conn;
-	WT_DECL_RET;
-	WT_LOG *log;
-	WT_LSN end_lsn;
-	int create_log;
-
-	conn = S2C(session);
-	log = conn->log;
-
-	create_log = 1;
-	/*
-	 * Set aside the log file handle to be closed later.  Other threads
-	 * may still be using it to write to the log.  If the log file size
-	 * is small we could fill a log file before the previous one is closed.
-	 * Wait for that to close.
-	 */
-	while (log->log_close_fh != NULL) {
-		WT_STAT_FAST_CONN_INCR(session, log_close_yields);
-		WT_RET(__wt_log_wrlsn(session, NULL, NULL));
-		__wt_yield();
-	}
-	log->log_close_fh = log->log_fh;
-	log->fileid++;
-
-	/*
-	 * If we're pre-allocating log files, look for one.  If there aren't any
-	 * or we're not pre-allocating, then create one.
-	 */
-	ret = 0;
-	if (conn->log_prealloc) {
-		ret = __log_alloc_prealloc(session, log->fileid);
-		/*
-		 * If ret is 0 it means we found a pre-allocated file.
-		 * If ret is non-zero but not WT_NOTFOUND, we return the error.
-		 * If ret is WT_NOTFOUND, we leave create_log set and create
-		 * the new log file.
-		 */
-		if (ret == 0)
-			create_log = 0;
-		/*
-		 * If we get any error other than WT_NOTFOUND, return it.
-		 */
-		if (ret != 0 && ret != WT_NOTFOUND)
-			return (ret);
-		ret = 0;
-	}
-	/*
-	 * If we need to create the log file, do so now.
-	 */
-	if (create_log) {
-		log->prep_missed++;
-		if ((ret = __wt_log_allocfile(
-		    session, log->fileid, WT_LOG_FILENAME, 0)) != 0)
-			return (ret);
-	}
-	WT_RET(__log_openfile(session,
-	    0, &log->log_fh, WT_LOG_FILENAME, log->fileid));
-	/*
-	 * We need to setup the LSNs.  Set the end LSN and alloc LSN to
-	 * the end of the header.
-	 */
-	log->alloc_lsn.file = log->fileid;
-	log->alloc_lsn.offset = WT_LOG_FIRST_RECORD;
-	end_lsn = log->alloc_lsn;
-
-	/*
-	 * If we're called from connection creation code, we need to update
-	 * the LSNs since we're the only write in progress.
-	 */
-	if (conn_create) {
-		WT_RET(__wt_fsync(session, log->log_fh));
-		log->sync_lsn = end_lsn;
-		log->write_lsn = end_lsn;
-		log->write_start_lsn = end_lsn;
-	}
-	if (created != NULL)
-		*created = create_log;
-	return (0);
-}
-
-/*
  * __wt_log_scan --
  *	Scan the logs, calling a function on each record found.
  */
@@ -1535,7 +1600,7 @@ advance:
 
 	/* Truncate if we're in recovery. */
 	if (LF_ISSET(WT_LOGSCAN_RECOVER) &&
-	    WT_LOG_CMP(&rd_lsn, &log->trunc_lsn) < 0)
+	    __wt_log_cmp(&rd_lsn, &log->trunc_lsn) < 0)
 		WT_ERR(__log_truncate(session,
 		    &rd_lsn, WT_LOG_FILENAME, 0));
 
@@ -1559,43 +1624,20 @@ err:	WT_STAT_FAST_CONN_INCR(session, log_scans);
 }
 
 /*
- * __log_direct_write --
- *	Write a log record without using the consolidation arrays.
+ * __wt_log_force_write --
+ *	Force a switch and release and write of the current slot.
+ *	Wrapper function that takes the lock.
  */
-static int
-__log_direct_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
-    uint32_t flags)
+int
+__wt_log_force_write(WT_SESSION_IMPL *session, int retry)
 {
-	WT_DECL_RET;
 	WT_LOG *log;
-	WT_LOGSLOT tmp;
 	WT_MYSLOT myslot;
-	int dummy, locked;
 
 	log = S2C(session)->log;
-	myslot.slot = &tmp;
-	myslot.offset = 0;
-	dummy = 0;
-	WT_CLEAR(tmp);
-
-	/* Fast path the contended case. */
-	if (__wt_spin_trylock(session, &log->log_slot_lock) != 0)
-		return (EAGAIN);
-	locked = 1;
-
-	if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC))
-		F_SET(&tmp, WT_SLOT_SYNC_DIR);
-	if (LF_ISSET(WT_LOG_FSYNC))
-		F_SET(&tmp, WT_SLOT_SYNC);
-	WT_ERR(__log_acquire(session, record->size, &tmp));
-	__wt_spin_unlock(session, &log->log_slot_lock);
-	locked = 0;
-	WT_ERR(__log_fill(session, &myslot, 1, record, lsnp));
-	WT_ERR(__log_release(session, &tmp, &dummy));
-
-err:	if (locked)
-		__wt_spin_unlock(session, &log->log_slot_lock);
-	return (ret);
+	memset(&myslot, 0, sizeof(myslot));
+	myslot.slot = log->active_slot;
+	return (__wt_log_slot_switch(session, &myslot, retry, 1));
 }
 
 /*
@@ -1741,14 +1783,16 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
 	WT_LOG_RECORD *logrec;
 	WT_LSN lsn;
 	WT_MYSLOT myslot;
-	uint32_t rdup_len;
-	int free_slot, locked;
+	int64_t release_size;
+	uint32_t force, rdup_len;
+	int free_slot;
 
 	conn = S2C(session);
 	log = conn->log;
-	free_slot = locked = 0;
+	free_slot = 0;
 	WT_INIT_LSN(&lsn);
 	myslot.slot = NULL;
+	memset(&myslot, 0, sizeof(myslot));
 	/*
 	 * Assume the WT_ITEM the caller passed is a WT_LOG_RECORD, which has a
 	 * header at the beginning for us to fill in.
@@ -1778,87 +1822,67 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
 
 	WT_STAT_FAST_CONN_INCR(session, log_writes);
 
-	if (!F_ISSET(log, WT_LOG_FORCE_CONSOLIDATE)) {
-		ret = __log_direct_write(session, record, &lsn, flags);
-		if (ret == 0 && lsnp != NULL)
-			*lsnp = lsn;
-		/*
-		 * All needed syncing will be handled directly except
-		 * a background sync.  Handle that here.
-		 */
-		if (ret == 0) {
-			if (LF_ISSET(WT_LOG_BACKGROUND))
-				goto bg;
-			else
-				return (0);
-		}
-		if (ret != EAGAIN)
-			WT_ERR(ret);
-		/*
-		 * An EAGAIN return means we failed to get the try lock -
-		 * fall through to the consolidation code in that case.
-		 */
-	}
-
+	__wt_log_slot_join(session, rdup_len, flags, &myslot);
+	/*
+	 * If the addition of this record crosses the buffer boundary,
+	 * switch in a new slot.
+	 */
+	force = LF_ISSET(WT_LOG_FLUSH | WT_LOG_FSYNC);
+	ret = 0;
+	if (myslot.end_offset >= WT_LOG_SLOT_BUF_MAX ||
+	    F_ISSET(&myslot, WT_MYSLOT_UNBUFFERED) || force)
+		ret = __wt_log_slot_switch(session, &myslot, 1, 0);
+	if (ret == 0)
+		ret = __log_fill(session, &myslot, 0, record, &lsn);
+	release_size = __wt_log_slot_release(
+	    session, &myslot, (int64_t)rdup_len);
 	/*
-	 * As soon as we see contention for the log slot, disable direct
-	 * log writes. We get better performance by forcing writes through
-	 * the consolidation code. This is because individual writes flood
-	 * the I/O system faster than they contend on the log slot lock.
+	 * If we get an error we still need to do proper accounting in
+	 * the slot fields.
+	 * XXX On error we may still need to call release and free.
 	 */
-	F_SET(log, WT_LOG_FORCE_CONSOLIDATE);
-	if ((ret = __wt_log_slot_join(
-	    session, rdup_len, flags, &myslot)) == ENOMEM) {
+	if (ret != 0)
+		myslot.slot->slot_error = ret;
+	WT_ASSERT(session, ret == 0);
+	if (WT_LOG_SLOT_DONE(release_size)) {
+		WT_ERR(__wt_log_release(session, myslot.slot, &free_slot));
+		if (free_slot)
+			__wt_log_slot_free(session, myslot.slot);
+	} else if (force) {
 		/*
-		 * If we couldn't find a consolidated slot for this record
-		 * write the record directly.
+		 * If we are going to wait for this slot to get written,
+		 * signal the wrlsn thread.
+		 *
+		 * XXX I've seen times when conditions are NULL.
 		 */
-		while ((ret = __log_direct_write(
-		    session, record, lsnp, flags)) == EAGAIN)
-			;
-		WT_ERR(ret);
-		return (0);
+		if (conn->log_cond != NULL) {
+			WT_ERR(__wt_cond_signal(session, conn->log_cond));
+			__wt_yield();
+		} else
+			WT_ERR(__wt_log_force_write(session, 1));
 	}
-	WT_ERR(ret);
-	if (myslot.offset == 0) {
-		__wt_spin_lock(session, &log->log_slot_lock);
-		locked = 1;
-		WT_ERR(__wt_log_slot_close(session, myslot.slot));
-		WT_ERR(__log_acquire(
-		    session, myslot.slot->slot_group_size, myslot.slot));
-		__wt_spin_unlock(session, &log->log_slot_lock);
-		locked = 0;
-		WT_ERR(__wt_log_slot_notify(session, myslot.slot));
-	} else
-		WT_ERR(__wt_log_slot_wait(session, myslot.slot));
-	WT_ERR(__log_fill(session, &myslot, 0, record, &lsn));
-	if (__wt_log_slot_release(myslot.slot, rdup_len) == WT_LOG_SLOT_DONE) {
-		WT_ERR(__log_release(session, myslot.slot, &free_slot));
-		if (free_slot)
-			WT_ERR(__wt_log_slot_free(session, myslot.slot));
+	if (LF_ISSET(WT_LOG_FLUSH)) {
+		/* Wait for our writes to reach the OS */
+		while (__wt_log_cmp(&log->write_lsn, &lsn) <= 0 &&
+		    myslot.slot->slot_error == 0)
+			(void)__wt_cond_wait(
+			    session, log->log_write_cond, 10000);
 	} else if (LF_ISSET(WT_LOG_FSYNC)) {
 		/* Wait for our writes to reach disk */
-		while (WT_LOG_CMP(&log->sync_lsn, &lsn) <= 0 &&
+		while (__wt_log_cmp(&log->sync_lsn, &lsn) <= 0 &&
 		    myslot.slot->slot_error == 0)
 			(void)__wt_cond_wait(
 			    session, log->log_sync_cond, 10000);
-	} else if (LF_ISSET(WT_LOG_FLUSH)) {
-		/* Wait for our writes to reach the OS */
-		while (WT_LOG_CMP(&log->write_lsn, &lsn) <= 0 &&
-		    myslot.slot->slot_error == 0)
-			(void)__wt_cond_wait(
-			    session, log->log_write_cond, 10000);
 	}
 
 	/*
 	 * Advance the background sync LSN if needed.
 	 */
-bg:	if (LF_ISSET(WT_LOG_BACKGROUND) &&
-	    WT_LOG_CMP(&session->bg_sync_lsn, &lsn) <= 0)
+	if (LF_ISSET(WT_LOG_BACKGROUND) &&
+	    __wt_log_cmp(&session->bg_sync_lsn, &lsn) <= 0)
 		WT_ERR(__wt_log_background(session, &lsn));
 
-err:	if (locked)
-		__wt_spin_unlock(session, &log->log_slot_lock);
+err:
 	if (ret == 0 && lsnp != NULL)
 		*lsnp = lsn;
 	/*
diff --git a/src/log/log_slot.c b/src/log/log_slot.c
index 0b580af4526..216a594ce3d 100644
--- a/src/log/log_slot.c
+++ b/src/log/log_slot.c
@@ -9,325 +9,486 @@
 #include "wt_internal.h"
 
 /*
- * This file implements the consolidated array algorithm as described in
- * the paper:
- * Scalability of write-ahead logging on multicore and multisocket hardware
- * by Ryan Johnson, Ippokratis Pandis, Radu Stoica, Manos Athanassoulis
- * and Anastasia Ailamaki.
- *
- * It appeared in The VLDB Journal, DOI 10.1007/s00778-011-0260-8 and can
- * be found at:
- * http://infoscience.epfl.ch/record/170505/files/aether-smpfulltext.pdf
+ * __wt_log_slot_activate --
+ *	Initialize a slot to become active.
  */
-
-/*
- * __wt_log_slot_init --
- *	Initialize the slot array.
- */
-int
-__wt_log_slot_init(WT_SESSION_IMPL *session)
+void
+__wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
 {
 	WT_CONNECTION_IMPL *conn;
-	WT_DECL_RET;
 	WT_LOG *log;
-	WT_LOGSLOT *slot;
-	int32_t i;
 
 	conn = S2C(session);
 	log = conn->log;
-	for (i = 0; i < WT_SLOT_POOL; i++) {
-		log->slot_pool[i].slot_state = WT_LOG_SLOT_FREE;
-		log->slot_pool[i].slot_index = WT_SLOT_INVALID_INDEX;
-	}
 
-	/*
-	 * Set up the available slots from the pool the first time.
-	 */
-	for (i = 0; i < WT_SLOT_ACTIVE; i++) {
-		slot = &log->slot_pool[i];
-		slot->slot_index = (uint32_t)i;
-		slot->slot_state = WT_LOG_SLOT_READY;
-		log->slot_array[i] = slot;
-	}
-
-	/*
-	 * Allocate memory for buffers now that the arrays are setup. Split
-	 * this out to make error handling simpler.
-	 *
-	 * Cap the slot buffer to the log file size.
-	 */
-	log->slot_buf_size =
-	    WT_MIN((size_t)conn->log_file_max, WT_LOG_SLOT_BUF_SIZE);
-	for (i = 0; i < WT_SLOT_POOL; i++) {
-		WT_ERR(__wt_buf_init(session,
-		    &log->slot_pool[i].slot_buf, log->slot_buf_size));
-		F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS);
-	}
-	WT_STAT_FAST_CONN_INCRV(session,
-	    log_buffer_size, log->slot_buf_size * WT_SLOT_POOL);
-	if (0) {
-err:		while (--i >= 0)
-			__wt_buf_free(session, &log->slot_pool[i].slot_buf);
-	}
-	return (ret);
+	slot->slot_state = 0;
+	slot->slot_start_lsn = slot->slot_end_lsn = log->alloc_lsn;
+	slot->slot_start_offset = log->alloc_lsn.offset;
+	slot->slot_last_offset = log->alloc_lsn.offset;
+	slot->slot_fh = log->log_fh;
+	slot->slot_error = 0;
+	slot->slot_unbuffered = 0;
 }
 
 /*
- * __wt_log_slot_destroy --
- *	Clean up the slot array on shutdown.
+ * __wt_log_slot_close --
+ *	Close out the slot the caller is using.  The slot may already be
+ *	closed or freed by another thread.
  */
 int
-__wt_log_slot_destroy(WT_SESSION_IMPL *session)
+__wt_log_slot_close(
+    WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *releasep, int forced)
 {
 	WT_CONNECTION_IMPL *conn;
 	WT_LOG *log;
-	int i;
+	int64_t end_offset, new_state, old_state;
 
+	WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
 	conn = S2C(session);
 	log = conn->log;
-
-	for (i = 0; i < WT_SLOT_POOL; i++)
-		__wt_buf_free(session, &log->slot_pool[i].slot_buf);
+	if (releasep != NULL)
+		*releasep = 0;
+	if (slot == NULL)
+		return (WT_NOTFOUND);
+retry:
+	old_state = slot->slot_state;
+	/*
+	 * If this close is coming from a forced close and a thread is in
+	 * the middle of using the slot, return EBUSY.  The caller can
+	 * decide if retrying is necessary or not.
+	 */
+	if (forced && WT_LOG_SLOT_INPROGRESS(old_state))
+		return (EBUSY);
+	/*
+	 * If someone else is switching out this slot we lost.  Nothing to
+	 * do but return.  Return WT_NOTFOUND anytime the given slot was
+	 * processed by another closing thread.  Only return 0 when we
+	 * actually closed the slot.
+	 */
+	if (WT_LOG_SLOT_CLOSED(old_state))
+		return (WT_NOTFOUND);
+	/*
+	 * If someone completely processed this slot, we're done.
+	 */
+	if (FLD64_ISSET((uint64_t)slot->slot_state, WT_LOG_SLOT_RESERVED))
+		return (WT_NOTFOUND);
+	new_state = (old_state | WT_LOG_SLOT_CLOSE);
+	/*
+	 * Close this slot.  If we lose the race retry.
+	 */
+	if (!__wt_atomic_casiv64(&slot->slot_state, old_state, new_state))
+		goto retry;
+	/*
+	 * We own the slot now.  No one else can join.
+	 * Set the end LSN.
+	 */
+	WT_STAT_FAST_CONN_INCR(session, log_slot_closes);
+	if (WT_LOG_SLOT_DONE(new_state) && releasep != NULL)
+		*releasep = 1;
+	slot->slot_end_lsn = slot->slot_start_lsn;
+	end_offset =
+	    WT_LOG_SLOT_JOINED_BUFFERED(old_state) + slot->slot_unbuffered;
+	slot->slot_end_lsn.offset += (wt_off_t)end_offset;
+	WT_STAT_FAST_CONN_INCRV(session,
+	    log_slot_consolidated, end_offset);
+	/*
+	 * XXX Would like to change so one piece of code advances the LSN.
+	 */
+	log->alloc_lsn = slot->slot_end_lsn;
+	WT_ASSERT(session, log->alloc_lsn.file >= log->write_lsn.file);
 	return (0);
 }
 
 /*
- * __wt_log_slot_join --
- *	Join a consolidated logging slot. Callers should be prepared to deal
- *	with a ENOMEM return - which indicates no slots could accommodate
- *	the log record.
+ * __log_slot_switch_internal --
+ *	Switch out the current slot and set up a new one.
  */
-int
-__wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize,
-    uint32_t flags, WT_MYSLOT *myslotp)
+static int
+__log_slot_switch_internal(
+    WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int forced)
 {
-	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
 	WT_LOG *log;
 	WT_LOGSLOT *slot;
-	int64_t new_state, old_state;
-	uint32_t allocated_slot, slot_attempts;
+	int free_slot, release;
 
-	conn = S2C(session);
-	log = conn->log;
-	slot_attempts = 0;
+	log = S2C(session)->log;
+	release = 0;
+	slot = myslot->slot;
+
+	WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
 
-	if (mysize >= (uint64_t)log->slot_buf_size) {
-		WT_STAT_FAST_CONN_INCR(session, log_slot_toobig);
-		return (ENOMEM);
-	}
-find_slot:
-#if WT_SLOT_ACTIVE == 1
-	allocated_slot = 0;
-#else
-	allocated_slot = __wt_random(&session->rnd) % WT_SLOT_ACTIVE;
-#endif
-	/*
-	 * Get the selected slot.  Use a barrier to prevent the compiler from
-	 * caching this read.
-	 */
-	WT_BARRIER();
-	slot = log->slot_array[allocated_slot];
-join_slot:
-	/*
-	 * Read the current slot state.  Use a barrier to prevent the compiler
-	 * from caching this read.
-	 */
-	WT_BARRIER();
-	old_state = slot->slot_state;
-	/*
-	 * WT_LOG_SLOT_READY and higher means the slot is available for
-	 * joining.  Any other state means it is in use and transitioning
-	 * from the active array.
-	 */
-	if (old_state < WT_LOG_SLOT_READY) {
-		WT_STAT_FAST_CONN_INCR(session, log_slot_transitions);
-		goto find_slot;
-	}
 	/*
-	 * Add in our size to the state and then atomically swap that
-	 * into place if it is still the same value.
+	 * If someone else raced us to closing this specific slot, we're
+	 * done here.
 	 */
-	new_state = old_state + (int64_t)mysize;
-	if (new_state < old_state) {
-		/* Our size doesn't fit here. */
-		WT_STAT_FAST_CONN_INCR(session, log_slot_toobig);
-		goto find_slot;
-	}
+	if (slot != log->active_slot)
+		return (0);
+
 	/*
-	 * If the slot buffer isn't big enough to hold this update, try
-	 * to find another slot.
+	 * If close returns WT_NOTFOUND, it means that someone else is
+	 * processing the slot change.  However, we could have retried
+	 * from a busy time creating a new slot.  If so, we are that
+	 * someone else and we need to try setting up a new slot again.
 	 */
-	if (new_state > (int64_t)slot->slot_buf.memsize) {
-		if (++slot_attempts > 5) {
-			WT_STAT_FAST_CONN_INCR(session, log_slot_toosmall);
-			return (ENOMEM);
+	if (!F_ISSET(myslot, WT_MYSLOT_CLOSE)) {
+		ret = __wt_log_slot_close(
+		    session, slot, &release, forced);
+		if (ret == WT_NOTFOUND)
+			return (0);
+		WT_RET(ret);
+		if (release) {
+			WT_RET(__wt_log_release(session, slot, &free_slot));
+			if (free_slot)
+				__wt_log_slot_free(session, slot);
 		}
-		goto find_slot;
 	}
 	/*
-	 * We lost a race to add our size into this slot.  Check the state
-	 * and try again.
+	 * Set that we have closed this slot because we may call in here
+	 * multiple times if we retry creating a new slot.
 	 */
-	if (!WT_ATOMIC_CAS8(slot->slot_state, old_state, new_state)) {
-		WT_STAT_FAST_CONN_INCR(session, log_slot_races);
-		goto join_slot;
-	}
-	WT_ASSERT(session, myslotp != NULL);
+	F_SET(myslot, WT_MYSLOT_CLOSE);
+	WT_RET(__wt_log_slot_new(session));
+	F_CLR(myslot, WT_MYSLOT_CLOSE);
+	return (0);
+}
+
+/*
+ * __wt_log_slot_switch --
+ *	Switch out the current slot and set up a new one.
+ */
+int
+__wt_log_slot_switch(
+    WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int retry, int forced)
+{
+	WT_DECL_RET;
+	WT_LOG *log;
+
+	log = S2C(session)->log;
 	/*
-	 * We joined this slot.  Fill in our information to return to
-	 * the caller.
+	 * !!! Since the WT_WITH_SLOT_LOCK macro is a do-while loop, the
+	 * compiler does not like it combined directly with the while loop
+	 * here.
 	 */
-	WT_STAT_FAST_CONN_INCR(session, log_slot_joins);
-	if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC))
-		F_SET(slot, WT_SLOT_SYNC_DIR);
-	if (LF_ISSET(WT_LOG_FSYNC))
-		F_SET(slot, WT_SLOT_SYNC);
-	myslotp->slot = slot;
-	myslotp->offset = (wt_off_t)old_state - WT_LOG_SLOT_READY;
-	return (0);
+	do {
+		WT_WITH_SLOT_LOCK(session, log,
+		    ret = __log_slot_switch_internal(
+		    session, myslot, forced));
+		if (ret == EBUSY) {
+			WT_STAT_FAST_CONN_INCR(session, log_slot_switch_busy);
+			__wt_yield();
+		}
+	} while (F_ISSET(myslot, WT_MYSLOT_CLOSE) || (retry && ret == EBUSY));
+	return (ret);
 }
 
 /*
- * __log_slot_find_free --
- * 	Find and return a free log slot.
+ * __wt_log_slot_new --
+ *	Find a free slot and switch it as the new active slot.
+ *	Must be called holding the slot lock.
  */
-static int
-__log_slot_find_free(WT_SESSION_IMPL *session, WT_LOGSLOT **slot)
+int
+__wt_log_slot_new(WT_SESSION_IMPL *session)
 {
 	WT_CONNECTION_IMPL *conn;
 	WT_LOG *log;
-	uint32_t pool_i;
+	WT_LOGSLOT *slot;
+	int32_t i;
 
+	WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
 	conn = S2C(session);
 	log = conn->log;
-	WT_ASSERT(session, slot != NULL);
 	/*
-	 * Encourage processing and moving the write LSN forward.
-	 * That process has to walk the slots anyway, so do that
-	 * work and let it give us the index of a free slot along
-	 * the way.
+	 * Although this function is single threaded, multiple threads could
+	 * be trying to set a new active slot sequentially.  If we find an
+	 * active slot that is valid, return.
 	 */
-	WT_RET(__wt_log_wrlsn(session, &pool_i, NULL));
-	while (pool_i == WT_SLOT_POOL) {
+	if ((slot = log->active_slot) != NULL &&
+	    WT_LOG_SLOT_OPEN(slot->slot_state))
+		return (0);
+
+	/*
+	 * Keep trying until we can find a free slot.
+	 */
+	for (;;) {
+		/*
+		 * For now just restart at 0.  We could use log->pool_index
+		 * if that is inefficient.
+		 */
+		for (i = 0; i < WT_SLOT_POOL; i++) {
+			slot = &log->slot_pool[i];
+			if (slot->slot_state == WT_LOG_SLOT_FREE) {
+				/*
+				 * Make sure that the next buffer size can
+				 * fit in the file.  Proactively switch if
+				 * it cannot.  This reduces, but does not
+				 * eliminate, log files that exceed the
+				 * maximum file size.
+				 *
+				 * We want to minimize the risk of an
+				 * error due to no space.
+				 */
+				WT_RET(__wt_log_acquire(session,
+				    log->slot_buf_size, slot));
+				/*
+				 * We have a new, free slot to use.
+				 * Set it as the active slot.
+				 */
+				WT_STAT_FAST_CONN_INCR(session,
+				    log_slot_transitions);
+				log->active_slot = slot;
+				return (0);
+			}
+		}
+		/*
+		 * If we didn't find any free slots signal the worker thread.
+		 */
+		(void)__wt_cond_signal(session, conn->log_wrlsn_cond);
 		__wt_yield();
-		WT_RET(__wt_log_wrlsn(session, &pool_i, NULL));
 	}
-	*slot = &log->slot_pool[pool_i];
-	WT_ASSERT(session, (*slot)->slot_state == WT_LOG_SLOT_FREE);
-	return (0);
+	/* NOTREACHED */
 }
 
 /*
- * __wt_log_slot_close --
- *	Close a slot and do not allow any other threads to join this slot.
- *	Remove this from the active slot array and move a new slot from
- *	the pool into its place.  Set up the size of this group;
- *	Must be called with the logging spinlock held.
+ * __wt_log_slot_init --
+ *	Initialize the slot array.
  */
 int
-__wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+__wt_log_slot_init(WT_SESSION_IMPL *session)
 {
 	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
 	WT_LOG *log;
-	WT_LOGSLOT *newslot;
-	int64_t old_state;
+	WT_LOGSLOT *slot;
+	int32_t i;
 
 	conn = S2C(session);
 	log = conn->log;
-	/*
-	 * Find an unused slot in the pool.
-	 */
-	WT_RET(__log_slot_find_free(session, &newslot));
+	WT_CACHE_LINE_ALIGNMENT_VERIFY(session, log->slot_pool);
+	for (i = 0; i < WT_SLOT_POOL; i++)
+		log->slot_pool[i].slot_state = WT_LOG_SLOT_FREE;
 
 	/*
-	 * Swap out the slot we're going to use and put a free one in the
-	 * slot array in its place so that threads can use it right away.
+	 * Allocate memory for buffers now that the arrays are setup. Split
+	 * this out to make error handling simpler.
 	 */
-	WT_STAT_FAST_CONN_INCR(session, log_slot_closes);
-	newslot->slot_state = WT_LOG_SLOT_READY;
-	newslot->slot_index = slot->slot_index;
-	log->slot_array[newslot->slot_index] = newslot;
-	old_state = WT_ATOMIC_STORE8(slot->slot_state, WT_LOG_SLOT_PENDING);
-	slot->slot_group_size = (uint64_t)(old_state - WT_LOG_SLOT_READY);
 	/*
-	 * Note that this statistic may be much bigger than in reality,
-	 * especially when compared with the total bytes written in
-	 * __log_fill.  The reason is that this size reflects any
-	 * rounding up that is needed and the total bytes in __log_fill
-	 * is the amount of user bytes.
+	 * Cap the slot buffer to the log file size times two if needed.
+	 * That means we try to fill to half the buffer but allow some
+	 * extra space.
+	 *
+	 * !!! If the buffer size is too close to the log file size, we will
+	 * switch log files very aggressively.  Scale back the buffer for
+	 * small log file sizes.
 	 */
+	log->slot_buf_size = (uint32_t)WT_MIN(
+	    (size_t)conn->log_file_max/10, WT_LOG_SLOT_BUF_SIZE);
+	for (i = 0; i < WT_SLOT_POOL; i++) {
+		WT_ERR(__wt_buf_init(session,
+		    &log->slot_pool[i].slot_buf, log->slot_buf_size));
+		F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS);
+	}
 	WT_STAT_FAST_CONN_INCRV(session,
-	    log_slot_consolidated, (uint64_t)slot->slot_group_size);
-	return (0);
+	    log_buffer_size, log->slot_buf_size * WT_SLOT_POOL);
+	/*
+	 * Set up the available slot from the pool the first time.
+	 */
+	slot = &log->slot_pool[0];
+	/*
+	 * We cannot initialize the release LSN in the activate function
+	 * because that is called after a log file switch.
+	 */
+	slot->slot_release_lsn = log->alloc_lsn;
+	__wt_log_slot_activate(session, slot);
+	log->active_slot = slot;
+
+	if (0) {
+err:		while (--i >= 0)
+			__wt_buf_free(session, &log->slot_pool[i].slot_buf);
+	}
+	return (ret);
 }
 
 /*
- * __wt_log_slot_notify --
- *	Notify all threads waiting for the state to be < WT_LOG_SLOT_DONE.
+ * __wt_log_slot_destroy --
+ *	Clean up the slot array on shutdown.
  */
 int
-__wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+__wt_log_slot_destroy(WT_SESSION_IMPL *session)
 {
-	WT_UNUSED(session);
+	WT_CONNECTION_IMPL *conn;
+	WT_LOG *log;
+	WT_LOGSLOT *slot;
+	int64_t rel;
+	int i;
 
-	slot->slot_state =
-	    (int64_t)WT_LOG_SLOT_DONE - (int64_t)slot->slot_group_size;
+	conn = S2C(session);
+	log = conn->log;
+
+	/*
+	 * Write out any remaining buffers.  Free the buffer.
+	 */
+	for (i = 0; i < WT_SLOT_POOL; i++) {
+		slot = &log->slot_pool[i];
+		if (!FLD64_ISSET(
+		    (uint64_t)slot->slot_state, WT_LOG_SLOT_RESERVED)) {
+			rel = WT_LOG_SLOT_RELEASED_BUFFERED(slot->slot_state);
+			if (rel != 0)
+				WT_RET(__wt_write(session, slot->slot_fh,
+				    slot->slot_start_offset, (size_t)rel,
+				    slot->slot_buf.mem));
+		}
+		__wt_buf_free(session, &log->slot_pool[i].slot_buf);
+	}
 	return (0);
 }
 
 /*
- * __wt_log_slot_wait --
- *	Wait for slot leader to allocate log area and tell us our log offset.
+ * __wt_log_slot_join --
+ *	Join a consolidated logging slot.  Must be called with
+ *	the read lock held.
  */
-int
-__wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+void
+__wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize,
+    uint32_t flags, WT_MYSLOT *myslot)
 {
-	int yield_count;
+	WT_CONNECTION_IMPL *conn;
+	WT_LOG *log;
+	WT_LOGSLOT *slot;
+	int64_t flag_state, new_state, old_state, released;
+	int32_t join_offset, new_join;
+#ifdef	HAVE_DIAGNOSTIC
+	int unbuf_force;
+#endif
 
-	yield_count = 0;
-	WT_UNUSED(session);
+	conn = S2C(session);
+	log = conn->log;
 
-	while (slot->slot_state > WT_LOG_SLOT_DONE)
-		if (++yield_count < 1000)
-			__wt_yield();
-		else
-			__wt_sleep(0, 200);
-	return (0);
+	/*
+	 * Make sure the length cannot overflow.  The caller should not
+	 * even call this function if it doesn't fit but use direct
+	 * writes.
+	 */
+	WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT));
+
+	/*
+	 * There should almost always be a slot open.
+	 */
+#ifdef	HAVE_DIAGNOSTIC
+	unbuf_force = ((++log->write_calls % 1000) == 0);
+#endif
+	for (;;) {
+		WT_BARRIER();
+		slot = log->active_slot;
+		old_state = slot->slot_state;
+		/*
+		 * Try to join our size into the existing size and
+		 * atomically write it back into the state.
+		 */
+		flag_state = WT_LOG_SLOT_FLAGS(old_state);
+		released = WT_LOG_SLOT_RELEASED(old_state);
+		join_offset = WT_LOG_SLOT_JOINED(old_state);
+#ifdef	HAVE_DIAGNOSTIC
+		if (unbuf_force || mysize > WT_LOG_SLOT_BUF_MAX) {
+#else
+		if (mysize > WT_LOG_SLOT_BUF_MAX) {
+#endif
+			new_join = join_offset + WT_LOG_SLOT_UNBUFFERED;
+			F_SET(myslot, WT_MYSLOT_UNBUFFERED);
+			myslot->slot = slot;
+		} else
+			new_join = join_offset + (int32_t)mysize;
+		new_state = (int64_t)WT_LOG_SLOT_JOIN_REL(
+		    (int64_t)new_join, (int64_t)released, (int64_t)flag_state);
+
+		/*
+		 * Check if the slot is open for joining and we are able to
+		 * swap in our size into the state.
+		 */
+		if (WT_LOG_SLOT_OPEN(old_state) &&
+		    __wt_atomic_casiv64(
+		    &slot->slot_state, old_state, new_state))
+			break;
+		/*
+		 * The slot is no longer open or we lost the race to
+		 * update it.  Yield and try again.
+		 */
+		WT_STAT_FAST_CONN_INCR(session, log_slot_races);
+		__wt_yield();
+	}
+	/*
+	 * We joined this slot.  Fill in our information to return to
+	 * the caller.
+	 */
+	if (mysize != 0)
+		WT_STAT_FAST_CONN_INCR(session, log_slot_joins);
+	if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC))
+		F_SET(slot, WT_SLOT_SYNC_DIR);
+	if (LF_ISSET(WT_LOG_FSYNC))
+		F_SET(slot, WT_SLOT_SYNC);
+	if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) {
+		WT_ASSERT(session, slot->slot_unbuffered == 0);
+		WT_STAT_FAST_CONN_INCR(session, log_slot_unbuffered);
+		slot->slot_unbuffered = (int64_t)mysize;
+	}
+	myslot->slot = slot;
+	myslot->offset = join_offset;
+	myslot->end_offset = (wt_off_t)((uint64_t)join_offset + mysize);
 }
 
 /*
  * __wt_log_slot_release --
  *	Each thread in a consolidated group releases its portion to
- *	signal it has completed writing its piece of the log.
+ *	signal it has completed copying its piece of the log into
+ *	the memory buffer.
  */
 int64_t
-__wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size)
+__wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size)
 {
-	int64_t newsize;
+	WT_LOGSLOT *slot;
+	wt_off_t cur_offset, my_start;
+	int64_t my_size, rel_size;
 
+	WT_UNUSED(session);
+	slot = myslot->slot;
+	my_start = slot->slot_start_offset + myslot->offset;
+	while ((cur_offset = slot->slot_last_offset) < my_start) {
+		/*
+		 * Set our offset if we are larger.
+		 */
+		if (__wt_atomic_casiv64(
+		    &slot->slot_last_offset, cur_offset, my_start))
+			break;
+		/*
+		 * If we raced another thread updating this, try again.
+		 */
+		WT_BARRIER();
+	}
 	/*
-	 * Add my size into the state.  When it reaches WT_LOG_SLOT_DONE
-	 * all participatory threads have completed copying their piece.
+	 * Add my size into the state and return the new size.
 	 */
-	newsize = WT_ATOMIC_ADD8(slot->slot_state, (int64_t)size);
-	return (newsize);
+	rel_size = size;
+	if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED))
+		rel_size = WT_LOG_SLOT_UNBUFFERED;
+	my_size = (int64_t)WT_LOG_SLOT_JOIN_REL((int64_t)0, rel_size, 0);
+	return (__wt_atomic_addiv64(&slot->slot_state, my_size));
 }
 
 /*
  * __wt_log_slot_free --
  *	Free a slot back into the pool.
  */
-int
+void
 __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
 {
 
-	WT_UNUSED(session);
 	/*
 	 * Make sure flags don't get retained between uses.
 	 * We have to reset them them here because multiple threads may
 	 * change the flags when joining the slot.
 	 */
+	WT_UNUSED(session);
 	slot->flags = WT_SLOT_INIT_FLAGS;
+	slot->slot_error = 0;
 	slot->slot_state = WT_LOG_SLOT_FREE;
-	return (0);
 }
diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c
index 84b8d5c9532..6068bb6c559 100644
--- a/src/lsm/lsm_cursor.c
+++ b/src/lsm/lsm_cursor.c
@@ -134,7 +134,7 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm)
 	if (have_primary) {
 		WT_ENTER_PAGE_INDEX(session);
 		WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)primary)->btree,
-		    ovfl = __wt_btree_lsm_size(session, hard_limit ?
+		    ovfl = __wt_btree_lsm_over_size(session, hard_limit ?
 		    2 * lsm_tree->chunk_size : lsm_tree->chunk_size));
 		WT_LEAVE_PAGE_INDEX(session);
 
@@ -1066,12 +1066,12 @@ __clsm_lookup(WT_CURSOR_LSM *clsm, WT_ITEM *value)
 
 			ret = __wt_bloom_hash_get(bloom, &bhash);
 			if (ret == WT_NOTFOUND) {
-				WT_STAT_FAST_INCR(session,
-				    &clsm->lsm_tree->stats, bloom_miss);
+				WT_LSM_TREE_STAT_INCR(
+				    session, clsm->lsm_tree->bloom_miss);
 				continue;
 			} else if (ret == 0)
-				WT_STAT_FAST_INCR(session,
-				    &clsm->lsm_tree->stats, bloom_hit);
+				WT_LSM_TREE_STAT_INCR(
+				    session, clsm->lsm_tree->bloom_hit);
 			WT_ERR(ret);
 		}
 		c->set_key(c, &cursor->key);
@@ -1086,11 +1086,11 @@ __clsm_lookup(WT_CURSOR_LSM *clsm, WT_ITEM *value)
 		F_CLR(c, WT_CURSTD_KEY_SET);
 		/* Update stats: the active chunk can't have a bloom filter. */
 		if (bloom != NULL)
-			WT_STAT_FAST_INCR(session,
-			    &clsm->lsm_tree->stats, bloom_false_positive);
+			WT_LSM_TREE_STAT_INCR(session,
+			    clsm->lsm_tree->bloom_false_positive);
 		else if (clsm->primary_chunk == NULL || i != clsm->nchunks)
-			WT_STAT_FAST_INCR(session,
-			    &clsm->lsm_tree->stats, lsm_lookup_no_bloom);
+			WT_LSM_TREE_STAT_INCR(session,
+			    clsm->lsm_tree->lsm_lookup_no_bloom);
 	}
 	WT_ERR(WT_NOTFOUND);
 
@@ -1331,12 +1331,12 @@ __clsm_put(WT_SESSION_IMPL *session,
 	    ++clsm->update_count >= 100) &&
 	    lsm_tree->merge_throttle + lsm_tree->ckpt_throttle > 0) {
 		clsm->update_count = 0;
-		WT_STAT_FAST_INCRV(session, &clsm->lsm_tree->stats,
-		    lsm_checkpoint_throttle, lsm_tree->ckpt_throttle);
+		WT_LSM_TREE_STAT_INCRV(session,
+		    lsm_tree->lsm_checkpoint_throttle, lsm_tree->ckpt_throttle);
 		WT_STAT_FAST_CONN_INCRV(session,
 		    lsm_checkpoint_throttle, lsm_tree->ckpt_throttle);
-		WT_STAT_FAST_INCRV(session, &clsm->lsm_tree->stats,
-		    lsm_merge_throttle, lsm_tree->merge_throttle);
+		WT_LSM_TREE_STAT_INCRV(session,
+		    lsm_tree->lsm_merge_throttle, lsm_tree->merge_throttle);
 		WT_STAT_FAST_CONN_INCRV(session,
 		    lsm_merge_throttle, lsm_tree->merge_throttle);
 		__wt_sleep(0,
diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c
index cb078d991d8..6c59232b619 100644
--- a/src/lsm/lsm_manager.c
+++ b/src/lsm/lsm_manager.c
@@ -258,7 +258,7 @@ __wt_lsm_manager_free_work_unit(
 	if (entry != NULL) {
 		WT_ASSERT(session, entry->lsm_tree->queue_ref > 0);
 
-		(void)WT_ATOMIC_SUB4(entry->lsm_tree->queue_ref, 1);
+		(void)__wt_atomic_sub32(&entry->lsm_tree->queue_ref, 1);
 		__wt_free(session, entry);
 	}
 }
@@ -273,7 +273,7 @@ __wt_lsm_manager_destroy(WT_SESSION_IMPL *session)
 	WT_CONNECTION_IMPL *conn;
 	WT_DECL_RET;
 	WT_LSM_MANAGER *manager;
-	WT_LSM_WORK_UNIT *current, *next;
+	WT_LSM_WORK_UNIT *current;
 	WT_SESSION *wt_session;
 	uint32_t i;
 	uint64_t removed;
@@ -297,23 +297,17 @@ __wt_lsm_manager_destroy(WT_SESSION_IMPL *session)
 		manager->lsm_worker_cookies[0].tid = 0;
 
 		/* Release memory from any operations left on the queue. */
-		for (current = TAILQ_FIRST(&manager->switchqh);
-		    current != NULL; current = next) {
-			next = TAILQ_NEXT(current, q);
+		while ((current = TAILQ_FIRST(&manager->switchqh)) != NULL) {
 			TAILQ_REMOVE(&manager->switchqh, current, q);
 			++removed;
 			__wt_lsm_manager_free_work_unit(session, current);
 		}
-		for (current = TAILQ_FIRST(&manager->appqh);
-		    current != NULL; current = next) {
-			next = TAILQ_NEXT(current, q);
+		while ((current = TAILQ_FIRST(&manager->appqh)) != NULL) {
 			TAILQ_REMOVE(&manager->appqh, current, q);
 			++removed;
 			__wt_lsm_manager_free_work_unit(session, current);
 		}
-		for (current = TAILQ_FIRST(&manager->managerqh);
-		    current != NULL; current = next) {
-			next = TAILQ_NEXT(current, q);
+		while ((current = TAILQ_FIRST(&manager->managerqh)) != NULL) {
 			TAILQ_REMOVE(&manager->managerqh, current, q);
 			++removed;
 			__wt_lsm_manager_free_work_unit(session, current);
@@ -645,9 +639,9 @@ __wt_lsm_manager_push_entry(WT_SESSION_IMPL *session,
 	 * on close, the flag is cleared and then the queue reference count
 	 * is checked.
 	 */
-	(void)WT_ATOMIC_ADD4(lsm_tree->queue_ref, 1);
+	(void)__wt_atomic_add32(&lsm_tree->queue_ref, 1);
 	if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) {
-		(void)WT_ATOMIC_SUB4(lsm_tree->queue_ref, 1);
+		(void)__wt_atomic_sub32(&lsm_tree->queue_ref, 1);
 		return (0);
 	}
 
@@ -674,6 +668,6 @@ __wt_lsm_manager_push_entry(WT_SESSION_IMPL *session,
 	return (0);
 err:
 	if (!pushed)
-		(void)WT_ATOMIC_SUB4(lsm_tree->queue_ref, 1);
+		(void)__wt_atomic_sub32(&lsm_tree->queue_ref, 1);
 	return (ret);
 }
diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c
index d7e684b8f51..01a61359949 100644
--- a/src/lsm/lsm_merge.c
+++ b/src/lsm/lsm_merge.c
@@ -398,7 +398,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
 	locked = 0;
 
 	/* Allocate an ID for the merge. */
-	dest_id = WT_ATOMIC_ADD4(lsm_tree->last, 1);
+	dest_id = __wt_atomic_add32(&lsm_tree->last, 1);
 
 	/*
 	 * We only want to do the chunk loop if we're running with verbose,
@@ -493,7 +493,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
 	 * merge_syncing field so that compact knows it is still in
 	 * progress.
 	 */
-	(void)WT_ATOMIC_ADD4(lsm_tree->merge_syncing, 1);
+	(void)__wt_atomic_add32(&lsm_tree->merge_syncing, 1);
 	in_sync = 1;
 	/*
 	 * We've successfully created the new chunk.  Now install it.  We need
@@ -512,7 +512,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
 	 * Don't block if the cache is full: our next unit of work may be to
 	 * discard some trees to free space.
 	 */
-	F_SET(session, WT_SESSION_NO_CACHE_CHECK);
+	F_SET(session, WT_SESSION_NO_EVICTION);
 
 	if (create_bloom) {
 		if (ret == 0)
@@ -544,7 +544,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
 	WT_TRET(dest->close(dest));
 	dest = NULL;
 	++lsm_tree->merge_progressing;
-	(void)WT_ATOMIC_SUB4(lsm_tree->merge_syncing, 1);
+	(void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1);
 	in_sync = 0;
 	WT_ERR_NOTFOUND_OK(ret);
 
@@ -600,7 +600,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
 err:	if (locked)
 		WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
 	if (in_sync)
-		(void)WT_ATOMIC_SUB4(lsm_tree->merge_syncing, 1);
+		(void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1);
 	if (src != NULL)
 		WT_TRET(src->close(src));
 	if (dest != NULL)
@@ -632,6 +632,6 @@ err:	if (locked)
 			    "Merge failed with %s",
 			   __wt_strerror(session, ret, NULL, 0)));
 	}
-	F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK);
+	F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
 	return (ret);
 }
diff --git a/src/lsm/lsm_stat.c b/src/lsm/lsm_stat.c
index 126a59af0d1..2817ec9eeb7 100644
--- a/src/lsm/lsm_stat.c
+++ b/src/lsm/lsm_stat.c
@@ -22,6 +22,7 @@ __curstat_lsm_init(
 	WT_DSRC_STATS *new, *stats;
 	WT_LSM_CHUNK *chunk;
 	WT_LSM_TREE *lsm_tree;
+	int64_t bloom_count;
 	u_int i;
 	int locked;
 	char config[64];
@@ -49,25 +50,22 @@ __curstat_lsm_init(
 		cfg[1] = disk_cfg[1] = config;
 	}
 
-	/*
-	 * Set the cursor to reference the data source statistics; we don't
-	 * initialize it, instead we copy (rather than aggregate), the first
-	 * chunk's statistics, which has the same effect.
-	 */
-	stats = &cst->u.dsrc_stats;
-
 	/* Hold the LSM lock so that we can safely walk through the chunks. */
 	WT_ERR(__wt_lsm_tree_readlock(session, lsm_tree));
 	locked = 1;
 
-	/* Initialize the statistics. */
-	__wt_stat_init_dsrc_stats(stats);
+	/*
+	 * Set the cursor to reference the data source statistics into which
+	 * we're going to aggregate statistics from the underlying objects.
+	 */
+	stats = &cst->u.dsrc_stats;
+	__wt_stat_dsrc_init_single(stats);
 
 	/*
 	 * For each chunk, aggregate its statistics, as well as any associated
 	 * bloom filter statistics, into the total statistics.
 	 */
-	for (i = 0; i < lsm_tree->nchunks; i++) {
+	for (bloom_count = 0, i = 0; i < lsm_tree->nchunks; i++) {
 		chunk = lsm_tree->chunk[i];
 
 		/*
@@ -93,17 +91,17 @@ __curstat_lsm_init(
 		 * top-level.
 		 */
 		new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
-		WT_STAT_SET(new, lsm_generation_max, chunk->generation);
+		new->lsm_generation_max = chunk->generation;
 
 		/* Aggregate statistics from each new chunk. */
-		__wt_stat_aggregate_dsrc_stats(new, stats);
+		__wt_stat_dsrc_aggregate_single(new, stats);
 		WT_ERR(stat_cursor->close(stat_cursor));
 
 		if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
 			continue;
 
 		/* Maintain a count of bloom filters. */
-		WT_STAT_INCR(&lsm_tree->stats, bloom_count);
+		++bloom_count;
 
 		/* Get the bloom filter's underlying object. */
 		WT_ERR(__wt_buf_fmt(
@@ -117,24 +115,39 @@ __curstat_lsm_init(
 		 * into the top-level.
 		 */
 		new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
-		WT_STAT_SET(new,
-		    bloom_size, (chunk->count * lsm_tree->bloom_bit_count) / 8);
-		WT_STAT_SET(new, bloom_page_evict,
-		    WT_STAT(new, cache_eviction_clean) +
-		    WT_STAT(new, cache_eviction_dirty));
-		WT_STAT_SET(new, bloom_page_read, WT_STAT(new, cache_read));
-
-		__wt_stat_aggregate_dsrc_stats(new, stats);
+		new->bloom_size =
+		    (int64_t)((chunk->count * lsm_tree->bloom_bit_count) / 8);
+		new->bloom_page_evict =
+		    new->cache_eviction_clean + new->cache_eviction_dirty;
+		new->bloom_page_read = new->cache_read;
+
+		__wt_stat_dsrc_aggregate_single(new, stats);
 		WT_ERR(stat_cursor->close(stat_cursor));
 	}
 
 	/* Set statistics that aren't aggregated directly into the cursor */
-	WT_STAT_SET(stats, lsm_chunk_count, lsm_tree->nchunks);
+	stats->bloom_count = bloom_count;
+	stats->lsm_chunk_count = lsm_tree->nchunks;
 
-	/* Aggregate, and optionally clear, LSM-level specific information. */
-	__wt_stat_aggregate_dsrc_stats(&lsm_tree->stats, stats);
+	/* Include, and optionally clear, LSM-level specific information. */
+	stats->bloom_miss = lsm_tree->bloom_miss;
+	if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
+		lsm_tree->bloom_miss = 0;
+	stats->bloom_hit = lsm_tree->bloom_hit;
+	if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
+		lsm_tree->bloom_hit = 0;
+	stats->bloom_false_positive = lsm_tree->bloom_false_positive;
+	if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
+		lsm_tree->bloom_false_positive = 0;
+	stats->lsm_lookup_no_bloom = lsm_tree->lsm_lookup_no_bloom;
+	if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
+		lsm_tree->lsm_lookup_no_bloom = 0;
+	stats->lsm_checkpoint_throttle = lsm_tree->lsm_checkpoint_throttle;
+	if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
+		lsm_tree->lsm_checkpoint_throttle = 0;
+	stats->lsm_merge_throttle = lsm_tree->lsm_merge_throttle;
 	if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
-		__wt_stat_refresh_dsrc_stats(&lsm_tree->stats);
+		lsm_tree->lsm_merge_throttle = 0;
 
 	__wt_curstat_dsrc_final(cst);
 
diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c
index 6c6b185f821..46db76e099c 100644
--- a/src/lsm/lsm_tree.c
+++ b/src/lsm/lsm_tree.c
@@ -141,7 +141,7 @@ __wt_lsm_tree_close_all(WT_SESSION_IMPL *session)
 		 * is no need to decrement the reference count since discard
 		 * is unconditional.
 		 */
-		(void)WT_ATOMIC_ADD4(lsm_tree->refcnt, 1);
+		(void)__wt_atomic_add32(&lsm_tree->refcnt, 1);
 		WT_TRET(__lsm_tree_close(session, lsm_tree));
 		WT_TRET(__lsm_tree_discard(session, lsm_tree, 1));
 	}
@@ -486,15 +486,17 @@ __lsm_tree_find(WT_SESSION_IMPL *session,
 				 * Make sure we win the race to switch on the
 				 * exclusive flag.
 				 */
-				if (!WT_ATOMIC_CAS1(lsm_tree->exclusive, 0, 1))
+				if (!__wt_atomic_cas8(
+				    &lsm_tree->exclusive, 0, 1))
 					return (EBUSY);
 				/* Make sure there are no readers */
-				if (!WT_ATOMIC_CAS4(lsm_tree->refcnt, 0, 1)) {
+				if (!__wt_atomic_cas32(
+				    &lsm_tree->refcnt, 0, 1)) {
 					lsm_tree->exclusive = 0;
 					return (EBUSY);
 				}
 			} else {
-				(void)WT_ATOMIC_ADD4(lsm_tree->refcnt, 1);
+				(void)__wt_atomic_add32(&lsm_tree->refcnt, 1);
 
 				/*
 				 * We got a reference, check if an exclusive
@@ -503,8 +505,8 @@ __lsm_tree_find(WT_SESSION_IMPL *session,
 				if (lsm_tree->exclusive) {
 					WT_ASSERT(session,
 					    lsm_tree->refcnt > 0);
-					(void)WT_ATOMIC_SUB4(
-					    lsm_tree->refcnt, 1);
+					(void)__wt_atomic_sub32(
+					    &lsm_tree->refcnt, 1);
 					return (EBUSY);
 				}
 			}
@@ -565,7 +567,7 @@ __lsm_tree_open(WT_SESSION_IMPL *session,
 	WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
 
 	/* Start the LSM manager thread if it isn't running. */
-	if (WT_ATOMIC_CAS4(conn->lsm_manager.lsm_workers, 0, 1))
+	if (__wt_atomic_cas32(&conn->lsm_manager.lsm_workers, 0, 1))
 		WT_RET(__wt_lsm_manager_start(session));
 
 	/* Make sure no one beat us to it. */
@@ -596,7 +598,7 @@ __lsm_tree_open(WT_SESSION_IMPL *session,
 	 * with getting handles exclusive.
 	 */
 	lsm_tree->refcnt = 1;
-	lsm_tree->exclusive = (int8_t)exclusive;
+	lsm_tree->exclusive = exclusive ? 1 : 0;
 	lsm_tree->queue_ref = 0;
 
 	/* Set a flush timestamp as a baseline. */
@@ -644,7 +646,7 @@ __wt_lsm_tree_release(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
 	WT_ASSERT(session, lsm_tree->refcnt > 0);
 	if (lsm_tree->exclusive)
 		lsm_tree->exclusive = 0;
-	(void)WT_ATOMIC_SUB4(lsm_tree->refcnt, 1);
+	(void)__wt_atomic_sub32(&lsm_tree->refcnt, 1);
 }
 
 /* How aggressively to ramp up or down throttle due to level 0 merging */
@@ -839,7 +841,7 @@ __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
 	/* Update the throttle time. */
 	__wt_lsm_tree_throttle(session, lsm_tree, 0);
 
-	new_id = WT_ATOMIC_ADD4(lsm_tree->last, 1);
+	new_id = __wt_atomic_add32(&lsm_tree->last, 1);
 
 	WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc,
 	    nchunks + 1, &lsm_tree->chunk));
@@ -1097,7 +1099,7 @@ __wt_lsm_tree_truncate(
 
 	/* Create the new chunk. */
 	WT_ERR(__wt_calloc_one(session, &chunk));
-	chunk->id = WT_ATOMIC_ADD4(lsm_tree->last, 1);
+	chunk->id = __wt_atomic_add32(&lsm_tree->last, 1);
 	WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
 
 	/* Mark all chunks old. */
@@ -1142,7 +1144,7 @@ __wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
 	 * Diagnostic: avoid deadlocks with the schema lock: if we need it for
 	 * an operation, we should already have it.
 	 */
-	F_SET(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);
+	F_SET(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK);
 	return (0);
 }
 
@@ -1155,7 +1157,7 @@ __wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
 {
 	WT_DECL_RET;
 
-	F_CLR(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);
+	F_CLR(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK);
 
 	if ((ret = __wt_readunlock(session, lsm_tree->rwlock)) != 0)
 		WT_PANIC_RET(session, ret, "Unlocking an LSM tree");
@@ -1175,7 +1177,7 @@ __wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
 	 * Diagnostic: avoid deadlocks with the schema lock: if we need it for
 	 * an operation, we should already have it.
 	 */
-	F_SET(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);
+	F_SET(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK);
 	return (0);
 }
 
@@ -1188,7 +1190,7 @@ __wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
 {
 	WT_DECL_RET;
 
-	F_CLR(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);
+	F_CLR(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK);
 
 	if ((ret = __wt_writeunlock(session, lsm_tree->rwlock)) != 0)
 		WT_PANIC_RET(session, ret, "Unlocking an LSM tree");
@@ -1207,7 +1209,8 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip)
 	WT_LSM_TREE *lsm_tree;
 	time_t begin, end;
 	uint64_t progress;
-	int i, compacting, flushing, locked, ref;
+	uint32_t i;
+	int compacting, flushing, locked, ref;
 
 	compacting = flushing = locked = ref = 0;
 	chunk = NULL;
@@ -1282,7 +1285,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip)
 		 * If we have a chunk, we want to look for it to be on-disk.
 		 * So we need to add a reference to keep it available.
 		 */
-		(void)WT_ATOMIC_ADD4(chunk->refcnt, 1);
+		(void)__wt_atomic_add32(&chunk->refcnt, 1);
 		ref = 1;
 	}
 
@@ -1330,7 +1333,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip)
 				    "Start compacting progress %" PRIu64,
 				    name, chunk->id,
 				    lsm_tree->merge_progressing));
-				(void)WT_ATOMIC_SUB4(chunk->refcnt, 1);
+				(void)__wt_atomic_sub32(&chunk->refcnt, 1);
 				flushing = ref = 0;
 				compacting = 1;
 				F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
@@ -1384,7 +1387,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip)
 err:
 	/* Ensure anything we set is cleared. */
 	if (ref)
-		(void)WT_ATOMIC_SUB4(chunk->refcnt, 1);
+		(void)__wt_atomic_sub32(&chunk->refcnt, 1);
 	if (compacting) {
 		F_CLR(lsm_tree, WT_LSM_TREE_COMPACTING);
 		lsm_tree->merge_aggressiveness = 0;
diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c
index c3bee162ea1..8eba0127b8b 100644
--- a/src/lsm/lsm_work_unit.c
+++ b/src/lsm/lsm_work_unit.c
@@ -53,7 +53,7 @@ __lsm_copy_chunks(WT_SESSION_IMPL *session,
 	 * it's safe.
 	 */
 	for (i = 0; i < nchunks; i++)
-		(void)WT_ATOMIC_ADD4(cookie->chunk_array[i]->refcnt, 1);
+		(void)__wt_atomic_add32(&cookie->chunk_array[i]->refcnt, 1);
 
 err:	WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree));
 
@@ -122,7 +122,7 @@ __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session,
 		    force ? " w/ force" : "",
 		    i, lsm_tree->nchunks, chunk->uri));
 
-		(void)WT_ATOMIC_ADD4(chunk->refcnt, 1);
+		(void)__wt_atomic_add32(&chunk->refcnt, 1);
 	}
 
 err:	WT_RET(__wt_lsm_tree_readunlock(session, lsm_tree));
@@ -145,7 +145,7 @@ __lsm_unpin_chunks(WT_SESSION_IMPL *session, WT_LSM_WORKER_COOKIE *cookie)
 		if (cookie->chunk_array[i] == NULL)
 			continue;
 		WT_ASSERT(session, cookie->chunk_array[i]->refcnt > 0);
-		(void)WT_ATOMIC_SUB4(cookie->chunk_array[i]->refcnt, 1);
+		(void)__wt_atomic_sub32(&cookie->chunk_array[i]->refcnt, 1);
 	}
 	/* Ensure subsequent calls don't double decrement. */
 	cookie->nchunks = 0;
@@ -223,7 +223,7 @@ __wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
 		 * See if we win the race to switch on the "busy" flag and
 		 * recheck that the chunk still needs a Bloom filter.
 		 */
-		if (WT_ATOMIC_CAS4(chunk->bloom_busy, 0, 1)) {
+		if (__wt_atomic_cas32(&chunk->bloom_busy, 0, 1)) {
 			if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) {
 				ret = __lsm_bloom_create(
 				    session, lsm_tree, chunk, (u_int)i);
@@ -301,17 +301,19 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
 	 * Flush the file before checkpointing: this is the expensive part in
 	 * terms of I/O.
 	 *
-	 * Use the special eviction isolation level to avoid interfering with
-	 * an application checkpoint: we have already checked that all of the
-	 * updates in this chunk are globally visible.
-	 *
-	 * !!! We can wait here for checkpoints and fsyncs to complete, which
-	 * can be a long time.
+	 * !!!
+	 * We can wait here for checkpoints and fsyncs to complete, which can
+	 * take a long time.
 	 */
 	if ((ret = __wt_session_get_btree(
 	    session, chunk->uri, NULL, NULL, 0)) == 0) {
+		/*
+		 * Set read-uncommitted: we have already checked that all of the
+		 * updates in this chunk are globally visible, use the cheapest
+		 * possible check in reconciliation.
+		 */
 		saved_isolation = session->txn.isolation;
-		session->txn.isolation = WT_ISO_EVICTION;
+		session->txn.isolation = WT_ISO_READ_UNCOMMITTED;
 		ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES);
 		session->txn.isolation = saved_isolation;
 		WT_TRET(__wt_session_release_btree(session));
@@ -412,7 +414,7 @@ __lsm_bloom_create(WT_SESSION_IMPL *session,
 	 * ourselves to get stuck creating bloom filters, the entire tree
 	 * can stall since there may be no worker threads available to flush.
 	 */
-	F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK);
+	F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
 	for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
 		WT_ERR(src->get_key(src, &key));
 		WT_ERR(__wt_bloom_insert(bloom, &key));
@@ -446,7 +448,7 @@ __lsm_bloom_create(WT_SESSION_IMPL *session,
 
 err:	if (bloom != NULL)
 		WT_TRET(__wt_bloom_close(bloom));
-	F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK);
+	F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
 	return (ret);
 }
 
@@ -528,7 +530,7 @@ __wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
 	 * Make sure only a single thread is freeing the old chunk array
 	 * at any time.
 	 */
-	if (!WT_ATOMIC_CAS4(lsm_tree->freeing_old_chunks, 0, 1))
+	if (!__wt_atomic_cas32(&lsm_tree->freeing_old_chunks, 0, 1))
 		return (0);
 	/*
 	 * Take a copy of the current state of the LSM tree and look for chunks
diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c
index 8ed4a117641..3add3155e17 100644
--- a/src/lsm/lsm_worker.c
+++ b/src/lsm/lsm_worker.c
@@ -65,7 +65,7 @@ __lsm_worker_general_op(
 			ret = __wt_lsm_checkpoint_chunk(
 			    session, entry->lsm_tree, chunk);
 			WT_ASSERT(session, chunk->refcnt > 0);
-			(void)WT_ATOMIC_SUB4(chunk->refcnt, 1);
+			(void)__wt_atomic_sub32(&chunk->refcnt, 1);
 			WT_ERR(ret);
 		}
 	} else if (entry->type == WT_LSM_WORK_DROP)
diff --git a/src/meta/meta_apply.c b/src/meta/meta_apply.c
index 6d08ce3aa6a..315621f2ae9 100644
--- a/src/meta/meta_apply.c
+++ b/src/meta/meta_apply.c
@@ -32,7 +32,7 @@ __wt_meta_btree_apply(WT_SESSION_IMPL *session,
 		WT_ERR(cursor->get_key(cursor, &uri));
 		if (!WT_PREFIX_MATCH(uri, "file:"))
 			break;
-		else if (strcmp(uri, WT_METAFILE_URI) == 0)
+		if (strcmp(uri, WT_METAFILE_URI) == 0)
 			continue;
 
 		/*
diff --git a/src/meta/meta_table.c b/src/meta/meta_table.c
index 227d0fa9a6c..8255f004dab 100644
--- a/src/meta/meta_table.c
+++ b/src/meta/meta_table.c
@@ -12,22 +12,22 @@
  * __metadata_turtle --
  *	Return if a key's value should be taken from the turtle file.
  */
-static int
+static bool
 __metadata_turtle(const char *key)
 {
 	switch (key[0]) {
 	case 'f':
 		if (strcmp(key, WT_METAFILE_URI) == 0)
-			return (1);
+			return (true);
 		break;
 	case 'W':
 		if (strcmp(key, "WiredTiger version") == 0)
-			return (1);
+			return (true);
 		if (strcmp(key, "WiredTiger version string") == 0)
-			return (1);
+			return (true);
 		break;
 	}
-	return (0);
+	return (false);
 }
 
 /*
@@ -37,6 +37,8 @@ __metadata_turtle(const char *key)
 int
 __wt_metadata_open(WT_SESSION_IMPL *session)
 {
+	WT_BTREE *btree;
+
 	if (session->meta_dhandle != NULL)
 		return (0);
 
@@ -45,7 +47,24 @@ __wt_metadata_open(WT_SESSION_IMPL *session)
 	session->meta_dhandle = session->dhandle;
 	WT_ASSERT(session, session->meta_dhandle != NULL);
 
-	/* The meta_dhandle doesn't need to stay locked -- release it. */
+	/* 
+	 * Set special flags for the metadata file: eviction (the metadata file
+	 * is in-memory and never evicted), logging (the metadata file is always
+	 * logged if possible).
+	 *
+	 * Test flags before setting them so updates can't race in subsequent
+	 * opens (the first update is safe because it's single-threaded from
+	 * wiredtiger_open).
+	 */
+	btree = S2BT(session);
+	if (!F_ISSET(btree, WT_BTREE_IN_MEMORY))
+		F_SET(btree, WT_BTREE_IN_MEMORY);
+	if (!F_ISSET(btree, WT_BTREE_NO_EVICTION))
+		F_SET(btree, WT_BTREE_NO_EVICTION);
+	if (F_ISSET(btree, WT_BTREE_NO_LOGGING))
+		F_CLR(btree, WT_BTREE_NO_LOGGING);
+
+	/* The metadata handle doesn't need to stay locked -- release it. */
 	return (__wt_session_release_btree(session));
 }
 
@@ -59,9 +78,9 @@ __wt_metadata_cursor(
 {
 	WT_DATA_HANDLE *saved_dhandle;
 	WT_DECL_RET;
+	int is_dead;
 	const char *cfg[] =
 	    { WT_CONFIG_BASE(session, WT_SESSION_open_cursor), config, NULL };
-	int is_dead;
 
 	saved_dhandle = session->dhandle;
 	WT_ERR(__wt_metadata_open(session));
diff --git a/src/os_posix/os_alloc.c b/src/os_posix/os_alloc.c
index 4d04f9ac579..eb2482723ec 100644
--- a/src/os_posix/os_alloc.c
+++ b/src/os_posix/os_alloc.c
@@ -58,7 +58,9 @@ __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp)
 		WT_STAT_FAST_CONN_INCR(session, memory_allocation);
 
 	if ((p = calloc(number, size)) == NULL)
-		WT_RET_MSG(session, __wt_errno(), "memory allocation");
+		WT_RET_MSG(session, __wt_errno(),
+		    "memory allocation of %" WT_SIZET_FMT " bytes failed",
+		    size * number);
 
 	*(void **)retp = p;
 	return (0);
@@ -100,7 +102,9 @@ __wt_realloc(WT_SESSION_IMPL *session,
 	}
 
 	if ((p = realloc(p, bytes_to_allocate)) == NULL)
-		WT_RET_MSG(session, __wt_errno(), "memory allocation");
+		WT_RET_MSG(session, __wt_errno(),
+		    "memory allocation of %" WT_SIZET_FMT " bytes failed",
+		    bytes_to_allocate);
 
 	/*
 	 * Clear the allocated memory -- an application might: allocate memory,
@@ -171,7 +175,9 @@ __wt_realloc_aligned(WT_SESSION_IMPL *session,
 		if ((ret = posix_memalign(&newp,
 		    S2C(session)->buffer_alignment,
 		    bytes_to_allocate)) != 0)
-			WT_RET_MSG(session, ret, "memory allocation");
+			WT_RET_MSG(session, ret,
+			     "memory allocation of %" WT_SIZET_FMT
+			     " bytes failed", bytes_to_allocate);
 
 		if (p != NULL)
 			memcpy(newp, p, bytes_allocated);
diff --git a/src/os_posix/os_mtx_cond.c b/src/os_posix/os_mtx_cond.c
index dfd72dd0cd2..7946b4ab0cc 100644
--- a/src/os_posix/os_mtx_cond.c
+++ b/src/os_posix/os_mtx_cond.c
@@ -41,11 +41,13 @@ err:	__wt_free(session, cond);
 }
 
 /*
- * __wt_cond_wait --
- *	Wait on a mutex, optionally timing out.
+ * __wt_cond_wait_signal --
+ *	Wait on a mutex, optionally timing out.  If we get it
+ *	before the time out period expires, let the caller know.
  */
 int
-__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs)
+__wt_cond_wait_signal(
+    WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, int *signalled)
 {
 	struct timespec ts;
 	WT_DECL_RET;
@@ -54,7 +56,8 @@ __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs)
 	locked = 0;
 
 	/* Fast path if already signalled. */
-	if (WT_ATOMIC_ADD4(cond->waiters, 1) == 0)
+	*signalled = 1;
+	if (__wt_atomic_addi32(&cond->waiters, 1) == 0)
 		return (0);
 
 	/*
@@ -88,10 +91,12 @@ __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs)
 #ifdef ETIME
 	    ret == ETIME ||
 #endif
-	    ret == ETIMEDOUT)
+	    ret == ETIMEDOUT) {
+		*signalled = 0;
 		ret = 0;
+	}
 
-	(void)WT_ATOMIC_SUB4(cond->waiters, 1);
+	(void)__wt_atomic_subi32(&cond->waiters, 1);
 
 err:	if (locked)
 		WT_TRET(pthread_mutex_unlock(&cond->mtx));
@@ -124,7 +129,7 @@ __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond)
 	if (cond->waiters == -1)
 		return (0);
 
-	if (cond->waiters > 0 || !WT_ATOMIC_CAS4(cond->waiters, 0, -1)) {
+	if (cond->waiters > 0 || !__wt_atomic_casi32(&cond->waiters, 0, -1)) {
 		WT_ERR(pthread_mutex_lock(&cond->mtx));
 		locked = 1;
 		WT_ERR(pthread_cond_broadcast(&cond->cond));
diff --git a/src/os_posix/os_mtx_rw.c b/src/os_posix/os_mtx_rw.c
index cdd4f8a24e1..d47ab197643 100644
--- a/src/os_posix/os_mtx_rw.c
+++ b/src/os_posix/os_mtx_rw.c
@@ -38,6 +38,78 @@
  * Joseph Seigh. Note that a similar (but not identical) algorithm was published
  * by John Mellor-Crummey and Michael Scott in their landmark paper "Scalable
  * Reader-Writer Synchronization for Shared-Memory Multiprocessors".
+ *
+ * The following is an explanation of this code. First, the underlying lock
+ * structure.
+ *
+ *	struct {
+ *		uint16_t writers;	Now serving for writers
+ *		uint16_t readers;	Now serving for readers
+ *		uint16_t users;		Next available ticket number
+ *		uint16_t __notused;	Padding
+ *	}
+ *
+ * First, imagine a store's 'take a number' ticket algorithm. A customer takes
+ * a unique ticket number and customers are served in ticket order. In the data
+ * structure, 'writers' is the next writer to be served, 'readers' is the next
+ * reader to be served, and 'users' is the next available ticket number.
+ *
+ * Next, consider exclusive (write) locks. The 'now serving' number for writers
+ * is 'writers'. To lock, 'take a number' and wait until that number is being
+ * served; more specifically, atomically copy and increment the current value of
+ * 'users', and then wait until 'writers' equals that copied number.
+ *
+ * Shared (read) locks are similar. Like writers, readers atomically get the
+ * next number available. However, instead of waiting for 'writers' to equal
+ * their number, they wait for 'readers' to equal their number.
+ *
+ * This has the effect of queuing lock requests in the order they arrive
+ * (incidentally avoiding starvation).
+ *
+ * Each lock/unlock pair requires incrementing both 'readers' and 'writers'.
+ * In the case of a reader, the 'readers' increment happens when the reader
+ * acquires the lock (to allow read-lock sharing), and the 'writers' increment
+ * happens when the reader releases the lock. In the case of a writer, both
+ * 'readers' and 'writers' are incremented when the writer releases the lock.
+ *
+ * For example, consider the following read (R) and write (W) lock requests:
+ *
+ *						writers	readers	users
+ *						0	0	0
+ *	R: ticket 0, readers match	OK	0	1	1
+ *	R: ticket 1, readers match	OK	0	2	2
+ *	R: ticket 2, readers match	OK	0	3	3
+ *	W: ticket 3, writers no match	block	0	3	4
+ *	R: ticket 2, unlock			1	3	4
+ *	R: ticket 0, unlock			2	3	4
+ *	R: ticket 1, unlock			3	3	4
+ *	W: ticket 3, writers match	OK	3	3	4
+ *
+ * Note the writer blocks until 'writers' equals its ticket number and it does
+ * not matter if readers unlock in order or not.
+ *
+ * Readers or writers entering the system after the write lock is queued block,
+ * and the next ticket holder (reader or writer) will unblock when the writer
+ * unlocks. An example, continuing from the last line of the above example:
+ *
+ *						writers	readers	users
+ *	W: ticket 3, writers match	OK	3	3	4
+ *	R: ticket 4, readers no match	block	3	3	5
+ *	R: ticket 5, readers no match	block	3	3	6
+ *	W: ticket 6, writers no match	block	3	3	7
+ *	W: ticket 3, unlock			4	4	7
+ *	R: ticket 4, readers match	OK	4	5	7
+ *	R: ticket 5, readers match	OK	4	6	7
+ *
+ * The 'users' field is a 2-byte value so the available ticket number wraps at
+ * 64K requests. If a thread's lock request is not granted until the 'users'
+ * field cycles and the same ticket is taken by another thread, we could grant
+ * a lock to two separate threads at the same time, and bad things happen: two
+ * writer threads or a reader thread and a writer thread would run in parallel,
+ * and lock waiters could be skipped if the unlocks race. This is unlikely, it
+ * only happens if a lock request is blocked by 64K other requests. The fix is
+ * to grow the lock structure fields, but the largest atomic instruction we have
+ * is 8 bytes, the structure has no room to grow.
  */
 
 #include "wt_internal.h"
@@ -69,20 +141,31 @@ __wt_rwlock_alloc(
 int
 __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 {
-	wt_rwlock_t *l;
-	uint64_t old, new, pad, users, writers;
+	wt_rwlock_t *l, new, old;
 
 	WT_RET(__wt_verbose(
 	    session, WT_VERB_MUTEX, "rwlock: try_readlock %s", rwlock->name));
 	WT_STAT_FAST_CONN_INCR(session, rwlock_read);
 
 	l = &rwlock->rwlock;
-	pad = l->s.pad;
-	users = l->s.users;
-	writers = l->s.writers;
-	old = (pad << 48) + (users << 32) + (users << 16) + writers;
-	new = (pad << 48) + ((users + 1) << 32) + ((users + 1) << 16) + writers;
-	return (WT_ATOMIC_CAS8(l->u, old, new) ? 0 : EBUSY);
+	new = old = *l;
+
+	/*
+	 * This read lock can only be granted if the lock was last granted to
+	 * a reader and there are no readers or writers blocked on the lock,
+	 * that is, if this thread's ticket would be the next ticket granted.
+	 * Do the cheap test to see if this can possibly succeed (and confirm
+	 * the lock is in the correct state to grant this read lock).
+	 */
+	if (old.s.readers != old.s.users)
+		return (EBUSY);
+
+	/*
+	 * The replacement lock value is a result of allocating a new ticket and
+	 * incrementing the reader value to match it.
+	 */
+	new.s.readers = new.s.users = old.s.users + 1;
+	return (__wt_atomic_cas64(&l->u, old.u, new.u) ? 0 : EBUSY);
 }
 
 /*
@@ -93,8 +176,7 @@ int
 __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 {
 	wt_rwlock_t *l;
-	uint64_t me;
-	uint16_t val;
+	uint16_t ticket;
 	int pause_cnt;
 
 	WT_RET(__wt_verbose(
@@ -102,17 +184,22 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 	WT_STAT_FAST_CONN_INCR(session, rwlock_read);
 
 	l = &rwlock->rwlock;
-	me = WT_ATOMIC_FETCH_ADD8(l->u, (uint64_t)1 << 32);
-	val = (uint16_t)(me >> 32);
-	for (pause_cnt = 0; val != l->s.readers;) {
+
+	/*
+	 * Possibly wrap: if we have more than 64K lockers waiting, the ticket
+	 * value will wrap and two lockers will simultaneously be granted the
+	 * lock.
+	 */
+	ticket = __wt_atomic_fetch_add16(&l->s.users, 1);
+	for (pause_cnt = 0; ticket != l->s.readers;) {
 		/*
 		 * We failed to get the lock; pause before retrying and if we've
 		 * paused enough, sleep so we don't burn CPU to no purpose. This
 		 * situation happens if there are more threads than cores in the
-		 * system and we're thrashing on shared resources. Regardless,
-		 * don't sleep long, all we need is to schedule the other reader
-		 * threads to complete a few more instructions and increment the
-		 * reader count.
+		 * system and we're thrashing on shared resources.
+		 *
+		 * Don't sleep long when waiting on a read lock, hopefully we're
+		 * waiting on another read thread to increment the reader count.
 		 */
 		if (++pause_cnt < 1000)
 			WT_PAUSE();
@@ -120,6 +207,10 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 			__wt_sleep(0, 10);
 	}
 
+	/*
+	 * We're the only writer of the readers field, so the update does not
+	 * need to be atomic.
+	 */
 	++l->s.readers;
 
 	return (0);
@@ -138,7 +229,12 @@ __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 	    session, WT_VERB_MUTEX, "rwlock: read unlock %s", rwlock->name));
 
 	l = &rwlock->rwlock;
-	WT_ATOMIC_ADD2(l->s.writers, 1);
+
+	/*
+	 * Increment the writers value (other readers are doing the same, make
+	 * sure we don't race).
+	 */
+	(void)__wt_atomic_add16(&l->s.writers, 1);
 
 	return (0);
 }
@@ -150,20 +246,28 @@ __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 int
 __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 {
-	wt_rwlock_t *l;
-	uint64_t old, new, pad, readers, users;
+	wt_rwlock_t *l, new, old;
 
 	WT_RET(__wt_verbose(
 	    session, WT_VERB_MUTEX, "rwlock: try_writelock %s", rwlock->name));
 	WT_STAT_FAST_CONN_INCR(session, rwlock_write);
 
 	l = &rwlock->rwlock;
-	pad = l->s.pad;
-	readers = l->s.readers;
-	users = l->s.users;
-	old = (pad << 48) + (users << 32) + (readers << 16) + users;
-	new = (pad << 48) + ((users + 1) << 32) + (readers << 16) + users;
-	return (WT_ATOMIC_CAS8(l->u, old, new) ? 0 : EBUSY);
+	old = new = *l;
+
+	/*
+	 * This write lock can only be granted if the lock was last granted to
+	 * a writer and there are no readers or writers blocked on the lock,
+	 * that is, if this thread's ticket would be the next ticket granted.
+	 * Do the cheap test to see if this can possibly succeed (and confirm
+	 * the lock is in the correct state to grant this write lock).
+	 */
+	if (old.s.writers != old.s.users)
+		return (EBUSY);
+
+	/* The replacement lock value is a result of allocating a new ticket. */
+	++new.s.users;
+	return (__wt_atomic_cas64(&l->u, old.u, new.u) ? 0 : EBUSY);
 }
 
 /*
@@ -174,23 +278,33 @@ int
 __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 {
 	wt_rwlock_t *l;
-	uint64_t me;
-	uint16_t val;
+	uint16_t ticket;
+	int pause_cnt;
 
 	WT_RET(__wt_verbose(
 	    session, WT_VERB_MUTEX, "rwlock: writelock %s", rwlock->name));
 	WT_STAT_FAST_CONN_INCR(session, rwlock_write);
 
+	l = &rwlock->rwlock;
+
 	/*
-	 * Possibly wrap: if we have more than 64K lockers waiting, the count
-	 * of writers will wrap and two lockers will simultaneously be granted
-	 * the write lock.
+	 * Possibly wrap: if we have more than 64K lockers waiting, the ticket
+	 * value will wrap and two lockers will simultaneously be granted the
+	 * lock.
 	 */
-	l = &rwlock->rwlock;
-	me = WT_ATOMIC_FETCH_ADD8(l->u, (uint64_t)1 << 32);
-	val = (uint16_t)(me >> 32);
-	while (val != l->s.writers)
-		WT_PAUSE();
+	ticket = __wt_atomic_fetch_add16(&l->s.users, 1);
+	for (pause_cnt = 0; ticket != l->s.writers;) {
+		/*
+		 * We failed to get the lock; pause before retrying and if we've
+		 * paused enough, sleep so we don't burn CPU to no purpose. This
+		 * situation happens if there are more threads than cores in the
+		 * system and we're thrashing on shared resources.
+		 */
+		if (++pause_cnt < 1000)
+			WT_PAUSE();
+		else
+			__wt_sleep(0, 10);
+	}
 
 	return (0);
 }
@@ -211,12 +325,23 @@ __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 
 	copy = *l;
 
+	/*
+	 * We're the only writer of the writers/readers fields, so the update
+	 * does not need to be atomic; we have to update both values at the
+	 * same time though, otherwise we'd potentially race with the thread
+	 * next granted the lock.
+	 *
+	 * Use a memory barrier to ensure the compiler doesn't mess with these
+	 * instructions and rework the code in a way that avoids the update as
+	 * a unit.
+	 */
 	WT_BARRIER();
 
 	++copy.s.writers;
 	++copy.s.readers;
 
-	l->i.us = copy.i.us;
+	l->i.wr = copy.i.wr;
+
 	return (0);
 }
 
diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c
index 7a4f5fdb38d..ef4662aa369 100644
--- a/src/os_posix/os_open.c
+++ b/src/os_posix/os_open.c
@@ -53,7 +53,7 @@ __wt_open(WT_SESSION_IMPL *session,
 	hash = __wt_hash_city64(name, strlen(name));
 	bucket = hash % WT_HASH_ARRAY_SIZE;
 	__wt_spin_lock(session, &conn->fh_lock);
-	SLIST_FOREACH(tfh, &conn->fhhash[bucket], hashl) {
+	TAILQ_FOREACH(tfh, &conn->fhhash[bucket], hashq) {
 		if (strcmp(name, tfh->name) == 0) {
 			++tfh->ref;
 			*fhp = tfh;
@@ -167,7 +167,7 @@ setupfh:
 	 */
 	matched = 0;
 	__wt_spin_lock(session, &conn->fh_lock);
-	SLIST_FOREACH(tfh, &conn->fhhash[bucket], hashl) {
+	TAILQ_FOREACH(tfh, &conn->fhhash[bucket], hashq) {
 		if (strcmp(name, tfh->name) == 0) {
 			++tfh->ref;
 			*fhp = tfh;
@@ -177,7 +177,7 @@ setupfh:
 	}
 	if (!matched) {
 		WT_CONN_FILE_INSERT(conn, fh, bucket);
-		(void)WT_ATOMIC_ADD4(conn->open_file_count, 1);
+		(void)__wt_atomic_add32(&conn->open_file_count, 1);
 		*fhp = fh;
 	}
 	__wt_spin_unlock(session, &conn->fh_lock);
@@ -213,6 +213,8 @@ __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp)
 	fh = *fhp;
 	*fhp = NULL;
 
+	WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: close", fh->name));
+
 	__wt_spin_lock(session, &conn->fh_lock);
 	if (fh == NULL || fh->ref == 0 || --fh->ref > 0) {
 		__wt_spin_unlock(session, &conn->fh_lock);
@@ -222,7 +224,7 @@ __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp)
 	/* Remove from the list. */
 	bucket = fh->name_hash % WT_HASH_ARRAY_SIZE;
 	WT_CONN_FILE_REMOVE(conn, fh, bucket);
-	(void)WT_ATOMIC_SUB4(conn->open_file_count, 1);
+	(void)__wt_atomic_sub32(&conn->open_file_count, 1);
 
 	__wt_spin_unlock(session, &conn->fh_lock);
 
diff --git a/src/os_posix/os_path.c b/src/os_posix/os_path.c
index 07b14b55b44..af28e1b3b56 100644
--- a/src/os_posix/os_path.c
+++ b/src/os_posix/os_path.c
@@ -12,10 +12,10 @@
  * __wt_absolute_path --
  *	Return if a filename is an absolute path.
  */
-int
+bool
 __wt_absolute_path(const char *path)
 {
-	return (path[0] == '/' ? 1 : 0);
+	return (path[0] == '/');
 }
 
 /*
diff --git a/src/os_posix/os_remove.c b/src/os_posix/os_remove.c
index 3fc692d8755..96bbba9bab2 100644
--- a/src/os_posix/os_remove.c
+++ b/src/os_posix/os_remove.c
@@ -29,7 +29,7 @@ __remove_file_check(WT_SESSION_IMPL *session, const char *name)
 	 * level should have closed it before removing.
 	 */
 	__wt_spin_lock(session, &conn->fh_lock);
-	SLIST_FOREACH(fh, &conn->fhhash[bucket], hashl)
+	TAILQ_FOREACH(fh, &conn->fhhash[bucket], hashq)
 		if (strcmp(name, fh->name) == 0)
 			break;
 	__wt_spin_unlock(session, &conn->fh_lock);
diff --git a/src/os_posix/os_thread.c b/src/os_posix/os_thread.c
index e4f24cdb44e..c7222aac6c4 100644
--- a/src/os_posix/os_thread.c
+++ b/src/os_posix/os_thread.c
@@ -19,7 +19,8 @@ __wt_thread_create(WT_SESSION_IMPL *session,
 	WT_DECL_RET;
 
 	/* Spawn a new thread of control. */
-	if ((ret = pthread_create(tidret, NULL, func, arg)) == 0)
+	WT_SYSCALL_RETRY(pthread_create(tidret, NULL, func, arg), ret);
+	if (ret == 0)
 		return (0);
 	WT_RET_MSG(session, ret, "pthread_create");
 }
@@ -33,7 +34,8 @@ __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid)
 {
 	WT_DECL_RET;
 
-	if ((ret = pthread_join(tid, NULL)) == 0)
+	WT_SYSCALL_RETRY(pthread_join(tid, NULL), ret);
+	if (ret == 0)
 		return (0);
 
 	WT_RET_MSG(session, ret, "pthread_join");
diff --git a/src/os_win/os_errno.c b/src/os_win/os_errno.c
index 097c73b5731..a9d3d521052 100644
--- a/src/os_win/os_errno.c
+++ b/src/os_win/os_errno.c
@@ -22,7 +22,7 @@ __wt_map_error_to_windows_error(int error) {
 	   Also validate he do not get any COM errors
 	   (which are negative integers)
 	*/
-	WT_ASSERT(NULL, error > 0 && error > -(windows_error_offset));
+	WT_ASSERT(NULL, error < 0);
 
 	return (error + -(windows_error_offset));
 }
@@ -96,7 +96,7 @@ __wt_strerror(WT_SESSION_IMPL *session, int error, char *errbuf, size_t errlen)
 		    snprintf(errbuf, errlen, "%s", buf) > 0)
 			return (errbuf);
 		if (lasterror != 0 && session != NULL &&
-		    __wt_buf_set(session, &session->err, buf, strlen(buf)) == 0)
+		    __wt_buf_fmt(session, &session->err, "%s", buf) == 0)
 			return (session->err.data);
 	}
 
diff --git a/src/os_win/os_mtx_cond.c b/src/os_win/os_mtx_cond.c
index 51f6d6533c8..14ca5d61282 100644
--- a/src/os_win/os_mtx_cond.c
+++ b/src/os_win/os_mtx_cond.c
@@ -37,13 +37,15 @@ __wt_cond_alloc(WT_SESSION_IMPL *session,
 }
 
 /*
- * __wt_cond_wait --
- *	Wait on a mutex, optionally timing out.
+ * __wt_cond_wait_signal --
+ *	Wait on a mutex, optionally timing out.  If we get it
+ *	before the time out period expires, let the caller know.
  */
 int
-__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs)
+__wt_cond_wait_signal(
+    WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, int *signalled)
 {
-	DWORD milliseconds;
+	DWORD err, milliseconds;
 	WT_DECL_RET;
 	uint64_t milliseconds64;
 	int locked;
@@ -51,7 +53,8 @@ __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs)
 	locked = 0;
 
 	/* Fast path if already signalled. */
-	if (WT_ATOMIC_ADD4(cond->waiters, 1) == 0)
+	*signalled = 1;
+	if (__wt_atomic_addi32(&cond->waiters, 1) == 0)
 		return (0);
 
 	/*
@@ -91,17 +94,25 @@ __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs)
 		ret = SleepConditionVariableCS(
 		    &cond->cond, &cond->mtx, INFINITE);
 
+	/*
+	 * SleepConditionVariableCS returns non-zero on success, 0 on timeout
+	 * or failure. Check for timeout, else convert to a WiredTiger error
+	 * value and fail.
+	 */
 	if (ret == 0) {
-		if (GetLastError() == ERROR_TIMEOUT) {
-			ret = 1;
-		}
-	}
+		if ((err = GetLastError()) == ERROR_TIMEOUT)
+			*signalled = 0;
+		else
+			ret = __wt_errno();
+	} else
+		ret = 0;
 
-	(void)WT_ATOMIC_SUB4(cond->waiters, 1);
+	(void)__wt_atomic_subi32(&cond->waiters, 1);
 
 	if (locked)
 		LeaveCriticalSection(&cond->mtx);
-	if (ret != 0)
+
+	if (ret == 0)
 		return (0);
 	WT_RET_MSG(session, ret, "SleepConditionVariableCS");
 }
@@ -130,7 +141,7 @@ __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond)
 	if (cond->waiters == -1)
 		return (0);
 
-	if (cond->waiters > 0 || !WT_ATOMIC_CAS4(cond->waiters, 0, -1)) {
+	if (cond->waiters > 0 || !__wt_atomic_casi32(&cond->waiters, 0, -1)) {
 		EnterCriticalSection(&cond->mtx);
 		locked = 1;
 		WakeAllConditionVariable(&cond->cond);
diff --git a/src/os_win/os_open.c b/src/os_win/os_open.c
index a77bef63b9d..3bd24369242 100644
--- a/src/os_win/os_open.c
+++ b/src/os_win/os_open.c
@@ -39,7 +39,7 @@ __wt_open(WT_SESSION_IMPL *session,
 	/* Increment the reference count if we already have the file open. */
 	matched = 0;
 	__wt_spin_lock(session, &conn->fh_lock);
-	SLIST_FOREACH(tfh, &conn->fhhash[bucket], hashl)
+	TAILQ_FOREACH(tfh, &conn->fhhash[bucket], hashq)
 		if (strcmp(name, tfh->name) == 0) {
 			++tfh->ref;
 			*fhp = tfh;
@@ -160,7 +160,7 @@ setupfh:
 	 */
 	matched = 0;
 	__wt_spin_lock(session, &conn->fh_lock);
-	SLIST_FOREACH(tfh, &conn->fhhash[bucket], hashl)
+	TAILQ_FOREACH(tfh, &conn->fhhash[bucket], hashq)
 		if (strcmp(name, tfh->name) == 0) {
 			++tfh->ref;
 			*fhp = tfh;
@@ -169,7 +169,7 @@ setupfh:
 		}
 	if (!matched) {
 		WT_CONN_FILE_INSERT(conn, fh, bucket);
-		(void)WT_ATOMIC_ADD4(conn->open_file_count, 1);
+		(void)__wt_atomic_add32(&conn->open_file_count, 1);
 
 		*fhp = fh;
 	}
@@ -217,7 +217,7 @@ __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp)
 	/* Remove from the list. */
 	bucket = fh->name_hash % WT_HASH_ARRAY_SIZE;
 	WT_CONN_FILE_REMOVE(conn, fh, bucket);
-	(void)WT_ATOMIC_SUB4(conn->open_file_count, 1);
+	(void)__wt_atomic_sub32(&conn->open_file_count, 1);
 
 	__wt_spin_unlock(session, &conn->fh_lock);
 
diff --git a/src/os_win/os_path.c b/src/os_win/os_path.c
index 89f05e238c4..9d001e50571 100644
--- a/src/os_win/os_path.c
+++ b/src/os_win/os_path.c
@@ -12,7 +12,7 @@
  * __wt_absolute_path --
  *	Return if a filename is an absolute path.
  */
-int
+bool
 __wt_absolute_path(const char *path)
 {
 	/*
@@ -21,7 +21,7 @@ __wt_absolute_path(const char *path)
 	 */
 	if (strlen(path) >= 3 && isalpha(path[0]) && path[1] == ':')
 		path += 2;
-	return (path[0] == '/' || path[0] == '\\' ? 1 : 0);
+	return (path[0] == '/' || path[0] == '\\');
 }
 
 /*
diff --git a/src/os_win/os_remove.c b/src/os_win/os_remove.c
index 0c6396c775f..55b50030064 100644
--- a/src/os_win/os_remove.c
+++ b/src/os_win/os_remove.c
@@ -29,7 +29,7 @@ __remove_file_check(WT_SESSION_IMPL *session, const char *name)
 	 * level should have closed it before removing.
 	 */
 	__wt_spin_lock(session, &conn->fh_lock);
-	SLIST_FOREACH(fh, &conn->fhhash[bucket], hashl)
+	TAILQ_FOREACH(fh, &conn->fhhash[bucket], hashq)
 		if (strcmp(name, fh->name) == 0)
 			break;
 	__wt_spin_unlock(session, &conn->fh_lock);
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index 37acb28a00b..10daa8b717c 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -27,18 +27,30 @@ typedef struct {
 
 	WT_ITEM	 dsk;			/* Temporary disk-image buffer */
 
-	/* Track whether all changes to the page are written. */
+	/*
+	 * Track start/stop write generation to decide if all changes to the
+	 * page are written.
+	 */
+	uint32_t orig_write_gen;
+
+	/*
+	 * Track start/stop checkpoint generations to decide if lookaside table
+	 * records are correct.
+	 */
+	uint64_t orig_btree_checkpoint_gen;
+	uint64_t orig_txn_checkpoint_gen;
+
+	/*
+	 * Track maximum transaction ID seen and first unwritten transaction ID.
+	 */
 	uint64_t max_txn;
 	uint64_t first_dirty_txn;
-	uint32_t orig_write_gen;
 
 	/*
-	 * If page updates are skipped because they are as yet unresolved, or
-	 * the page has updates we cannot discard, the page is left "dirty":
-	 * the page cannot be discarded and a subsequent reconciliation will
-	 * be necessary to discard the page.
+	 * When we can't mark the page clean (for example, checkpoint found some
+	 * uncommitted updates), there's a leave-dirty flag.
 	 */
-	int	 leave_dirty;
+	int leave_dirty;
 
 	/*
 	 * Raw compression (don't get me started, as if normal reconciliation
@@ -153,18 +165,12 @@ typedef struct {
 		void    *dsk;		/* Split's disk image */
 
 		/*
-		 * When busy pages get large, we need to be able to evict them
-		 * even when they contain unresolved updates, or updates which
-		 * cannot be evicted because of running transactions.  In such
-		 * cases, break the page into multiple blocks, write the blocks
-		 * that can be evicted, saving lists of updates for blocks that
-		 * cannot be evicted, then re-instantiate the blocks that cannot
-		 * be evicted as new, in-memory pages, restoring the updates on
-		 * those pages.
+		 * Saved update list, supporting the WT_EVICT_UPDATE_RESTORE and
+		 * WT_EVICT_LOOKASIDE configurations.
 		 */
-		WT_UPD_SKIPPED *skip;	/* Skipped updates */
-		uint32_t	skip_next;
-		size_t		skip_allocated;
+		WT_SAVE_UPD *supd;	/* Saved updates */
+		uint32_t     supd_next;
+		size_t	     supd_allocated;
 
 		/*
 		 * The key for a row-store page; no column-store key is needed
@@ -220,12 +226,14 @@ typedef struct {
 	size_t	 space_avail;		/* Remaining space in this chunk */
 
 	/*
-	 * While reviewing updates for each page, we store skipped updates here,
-	 * and then move them to per-block areas as the blocks are defined.
+	 * Saved update list, supporting the WT_EVICT_UPDATE_RESTORE and
+	 * WT_EVICT_LOOKASIDE configurations. While reviewing updates for each
+	 * page, we save WT_UPDATE lists here, and then move them to per-block
+	 * areas as the blocks are defined.
 	 */
-	WT_UPD_SKIPPED *skip;		/* Skipped updates */
-	uint32_t	skip_next;
-	size_t		skip_allocated;
+	WT_SAVE_UPD *supd;		/* Saved updates */
+	uint32_t     supd_next;
+	size_t	     supd_allocated;
 
 	/*
 	 * We don't need to keep the 0th key around on internal pages, the
@@ -277,7 +285,10 @@ typedef struct {
 
 	WT_SALVAGE_COOKIE *salvage;	/* If it's a salvage operation */
 
-	int tested_ref_state;		/* Debugging information */
+	int cache_write_lookaside;	/* Used the lookaside table */
+	int cache_write_restore;		/* Used update/restoration */
+
+	uint32_t tested_ref_state;	/* Debugging information */
 } WT_RECONCILE;
 
 static void __rec_bnd_cleanup(WT_SESSION_IMPL *, WT_RECONCILE *, int);
@@ -318,8 +329,11 @@ static int  __rec_split_row_promote(
 		WT_SESSION_IMPL *, WT_RECONCILE *, WT_ITEM *, uint8_t);
 static int  __rec_split_write(WT_SESSION_IMPL *,
 		WT_RECONCILE *, WT_BOUNDARY *, WT_ITEM *, int);
+static int  __rec_update_las(
+		WT_SESSION_IMPL *, WT_RECONCILE *, uint32_t, WT_BOUNDARY *);
 static int  __rec_write_init(WT_SESSION_IMPL *,
 		WT_REF *, uint32_t, WT_SALVAGE_COOKIE *, void *);
+static int  __rec_write_status(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
 static int  __rec_write_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
 static int  __rec_write_wrapup_err(
 		WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
@@ -338,31 +352,19 @@ int
 __wt_reconcile(WT_SESSION_IMPL *session,
     WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags)
 {
-	WT_CONNECTION_IMPL *conn;
 	WT_DECL_RET;
 	WT_PAGE *page;
 	WT_PAGE_MODIFY *mod;
 	WT_RECONCILE *r;
-	int page_lock, scan_lock, split_lock;
 
-	conn = S2C(session);
 	page = ref->page;
 	mod = page->modify;
-	page_lock = scan_lock = split_lock = 0;
-
-	/* We're shouldn't get called with a clean page, that's an error. */
-	if (!__wt_page_is_modified(page))
-		WT_RET_MSG(session, WT_ERROR,
-		    "Attempt to reconcile a clean page.");
 
 	WT_RET(__wt_verbose(session,
 	    WT_VERB_RECONCILE, "%s", __wt_page_type_string(page->type)));
-	WT_STAT_FAST_CONN_INCR(session, rec_pages);
-	WT_STAT_FAST_DATA_INCR(session, rec_pages);
-	if (LF_ISSET(WT_EVICTING)) {
-		WT_STAT_FAST_CONN_INCR(session, rec_pages_eviction);
-		WT_STAT_FAST_DATA_INCR(session, rec_pages_eviction);
-	}
+
+	/* We shouldn't get called with a clean page, that's an error. */
+	WT_ASSERT(session, __wt_page_is_modified(page));
 
 #ifdef HAVE_DIAGNOSTIC
 	{
@@ -386,39 +388,15 @@ __wt_reconcile(WT_SESSION_IMPL *session,
 	r = session->reconcile;
 
 	/*
-	 * The compaction process looks at the page's modification information;
-	 * if compaction is running, acquire the page's lock.
-	 */
-	if (conn->compact_in_memory_pass) {
-		WT_PAGE_LOCK(session, page);
-		page_lock = 1;
-	}
-
-	/*
-	 * Reconciliation reads the lists of updates, so obsolete updates cannot
-	 * be discarded while reconciliation is in progress.
-	 */
-	for (;;) {
-		F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret);
-		if (ret == 0)
-			break;
-		__wt_yield();
-	}
-	scan_lock = 1;
-
-	/*
-	 * Mark internal pages as splitting to ensure we don't deadlock when
-	 * performing an in-memory split during a checkpoint.
+	 * Reconciliation locks the page for three reasons:
+	 *    Reconciliation reads the lists of page updates, obsolete updates
+	 * cannot be discarded while reconciliation is in progress;
+	 *    The compaction process reads page modification information, which
+	 * reconciliation modifies;
+	 *    In-memory splits: reconciliation of an internal page cannot handle
+	 * a child page splitting during the reconciliation.
 	 */
-	if (WT_PAGE_IS_INTERNAL(page)) {
-		for (;;) {
-			F_CAS_ATOMIC(page, WT_PAGE_SPLIT_LOCKED, ret);
-			if (ret == 0)
-				break;
-			__wt_yield();
-		}
-		split_lock = 1;
-	}
+	F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION);
 
 	/* Reconcile the page. */
 	switch (page->type) {
@@ -445,19 +423,34 @@ __wt_reconcile(WT_SESSION_IMPL *session,
 	WT_ILLEGAL_VALUE_SET(session);
 	}
 
+	/* Get the final status for the reconciliation. */
+	if (ret == 0)
+		ret = __rec_write_status(session, r, page);
+
 	/* Wrap up the page reconciliation. */
 	if (ret == 0)
 		ret = __rec_write_wrapup(session, r, page);
 	else
 		WT_TRET(__rec_write_wrapup_err(session, r, page));
 
-	/* Release the locks we're holding. */
-	if (split_lock)
-		F_CLR_ATOMIC(page, WT_PAGE_SPLIT_LOCKED);
-	if (scan_lock)
-		F_CLR_ATOMIC(page, WT_PAGE_SCANNING);
-	if (page_lock)
-		WT_PAGE_UNLOCK(session, page);
+	/* Release the reconciliation lock. */
+	F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION);
+
+	/* Update statistics. */
+	WT_STAT_FAST_CONN_INCR(session, rec_pages);
+	WT_STAT_FAST_DATA_INCR(session, rec_pages);
+	if (LF_ISSET(WT_EVICTING)) {
+		WT_STAT_FAST_CONN_INCR(session, rec_pages_eviction);
+		WT_STAT_FAST_DATA_INCR(session, rec_pages_eviction);
+	}
+	if (r->cache_write_lookaside) {
+		WT_STAT_FAST_CONN_INCR(session, cache_write_lookaside);
+		WT_STAT_FAST_DATA_INCR(session, cache_write_lookaside);
+	}
+	if (r->cache_write_restore) {
+		WT_STAT_FAST_CONN_INCR(session, cache_write_restore);
+		WT_STAT_FAST_DATA_INCR(session, cache_write_restore);
+	}
 
 	/*
 	 * Clean up the boundary structures: some workloads result in millions
@@ -489,6 +482,125 @@ __wt_reconcile(WT_SESSION_IMPL *session,
 }
 
 /*
+ * __rec_las_checkpoint_test --
+ *	Return if the lookaside table is going to collide with a checkpoint.
+ */
+static inline bool
+__rec_las_checkpoint_test(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_BTREE *btree;
+
+	conn = S2C(session);
+	btree = S2BT(session);
+
+	/*
+	 * Running checkpoints can collide with the lookaside table because
+	 * reconciliation using the lookaside table writes the key's last
+	 * committed value, which might not be the value checkpoint would write.
+	 * If reconciliation was configured for lookaside table eviction, this
+	 * file participates in checkpoints, and any of the tree or system
+	 * transactional generation numbers don't match, there's a possible
+	 * collision.
+	 *
+	 * It's a complicated test, but the alternative is to have checkpoint
+	 * drain lookaside table reconciliations, and this isn't a problem for
+	 * most workloads.
+	 */
+	if (!F_ISSET(r, WT_EVICT_LOOKASIDE))
+		return (false);
+	if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT))
+		return (false);
+	if (r->orig_btree_checkpoint_gen == btree->checkpoint_gen &&
+	    r->orig_txn_checkpoint_gen == conn->txn_global.checkpoint_gen &&
+	    r->orig_btree_checkpoint_gen == r->orig_txn_checkpoint_gen)
+		return (false);
+	return (true);
+}
+
+/*
+ * __rec_write_status --
+ *	Return the final status for reconciliation.
+ */
+static int
+__rec_write_status(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+	WT_BTREE *btree;
+	WT_PAGE_MODIFY *mod;
+
+	btree = S2BT(session);
+	mod = page->modify;
+
+	/* Check for a lookaside table and checkpoint collision. */
+	if (__rec_las_checkpoint_test(session, r))
+		return (EBUSY);
+
+	/*
+	 * Set the page's status based on whether or not we cleaned the page.
+	 */
+	if (r->leave_dirty) {
+		/*
+		 * Update the page's first unwritten transaction ID.
+		 */
+		mod->first_dirty_txn = r->first_dirty_txn;
+
+		/*
+		 * The page remains dirty.
+		 *
+		 * Any checkpoint call cleared the tree's modified flag before
+		 * writing pages, so we must explicitly reset it.  We insert a
+		 * barrier after the change for clarity (the requirement is the
+		 * flag be set before a subsequent checkpoint reads it, and
+		 * as the current checkpoint is waiting on this reconciliation
+		 * to complete, there's no risk of that happening)
+		 */
+		btree->modified = 1;
+		WT_FULL_BARRIER();
+
+		/*
+		 * Eviction should only be here if following the save/restore
+		 * eviction path.
+		 */
+		WT_ASSERT(session,
+		    !F_ISSET(r, WT_EVICTING) ||
+		    F_ISSET(r, WT_EVICT_UPDATE_RESTORE));
+	} else {
+		/*
+		 * Track the page's maximum transaction ID (used to decide if
+		 * we're likely to be able to evict this page in the future).
+		 */
+		mod->rec_max_txn = r->max_txn;
+
+		/*
+		 * Track the tree's maximum transaction ID (used to decide if
+		 * it's safe to discard the tree). Reconciliation for eviction
+		 * is multi-threaded, only update the tree's maximum transaction
+		 * ID when doing a checkpoint. That's sufficient, we only care
+		 * about the maximum transaction ID of current updates in the
+		 * tree, and checkpoint visits every dirty page in the tree.
+		 */
+		if (!F_ISSET(r, WT_EVICTING) &&
+		    WT_TXNID_LT(btree->rec_max_txn, r->max_txn))
+			btree->rec_max_txn = r->max_txn;
+
+		/*
+		 * The page only might be clean; if the write generation is
+		 * unchanged since reconciliation started, it's clean.
+		 *
+		 * If the write generation changed, the page has been written
+		 * since reconciliation started and remains dirty (that can't
+		 * happen when evicting, the page is exclusively locked).
+		 */
+		if (__wt_atomic_cas32(&mod->write_gen, r->orig_write_gen, 0))
+			__wt_cache_dirty_decr(session, page);
+		else
+			WT_ASSERT(session, !F_ISSET(r, WT_EVICTING));
+	}
+
+	return (0);
+}
+
+/*
  * __rec_root_write --
  *	Handle the write of a root page.
  */
@@ -577,7 +689,7 @@ err:	__wt_page_out(session, &next);
  * __rec_raw_compression_config --
  *	Configure raw compression.
  */
-static inline int
+static inline bool
 __rec_raw_compression_config(
     WT_SESSION_IMPL *session, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
 {
@@ -588,11 +700,11 @@ __rec_raw_compression_config(
 	/* Check if raw compression configured. */
 	if (btree->compressor == NULL ||
 	    btree->compressor->compress_raw == NULL)
-		return (0);
+		return (false);
 
 	/* Only for row-store and variable-length column-store objects. */
 	if (page->type == WT_PAGE_COL_FIX)
-		return (0);
+		return (false);
 
 	/*
 	 * Raw compression cannot support dictionary compression. (Technically,
@@ -602,11 +714,11 @@ __rec_raw_compression_config(
 	 * that seems an unlikely use case.)
 	 */
 	if (btree->dictionary != 0)
-		return (0);
+		return (false);
 
 	/* Raw compression cannot support prefix compression. */
 	if (btree->prefix_compression != 0)
-		return (0);
+		return (false);
 
 	/*
 	 * Raw compression is also turned off during salvage: we can't allow
@@ -614,9 +726,9 @@ __rec_raw_compression_config(
 	 * can't manipulate the page size.
 	 */
 	if (salvage != NULL)
-		return (0);
+		return (false);
 
-	return (1);
+	return (true);
 }
 
 /*
@@ -628,10 +740,12 @@ __rec_write_init(WT_SESSION_IMPL *session,
     WT_REF *ref, uint32_t flags, WT_SALVAGE_COOKIE *salvage, void *reconcilep)
 {
 	WT_BTREE *btree;
+	WT_CONNECTION_IMPL *conn;
 	WT_PAGE *page;
 	WT_RECONCILE *r;
 
 	btree = S2BT(session);
+	conn = S2C(session);
 	page = ref->page;
 
 	if ((r = *(WT_RECONCILE **)reconcilep) == NULL) {
@@ -648,9 +762,59 @@ __rec_write_init(WT_SESSION_IMPL *session,
 		F_SET(&r->dsk, WT_ITEM_ALIGNED);
 	}
 
+	/* Reconciliation is not re-entrant, make sure that doesn't happen. */
+	WT_ASSERT(session, r->ref == NULL);
+
 	/* Remember the configuration. */
 	r->ref = ref;
 	r->page = page;
+
+	/*
+	 * Save the page's write generation before reading the page.
+	 * Save the transaction generations before reading the page.
+	 * These are all ordered reads, but we only need one.
+	 */
+	r->orig_btree_checkpoint_gen = btree->checkpoint_gen;
+	r->orig_txn_checkpoint_gen = conn->txn_global.checkpoint_gen;
+	WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen);
+
+	/*
+	 * Lookaside table eviction is configured when eviction gets aggressive,
+	 * adjust the flags for cases we don't support.
+	 */
+	if (LF_ISSET(WT_EVICT_LOOKASIDE)) {
+		/*
+		 * Saving lookaside table updates into the lookaside table won't
+		 * work.
+		 */
+		if (F_ISSET(btree, WT_BTREE_LOOKASIDE))
+			LF_CLR(WT_EVICT_LOOKASIDE);
+
+		/*
+		 * We don't yet support fixed-length column-store combined with
+		 * the lookaside table. It's not hard to do, but the underlying
+		 * function that reviews which updates can be written to the
+		 * evicted page and which updates need to be written to the
+		 * lookaside table needs access to the original value from the
+		 * page being evicted, and there's no code path for that in the
+		 * case of fixed-length column-store objects. (Row-store and
+		 * variable-width column-store objects provide a reference to
+		 * the unpacked on-page cell for this purpose, but there isn't
+		 * an on-page cell for fixed-length column-store objects.) For
+		 * now, turn it off.
+		 */
+		if (page->type == WT_PAGE_COL_FIX)
+			LF_CLR(WT_EVICT_LOOKASIDE);
+
+		/*
+		 * Check for a lookaside table and checkpoint collision, and if
+		 * we find one, turn off the lookaside file (we've gone to all
+		 * the effort of getting exclusive access to the page, might as
+		 * well try and evict it).
+		 */
+		if (__rec_las_checkpoint_test(session, r))
+			LF_CLR(WT_EVICT_LOOKASIDE);
+	}
 	r->flags = flags;
 
 	/* Track if the page can be marked clean. */
@@ -668,8 +832,8 @@ __rec_write_init(WT_SESSION_IMPL *session,
 	r->all_empty_value = 1;
 	r->any_empty_value = 0;
 
-	/* The list of cached, skipped updates. */
-	r->skip_next = 0;
+	/* The list of saved updates. */
+	r->supd_next = 0;
 
 	/*
 	 * Dictionary compression only writes repeated values once.  We grow
@@ -714,14 +878,11 @@ __rec_write_init(WT_SESSION_IMPL *session,
 
 	r->salvage = salvage;
 
-	/* Save the page's write generation before reading the page. */
-	WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen);
-
 	/*
 	 * Running transactions may update the page after we write it, so
 	 * this is the highest ID we can be confident we will see.
 	 */
-	r->first_dirty_txn = S2C(session)->txn_global.last_running;
+	r->first_dirty_txn = conn->txn_global.last_running;
 
 	return (0);
 }
@@ -748,7 +909,7 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep)
 
 	__rec_bnd_cleanup(session, r, 1);
 
-	__wt_free(session, r->skip);
+	__wt_free(session, r->supd);
 
 	__wt_buf_free(session, &r->k.buf);
 	__wt_buf_free(session, &r->v.buf);
@@ -784,6 +945,9 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, int destroy)
 	if (r->bnd == NULL)
 		return;
 
+	/* Reconciliation is not re-entrant, make sure that doesn't happen. */
+	r->ref = NULL;
+
 	/*
 	 * Free the boundary structures' memory.  In the case of normal cleanup,
 	 * discard any memory we won't reuse in the next reconciliation; in the
@@ -799,7 +963,7 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, int destroy)
 		for (bnd = r->bnd, i = 0; i < r->bnd_entries; ++bnd, ++i) {
 			__wt_free(session, bnd->addr.addr);
 			__wt_free(session, bnd->dsk);
-			__wt_free(session, bnd->skip);
+			__wt_free(session, bnd->supd);
 			__wt_buf_free(session, &bnd->key);
 		}
 		__wt_free(session, r->bnd);
@@ -820,66 +984,84 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, int destroy)
 		for (bnd = r->bnd, i = 0; i < last_used; ++bnd, ++i) {
 			__wt_free(session, bnd->addr.addr);
 			__wt_free(session, bnd->dsk);
-			__wt_free(session, bnd->skip);
+			__wt_free(session, bnd->supd);
 		}
 	}
 }
 
 /*
- * __rec_skip_update_save --
- *	Save a skipped WT_UPDATE list for later restoration.
+ * __rec_block_free --
+ *	Helper function to free a block.
  */
 static int
-__rec_skip_update_save(
-    WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip)
+__rec_block_free(
+    WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
+{
+	WT_BM *bm;
+	WT_BTREE *btree;
+
+	btree = S2BT(session);
+	bm = btree->bm;
+
+	return (bm->free(bm, session, addr, addr_size));
+}
+
+/*
+ * __rec_update_save --
+ *	Save a WT_UPDATE list for later restoration.
+ */
+static int
+__rec_update_save(WT_SESSION_IMPL *session,
+    WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip, uint64_t txnid)
 {
 	WT_RET(__wt_realloc_def(
-	    session, &r->skip_allocated, r->skip_next + 1, &r->skip));
-	r->skip[r->skip_next].ins = ins;
-	r->skip[r->skip_next].rip = rip;
-	++r->skip_next;
+	    session, &r->supd_allocated, r->supd_next + 1, &r->supd));
+	r->supd[r->supd_next].ins = ins;
+	r->supd[r->supd_next].rip = rip;
+	r->supd[r->supd_next].onpage_txn = txnid;
+	++r->supd_next;
 	return (0);
 }
 
 /*
- * __rec_skip_update_move --
- *	Move a skipped WT_UPDATE list from the per-page cache to a specific
+ * __rec_update_move --
+ *	Move a saved WT_UPDATE list from the per-page cache to a specific
  * block's list.
  */
 static int
-__rec_skip_update_move(
-    WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_UPD_SKIPPED *skip)
+__rec_update_move(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_SAVE_UPD *supd)
 {
 	WT_RET(__wt_realloc_def(
-	    session, &bnd->skip_allocated, bnd->skip_next + 1, &bnd->skip));
-	bnd->skip[bnd->skip_next] = *skip;
-	++bnd->skip_next;
+	    session, &bnd->supd_allocated, bnd->supd_next + 1, &bnd->supd));
+	bnd->supd[bnd->supd_next] = *supd;
+	++bnd->supd_next;
 
-	skip->ins = NULL;
-	skip->rip = NULL;
+	supd->ins = NULL;
+	supd->rip = NULL;
 	return (0);
 }
 
 /*
  * __rec_txn_read --
- *	Return the first visible update in a list (or NULL if none are visible),
- * set a flag if any updates were skipped, track the maximum transaction ID on
- * the page.
+ *	Return the update in a list that should be written (or NULL if none can
+ * be written).
  */
-static inline int
+static int
 __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
     WT_INSERT *ins, WT_ROW *rip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp)
 {
+	WT_BTREE *btree;
 	WT_DECL_RET;
-	WT_ITEM ovfl;
+	WT_DECL_ITEM(tmp);
 	WT_PAGE *page;
-	WT_UPDATE *upd, *upd_list, *upd_ovfl;
+	WT_UPDATE *append, *upd, *upd_list;
 	size_t notused;
 	uint64_t max_txn, min_txn, txnid;
-	int skipped;
+	int append_origv, skipped;
 
 	*updp = NULL;
 
+	btree = S2BT(session);
 	page = r->page;
 
 	/*
@@ -893,13 +1075,16 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
 	} else
 		upd_list = ins->upd;
 
-	skipped = 0;
-	for (max_txn = WT_TXN_NONE, min_txn = UINT64_MAX, upd = upd_list;
-	    upd != NULL; upd = upd->next) {
+	for (skipped = 0,
+	    max_txn = WT_TXN_NONE, min_txn = UINT64_MAX,
+	    upd = upd_list; upd != NULL; upd = upd->next) {
 		if ((txnid = upd->txnid) == WT_TXN_ABORTED)
 			continue;
 
-		/* Track the largest/smallest transaction IDs on the list. */
+		/*
+		 * Track the largest/smallest transaction IDs on the list and
+		 * the smallest not-globally-visible transaction on the page.
+		 */
 		if (WT_TXNID_LT(max_txn, txnid))
 			max_txn = txnid;
 		if (WT_TXNID_LT(txnid, min_txn))
@@ -909,132 +1094,231 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
 			r->first_dirty_txn = txnid;
 
 		/*
-		 * Record whether any updates were skipped on the way to finding
-		 * the first visible update.
-		 *
-		 * If updates were skipped before the one being written, future
-		 * reads without intervening modifications to the page could
-		 * see a different value; if no updates were skipped, the page
-		 * can safely be marked clean and does not need to be
-		 * reconciled until modified again.
+		 * Find the first update we can use.
 		 */
-		if (*updp == NULL) {
-			if (__wt_txn_visible(session, txnid))
-				*updp = upd;
-			else
+		if (F_ISSET(r, WT_EVICTING)) {
+			/*
+			 * Eviction can write any committed update.
+			 *
+			 * When reconciling for eviction, track whether any
+			 * uncommitted updates are found.
+			 */
+			if (__wt_txn_committed(session, txnid)) {
+				if (*updp == NULL)
+					*updp = upd;
+			} else
 				skipped = 1;
+		} else {
+			/*
+			 * Checkpoint can only write updates visible as of its
+			 * snapshot.
+			 *
+			 * When reconciling for a checkpoint, track whether any
+			 * updates were skipped on the way to finding the first
+			 * visible update.
+			 */
+			if (*updp == NULL) {
+				if (__wt_txn_visible(session, txnid))
+					*updp = upd;
+				else
+					skipped = 1;
+			}
 		}
 	}
 
 	/*
+	 * If all of the updates were aborted, quit. This test is not strictly
+	 * necessary because the above loop exits with skipped not set and the
+	 * maximum transaction left at its initial value of WT_TXN_NONE, so
+	 * the test below will be branch true and return, but it's cheap and a
+	 * little more explicit, and makes Coverity happy.
+	 */
+	if (max_txn == WT_TXN_NONE)
+		return (0);
+
+	/*
 	 * Track the maximum transaction ID in the page.  We store this in the
-	 * page at the end of reconciliation if no updates are skipped, it's
-	 * used to avoid evicting clean pages from memory with changes required
-	 * to satisfy a snapshot read.
+	 * tree at the end of reconciliation in the service of checkpoints, it
+	 * is used to avoid discarding trees from memory when they have changes
+	 * required to satisfy a snapshot read.
 	 */
 	if (WT_TXNID_LT(r->max_txn, max_txn))
 		r->max_txn = max_txn;
 
 	/*
-	 * If no updates were skipped and all updates are globally visible, the
-	 * page can be marked clean and we're done, regardless of whether we're
-	 * evicting or checkpointing.
+	 * If there are no skipped updates and all updates are globally visible,
+	 * the page can be marked clean and we're done, regardless if evicting
+	 * or checkpointing.
 	 *
 	 * We have to check both: the oldest transaction ID may have moved while
-	 * we were scanning the update list, so it is possible to skip an update
-	 * but then find that by the end of the scan, all updates are stable.
+	 * we were scanning the update list, so it is possible to find a skipped
+	 * update, but then find all updates are stable at the end of the scan.
+	 *
+	 * Skip the visibility check for the lookaside table as a special-case,
+	 * we know there are no older readers of that table.
 	 */
-	if (!skipped && __wt_txn_visible_all(session, max_txn))
+	if (!skipped &&
+	    (F_ISSET(btree, WT_BTREE_LOOKASIDE) ||
+	    __wt_txn_visible_all(session, max_txn)))
 		return (0);
 
 	/*
-	 * If some updates are not globally visible, or were skipped, the page
-	 * cannot be marked clean.
+	 * In some cases, there had better not be skipped updates or updates not
+	 * yet globally visible.
 	 */
-	r->leave_dirty = 1;
-
-	/* If we're not evicting, we're done, we know what we'll write. */
-	if (!F_ISSET(r, WT_EVICTING))
-		return (0);
-
-	/* In some cases, there had better not be any updates we can't write. */
-	if (F_ISSET(r, WT_SKIP_UPDATE_ERR))
+	if (F_ISSET(r, WT_VISIBILITY_ERR))
 		WT_PANIC_RET(session, EINVAL,
-		    "reconciliation illegally skipped an update");
+		    "reconciliation error, uncommitted update or update not "
+		    "globally visible");
 
 	/*
-	 * If evicting and we aren't able to save/restore the not-yet-visible
-	 * updates, the page can't be evicted.
+	 * If not trying to evict the page, we know what we'll write and we're
+	 * done. Because some updates were skipped or are not globally visible,
+	 * the page can't be marked clean.
 	 */
-	if (!F_ISSET(r, WT_SKIP_UPDATE_RESTORE))
-		return (EBUSY);
+	if (!F_ISSET(r, WT_EVICTING)) {
+		r->leave_dirty = 1;
+		return (0);
+	}
 
 	/*
-	 * Evicting a page with not-yet-visible updates: save and restore the
-	 * list of updates on a newly instantiated page.
-	 *
-	 * The order of the updates on the list matters so we can't move only
-	 * the unresolved updates, we have to move the entire update list.
+	 * Evicting with either uncommitted changes or not-yet-globally-visible
+	 * changes. There are two ways to continue, the save/restore eviction
+	 * path or the lookaside table eviction path. Both cannot be configured
+	 * because the paths track different information. The save/restore path
+	 * can handle both uncommitted and not-yet-globally-visible changes, by
+	 * evicting most of the page and then creating a new, smaller page into
+	 * which we re-instantiate those changes. The lookaside table path can
+	 * only handle not-yet-globally-visible changes by writing those changes
+	 * into the lookaside table and restoring them on demand if and when the
+	 * page is read back into memory.
 	 *
-	 * Clear the returned update so our caller ignores the key/value pair
-	 * in the case of an insert/append entry (everything we need is in the
-	 * update list), and otherwise writes the original on-page key/value
-	 * pair to which the update list applies.
+	 * Both paths are configured outside of reconciliation: the save/restore
+	 * path is the WT_EVICT_UPDATE_RESTORE flag, the lookaside table path is
+	 * the WT_EVICT_LOOKASIDE flag.
 	 */
-	*updp = NULL;
+	if (!F_ISSET(r, WT_EVICT_LOOKASIDE | WT_EVICT_UPDATE_RESTORE))
+		return (EBUSY);
+	if (skipped && !F_ISSET(r, WT_EVICT_UPDATE_RESTORE))
+		return (EBUSY);
+
+	append_origv = 0;
+	if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) {
+		/*
+		 * The save/restore eviction path.
+		 *
+		 * Clear the returned update so our caller ignores the key/value
+		 * pair in the case of an insert/append list entry (everything
+		 * we need is in the update list), and otherwise writes the
+		 * original on-page key/value pair to which the update list
+		 * applies.
+		 */
+		*updp = NULL;
+
+		/* The page can't be marked clean. */
+		r->leave_dirty = 1;
+
+		/*
+		 * A special-case for overflow values, where we can't write the
+		 * original on-page value item to disk because it's been updated
+		 * or removed.
+		 *
+		 * What happens is that an overflow value is updated or removed
+		 * and its backing blocks freed.  If any reader in the system
+		 * might still want the value, a copy was cached in the page
+		 * reconciliation tracking memory, and the page cell set to
+		 * WT_CELL_VALUE_OVFL_RM.  Eviction then chose the page and
+		 * we're splitting it up in order to push parts of it out of
+		 * memory.
+		 *
+		 * We could write the original on-page value item to disk... if
+		 * we had a copy.  The cache may not have a copy (a globally
+		 * visible update would have kept a value from being cached), or
+		 * an update that subsequently became globally visible could
+		 * cause a cached value to be discarded.  Either way, once there
+		 * is a globally visible update, we may not have the original
+		 * value.
+		 *
+		 * Fortunately, if there's a globally visible update we don't
+		 * care about the original version, so we simply ignore it, no
+		 * transaction can ever try and read it.  If there isn't a
+		 * globally visible update, there had better be a cached value.
+		 *
+		 * In the latter case, we could write the value out to disk, but
+		 * (1) we are planning on re-instantiating this page in memory,
+		 * it isn't going to disk, and (2) the value item is eventually
+		 * going to be discarded, that seems like a waste of a write.
+		 * Instead, find the cached value and append it to the update
+		 * list we're saving for later restoration.
+		 */
+		if (vpack != NULL &&
+		    vpack->raw == WT_CELL_VALUE_OVFL_RM &&
+		    !__wt_txn_visible_all(session, min_txn))
+			append_origv = 1;
+	} else {
+		/*
+		 * The lookaside table eviction path.
+		 *
+		 * If at least one update is globally visible, copy the update
+		 * list and ignore the current on-page value. If no update is
+		 * globally visible, readers require the page's original value.
+		 */
+		if (!__wt_txn_visible_all(session, min_txn))
+			append_origv = 1;
+	}
 
 	/*
-	 * Handle the case were we don't want to write an original on-page value
-	 * item to disk because it's been updated or removed.
-	 *
-	 * Here's the deal: an overflow value was updated or removed and its
-	 * backing blocks freed.  If any transaction in the system might still
-	 * read the value, a copy was cached in page reconciliation tracking
-	 * memory, and the page cell set to WT_CELL_VALUE_OVFL_RM.  Eviction
-	 * then chose the page and we're splitting it up in order to push parts
-	 * of it out of memory.
-	 *
-	 * We could write the original on-page value item to disk... if we had
-	 * a copy.  The cache may not have a copy (a globally visible update
-	 * would have kept a value from ever being cached), or an update that
-	 * subsequent became globally visible could cause a cached value to be
-	 * discarded.  Either way, once there's a globally visible update, we
-	 * may not have the value.
-	 *
-	 * Fortunately, if there's a globally visible update we don't care about
-	 * the original version, so we simply ignore it, no transaction can ever
-	 * try and read it.  If there isn't a globally visible update, there had
-	 * better be a cached value.
-	 *
-	 * In the latter case, we could write the value out to disk, but (1) we
-	 * are planning on re-instantiating this page in memory, it isn't going
-	 * to disk, and (2) the value item is eventually going to be discarded,
-	 * that seems like a waste of a write.  Instead, find the cached value
-	 * and append it to the update list we're saving for later restoration.
-	 */
-	if (vpack != NULL && vpack->raw == WT_CELL_VALUE_OVFL_RM &&
-	    !__wt_txn_visible_all(session, min_txn)) {
-		if ((ret = __wt_ovfl_txnc_search(
-		    page, vpack->data, vpack->size, &ovfl)) != 0)
-			WT_PANIC_RET(session, ret,
-			    "cached overflow item discarded early");
+	 * We need the original on-page value for some reason: get a copy and
+	 * append it to the end of the update list with a transaction ID that
+	 * guarantees its visibility.
+	 */
+	if (append_origv) {
+		/*
+		 * If we don't have a value cell, it's an insert/append list
+		 * key/value pair which simply doesn't exist for some reader;
+		 * place a deleted record at the end of the update list.
+		 */
+		if (vpack == NULL || vpack->type == WT_CELL_DEL)
+			WT_RET(__wt_update_alloc(
+			    session, NULL, &append, &notused));
+		else {
+			WT_RET(__wt_scr_alloc(session, 0, &tmp));
+			if ((ret = __wt_page_cell_data_ref(
+			    session, page, vpack, tmp)) == 0)
+				ret = __wt_update_alloc(
+				    session, tmp, &append, &notused);
+			__wt_scr_free(session, &tmp);
+			WT_RET(ret);
+		}
 
 		/*
-		 * Create an update structure with an impossibly low transaction
-		 * ID and append it to the update list we're about to save.
-		 * Restoring that update list when this page is re-instantiated
-		 * creates an update for the key/value pair visible to every
-		 * running transaction in the system, ensuring the on-page value
-		 * will be ignored.
+		 * Give the entry an impossibly low transaction ID to ensure its
+		 * global visibility, append it to the update list.
+		 *
+		 * Note the change to the actual reader-accessible update list:
+		 * from now on, the original on-page value appears at the end
+		 * of the update list, even if this reconciliation subsequently
+		 * fails.
 		 */
-		WT_RET(__wt_update_alloc(session, &ovfl, &upd_ovfl, &notused));
-		upd_ovfl->txnid = WT_TXN_NONE;
+		append->txnid = WT_TXN_NONE;
 		for (upd = upd_list; upd->next != NULL; upd = upd->next)
 			;
-		upd->next = upd_ovfl;
+		upd->next = append;
 	}
 
-	return (__rec_skip_update_save(session, r, ins, rip));
+	/*
+	 * The order of the updates on the list matters, we can't move only the
+	 * unresolved updates, move the entire update list.
+	 *
+	 * If we skipped updates, the transaction value is never used.  If we
+	 * didn't skip updates, the list of updates are eventually written to
+	 * the lookaside table, and associated with each update record is the
+	 * transaction ID of the update we wrote in the reconciled page; once
+	 * that transaction ID is globally visible, we know we no longer need
+	 * the lookaside table records, allowing them to be discarded.
+	 */
+	return (__rec_update_save(session,
+	    r, ins, rip, (*updp == NULL) ? WT_TXN_NONE : (*updp)->txnid));
 }
 
 /*
@@ -1104,8 +1388,8 @@ __rec_child_modify(WT_SESSION_IMPL *session,
 			 * to see if the delete is visible to us.  Lock down the
 			 * structure.
 			 */
-			if (!WT_ATOMIC_CAS4(
-			    ref->state, WT_REF_DELETED, WT_REF_LOCKED))
+			if (!__wt_atomic_casv32(
+			    &ref->state, WT_REF_DELETED, WT_REF_LOCKED))
 				break;
 			ret = __rec_child_deleted(session, r, ref, statep);
 			WT_PUBLISH(ref->state, WT_REF_DELETED);
@@ -1155,10 +1439,10 @@ __rec_child_modify(WT_SESSION_IMPL *session,
 			 * If called during checkpoint, acquire a hazard pointer
 			 * so the child isn't evicted, it's an in-memory case.
 			 *
-			 * This call cannot return split/restart, dirty page
-			 * eviction is shutout during checkpoint, all splits in
-			 * process will have completed before we walk any pages
-			 * for checkpoint.
+			 * This call cannot return split/restart, eviction of
+			 * pages that split into their parent is shutout during
+			 * checkpoint, all splits in process will have completed
+			 * before we walk any pages for checkpoint.
 			 */
 			ret = __wt_page_in(session, ref,
 			    WT_READ_CACHE | WT_READ_NO_EVICT |
@@ -1215,7 +1499,7 @@ in_memory:
 	 * reason to write the cell.
 	 */
 	mod = ref->page->modify;
-	if (mod != NULL && mod->flags != 0)
+	if (mod != NULL && F_ISSET(mod, WT_PM_REC_MASK))
 		*statep = WT_CHILD_MODIFIED;
 	else if (ref->addr == NULL) {
 		*statep = WT_CHILD_IGNORE;
@@ -1234,37 +1518,32 @@ static int
 __rec_child_deleted(
     WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, int *statep)
 {
-	WT_BM *bm;
 	WT_PAGE_DELETED *page_del;
 	size_t addr_size;
 	const uint8_t *addr;
 
-	bm = S2BT(session)->bm;
 	page_del = ref->page_del;
 
 	/*
 	 * Internal pages with child leaf pages in the WT_REF_DELETED state are
 	 * a special case during reconciliation.  First, if the deletion was a
 	 * result of a session truncate call, the deletion may not be visible to
-	 * us.  In that case, we proceed as with any change that's not visible
-	 * during reconciliation by setting the skipped flag and ignoring the
-	 * change for the purposes of writing the internal page.
+	 * us. In that case, we proceed as with any change not visible during
+	 * reconciliation by ignoring the change for the purposes of writing the
+	 * internal page.
 	 *
 	 * In this case, there must be an associated page-deleted structure, and
 	 * it holds the transaction ID we care about.
+	 *
+	 * In some cases, there had better not be any updates we can't see.
 	 */
-	if (page_del != NULL && !__wt_txn_visible(session, page_del->txnid)) {
-		/*
-		 * In some cases, there had better not be any updates we can't
-		 * write.
-		 */
-		if (F_ISSET(r, WT_SKIP_UPDATE_ERR))
-			WT_PANIC_RET(session, EINVAL,
-			    "reconciliation illegally skipped an update");
-	}
+	if (F_ISSET(r, WT_VISIBILITY_ERR) &&
+	    page_del != NULL && !__wt_txn_visible(session, page_del->txnid))
+		WT_PANIC_RET(session, EINVAL,
+		    "reconciliation illegally skipped an update");
 
 	/*
-	 * The deletion is visible to us, deal with any underlying disk blocks.
+	 * Deal with any underlying disk blocks.
 	 *
 	 * First, check to see if there is an address associated with this leaf:
 	 * if there isn't, we're done, the underlying page is already gone.  If
@@ -1291,7 +1570,7 @@ __rec_child_deleted(
 	    (page_del == NULL ||
 	    __wt_txn_visible_all(session, page_del->txnid))) {
 		WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
-		WT_RET(bm->free(bm, session, addr, addr_size));
+		WT_RET(__rec_block_free(session, addr, addr_size));
 
 		if (__wt_off_page(ref->home, ref->addr)) {
 			__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
@@ -1562,7 +1841,7 @@ static void
 __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd)
 {
 	bnd->offset = 0;
-	bnd->recno = 0;
+	bnd->recno = WT_RECNO_OOB;
 	bnd->entries = 0;
 
 	__wt_free(session, bnd->addr.addr);
@@ -1571,9 +1850,9 @@ __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd)
 	bnd->cksum = 0;
 	__wt_free(session, bnd->dsk);
 
-	__wt_free(session, bnd->skip);
-	bnd->skip_next = 0;
-	bnd->skip_allocated = 0;
+	__wt_free(session, bnd->supd);
+	bnd->supd_next = 0;
+	bnd->supd_allocated = 0;
 
 	/*
 	 * Don't touch the key, we re-use that memory in each new
@@ -1775,9 +2054,13 @@ __rec_split_init(WT_SESSION_IMPL *session,
  * __rec_is_checkpoint --
  *	Return if we're writing a checkpoint.
  */
-static int
-__rec_is_checkpoint(WT_RECONCILE *r, WT_BOUNDARY *bnd)
+static bool
+__rec_is_checkpoint(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_BOUNDARY *bnd)
 {
+	WT_BTREE *btree;
+
+	btree = S2BT(session);
+
 	/*
 	 * Check to see if we're going to create a checkpoint.
 	 *
@@ -1792,13 +2075,14 @@ __rec_is_checkpoint(WT_RECONCILE *r, WT_BOUNDARY *bnd)
 	 * we don't do checkpoint writes here; clear the boundary information as
 	 * a reminder and create the checkpoint during wrapup.
 	 */
-	if (bnd == &r->bnd[0] && __wt_ref_is_root(r->ref)) {
+	if (!F_ISSET(btree, WT_BTREE_NO_CHECKPOINT) &&
+	    bnd == &r->bnd[0] && __wt_ref_is_root(r->ref)) {
 		bnd->addr.addr = NULL;
 		bnd->addr.size = 0;
 		bnd->addr.type = 0;
-		return (1);
+		return (true);
 	}
-	return (0);
+	return (false);
 }
 
 /*
@@ -1841,7 +2125,7 @@ __rec_split_row_promote(
 	WT_DECL_ITEM(update);
 	WT_DECL_RET;
 	WT_ITEM *max;
-	WT_UPD_SKIPPED *skip;
+	WT_SAVE_UPD *supd;
 	size_t cnt, len, size;
 	uint32_t i;
 	const uint8_t *pa, *pb;
@@ -1892,36 +2176,37 @@ __rec_split_row_promote(
 	 * the last key and smaller than the current key.
 	 */
 	max = r->last;
-	for (i = r->skip_next; i > 0; --i) {
-		skip = &r->skip[i - 1];
-		if (skip->ins == NULL)
-			WT_ERR(__wt_row_leaf_key(
-			    session, r->page, skip->rip, update, 0));
-		else {
-			update->data = WT_INSERT_KEY(skip->ins);
-			update->size = WT_INSERT_KEY_SIZE(skip->ins);
-		}
+	if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE))
+		for (i = r->supd_next; i > 0; --i) {
+			supd = &r->supd[i - 1];
+			if (supd->ins == NULL)
+				WT_ERR(__wt_row_leaf_key(
+				    session, r->page, supd->rip, update, 0));
+			else {
+				update->data = WT_INSERT_KEY(supd->ins);
+				update->size = WT_INSERT_KEY_SIZE(supd->ins);
+			}
 
-		/* Compare against the current key, it must be less. */
-		WT_ERR(__wt_compare(
-		    session, btree->collator, update, r->cur, &cmp));
-		if (cmp >= 0)
-			continue;
+			/* Compare against the current key, it must be less. */
+			WT_ERR(__wt_compare(
+			    session, btree->collator, update, r->cur, &cmp));
+			if (cmp >= 0)
+				continue;
 
-		/* Compare against the last key, it must be greater. */
-		WT_ERR(__wt_compare(
-		    session, btree->collator, update, r->last, &cmp));
-		if (cmp >= 0)
-			max = update;
+			/* Compare against the last key, it must be greater. */
+			WT_ERR(__wt_compare(
+			    session, btree->collator, update, r->last, &cmp));
+			if (cmp >= 0)
+				max = update;
 
-		/*
-		 * The skipped updates are in key-sort order so the entry we're
-		 * looking for is either the last one or the next-to-last one
-		 * in the list.  Once we've compared an entry against the last
-		 * key on the page, we're done.
-		 */
-		break;
-	}
+			/*
+			 * The saved updates are in key-sort order so the entry
+			 * we're looking for is either the last or the next-to-
+			 * last one in the list.  Once we've compared an entry
+			 * against the last key on the page, we're done.
+			 */
+			break;
+		}
 
 	/*
 	 * The largest key on the last block must sort before the current key,
@@ -2228,7 +2513,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session,
 	 * We track the record number at each column-store split point, set an
 	 * initial value.
 	 */
-	recno = 0;
+	recno = WT_RECNO_OOB;
 	if (dsk->type == WT_PAGE_COL_VAR)
 		recno = last->recno;
 
@@ -2326,10 +2611,8 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session,
 		WT_RET(compressor->pre_size(compressor, wt_session,
 		    (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP,
 		    (size_t)r->raw_offsets[slots], &result_len));
-	extra_skip = 0;
-	if (btree->kencryptor != NULL)
-		extra_skip = btree->kencryptor->size_const +
-		    WT_ENCRYPT_LEN_SIZE;
+	extra_skip = btree->kencryptor == NULL ? 0 :
+	    btree->kencryptor->size_const + WT_ENCRYPT_LEN_SIZE;
 
 	corrected_page_size = result_len + WT_BLOCK_COMPRESS_SKIP;
 	WT_RET(bm->write_size(bm, session, &corrected_page_size));
@@ -2477,7 +2760,7 @@ no_slots:
 			break;
 		case WT_PAGE_ROW_INT:
 		case WT_PAGE_ROW_LEAF:
-			next->recno = 0;
+			next->recno = WT_RECNO_OOB;
 			if (!last_block) {
 				/*
 				 * Confirm there was uncompressed data remaining
@@ -2530,7 +2813,8 @@ no_slots:
 	 *
 	 * If it's not a checkpoint, write the block.
 	 */
-	if (r->bnd_next == 1 && last_block && __rec_is_checkpoint(r, last)) {
+	if (r->bnd_next == 1 &&
+	    last_block && __rec_is_checkpoint(session, r, last)) {
 		if (write_ref == dst)
 			WT_RET(__wt_buf_set(
 			    session, &r->dsk, dst->mem, dst->size));
@@ -2647,13 +2931,29 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
 	}
 
 	/*
-	 * We only arrive here with no entries to write if the page was entirely
-	 * empty, and if the page is empty, we merge it into its parent during
-	 * the parent's reconciliation.  A page with skipped updates isn't truly
-	 * empty, continue on.
+	 * We may arrive here with no entries to write if the page was entirely
+	 * empty or if nothing on the page was visible to us.
 	 */
-	if (r->entries == 0 && r->skip_next == 0)
-		return (0);
+	if (r->entries == 0) {
+		/*
+		 * Pages with skipped or not-yet-globally visible updates aren't
+		 * really empty; otherwise, the page is truly empty and we will
+		 * merge it into its parent during the parent's reconciliation.
+		 */
+		if (r->supd_next == 0)
+			return (0);
+
+		/*
+		 * If using the save/restore eviction path, continue with the
+		 * write, the page will be restored after we finish.
+		 *
+		 * If using the lookaside table eviction path, we can't continue
+		 * (we need a page to be written, otherwise we won't ever find
+		 * the updates for future reads).
+		 */
+		if (F_ISSET(r, WT_EVICT_LOOKASIDE))
+			return (EBUSY);
+	}
 
 	/* Set the boundary reference and increment the count. */
 	bnd = &r->bnd[r->bnd_next++];
@@ -2666,9 +2966,8 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
 	dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk);
 
 	/* If this is a checkpoint, we're done, otherwise write the page. */
-	return (
-	    __rec_is_checkpoint(r, bnd) ? 0 :
-	    __rec_split_write(session, r, bnd, &r->dsk, 1));
+	return (__rec_is_checkpoint(session, r, bnd) ?
+	    0 : __rec_split_write(session, r, bnd, &r->dsk, 1));
 }
 
 /*
@@ -2794,7 +3093,7 @@ __rec_split_write(WT_SESSION_IMPL *session,
 	WT_PAGE *page;
 	WT_PAGE_HEADER *dsk;
 	WT_PAGE_MODIFY *mod;
-	WT_UPD_SKIPPED *skip;
+	WT_SAVE_UPD *supd;
 	size_t addr_size;
 	uint32_t bnd_slot, i, j;
 	int cmp;
@@ -2837,23 +3136,23 @@ __rec_split_write(WT_SESSION_IMPL *session,
 	bnd->cksum = 0;
 
 	/*
-	 * Check if we've skipped updates that belong to this block, and move
-	 * any to the per-block structure.  Quit as soon as we find a skipped
+	 * Check if we've saved updates that belong to this block, and move
+	 * any to the per-block structure.  Quit as soon as we find a saved
 	 * update that doesn't belong to the block, they're in sorted order.
 	 *
 	 * This code requires a key be filled in for the next block (or the
 	 * last block flag be set, if there's no next block).
 	 */
-	for (i = 0, skip = r->skip; i < r->skip_next; ++i, ++skip) {
-		/* The last block gets all remaining skipped updates. */
+	for (i = 0, supd = r->supd; i < r->supd_next; ++i, ++supd) {
+		/* The last block gets all remaining saved updates. */
 		if (last_block) {
-			WT_ERR(__rec_skip_update_move(session, bnd, skip));
+			WT_ERR(__rec_update_move(session, bnd, supd));
 			continue;
 		}
 
 		/*
-		 * Get the skipped update's key and compare it with this block's
-		 * key range.  If the skipped update list belongs with the block
+		 * Get the saved update's key and compare it with this block's
+		 * key range.  If the saved update list belongs with the block
 		 * we're about to write, move it to the per-block memory.  Check
 		 * only to the first update that doesn't go with the block, they
 		 * must be in sorted order.
@@ -2861,43 +3160,56 @@ __rec_split_write(WT_SESSION_IMPL *session,
 		switch (page->type) {
 		case WT_PAGE_COL_FIX:
 		case WT_PAGE_COL_VAR:
-			if (WT_INSERT_RECNO(skip->ins) >= (bnd + 1)->recno)
-				goto skip_check_complete;
+			if (WT_INSERT_RECNO(supd->ins) >= (bnd + 1)->recno)
+				goto supd_check_complete;
 			break;
 		case WT_PAGE_ROW_LEAF:
-			if (skip->ins == NULL)
+			if (supd->ins == NULL)
 				WT_ERR(__wt_row_leaf_key(
-				    session, page, skip->rip, key, 0));
+				    session, page, supd->rip, key, 0));
 			else {
-				key->data = WT_INSERT_KEY(skip->ins);
-				key->size = WT_INSERT_KEY_SIZE(skip->ins);
+				key->data = WT_INSERT_KEY(supd->ins);
+				key->size = WT_INSERT_KEY_SIZE(supd->ins);
 			}
 			WT_ERR(__wt_compare(session,
 			    btree->collator, key, &(bnd + 1)->key, &cmp));
 			if (cmp >= 0)
-				goto skip_check_complete;
+				goto supd_check_complete;
 			break;
 		WT_ILLEGAL_VALUE_ERR(session);
 		}
-		WT_ERR(__rec_skip_update_move(session, bnd, skip));
+		WT_ERR(__rec_update_move(session, bnd, supd));
 	}
 
-skip_check_complete:
+supd_check_complete:
 	/*
 	 * If there are updates that weren't moved to the block, shuffle them to
-	 * the beginning of the cached list (we maintain the skipped updates in
-	 * sorted order, new skipped updates must be appended to the list).
+	 * the beginning of the cached list (we maintain the saved updates in
+	 * sorted order, new saved updates must be appended to the list).
+	 */
+	for (j = 0; i < r->supd_next; ++j, ++i)
+		r->supd[j] = r->supd[i];
+	r->supd_next = j;
+
+	/*
+	 * If using the lookaside table eviction path and we found updates that
+	 * weren't globally visible when reconciling this page, note that in the
+	 * page header.
 	 */
-	for (j = 0; i < r->skip_next; ++j, ++i)
-		r->skip[j] = r->skip[i];
-	r->skip_next = j;
+	if (F_ISSET(r, WT_EVICT_LOOKASIDE) && bnd->supd != NULL) {
+		F_SET(dsk, WT_PAGE_LAS_UPDATE);
+		r->cache_write_lookaside = 1;
+	}
 
 	/*
-	 * If we had to skip updates in order to build this disk image, we can't
-	 * actually write it. Instead, we will re-instantiate the page using the
-	 * disk image and the list of updates we skipped.
+	 * If using the save/restore eviction path and we had to skip updates in
+	 * order to build this disk image, we can't actually write it. Instead,
+	 * we will re-instantiate the page using the disk image and the list of
+	 * updates we skipped.
 	 */
-	if (bnd->skip != NULL) {
+	if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) {
+		r->cache_write_restore = 1;
+
 		/*
 		 * If the buffer is compressed (raw compression was configured),
 		 * we have to decompress it so we can instantiate it later. It's
@@ -2963,12 +3275,148 @@ skip_check_complete:
 	WT_ERR(__wt_strndup(session, addr, addr_size, &bnd->addr.addr));
 	bnd->addr.size = (uint8_t)addr_size;
 
+	/*
+	 * If using the lookaside table eviction path and we found updates that
+	 * weren't globally visible when reconciling this page, copy them into
+	 * the database's lookaside store.
+	 */
+	if (F_ISSET(r, WT_EVICT_LOOKASIDE) && bnd->supd != NULL)
+		ret = __rec_update_las(session, r, btree->id, bnd);
+
 done:
 err:	__wt_scr_free(session, &key);
 	return (ret);
 }
 
 /*
+ * __rec_update_las --
+ *	Copy a set of updates into the database's lookaside buffer.
+ */
+static int
+__rec_update_las(WT_SESSION_IMPL *session,
+    WT_RECONCILE *r, uint32_t btree_id, WT_BOUNDARY *bnd)
+{
+	WT_CURSOR *cursor;
+	WT_DECL_ITEM(key);
+	WT_DECL_RET;
+	WT_ITEM las_addr, las_value;
+	WT_PAGE *page;
+	WT_SAVE_UPD *list;
+	WT_UPDATE *upd;
+	uint64_t las_counter;
+	uint32_t i, session_flags, slot;
+	uint8_t *p;
+
+	cursor = NULL;
+	WT_CLEAR(las_addr);
+	WT_CLEAR(las_value);
+	page = r->page;
+
+	/*
+	 * We're writing lookaside records: start instantiating them on pages
+	 * we read (with the right flag set), and start sweeping the file.
+	 */
+	__wt_las_set_written(session);
+
+	WT_ERR(__wt_las_cursor(session, &cursor, &session_flags));
+
+	/* Ensure enough room for a column-store key without checking. */
+	WT_ERR(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key));
+
+	/*
+	 * Each key in the lookaside table is associated with a block, and those
+	 * blocks are freed and reallocated to other pages as pages in the tree
+	 * are modified and reconciled. We want to be sure we don't add records
+	 * to the lookaside table, then discard the block to which they apply,
+	 * then write a new block to the same address, and then apply the old
+	 * records to the new block when it's read. We don't want to clean old
+	 * records out of the lookaside table every time we free a block because
+	 * that happens a lot and would be costly; instead, we clean out the old
+	 * records when adding new records into the lookaside table. This works
+	 * because we only read from the lookaside table for pages marked with
+	 * the WT_PAGE_LAS_UPDATE flag: that flag won't be set if we rewrite a
+	 * block with no lookaside records, so the lookaside table won't be
+	 * checked when the block is read, even if there are lookaside table
+	 * records matching that block. If we rewrite a block that has lookaside
+	 * records, we'll run this code, discarding any old records that might
+	 * exist.
+	 */
+	WT_ERR(__wt_las_remove_block(
+	    session, cursor, btree_id, bnd->addr.addr, bnd->addr.size));
+
+	/* Lookaside table key component: block address. */
+	las_addr.data = bnd->addr.addr;
+	las_addr.size = bnd->addr.size;
+
+	/* Enter each update in the boundary's list into the lookaside store. */
+	for (las_counter = 0, i = 0,
+	    list = bnd->supd; i < bnd->supd_next; ++i, ++list) {
+		/* Lookaside table key component: source key. */
+		switch (page->type) {
+		case WT_PAGE_COL_FIX:
+		case WT_PAGE_COL_VAR:
+			p = key->mem;
+			WT_ERR(
+			    __wt_vpack_uint(&p, 0, WT_INSERT_RECNO(list->ins)));
+			key->size = WT_PTRDIFF(p, key->data);
+
+			break;
+		case WT_PAGE_ROW_LEAF:
+			if (list->ins == NULL)
+				WT_ERR(__wt_row_leaf_key(
+				    session, page, list->rip, key, 0));
+			else {
+				key->data = WT_INSERT_KEY(list->ins);
+				key->size = WT_INSERT_KEY_SIZE(list->ins);
+			}
+			break;
+		WT_ILLEGAL_VALUE_ERR(session);
+		}
+
+		/* Lookaside table value component: update reference. */
+		switch (page->type) {
+		case WT_PAGE_COL_FIX:
+		case WT_PAGE_COL_VAR:
+			upd = list->ins->upd;
+			break;
+		case WT_PAGE_ROW_LEAF:
+			if (list->ins == NULL) {
+				slot = WT_ROW_SLOT(page, list->rip);
+				upd = page->pg_row_upd[slot];
+			} else
+				upd = list->ins->upd;
+			break;
+		WT_ILLEGAL_VALUE_ERR(session);
+		}
+
+		/*
+		 * Walk the list of updates, storing each key/value pair into
+		 * the lookaside table.
+		 */
+		do {
+			cursor->set_key(cursor, btree_id,
+			    &las_addr, ++las_counter, list->onpage_txn, key);
+
+			if (WT_UPDATE_DELETED_ISSET(upd))
+				las_value.size = 0;
+			else {
+				las_value.data = WT_UPDATE_DATA(upd);
+				las_value.size = upd->size;
+			}
+			cursor->set_value(
+			    cursor, upd->txnid, upd->size, &las_value);
+
+			WT_ERR(cursor->insert(cursor));
+		} while ((upd = upd->next) != NULL);
+	}
+
+err:	WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
+
+	__wt_scr_free(session, &key);
+	return (ret);
+}
+
+/*
  * __wt_bulk_init --
  *	Bulk insert initialization.
  */
@@ -3008,7 +3456,7 @@ __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
 		recno = 1;
 		break;
 	case BTREE_ROW:
-		recno = 0;
+		recno = WT_RECNO_OOB;
 		break;
 	WT_ILLEGAL_VALUE(session);
 	}
@@ -3049,6 +3497,7 @@ __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
 
 	WT_RET(__rec_split_finish(session, r));
 	WT_RET(__rec_write_wrapup(session, r, r->page));
+	WT_RET(__rec_write_status(session, r, r->page));
 
 	/* Mark the page's parent and the tree dirty. */
 	parent = r->ref->home;
@@ -3824,7 +4273,7 @@ record_loop:	/*
 				 * Write a placeholder.
 				 */
 				 WT_ASSERT(session,
-				     F_ISSET(r, WT_SKIP_UPDATE_RESTORE));
+				     F_ISSET(r, WT_EVICT_UPDATE_RESTORE));
 
 				data = "@";
 				size = 1;
@@ -4207,7 +4656,7 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
 			vtype = state == WT_CHILD_PROXY ?
 			    WT_CELL_ADDR_DEL : (u_int)vpack->raw;
 		}
-		__rec_cell_build_addr(r, p, size, vtype, 0);
+		__rec_cell_build_addr(r, p, size, vtype, WT_RECNO_OOB);
 		CHILD_RELEASE_ERR(session, hazard, ref);
 
 		/*
@@ -4294,7 +4743,7 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
 
 		addr = &multi->addr;
 		__rec_cell_build_addr(
-		    r, addr->addr, addr->size, __rec_vtype(addr), 0);
+		    r, addr->addr, addr->size, __rec_vtype(addr), WT_RECNO_OOB);
 
 		/* Boundary: split or write the page. */
 		if (key->len + val->len > r->space_avail)
@@ -4450,7 +4899,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
 				 * Assert the case.
 				 */
 				WT_ASSERT(session,
-				    F_ISSET(r, WT_SKIP_UPDATE_RESTORE));
+				    F_ISSET(r, WT_EVICT_UPDATE_RESTORE));
 
 				/*
 				 * If the key is also a removed overflow item,
@@ -4777,13 +5226,11 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
 static int
 __rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page)
 {
-	WT_BM *bm;
 	WT_DECL_RET;
 	WT_PAGE_MODIFY *mod;
 	WT_MULTI *multi;
 	uint32_t i;
 
-	bm = S2BT(session)->bm;
 	mod = page->modify;
 
 	/*
@@ -4799,17 +5246,17 @@ __rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page)
 			__wt_free(session, multi->key.ikey);
 			break;
 		}
-		if (multi->skip == NULL) {
+		if (multi->supd == NULL) {
 			if (multi->addr.reuse)
 				multi->addr.addr = NULL;
 			else {
-				WT_RET(bm->free(bm, session,
+				WT_RET(__rec_block_free(session,
 				    multi->addr.addr, multi->addr.size));
 				__wt_free(session, multi->addr.addr);
 			}
 		} else {
-			__wt_free(session, multi->skip);
-			__wt_free(session, multi->skip_dsk);
+			__wt_free(session, multi->supd);
+			__wt_free(session, multi->supd_dsk);
 		}
 	}
 	__wt_free(session, mod->mod_multi);
@@ -4882,7 +5329,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
 			 */
 			WT_RET(__wt_ref_info(
 			    session, ref, &addr, &addr_size, NULL));
-			WT_RET(bm->free(bm, session, addr, addr_size));
+			WT_RET(__rec_block_free(session, addr, addr_size));
 			if (__wt_off_page(ref->home, ref->addr)) {
 				__wt_free(
 				    session, ((WT_ADDR *)ref->addr)->addr);
@@ -4908,7 +5355,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
 		 * are checkpoints, and must be explicitly dropped.
 		 */
 		if (!__wt_ref_is_root(ref))
-			WT_RET(bm->free(bm, session,
+			WT_RET(__rec_block_free(session,
 			    mod->mod_replace.addr, mod->mod_replace.size));
 
 		/* Discard the replacement page's address. */
@@ -4962,14 +5409,14 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
 		 * nothing to write. Allocate, then initialize the array of
 		 * replacement blocks.
 		 */
-		if (bnd->skip != NULL) {
+		if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) {
 			WT_RET(__wt_calloc_def(
 			    session, r->bnd_next, &mod->mod_multi));
 			multi = mod->mod_multi;
-			multi->skip = bnd->skip;
-			multi->skip_entries = bnd->skip_next;
-			bnd->skip = NULL;
-			multi->skip_dsk = bnd->dsk;
+			multi->supd = bnd->supd;
+			multi->supd_entries = bnd->supd_next;
+			bnd->supd = NULL;
+			multi->supd_dsk = bnd->dsk;
 			bnd->dsk = NULL;
 			mod->mod_multi_entries = 1;
 
@@ -5068,50 +5515,6 @@ err:			__wt_scr_free(session, &tkey);
 		F_SET(mod, WT_PM_REC_MULTIBLOCK);
 		break;
 	}
-
-	/*
-	 * If updates were skipped, the tree isn't clean.  The checkpoint call
-	 * cleared the tree's modified value before calling the eviction thread,
-	 * so we must explicitly reset the tree's modified flag.  We insert a
-	 * barrier after the change for clarity (the requirement is the value
-	 * be set before a subsequent checkpoint reads it, and because the
-	 * current checkpoint is waiting on this reconciliation to complete,
-	 * there's no risk of that happening).
-	 */
-	if (r->leave_dirty) {
-		mod->first_dirty_txn = r->first_dirty_txn;
-
-		btree->modified = 1;
-		WT_FULL_BARRIER();
-	} else {
-		/*
-		 * If no updates were skipped, we have a new maximum transaction
-		 * written for the page (used to decide if a clean page can be
-		 * evicted). Set the highest transaction ID for the page.
-		 *
-		 * Track the highest transaction ID for the tree (used to decide
-		 * if it's safe to discard all of the pages in the tree without
-		 * further checking). Reconciliation in the service of eviction
-		 * is multi-threaded, only update the tree's maximum transaction
-		 * ID when doing a checkpoint. That's sufficient, we only care
-		 * about the highest transaction ID of any update currently in
-		 * the tree, and checkpoint visits every dirty page in the tree.
-		 */
-		mod->rec_max_txn = r->max_txn;
-		if (!F_ISSET(r, WT_EVICTING) &&
-		    WT_TXNID_LT(btree->rec_max_txn, r->max_txn))
-			btree->rec_max_txn = r->max_txn;
-
-		/*
-		 * The page only might be clean; if the write generation is
-		 * unchanged since reconciliation started, it's clean. If the
-		 * write generation changed, the page has been written since
-		 * we started reconciliation and remains dirty.
-		 */
-		if (WT_ATOMIC_CAS4(mod->write_gen, r->orig_write_gen, 0))
-			__wt_cache_dirty_decr(session, page);
-	}
-
 	return (0);
 }
 
@@ -5122,14 +5525,12 @@ err:			__wt_scr_free(session, &tkey);
 static int
 __rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
 {
-	WT_BM *bm;
 	WT_BOUNDARY *bnd;
 	WT_DECL_RET;
 	WT_MULTI *multi;
 	WT_PAGE_MODIFY *mod;
 	uint32_t i;
 
-	bm = S2BT(session)->bm;
 	mod = page->modify;
 
 	/*
@@ -5160,7 +5561,7 @@ __rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
 			if (bnd->addr.reuse)
 				bnd->addr.addr = NULL;
 			else {
-				WT_TRET(bm->free(bm, session,
+				WT_TRET(__rec_block_free(session,
 				    bnd->addr.addr, bnd->addr.size));
 				__wt_free(session, bnd->addr.addr);
 			}
@@ -5203,18 +5604,18 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
 		WT_RET(__wt_row_ikey_alloc(session, 0,
 		    bnd->key.data, bnd->key.size, &multi->key.ikey));
 
-		if (bnd->skip == NULL) {
+		if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) {
+			multi->supd = bnd->supd;
+			multi->supd_entries = bnd->supd_next;
+			bnd->supd = NULL;
+			multi->supd_dsk = bnd->dsk;
+			bnd->dsk = NULL;
+		} else {
 			multi->addr = bnd->addr;
 			multi->addr.reuse = 0;
 			multi->size = bnd->size;
 			multi->cksum = bnd->cksum;
 			bnd->addr.addr = NULL;
-		} else {
-			multi->skip = bnd->skip;
-			multi->skip_entries = bnd->skip_next;
-			bnd->skip = NULL;
-			multi->skip_dsk = bnd->dsk;
-			bnd->dsk = NULL;
 		}
 	}
 	mod->mod_multi_entries = r->bnd_next;
@@ -5243,18 +5644,18 @@ __rec_split_col(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
 	    bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) {
 		multi->key.recno = bnd->recno;
 
-		if (bnd->skip == NULL) {
+		if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) {
+			multi->supd = bnd->supd;
+			multi->supd_entries = bnd->supd_next;
+			bnd->supd = NULL;
+			multi->supd_dsk = bnd->dsk;
+			bnd->dsk = NULL;
+		} else {
 			multi->addr = bnd->addr;
 			multi->addr.reuse = 0;
 			multi->size = bnd->size;
 			multi->cksum = bnd->cksum;
 			bnd->addr.addr = NULL;
-		} else {
-			multi->skip = bnd->skip;
-			multi->skip_entries = bnd->skip_next;
-			bnd->skip = NULL;
-			multi->skip_dsk = bnd->dsk;
-			bnd->dsk = NULL;
 		}
 	}
 	mod->mod_multi_entries = r->bnd_next;
diff --git a/src/schema/schema_list.c b/src/schema/schema_list.c
index a36fd696079..d091a5d94da 100644
--- a/src/schema/schema_list.c
+++ b/src/schema/schema_list.c
@@ -29,8 +29,8 @@ __schema_add_table(WT_SESSION_IMPL *session,
 	WT_RET(ret);
 
 	bucket = table->name_hash % WT_HASH_ARRAY_SIZE;
-	SLIST_INSERT_HEAD(&session->tables, table, l);
-	SLIST_INSERT_HEAD(&session->tablehash[bucket], table, hashl);
+	TAILQ_INSERT_HEAD(&session->tables, table, q);
+	TAILQ_INSERT_HEAD(&session->tablehash[bucket], table, hashq);
 	*tablep = table;
 
 	return (0);
@@ -51,7 +51,7 @@ __schema_find_table(WT_SESSION_IMPL *session,
 	bucket = __wt_hash_city64(name, namelen) % WT_HASH_ARRAY_SIZE;
 
 restart:
-	SLIST_FOREACH(table, &session->tablehash[bucket], hashl) {
+	TAILQ_FOREACH(table, &session->tablehash[bucket], hashq) {
 		tablename = table->name;
 		(void)WT_PREFIX_SKIP(tablename, "table:");
 		if (WT_STRING_MATCH(tablename, name, namelen)) {
@@ -228,8 +228,8 @@ __wt_schema_remove_table(WT_SESSION_IMPL *session, WT_TABLE *table)
 	WT_ASSERT(session, table->refcnt <= 1);
 
 	bucket = table->name_hash % WT_HASH_ARRAY_SIZE;
-	SLIST_REMOVE(&session->tables, table, __wt_table, l);
-	SLIST_REMOVE(&session->tablehash[bucket], table, __wt_table, hashl);
+	TAILQ_REMOVE(&session->tables, table, q);
+	TAILQ_REMOVE(&session->tablehash[bucket], table, hashq);
 	return (__wt_schema_destroy_table(session, &table));
 }
 
@@ -243,7 +243,7 @@ __wt_schema_close_tables(WT_SESSION_IMPL *session)
 	WT_DECL_RET;
 	WT_TABLE *table;
 
-	while ((table = SLIST_FIRST(&session->tables)) != NULL)
+	while ((table = TAILQ_FIRST(&session->tables)) != NULL)
 		WT_TRET(__wt_schema_remove_table(session, table));
 	return (ret);
 }
diff --git a/src/schema/schema_stat.c b/src/schema/schema_stat.c
index dea797f823d..e9439abe16f 100644
--- a/src/schema/schema_stat.c
+++ b/src/schema/schema_stat.c
@@ -90,7 +90,7 @@ __wt_curstat_table_init(WT_SESSION_IMPL *session,
 		if (i == 0)
 			*stats = *new;
 		else
-			__wt_stat_aggregate_dsrc_stats(new, stats);
+			__wt_stat_dsrc_aggregate_single(new, stats);
 		WT_ERR(stat_cursor->close(stat_cursor));
 	}
 
@@ -102,7 +102,7 @@ __wt_curstat_table_init(WT_SESSION_IMPL *session,
 		WT_ERR(__wt_curstat_open(
 		    session, buf->data, cfg, &stat_cursor));
 		new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
-		__wt_stat_aggregate_dsrc_stats(new, stats);
+		__wt_stat_dsrc_aggregate_single(new, stats);
 		WT_ERR(stat_cursor->close(stat_cursor));
 	}
 
diff --git a/src/session/session_api.c b/src/session/session_api.c
index ef9735a8b98..a1f5618a317 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -383,6 +383,22 @@ err:		if (cursor != NULL)
 }
 
 /*
+ * __wt_session_create --
+ *	Internal version of WT_SESSION::create.
+ */
+int
+__wt_session_create(
+    WT_SESSION_IMPL *session, const char *uri, const char *config)
+{
+	WT_DECL_RET;
+
+	WT_WITH_SCHEMA_LOCK(session,
+	    WT_WITH_TABLE_LOCK(session,
+		ret = __wt_schema_create(session, uri, config)));
+	return (ret);
+}
+
+/*
  * __session_create --
  *	WT_SESSION->create method.
  */
@@ -423,9 +439,7 @@ __session_create(WT_SESSION *wt_session, const char *uri, const char *config)
 		WT_ERR_NOTFOUND_OK(ret);
 	}
 
-	WT_WITH_SCHEMA_LOCK(session,
-	    WT_WITH_TABLE_LOCK(session,
-		ret = __wt_schema_create(session, uri, config)));
+	ret = __wt_session_create(session, uri, config);
 
 err:	API_END_RET_NOTFOUND_MAP(session, ret);
 }
@@ -529,6 +543,21 @@ __session_compact(WT_SESSION *wt_session, const char *uri, const char *config)
 }
 
 /*
+ * __wt_session_drop --
+ *	Internal version of WT_SESSION::drop.
+ */
+int
+__wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
+{
+	WT_DECL_RET;
+
+	WT_WITH_SCHEMA_LOCK(session,
+	    WT_WITH_TABLE_LOCK(session,
+		ret = __wt_schema_drop(session, uri, cfg)));
+	return (ret);
+}
+
+/*
  * __session_drop --
  *	WT_SESSION->drop method.
  */
@@ -544,9 +573,7 @@ __session_drop(WT_SESSION *wt_session, const char *uri, const char *config)
 	/* Disallow objects in the WiredTiger name space. */
 	WT_ERR(__wt_str_name_check(session, uri));
 
-	WT_WITH_SCHEMA_LOCK(session,
-	    WT_WITH_TABLE_LOCK(session,
-		ret = __wt_schema_drop(session, uri, cfg)));
+	ret = __wt_session_drop(session, uri, cfg);
 
 err:	/* Note: drop operations cannot be unrolled (yet?). */
 	API_END_RET_NOTFOUND_MAP(session, ret);
@@ -800,7 +827,7 @@ __session_commit_transaction(WT_SESSION *wt_session, const char *config)
 	WT_STAT_FAST_CONN_INCR(session, txn_commit);
 
 	txn = &session->txn;
-	if (F_ISSET(txn, WT_TXN_ERROR)) {
+	if (F_ISSET(txn, WT_TXN_ERROR) && txn->mod_count != 0) {
 		__wt_errx(session, "failed transaction requires rollback");
 		ret = EINVAL;
 	}
@@ -915,7 +942,7 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config)
 	 * If our LSN is smaller than the current sync LSN then our
 	 * transaction is stable.  We're done.
 	 */
-	if (WT_LOG_CMP(&session->bg_sync_lsn, &log->sync_lsn) <= 0)
+	if (__wt_log_cmp(&session->bg_sync_lsn, &log->sync_lsn) <= 0)
 		goto err;
 
 	/*
@@ -937,7 +964,7 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config)
 	 * Keep checking the LSNs until we find it is stable or we reach
 	 * our timeout.
 	 */
-	while (WT_LOG_CMP(&session->bg_sync_lsn, &log->sync_lsn) > 0) {
+	while (__wt_log_cmp(&session->bg_sync_lsn, &log->sync_lsn) > 0) {
 		WT_ERR(__wt_cond_signal(session, conn->log_file_cond));
 		WT_ERR(__wt_epoch(session, &now));
 		waited_ms = WT_TIMEDIFF(now, start) / WT_MILLION;
@@ -1001,7 +1028,7 @@ __session_checkpoint(WT_SESSION *wt_session, const char *config)
 	 * operations, but checkpoint does enough I/O it may be called upon to
 	 * perform slow operations for the block manager.
 	 */
-	F_SET(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_CACHE_CHECK);
+	F_SET(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION);
 
 	/*
 	 * Only one checkpoint can be active at a time, and checkpoints must run
@@ -1016,7 +1043,7 @@ __session_checkpoint(WT_SESSION *wt_session, const char *config)
 
 	WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 0);
 
-err:	F_CLR(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_CACHE_CHECK);
+err:	F_CLR(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION);
 
 	API_END_RET_NOTFOUND_MAP(session, ret);
 }
@@ -1166,8 +1193,8 @@ __wt_open_session(WT_CONNECTION_IMPL *conn,
 	if (i == conn->session_size)
 		WT_ERR_MSG(session, ENOMEM,
 		    "only configured to support %" PRIu32 " sessions"
-		    " (including %" PRIu32 " internal)",
-		    conn->session_size, WT_NUM_INTERNAL_SESSIONS);
+		    " (including %d additional internal sessions)",
+		    conn->session_size, WT_EXTRA_INTERNAL_SESSIONS);
 
 	/*
 	 * If the active session count is increasing, update it.  We don't worry
@@ -1190,7 +1217,7 @@ __wt_open_session(WT_CONNECTION_IMPL *conn,
 	    event_handler == NULL ? session->event_handler : event_handler);
 
 	TAILQ_INIT(&session_ret->cursors);
-	SLIST_INIT(&session_ret->dhandles);
+	TAILQ_INIT(&session_ret->dhandles);
 	/*
 	 * If we don't have one, allocate the dhandle hash array.
 	 * Allocate the table hash array as well.
@@ -1202,8 +1229,8 @@ __wt_open_session(WT_CONNECTION_IMPL *conn,
 		WT_ERR(__wt_calloc(session_ret, WT_HASH_ARRAY_SIZE,
 		    sizeof(struct __tables_hash), &session_ret->tablehash));
 	for (i = 0; i < WT_HASH_ARRAY_SIZE; i++) {
-		SLIST_INIT(&session_ret->dhhash[i]);
-		SLIST_INIT(&session_ret->tablehash[i]);
+		TAILQ_INIT(&session_ret->dhhash[i]);
+		TAILQ_INIT(&session_ret->tablehash[i]);
 	}
 
 	/* Initialize transaction support: default to read-committed. */
diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c
index be8ca494778..dd0b50cc094 100644
--- a/src/session/session_dhandle.c
+++ b/src/session/session_dhandle.c
@@ -25,8 +25,8 @@ __session_add_dhandle(
 	dhandle_cache->dhandle = session->dhandle;
 
 	bucket = dhandle_cache->dhandle->name_hash % WT_HASH_ARRAY_SIZE;
-	SLIST_INSERT_HEAD(&session->dhandles, dhandle_cache, l);
-	SLIST_INSERT_HEAD(&session->dhhash[bucket], dhandle_cache, hashl);
+	TAILQ_INSERT_HEAD(&session->dhandles, dhandle_cache, q);
+	TAILQ_INSERT_HEAD(&session->dhhash[bucket], dhandle_cache, hashq);
 
 	if (dhandle_cachep != NULL)
 		*dhandle_cachep = dhandle_cache;
@@ -36,6 +36,61 @@ __session_add_dhandle(
 }
 
 /*
+ * __session_discard_dhandle --
+ *	Remove a data handle from the session cache.
+ */
+static void
+__session_discard_dhandle(
+    WT_SESSION_IMPL *session, WT_DATA_HANDLE_CACHE *dhandle_cache)
+{
+	uint64_t bucket;
+
+	bucket = dhandle_cache->dhandle->name_hash % WT_HASH_ARRAY_SIZE;
+	TAILQ_REMOVE(&session->dhandles, dhandle_cache, q);
+	TAILQ_REMOVE(&session->dhhash[bucket], dhandle_cache, hashq);
+
+	(void)__wt_atomic_sub32(&dhandle_cache->dhandle->session_ref, 1);
+
+	__wt_overwrite_and_free(session, dhandle_cache);
+}
+
+/*
+ * __session_find_dhandle --
+ *	Search for a data handle in the session cache.
+ */
+static void
+__session_find_dhandle(WT_SESSION_IMPL *session,
+    const char *uri, const char *checkpoint,
+    WT_DATA_HANDLE_CACHE **dhandle_cachep)
+{
+	WT_DATA_HANDLE *dhandle;
+	WT_DATA_HANDLE_CACHE *dhandle_cache;
+	uint64_t bucket;
+
+	dhandle = NULL;
+
+	bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE;
+retry:	TAILQ_FOREACH(dhandle_cache, &session->dhhash[bucket], hashq) {
+		dhandle = dhandle_cache->dhandle;
+		if (WT_DHANDLE_INACTIVE(dhandle) && !WT_IS_METADATA(dhandle)) {
+			__session_discard_dhandle(session, dhandle_cache);
+			/* We deleted our entry, retry from the start. */
+			goto retry;
+		}
+
+		if (strcmp(uri, dhandle->name) != 0)
+			continue;
+		if (checkpoint == NULL && dhandle->checkpoint == NULL)
+			break;
+		if (checkpoint != NULL && dhandle->checkpoint != NULL &&
+		    strcmp(checkpoint, dhandle->checkpoint) == 0)
+			break;
+	}
+
+	*dhandle_cachep = dhandle_cache;
+}
+
+/*
  * __wt_session_lock_dhandle --
  *	Return when the current data handle is either (a) open with the
  *	requested lock mode; or (b) closed and write locked.  If exclusive
@@ -173,6 +228,7 @@ __wt_session_release_btree(WT_SESSION_IMPL *session)
 {
 	WT_BTREE *btree;
 	WT_DATA_HANDLE *dhandle;
+	WT_DATA_HANDLE_CACHE *dhandle_cache;
 	WT_DECL_RET;
 	int locked, write_locked;
 
@@ -185,6 +241,13 @@ __wt_session_release_btree(WT_SESSION_IMPL *session)
 	 * If we had special flags set, close the handle so that future access
 	 * can get a handle without special flags.
 	 */
+	if (F_ISSET(dhandle, WT_DHANDLE_DISCARD | WT_DHANDLE_DISCARD_FORCE)) {
+		__session_find_dhandle(session,
+		    dhandle->name, dhandle->checkpoint, &dhandle_cache);
+		if (dhandle_cache != NULL)
+			__session_discard_dhandle(session, dhandle_cache);
+	}
+
 	if (F_ISSET(dhandle, WT_DHANDLE_DISCARD_FORCE)) {
 		ret = __wt_conn_btree_sync_and_close(session, 0, 1);
 		F_CLR(dhandle, WT_DHANDLE_DISCARD_FORCE);
@@ -272,26 +335,6 @@ retry:			WT_RET(__wt_meta_checkpoint_last_name(
 }
 
 /*
- * __session_discard_btree --
- *	Discard our reference to the btree.
- */
-static void
-__session_discard_btree(
-    WT_SESSION_IMPL *session, WT_DATA_HANDLE_CACHE *dhandle_cache)
-{
-	uint64_t bucket;
-
-	bucket = dhandle_cache->dhandle->name_hash % WT_HASH_ARRAY_SIZE;
-	SLIST_REMOVE(
-	    &session->dhandles, dhandle_cache, __wt_data_handle_cache, l);
-	SLIST_REMOVE(&session->dhhash[bucket],
-	    dhandle_cache, __wt_data_handle_cache, hashl);
-
-	(void)WT_ATOMIC_SUB4(dhandle_cache->dhandle->session_ref, 1);
-	__wt_overwrite_and_free(session, dhandle_cache);
-}
-
-/*
  * __wt_session_close_cache --
  *	Close any cached handles in a session.
  */
@@ -300,8 +343,8 @@ __wt_session_close_cache(WT_SESSION_IMPL *session)
 {
 	WT_DATA_HANDLE_CACHE *dhandle_cache;
 
-	while ((dhandle_cache = SLIST_FIRST(&session->dhandles)) != NULL)
-		__session_discard_btree(session, dhandle_cache);
+	while ((dhandle_cache = TAILQ_FIRST(&session->dhandles)) != NULL)
+		__session_discard_dhandle(session, dhandle_cache);
 }
 
 /*
@@ -329,18 +372,18 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session)
 
 	WT_STAT_FAST_CONN_INCR(session, dh_session_sweeps);
 
-	dhandle_cache = SLIST_FIRST(&session->dhandles);
+	dhandle_cache = TAILQ_FIRST(&session->dhandles);
 	while (dhandle_cache != NULL) {
-		dhandle_cache_next = SLIST_NEXT(dhandle_cache, l);
+		dhandle_cache_next = TAILQ_NEXT(dhandle_cache, q);
 		dhandle = dhandle_cache->dhandle;
 		if (dhandle != session->dhandle &&
 		    dhandle->session_inuse == 0 &&
-		    (F_ISSET(dhandle, WT_DHANDLE_DEAD) ||
+		    (WT_DHANDLE_INACTIVE(dhandle) ||
 		    (dhandle->timeofdeath != 0 &&
 		    now - dhandle->timeofdeath > conn->sweep_idle_time))) {
 			WT_STAT_FAST_CONN_INCR(session, dh_session_handles);
 			WT_ASSERT(session, !WT_IS_METADATA(dhandle));
-			__session_discard_btree(session, dhandle_cache);
+			__session_discard_dhandle(session, dhandle_cache);
 		}
 		dhandle_cache = dhandle_cache_next;
 	}
@@ -348,51 +391,37 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session)
 }
 
 /*
- * __session_dhandle_find_shared --
+ * __session_find_shared_dhandle --
  *	Search for a data handle in the connection and add it to a session's
  *	cache.  Since the data handle isn't locked, this must be called holding
  *	the handle list lock, and we must increment the handle's reference
  *	count before releasing it.
  */
 static int
-__session_dhandle_find_shared(
+__session_find_shared_dhandle(
     WT_SESSION_IMPL *session, const char *uri, const char *checkpoint)
 {
 	WT_RET(__wt_conn_dhandle_find(session, uri, checkpoint));
-	(void)WT_ATOMIC_ADD4(session->dhandle->session_ref, 1);
+	(void)__wt_atomic_add32(&session->dhandle->session_ref, 1);
 	return (0);
 }
+
 /*
- * __session_dhandle_find --
+ * __session_get_dhandle --
  *	Search for a data handle, first in the session cache, then in the
  *	connection.
  */
 static int
-__session_dhandle_find(
+__session_get_dhandle(
     WT_SESSION_IMPL *session, const char *uri, const char *checkpoint)
 {
-	WT_DATA_HANDLE *dhandle;
 	WT_DATA_HANDLE_CACHE *dhandle_cache;
 	WT_DECL_RET;
-	uint64_t bucket;
 
-	bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE;
-retry:	SLIST_FOREACH(dhandle_cache, &session->dhhash[bucket], hashl) {
-		dhandle = dhandle_cache->dhandle;
-		if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) {
-			WT_ASSERT(session, !WT_IS_METADATA(dhandle));
-			__session_discard_btree(session, dhandle_cache);
-			/* We deleted our entry, retry from the start. */
-			goto retry;
-		}
-		if (strcmp(uri, dhandle->name) != 0)
-			continue;
-		if ((checkpoint == NULL && dhandle->checkpoint == NULL) ||
-		    (checkpoint != NULL && dhandle->checkpoint != NULL &&
-		    strcmp(checkpoint, dhandle->checkpoint) == 0)) {
-			session->dhandle = dhandle;
-			return (0);
-		}
+	__session_find_dhandle(session, uri, checkpoint, &dhandle_cache);
+	if (dhandle_cache != NULL) {
+		session->dhandle = dhandle_cache->dhandle;
+		return (0);
 	}
 
 	/*
@@ -400,7 +429,7 @@ retry:	SLIST_FOREACH(dhandle_cache, &session->dhhash[bucket], hashl) {
 	 * handle list and cache the handle we find.
 	 */
 	WT_WITH_HANDLE_LIST_LOCK(session, ret =
-	    __session_dhandle_find_shared(session, uri, checkpoint));
+	    __session_find_shared_dhandle(session, uri, checkpoint));
 	if (ret == 0)
 		ret = __session_add_dhandle(session, NULL);
 
@@ -422,7 +451,7 @@ __wt_session_get_btree(WT_SESSION_IMPL *session,
 	WT_ASSERT(session, !F_ISSET(session, WT_SESSION_NO_DATA_HANDLES));
 
 	for (;;) {
-		WT_RET(__session_dhandle_find(session, uri, checkpoint));
+		WT_RET(__session_get_dhandle(session, uri, checkpoint));
 		dhandle = session->dhandle;
 
 		/*
diff --git a/src/support/pow.c b/src/support/pow.c
index 8e42113a2ee..0f50bfe56a1 100644
--- a/src/support/pow.c
+++ b/src/support/pow.c
@@ -100,7 +100,7 @@ __wt_log2_int(uint32_t n)
  * __wt_ispo2 --
  *	Return if a number is a power-of-two.
  */
-int
+bool
 __wt_ispo2(uint32_t v)
 {
 	/*
diff --git a/src/support/rand.c b/src/support/rand.c
index caac04d3529..f5ecb12633e 100644
--- a/src/support/rand.c
+++ b/src/support/rand.c
@@ -84,8 +84,11 @@ __wt_random(WT_RAND_STATE volatile * rnd_state)
 	 * to initialize the state, or initializes with a seed that results in a
 	 * short period.
 	 */
-	if (z == 0 || w == 0)
-		__wt_random_init(rnd_state);
+	if (z == 0 || w == 0) {
+		__wt_random_init(&rnd);
+		w = M_W(rnd);
+		z = M_Z(rnd);
+	}
 
 	M_Z(rnd) = z = 36969 * (z & 65535) + (z >> 16);
 	M_W(rnd) = w = 18000 * (w & 65535) + (w >> 16);
diff --git a/src/support/stat.c b/src/support/stat.c
index b0e7d660587..79248b0652c 100644
--- a/src/support/stat.c
+++ b/src/support/stat.c
@@ -2,672 +2,1016 @@
 
 #include "wt_internal.h"
 
+static const char * const __stats_dsrc_desc[] = {
+	"block-manager: file allocation unit size",
+	"block-manager: blocks allocated",
+	"block-manager: checkpoint size",
+	"block-manager: allocations requiring file extension",
+	"block-manager: blocks freed",
+	"block-manager: file magic number",
+	"block-manager: file major version number",
+	"block-manager: minor version number",
+	"block-manager: file bytes available for reuse",
+	"block-manager: file size in bytes",
+	"LSM: bloom filters in the LSM tree",
+	"LSM: bloom filter false positives",
+	"LSM: bloom filter hits",
+	"LSM: bloom filter misses",
+	"LSM: bloom filter pages evicted from cache",
+	"LSM: bloom filter pages read into cache",
+	"LSM: total size of bloom filters",
+	"btree: btree checkpoint generation",
+	"btree: column-store variable-size deleted values",
+	"btree: column-store fixed-size leaf pages",
+	"btree: column-store internal pages",
+	"btree: column-store variable-size RLE encoded values",
+	"btree: column-store variable-size leaf pages",
+	"btree: pages rewritten by compaction",
+	"btree: number of key/value pairs",
+	"btree: fixed-record size",
+	"btree: maximum tree depth",
+	"btree: maximum internal page key size",
+	"btree: maximum internal page size",
+	"btree: maximum leaf page key size",
+	"btree: maximum leaf page size",
+	"btree: maximum leaf page value size",
+	"btree: overflow pages",
+	"btree: row-store internal pages",
+	"btree: row-store leaf pages",
+	"cache: bytes read into cache",
+	"cache: bytes written from cache",
+	"cache: checkpoint blocked page eviction",
+	"cache: unmodified pages evicted",
+	"cache: page split during eviction deepened the tree",
+	"cache: modified pages evicted",
+	"cache: data source pages selected for eviction unable to be evicted",
+	"cache: hazard pointer blocked page eviction",
+	"cache: internal pages evicted",
+	"cache: pages split during eviction",
+	"cache: in-memory page splits",
+	"cache: in-memory page passed criteria to be split",
+	"cache: overflow values cached in memory",
+	"cache: pages read into cache",
+	"cache: pages read into cache requiring lookaside entries",
+	"cache: overflow pages read into cache",
+	"cache: pages written from cache",
+	"cache: page written requiring lookaside records",
+	"cache: pages written requiring in-memory restoration",
+	"compression: raw compression call failed, no additional data available",
+	"compression: raw compression call failed, additional data available",
+	"compression: raw compression call succeeded",
+	"compression: compressed pages read",
+	"compression: compressed pages written",
+	"compression: page written failed to compress",
+	"compression: page written was too small to compress",
+	"cursor: create calls",
+	"cursor: insert calls",
+	"cursor: bulk-loaded cursor-insert calls",
+	"cursor: cursor-insert key and value bytes inserted",
+	"cursor: next calls",
+	"cursor: prev calls",
+	"cursor: remove calls",
+	"cursor: cursor-remove key bytes removed",
+	"cursor: reset calls",
+	"cursor: restarted searches",
+	"cursor: search calls",
+	"cursor: search near calls",
+	"cursor: update calls",
+	"cursor: cursor-update value bytes updated",
+	"LSM: sleep for LSM checkpoint throttle",
+	"LSM: chunks in the LSM tree",
+	"LSM: highest merge generation in the LSM tree",
+	"LSM: queries that could have benefited from a Bloom filter that did not exist",
+	"LSM: sleep for LSM merge throttle",
+	"reconciliation: dictionary matches",
+	"reconciliation: internal page multi-block writes",
+	"reconciliation: leaf page multi-block writes",
+	"reconciliation: maximum blocks required for a page",
+	"reconciliation: internal-page overflow keys",
+	"reconciliation: leaf-page overflow keys",
+	"reconciliation: overflow values written",
+	"reconciliation: pages deleted",
+	"reconciliation: page checksum matches",
+	"reconciliation: page reconciliation calls",
+	"reconciliation: page reconciliation calls for eviction",
+	"reconciliation: leaf page key bytes discarded using prefix compression",
+	"reconciliation: internal page key bytes discarded using suffix compression",
+	"session: object compaction",
+	"session: open cursor count",
+	"transaction: update conflicts",
+};
+
+const char *
+__wt_stat_dsrc_desc(int slot)
+{
+	return (__stats_dsrc_desc[slot]);
+}
+
 void
-__wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats)
+__wt_stat_dsrc_init_single(WT_DSRC_STATS *stats)
 {
-	/* Clear, so can also be called for reinitialization. */
 	memset(stats, 0, sizeof(*stats));
+}
+
+void
+__wt_stat_dsrc_init(WT_DATA_HANDLE *handle)
+{
+	int i;
 
-	stats->block_extension.desc =
-	    "block-manager: allocations requiring file extension";
-	stats->block_alloc.desc = "block-manager: blocks allocated";
-	stats->block_free.desc = "block-manager: blocks freed";
-	stats->block_checkpoint_size.desc = "block-manager: checkpoint size";
-	stats->allocation_size.desc =
-	    "block-manager: file allocation unit size";
-	stats->block_reuse_bytes.desc =
-	    "block-manager: file bytes available for reuse";
-	stats->block_magic.desc = "block-manager: file magic number";
-	stats->block_major.desc = "block-manager: file major version number";
-	stats->block_size.desc = "block-manager: file size in bytes";
-	stats->block_minor.desc = "block-manager: minor version number";
-	stats->btree_checkpoint_generation.desc =
-	    "btree: btree checkpoint generation";
-	stats->btree_column_fix.desc =
-	    "btree: column-store fixed-size leaf pages";
-	stats->btree_column_internal.desc =
-	    "btree: column-store internal pages";
-	stats->btree_column_deleted.desc =
-	    "btree: column-store variable-size deleted values";
-	stats->btree_column_variable.desc =
-	    "btree: column-store variable-size leaf pages";
-	stats->btree_fixed_len.desc = "btree: fixed-record size";
-	stats->btree_maxintlkey.desc = "btree: maximum internal page key size";
-	stats->btree_maxintlpage.desc = "btree: maximum internal page size";
-	stats->btree_maxleafkey.desc = "btree: maximum leaf page key size";
-	stats->btree_maxleafpage.desc = "btree: maximum leaf page size";
-	stats->btree_maxleafvalue.desc = "btree: maximum leaf page value size";
-	stats->btree_maximum_depth.desc = "btree: maximum tree depth";
-	stats->btree_entries.desc = "btree: number of key/value pairs";
-	stats->btree_overflow.desc = "btree: overflow pages";
-	stats->btree_compact_rewrite.desc =
-	    "btree: pages rewritten by compaction";
-	stats->btree_row_internal.desc = "btree: row-store internal pages";
-	stats->btree_row_leaf.desc = "btree: row-store leaf pages";
-	stats->cache_bytes_read.desc = "cache: bytes read into cache";
-	stats->cache_bytes_write.desc = "cache: bytes written from cache";
-	stats->cache_eviction_checkpoint.desc =
-	    "cache: checkpoint blocked page eviction";
-	stats->cache_eviction_fail.desc =
-	    "cache: data source pages selected for eviction unable to be evicted";
-	stats->cache_eviction_hazard.desc =
-	    "cache: hazard pointer blocked page eviction";
-	stats->cache_inmem_split.desc = "cache: in-memory page splits";
-	stats->cache_eviction_internal.desc = "cache: internal pages evicted";
-	stats->cache_eviction_dirty.desc = "cache: modified pages evicted";
-	stats->cache_read_overflow.desc =
-	    "cache: overflow pages read into cache";
-	stats->cache_overflow_value.desc =
-	    "cache: overflow values cached in memory";
-	stats->cache_eviction_deepen.desc =
-	    "cache: page split during eviction deepened the tree";
-	stats->cache_read.desc = "cache: pages read into cache";
-	stats->cache_eviction_split.desc =
-	    "cache: pages split during eviction";
-	stats->cache_write.desc = "cache: pages written from cache";
-	stats->cache_eviction_clean.desc = "cache: unmodified pages evicted";
-	stats->compress_read.desc = "compression: compressed pages read";
-	stats->compress_write.desc = "compression: compressed pages written";
-	stats->compress_write_fail.desc =
-	    "compression: page written failed to compress";
-	stats->compress_write_too_small.desc =
-	    "compression: page written was too small to compress";
-	stats->compress_raw_fail_temporary.desc =
-	    "compression: raw compression call failed, additional data available";
-	stats->compress_raw_fail.desc =
-	    "compression: raw compression call failed, no additional data available";
-	stats->compress_raw_ok.desc =
-	    "compression: raw compression call succeeded";
-	stats->cursor_insert_bulk.desc =
-	    "cursor: bulk-loaded cursor-insert calls";
-	stats->cursor_create.desc = "cursor: create calls";
-	stats->cursor_insert_bytes.desc =
-	    "cursor: cursor-insert key and value bytes inserted";
-	stats->cursor_remove_bytes.desc =
-	    "cursor: cursor-remove key bytes removed";
-	stats->cursor_update_bytes.desc =
-	    "cursor: cursor-update value bytes updated";
-	stats->cursor_insert.desc = "cursor: insert calls";
-	stats->cursor_next.desc = "cursor: next calls";
-	stats->cursor_prev.desc = "cursor: prev calls";
-	stats->cursor_remove.desc = "cursor: remove calls";
-	stats->cursor_reset.desc = "cursor: reset calls";
-	stats->cursor_search.desc = "cursor: search calls";
-	stats->cursor_search_near.desc = "cursor: search near calls";
-	stats->cursor_update.desc = "cursor: update calls";
-	stats->bloom_false_positive.desc = "LSM: bloom filter false positives";
-	stats->bloom_hit.desc = "LSM: bloom filter hits";
-	stats->bloom_miss.desc = "LSM: bloom filter misses";
-	stats->bloom_page_evict.desc =
-	    "LSM: bloom filter pages evicted from cache";
-	stats->bloom_page_read.desc =
-	    "LSM: bloom filter pages read into cache";
-	stats->bloom_count.desc = "LSM: bloom filters in the LSM tree";
-	stats->lsm_chunk_count.desc = "LSM: chunks in the LSM tree";
-	stats->lsm_generation_max.desc =
-	    "LSM: highest merge generation in the LSM tree";
-	stats->lsm_lookup_no_bloom.desc =
-	    "LSM: queries that could have benefited from a Bloom filter that did not exist";
-	stats->lsm_checkpoint_throttle.desc =
-	    "LSM: sleep for LSM checkpoint throttle";
-	stats->lsm_merge_throttle.desc = "LSM: sleep for LSM merge throttle";
-	stats->bloom_size.desc = "LSM: total size of bloom filters";
-	stats->rec_dictionary.desc = "reconciliation: dictionary matches";
-	stats->rec_suffix_compression.desc =
-	    "reconciliation: internal page key bytes discarded using suffix compression";
-	stats->rec_multiblock_internal.desc =
-	    "reconciliation: internal page multi-block writes";
-	stats->rec_overflow_key_internal.desc =
-	    "reconciliation: internal-page overflow keys";
-	stats->rec_prefix_compression.desc =
-	    "reconciliation: leaf page key bytes discarded using prefix compression";
-	stats->rec_multiblock_leaf.desc =
-	    "reconciliation: leaf page multi-block writes";
-	stats->rec_overflow_key_leaf.desc =
-	    "reconciliation: leaf-page overflow keys";
-	stats->rec_multiblock_max.desc =
-	    "reconciliation: maximum blocks required for a page";
-	stats->rec_overflow_value.desc =
-	    "reconciliation: overflow values written";
-	stats->rec_page_match.desc = "reconciliation: page checksum matches";
-	stats->rec_pages.desc = "reconciliation: page reconciliation calls";
-	stats->rec_pages_eviction.desc =
-	    "reconciliation: page reconciliation calls for eviction";
-	stats->rec_page_delete.desc = "reconciliation: pages deleted";
-	stats->session_compact.desc = "session: object compaction";
-	stats->session_cursor_open.desc = "session: open cursor count";
-	stats->txn_update_conflict.desc = "transaction: update conflicts";
+	for (i = 0; i < WT_COUNTER_SLOTS; ++i) {
+		handle->stats[i] = &handle->stat_array[i];
+		__wt_stat_dsrc_init_single(handle->stats[i]);
+	}
 }
 
 void
-__wt_stat_refresh_dsrc_stats(void *stats_arg)
+__wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats)
 {
-	WT_DSRC_STATS *stats;
+	stats->block_extension = 0;
+	stats->block_alloc = 0;
+	stats->block_free = 0;
+	stats->block_checkpoint_size = 0;
+	stats->allocation_size = 0;
+	stats->block_reuse_bytes = 0;
+	stats->block_magic = 0;
+	stats->block_major = 0;
+	stats->block_size = 0;
+	stats->block_minor = 0;
+		/* not clearing btree_checkpoint_generation */
+	stats->btree_column_fix = 0;
+	stats->btree_column_internal = 0;
+	stats->btree_column_deleted = 0;
+	stats->btree_column_variable = 0;
+	stats->btree_column_rle = 0;
+	stats->btree_fixed_len = 0;
+	stats->btree_maxintlkey = 0;
+	stats->btree_maxintlpage = 0;
+	stats->btree_maxleafkey = 0;
+	stats->btree_maxleafpage = 0;
+	stats->btree_maxleafvalue = 0;
+	stats->btree_maximum_depth = 0;
+	stats->btree_entries = 0;
+	stats->btree_overflow = 0;
+	stats->btree_compact_rewrite = 0;
+	stats->btree_row_internal = 0;
+	stats->btree_row_leaf = 0;
+	stats->cache_bytes_read = 0;
+	stats->cache_bytes_write = 0;
+	stats->cache_eviction_checkpoint = 0;
+	stats->cache_eviction_fail = 0;
+	stats->cache_eviction_hazard = 0;
+	stats->cache_inmem_splittable = 0;
+	stats->cache_inmem_split = 0;
+	stats->cache_eviction_internal = 0;
+	stats->cache_eviction_dirty = 0;
+	stats->cache_read_overflow = 0;
+	stats->cache_overflow_value = 0;
+	stats->cache_eviction_deepen = 0;
+	stats->cache_write_lookaside = 0;
+	stats->cache_read = 0;
+	stats->cache_read_lookaside = 0;
+	stats->cache_eviction_split = 0;
+	stats->cache_write = 0;
+	stats->cache_write_restore = 0;
+	stats->cache_eviction_clean = 0;
+	stats->compress_read = 0;
+	stats->compress_write = 0;
+	stats->compress_write_fail = 0;
+	stats->compress_write_too_small = 0;
+	stats->compress_raw_fail_temporary = 0;
+	stats->compress_raw_fail = 0;
+	stats->compress_raw_ok = 0;
+	stats->cursor_insert_bulk = 0;
+	stats->cursor_create = 0;
+	stats->cursor_insert_bytes = 0;
+	stats->cursor_remove_bytes = 0;
+	stats->cursor_update_bytes = 0;
+	stats->cursor_insert = 0;
+	stats->cursor_next = 0;
+	stats->cursor_prev = 0;
+	stats->cursor_remove = 0;
+	stats->cursor_reset = 0;
+	stats->cursor_restart = 0;
+	stats->cursor_search = 0;
+	stats->cursor_search_near = 0;
+	stats->cursor_update = 0;
+	stats->bloom_false_positive = 0;
+	stats->bloom_hit = 0;
+	stats->bloom_miss = 0;
+	stats->bloom_page_evict = 0;
+	stats->bloom_page_read = 0;
+	stats->bloom_count = 0;
+	stats->lsm_chunk_count = 0;
+	stats->lsm_generation_max = 0;
+	stats->lsm_lookup_no_bloom = 0;
+	stats->lsm_checkpoint_throttle = 0;
+	stats->lsm_merge_throttle = 0;
+	stats->bloom_size = 0;
+	stats->rec_dictionary = 0;
+	stats->rec_suffix_compression = 0;
+	stats->rec_multiblock_internal = 0;
+	stats->rec_overflow_key_internal = 0;
+	stats->rec_prefix_compression = 0;
+	stats->rec_multiblock_leaf = 0;
+	stats->rec_overflow_key_leaf = 0;
+	stats->rec_multiblock_max = 0;
+	stats->rec_overflow_value = 0;
+	stats->rec_page_match = 0;
+	stats->rec_pages = 0;
+	stats->rec_pages_eviction = 0;
+	stats->rec_page_delete = 0;
+	stats->session_compact = 0;
+		/* not clearing session_cursor_open */
+	stats->txn_update_conflict = 0;
+}
+
+void
+__wt_stat_dsrc_clear_all(WT_DSRC_STATS **stats)
+{
+	u_int i;
 
-	stats = (WT_DSRC_STATS *)stats_arg;
-	stats->block_extension.v = 0;
-	stats->block_alloc.v = 0;
-	stats->block_free.v = 0;
-	stats->block_checkpoint_size.v = 0;
-	stats->allocation_size.v = 0;
-	stats->block_reuse_bytes.v = 0;
-	stats->block_magic.v = 0;
-	stats->block_major.v = 0;
-	stats->block_size.v = 0;
-	stats->block_minor.v = 0;
-	stats->btree_column_fix.v = 0;
-	stats->btree_column_internal.v = 0;
-	stats->btree_column_deleted.v = 0;
-	stats->btree_column_variable.v = 0;
-	stats->btree_fixed_len.v = 0;
-	stats->btree_maxintlkey.v = 0;
-	stats->btree_maxintlpage.v = 0;
-	stats->btree_maxleafkey.v = 0;
-	stats->btree_maxleafpage.v = 0;
-	stats->btree_maxleafvalue.v = 0;
-	stats->btree_maximum_depth.v = 0;
-	stats->btree_entries.v = 0;
-	stats->btree_overflow.v = 0;
-	stats->btree_compact_rewrite.v = 0;
-	stats->btree_row_internal.v = 0;
-	stats->btree_row_leaf.v = 0;
-	stats->cache_bytes_read.v = 0;
-	stats->cache_bytes_write.v = 0;
-	stats->cache_eviction_checkpoint.v = 0;
-	stats->cache_eviction_fail.v = 0;
-	stats->cache_eviction_hazard.v = 0;
-	stats->cache_inmem_split.v = 0;
-	stats->cache_eviction_internal.v = 0;
-	stats->cache_eviction_dirty.v = 0;
-	stats->cache_read_overflow.v = 0;
-	stats->cache_overflow_value.v = 0;
-	stats->cache_eviction_deepen.v = 0;
-	stats->cache_read.v = 0;
-	stats->cache_eviction_split.v = 0;
-	stats->cache_write.v = 0;
-	stats->cache_eviction_clean.v = 0;
-	stats->compress_read.v = 0;
-	stats->compress_write.v = 0;
-	stats->compress_write_fail.v = 0;
-	stats->compress_write_too_small.v = 0;
-	stats->compress_raw_fail_temporary.v = 0;
-	stats->compress_raw_fail.v = 0;
-	stats->compress_raw_ok.v = 0;
-	stats->cursor_insert_bulk.v = 0;
-	stats->cursor_create.v = 0;
-	stats->cursor_insert_bytes.v = 0;
-	stats->cursor_remove_bytes.v = 0;
-	stats->cursor_update_bytes.v = 0;
-	stats->cursor_insert.v = 0;
-	stats->cursor_next.v = 0;
-	stats->cursor_prev.v = 0;
-	stats->cursor_remove.v = 0;
-	stats->cursor_reset.v = 0;
-	stats->cursor_search.v = 0;
-	stats->cursor_search_near.v = 0;
-	stats->cursor_update.v = 0;
-	stats->bloom_false_positive.v = 0;
-	stats->bloom_hit.v = 0;
-	stats->bloom_miss.v = 0;
-	stats->bloom_page_evict.v = 0;
-	stats->bloom_page_read.v = 0;
-	stats->bloom_count.v = 0;
-	stats->lsm_chunk_count.v = 0;
-	stats->lsm_generation_max.v = 0;
-	stats->lsm_lookup_no_bloom.v = 0;
-	stats->lsm_checkpoint_throttle.v = 0;
-	stats->lsm_merge_throttle.v = 0;
-	stats->bloom_size.v = 0;
-	stats->rec_dictionary.v = 0;
-	stats->rec_suffix_compression.v = 0;
-	stats->rec_multiblock_internal.v = 0;
-	stats->rec_overflow_key_internal.v = 0;
-	stats->rec_prefix_compression.v = 0;
-	stats->rec_multiblock_leaf.v = 0;
-	stats->rec_overflow_key_leaf.v = 0;
-	stats->rec_multiblock_max.v = 0;
-	stats->rec_overflow_value.v = 0;
-	stats->rec_page_match.v = 0;
-	stats->rec_pages.v = 0;
-	stats->rec_pages_eviction.v = 0;
-	stats->rec_page_delete.v = 0;
-	stats->session_compact.v = 0;
-	stats->txn_update_conflict.v = 0;
+	for (i = 0; i < WT_COUNTER_SLOTS; ++i)
+		__wt_stat_dsrc_clear_single(stats[i]);
+}
+
+void
+__wt_stat_dsrc_aggregate_single(
+    WT_DSRC_STATS *from, WT_DSRC_STATS *to)
+{
+	to->block_extension += from->block_extension;
+	to->block_alloc += from->block_alloc;
+	to->block_free += from->block_free;
+	to->block_checkpoint_size += from->block_checkpoint_size;
+	to->allocation_size = from->allocation_size;
+	to->block_reuse_bytes += from->block_reuse_bytes;
+	to->block_magic = from->block_magic;
+	to->block_major = from->block_major;
+	to->block_size += from->block_size;
+	to->block_minor = from->block_minor;
+	to->btree_checkpoint_generation += from->btree_checkpoint_generation;
+	to->btree_column_fix += from->btree_column_fix;
+	to->btree_column_internal += from->btree_column_internal;
+	to->btree_column_deleted += from->btree_column_deleted;
+	to->btree_column_variable += from->btree_column_variable;
+	to->btree_column_rle += from->btree_column_rle;
+	to->btree_fixed_len = from->btree_fixed_len;
+	if (from->btree_maxintlkey > to->btree_maxintlkey)
+		to->btree_maxintlkey = from->btree_maxintlkey;
+	if (from->btree_maxintlpage > to->btree_maxintlpage)
+		to->btree_maxintlpage = from->btree_maxintlpage;
+	if (from->btree_maxleafkey > to->btree_maxleafkey)
+		to->btree_maxleafkey = from->btree_maxleafkey;
+	if (from->btree_maxleafpage > to->btree_maxleafpage)
+		to->btree_maxleafpage = from->btree_maxleafpage;
+	if (from->btree_maxleafvalue > to->btree_maxleafvalue)
+		to->btree_maxleafvalue = from->btree_maxleafvalue;
+	if (from->btree_maximum_depth > to->btree_maximum_depth)
+		to->btree_maximum_depth = from->btree_maximum_depth;
+	to->btree_entries += from->btree_entries;
+	to->btree_overflow += from->btree_overflow;
+	to->btree_compact_rewrite += from->btree_compact_rewrite;
+	to->btree_row_internal += from->btree_row_internal;
+	to->btree_row_leaf += from->btree_row_leaf;
+	to->cache_bytes_read += from->cache_bytes_read;
+	to->cache_bytes_write += from->cache_bytes_write;
+	to->cache_eviction_checkpoint += from->cache_eviction_checkpoint;
+	to->cache_eviction_fail += from->cache_eviction_fail;
+	to->cache_eviction_hazard += from->cache_eviction_hazard;
+	to->cache_inmem_splittable += from->cache_inmem_splittable;
+	to->cache_inmem_split += from->cache_inmem_split;
+	to->cache_eviction_internal += from->cache_eviction_internal;
+	to->cache_eviction_dirty += from->cache_eviction_dirty;
+	to->cache_read_overflow += from->cache_read_overflow;
+	to->cache_overflow_value += from->cache_overflow_value;
+	to->cache_eviction_deepen += from->cache_eviction_deepen;
+	to->cache_write_lookaside += from->cache_write_lookaside;
+	to->cache_read += from->cache_read;
+	to->cache_read_lookaside += from->cache_read_lookaside;
+	to->cache_eviction_split += from->cache_eviction_split;
+	to->cache_write += from->cache_write;
+	to->cache_write_restore += from->cache_write_restore;
+	to->cache_eviction_clean += from->cache_eviction_clean;
+	to->compress_read += from->compress_read;
+	to->compress_write += from->compress_write;
+	to->compress_write_fail += from->compress_write_fail;
+	to->compress_write_too_small += from->compress_write_too_small;
+	to->compress_raw_fail_temporary += from->compress_raw_fail_temporary;
+	to->compress_raw_fail += from->compress_raw_fail;
+	to->compress_raw_ok += from->compress_raw_ok;
+	to->cursor_insert_bulk += from->cursor_insert_bulk;
+	to->cursor_create += from->cursor_create;
+	to->cursor_insert_bytes += from->cursor_insert_bytes;
+	to->cursor_remove_bytes += from->cursor_remove_bytes;
+	to->cursor_update_bytes += from->cursor_update_bytes;
+	to->cursor_insert += from->cursor_insert;
+	to->cursor_next += from->cursor_next;
+	to->cursor_prev += from->cursor_prev;
+	to->cursor_remove += from->cursor_remove;
+	to->cursor_reset += from->cursor_reset;
+	to->cursor_restart += from->cursor_restart;
+	to->cursor_search += from->cursor_search;
+	to->cursor_search_near += from->cursor_search_near;
+	to->cursor_update += from->cursor_update;
+	to->bloom_false_positive += from->bloom_false_positive;
+	to->bloom_hit += from->bloom_hit;
+	to->bloom_miss += from->bloom_miss;
+	to->bloom_page_evict += from->bloom_page_evict;
+	to->bloom_page_read += from->bloom_page_read;
+	to->bloom_count += from->bloom_count;
+	to->lsm_chunk_count += from->lsm_chunk_count;
+	if (from->lsm_generation_max > to->lsm_generation_max)
+		to->lsm_generation_max = from->lsm_generation_max;
+	to->lsm_lookup_no_bloom += from->lsm_lookup_no_bloom;
+	to->lsm_checkpoint_throttle += from->lsm_checkpoint_throttle;
+	to->lsm_merge_throttle += from->lsm_merge_throttle;
+	to->bloom_size += from->bloom_size;
+	to->rec_dictionary += from->rec_dictionary;
+	to->rec_suffix_compression += from->rec_suffix_compression;
+	to->rec_multiblock_internal += from->rec_multiblock_internal;
+	to->rec_overflow_key_internal += from->rec_overflow_key_internal;
+	to->rec_prefix_compression += from->rec_prefix_compression;
+	to->rec_multiblock_leaf += from->rec_multiblock_leaf;
+	to->rec_overflow_key_leaf += from->rec_overflow_key_leaf;
+	if (from->rec_multiblock_max > to->rec_multiblock_max)
+		to->rec_multiblock_max = from->rec_multiblock_max;
+	to->rec_overflow_value += from->rec_overflow_value;
+	to->rec_page_match += from->rec_page_match;
+	to->rec_pages += from->rec_pages;
+	to->rec_pages_eviction += from->rec_pages_eviction;
+	to->rec_page_delete += from->rec_page_delete;
+	to->session_compact += from->session_compact;
+	to->session_cursor_open += from->session_cursor_open;
+	to->txn_update_conflict += from->txn_update_conflict;
 }
 
 void
-__wt_stat_aggregate_dsrc_stats(const void *child, const void *parent)
+__wt_stat_dsrc_aggregate(
+    WT_DSRC_STATS **from, WT_DSRC_STATS *to)
 {
-	WT_DSRC_STATS *c, *p;
+	int64_t v;
 
-	c = (WT_DSRC_STATS *)child;
-	p = (WT_DSRC_STATS *)parent;
-	p->block_extension.v += c->block_extension.v;
-	p->block_alloc.v += c->block_alloc.v;
-	p->block_free.v += c->block_free.v;
-	p->block_checkpoint_size.v += c->block_checkpoint_size.v;
-	p->block_reuse_bytes.v += c->block_reuse_bytes.v;
-	p->block_size.v += c->block_size.v;
-	p->btree_checkpoint_generation.v += c->btree_checkpoint_generation.v;
-	p->btree_column_fix.v += c->btree_column_fix.v;
-	p->btree_column_internal.v += c->btree_column_internal.v;
-	p->btree_column_deleted.v += c->btree_column_deleted.v;
-	p->btree_column_variable.v += c->btree_column_variable.v;
-	if (c->btree_maxintlkey.v > p->btree_maxintlkey.v)
-	    p->btree_maxintlkey.v = c->btree_maxintlkey.v;
-	if (c->btree_maxintlpage.v > p->btree_maxintlpage.v)
-	    p->btree_maxintlpage.v = c->btree_maxintlpage.v;
-	if (c->btree_maxleafkey.v > p->btree_maxleafkey.v)
-	    p->btree_maxleafkey.v = c->btree_maxleafkey.v;
-	if (c->btree_maxleafpage.v > p->btree_maxleafpage.v)
-	    p->btree_maxleafpage.v = c->btree_maxleafpage.v;
-	if (c->btree_maxleafvalue.v > p->btree_maxleafvalue.v)
-	    p->btree_maxleafvalue.v = c->btree_maxleafvalue.v;
-	if (c->btree_maximum_depth.v > p->btree_maximum_depth.v)
-	    p->btree_maximum_depth.v = c->btree_maximum_depth.v;
-	p->btree_entries.v += c->btree_entries.v;
-	p->btree_overflow.v += c->btree_overflow.v;
-	p->btree_compact_rewrite.v += c->btree_compact_rewrite.v;
-	p->btree_row_internal.v += c->btree_row_internal.v;
-	p->btree_row_leaf.v += c->btree_row_leaf.v;
-	p->cache_bytes_read.v += c->cache_bytes_read.v;
-	p->cache_bytes_write.v += c->cache_bytes_write.v;
-	p->cache_eviction_checkpoint.v += c->cache_eviction_checkpoint.v;
-	p->cache_eviction_fail.v += c->cache_eviction_fail.v;
-	p->cache_eviction_hazard.v += c->cache_eviction_hazard.v;
-	p->cache_inmem_split.v += c->cache_inmem_split.v;
-	p->cache_eviction_internal.v += c->cache_eviction_internal.v;
-	p->cache_eviction_dirty.v += c->cache_eviction_dirty.v;
-	p->cache_read_overflow.v += c->cache_read_overflow.v;
-	p->cache_overflow_value.v += c->cache_overflow_value.v;
-	p->cache_eviction_deepen.v += c->cache_eviction_deepen.v;
-	p->cache_read.v += c->cache_read.v;
-	p->cache_eviction_split.v += c->cache_eviction_split.v;
-	p->cache_write.v += c->cache_write.v;
-	p->cache_eviction_clean.v += c->cache_eviction_clean.v;
-	p->compress_read.v += c->compress_read.v;
-	p->compress_write.v += c->compress_write.v;
-	p->compress_write_fail.v += c->compress_write_fail.v;
-	p->compress_write_too_small.v += c->compress_write_too_small.v;
-	p->compress_raw_fail_temporary.v += c->compress_raw_fail_temporary.v;
-	p->compress_raw_fail.v += c->compress_raw_fail.v;
-	p->compress_raw_ok.v += c->compress_raw_ok.v;
-	p->cursor_insert_bulk.v += c->cursor_insert_bulk.v;
-	p->cursor_create.v += c->cursor_create.v;
-	p->cursor_insert_bytes.v += c->cursor_insert_bytes.v;
-	p->cursor_remove_bytes.v += c->cursor_remove_bytes.v;
-	p->cursor_update_bytes.v += c->cursor_update_bytes.v;
-	p->cursor_insert.v += c->cursor_insert.v;
-	p->cursor_next.v += c->cursor_next.v;
-	p->cursor_prev.v += c->cursor_prev.v;
-	p->cursor_remove.v += c->cursor_remove.v;
-	p->cursor_reset.v += c->cursor_reset.v;
-	p->cursor_search.v += c->cursor_search.v;
-	p->cursor_search_near.v += c->cursor_search_near.v;
-	p->cursor_update.v += c->cursor_update.v;
-	p->bloom_false_positive.v += c->bloom_false_positive.v;
-	p->bloom_hit.v += c->bloom_hit.v;
-	p->bloom_miss.v += c->bloom_miss.v;
-	p->bloom_page_evict.v += c->bloom_page_evict.v;
-	p->bloom_page_read.v += c->bloom_page_read.v;
-	p->bloom_count.v += c->bloom_count.v;
-	p->lsm_chunk_count.v += c->lsm_chunk_count.v;
-	if (c->lsm_generation_max.v > p->lsm_generation_max.v)
-	    p->lsm_generation_max.v = c->lsm_generation_max.v;
-	p->lsm_lookup_no_bloom.v += c->lsm_lookup_no_bloom.v;
-	p->lsm_checkpoint_throttle.v += c->lsm_checkpoint_throttle.v;
-	p->lsm_merge_throttle.v += c->lsm_merge_throttle.v;
-	p->bloom_size.v += c->bloom_size.v;
-	p->rec_dictionary.v += c->rec_dictionary.v;
-	p->rec_suffix_compression.v += c->rec_suffix_compression.v;
-	p->rec_multiblock_internal.v += c->rec_multiblock_internal.v;
-	p->rec_overflow_key_internal.v += c->rec_overflow_key_internal.v;
-	p->rec_prefix_compression.v += c->rec_prefix_compression.v;
-	p->rec_multiblock_leaf.v += c->rec_multiblock_leaf.v;
-	p->rec_overflow_key_leaf.v += c->rec_overflow_key_leaf.v;
-	if (c->rec_multiblock_max.v > p->rec_multiblock_max.v)
-	    p->rec_multiblock_max.v = c->rec_multiblock_max.v;
-	p->rec_overflow_value.v += c->rec_overflow_value.v;
-	p->rec_page_match.v += c->rec_page_match.v;
-	p->rec_pages.v += c->rec_pages.v;
-	p->rec_pages_eviction.v += c->rec_pages_eviction.v;
-	p->rec_page_delete.v += c->rec_page_delete.v;
-	p->session_compact.v += c->session_compact.v;
-	p->session_cursor_open.v += c->session_cursor_open.v;
-	p->txn_update_conflict.v += c->txn_update_conflict.v;
+	to->block_extension += WT_STAT_READ(from, block_extension);
+	to->block_alloc += WT_STAT_READ(from, block_alloc);
+	to->block_free += WT_STAT_READ(from, block_free);
+	to->block_checkpoint_size +=
+	    WT_STAT_READ(from, block_checkpoint_size);
+	to->allocation_size = from[0]->allocation_size;
+	to->block_reuse_bytes += WT_STAT_READ(from, block_reuse_bytes);
+	to->block_magic = from[0]->block_magic;
+	to->block_major = from[0]->block_major;
+	to->block_size += WT_STAT_READ(from, block_size);
+	to->block_minor = from[0]->block_minor;
+	to->btree_checkpoint_generation +=
+	    WT_STAT_READ(from, btree_checkpoint_generation);
+	to->btree_column_fix += WT_STAT_READ(from, btree_column_fix);
+	to->btree_column_internal +=
+	    WT_STAT_READ(from, btree_column_internal);
+	to->btree_column_deleted += WT_STAT_READ(from, btree_column_deleted);
+	to->btree_column_variable +=
+	    WT_STAT_READ(from, btree_column_variable);
+	to->btree_column_rle += WT_STAT_READ(from, btree_column_rle);
+	to->btree_fixed_len = from[0]->btree_fixed_len;
+	if ((v = WT_STAT_READ(from, btree_maxintlkey)) >
+	    to->btree_maxintlkey)
+		to->btree_maxintlkey = v;
+	if ((v = WT_STAT_READ(from, btree_maxintlpage)) >
+	    to->btree_maxintlpage)
+		to->btree_maxintlpage = v;
+	if ((v = WT_STAT_READ(from, btree_maxleafkey)) >
+	    to->btree_maxleafkey)
+		to->btree_maxleafkey = v;
+	if ((v = WT_STAT_READ(from, btree_maxleafpage)) >
+	    to->btree_maxleafpage)
+		to->btree_maxleafpage = v;
+	if ((v = WT_STAT_READ(from, btree_maxleafvalue)) >
+	    to->btree_maxleafvalue)
+		to->btree_maxleafvalue = v;
+	if ((v = WT_STAT_READ(from, btree_maximum_depth)) >
+	    to->btree_maximum_depth)
+		to->btree_maximum_depth = v;
+	to->btree_entries += WT_STAT_READ(from, btree_entries);
+	to->btree_overflow += WT_STAT_READ(from, btree_overflow);
+	to->btree_compact_rewrite +=
+	    WT_STAT_READ(from, btree_compact_rewrite);
+	to->btree_row_internal += WT_STAT_READ(from, btree_row_internal);
+	to->btree_row_leaf += WT_STAT_READ(from, btree_row_leaf);
+	to->cache_bytes_read += WT_STAT_READ(from, cache_bytes_read);
+	to->cache_bytes_write += WT_STAT_READ(from, cache_bytes_write);
+	to->cache_eviction_checkpoint +=
+	    WT_STAT_READ(from, cache_eviction_checkpoint);
+	to->cache_eviction_fail += WT_STAT_READ(from, cache_eviction_fail);
+	to->cache_eviction_hazard +=
+	    WT_STAT_READ(from, cache_eviction_hazard);
+	to->cache_inmem_splittable +=
+	    WT_STAT_READ(from, cache_inmem_splittable);
+	to->cache_inmem_split += WT_STAT_READ(from, cache_inmem_split);
+	to->cache_eviction_internal +=
+	    WT_STAT_READ(from, cache_eviction_internal);
+	to->cache_eviction_dirty += WT_STAT_READ(from, cache_eviction_dirty);
+	to->cache_read_overflow += WT_STAT_READ(from, cache_read_overflow);
+	to->cache_overflow_value += WT_STAT_READ(from, cache_overflow_value);
+	to->cache_eviction_deepen +=
+	    WT_STAT_READ(from, cache_eviction_deepen);
+	to->cache_write_lookaside +=
+	    WT_STAT_READ(from, cache_write_lookaside);
+	to->cache_read += WT_STAT_READ(from, cache_read);
+	to->cache_read_lookaside += WT_STAT_READ(from, cache_read_lookaside);
+	to->cache_eviction_split += WT_STAT_READ(from, cache_eviction_split);
+	to->cache_write += WT_STAT_READ(from, cache_write);
+	to->cache_write_restore += WT_STAT_READ(from, cache_write_restore);
+	to->cache_eviction_clean += WT_STAT_READ(from, cache_eviction_clean);
+	to->compress_read += WT_STAT_READ(from, compress_read);
+	to->compress_write += WT_STAT_READ(from, compress_write);
+	to->compress_write_fail += WT_STAT_READ(from, compress_write_fail);
+	to->compress_write_too_small +=
+	    WT_STAT_READ(from, compress_write_too_small);
+	to->compress_raw_fail_temporary +=
+	    WT_STAT_READ(from, compress_raw_fail_temporary);
+	to->compress_raw_fail += WT_STAT_READ(from, compress_raw_fail);
+	to->compress_raw_ok += WT_STAT_READ(from, compress_raw_ok);
+	to->cursor_insert_bulk += WT_STAT_READ(from, cursor_insert_bulk);
+	to->cursor_create += WT_STAT_READ(from, cursor_create);
+	to->cursor_insert_bytes += WT_STAT_READ(from, cursor_insert_bytes);
+	to->cursor_remove_bytes += WT_STAT_READ(from, cursor_remove_bytes);
+	to->cursor_update_bytes += WT_STAT_READ(from, cursor_update_bytes);
+	to->cursor_insert += WT_STAT_READ(from, cursor_insert);
+	to->cursor_next += WT_STAT_READ(from, cursor_next);
+	to->cursor_prev += WT_STAT_READ(from, cursor_prev);
+	to->cursor_remove += WT_STAT_READ(from, cursor_remove);
+	to->cursor_reset += WT_STAT_READ(from, cursor_reset);
+	to->cursor_restart += WT_STAT_READ(from, cursor_restart);
+	to->cursor_search += WT_STAT_READ(from, cursor_search);
+	to->cursor_search_near += WT_STAT_READ(from, cursor_search_near);
+	to->cursor_update += WT_STAT_READ(from, cursor_update);
+	to->bloom_false_positive += WT_STAT_READ(from, bloom_false_positive);
+	to->bloom_hit += WT_STAT_READ(from, bloom_hit);
+	to->bloom_miss += WT_STAT_READ(from, bloom_miss);
+	to->bloom_page_evict += WT_STAT_READ(from, bloom_page_evict);
+	to->bloom_page_read += WT_STAT_READ(from, bloom_page_read);
+	to->bloom_count += WT_STAT_READ(from, bloom_count);
+	to->lsm_chunk_count += WT_STAT_READ(from, lsm_chunk_count);
+	if ((v = WT_STAT_READ(from, lsm_generation_max)) >
+	    to->lsm_generation_max)
+		to->lsm_generation_max = v;
+	to->lsm_lookup_no_bloom += WT_STAT_READ(from, lsm_lookup_no_bloom);
+	to->lsm_checkpoint_throttle +=
+	    WT_STAT_READ(from, lsm_checkpoint_throttle);
+	to->lsm_merge_throttle += WT_STAT_READ(from, lsm_merge_throttle);
+	to->bloom_size += WT_STAT_READ(from, bloom_size);
+	to->rec_dictionary += WT_STAT_READ(from, rec_dictionary);
+	to->rec_suffix_compression +=
+	    WT_STAT_READ(from, rec_suffix_compression);
+	to->rec_multiblock_internal +=
+	    WT_STAT_READ(from, rec_multiblock_internal);
+	to->rec_overflow_key_internal +=
+	    WT_STAT_READ(from, rec_overflow_key_internal);
+	to->rec_prefix_compression +=
+	    WT_STAT_READ(from, rec_prefix_compression);
+	to->rec_multiblock_leaf += WT_STAT_READ(from, rec_multiblock_leaf);
+	to->rec_overflow_key_leaf +=
+	    WT_STAT_READ(from, rec_overflow_key_leaf);
+	if ((v = WT_STAT_READ(from, rec_multiblock_max)) >
+	    to->rec_multiblock_max)
+		to->rec_multiblock_max = v;
+	to->rec_overflow_value += WT_STAT_READ(from, rec_overflow_value);
+	to->rec_page_match += WT_STAT_READ(from, rec_page_match);
+	to->rec_pages += WT_STAT_READ(from, rec_pages);
+	to->rec_pages_eviction += WT_STAT_READ(from, rec_pages_eviction);
+	to->rec_page_delete += WT_STAT_READ(from, rec_page_delete);
+	to->session_compact += WT_STAT_READ(from, session_compact);
+	to->session_cursor_open += WT_STAT_READ(from, session_cursor_open);
+	to->txn_update_conflict += WT_STAT_READ(from, txn_update_conflict);
+}
+
+static const char * const __stats_connection_desc[] = {
+	"async: number of allocation state races",
+	"async: number of operation slots viewed for allocation",
+	"async: current work queue length",
+	"async: number of flush calls",
+	"async: number of times operation allocation failed",
+	"async: maximum work queue length",
+	"async: number of times worker found no work",
+	"async: total allocations",
+	"async: total compact calls",
+	"async: total insert calls",
+	"async: total remove calls",
+	"async: total search calls",
+	"async: total update calls",
+	"block-manager: mapped bytes read",
+	"block-manager: bytes read",
+	"block-manager: bytes written",
+	"block-manager: mapped blocks read",
+	"block-manager: blocks pre-loaded",
+	"block-manager: blocks read",
+	"block-manager: blocks written",
+	"cache: tracked dirty bytes in the cache",
+	"cache: tracked bytes belonging to internal pages in the cache",
+	"cache: bytes currently in the cache",
+	"cache: tracked bytes belonging to leaf pages in the cache",
+	"cache: maximum bytes configured",
+	"cache: tracked bytes belonging to overflow pages in the cache",
+	"cache: bytes read into cache",
+	"cache: bytes written from cache",
+	"cache: pages evicted by application threads",
+	"cache: checkpoint blocked page eviction",
+	"cache: unmodified pages evicted",
+	"cache: page split during eviction deepened the tree",
+	"cache: modified pages evicted",
+	"cache: pages selected for eviction unable to be evicted",
+	"cache: pages evicted because they exceeded the in-memory maximum",
+	"cache: pages evicted because they had chains of deleted items",
+	"cache: failed eviction of pages that exceeded the in-memory maximum",
+	"cache: hazard pointer blocked page eviction",
+	"cache: internal pages evicted",
+	"cache: maximum page size at eviction",
+	"cache: eviction server candidate queue empty when topping up",
+	"cache: eviction server candidate queue not empty when topping up",
+	"cache: eviction server evicting pages",
+	"cache: eviction server populating queue, but not evicting pages",
+	"cache: eviction server unable to reach eviction goal",
+	"cache: pages split during eviction",
+	"cache: pages walked for eviction",
+	"cache: eviction worker thread evicting pages",
+	"cache: in-memory page splits",
+	"cache: in-memory page passed criteria to be split",
+	"cache: lookaside table insert calls",
+	"cache: lookaside table remove calls",
+	"cache: percentage overhead",
+	"cache: tracked dirty pages in the cache",
+	"cache: pages currently held in the cache",
+	"cache: pages read into cache",
+	"cache: pages read into cache requiring lookaside entries",
+	"cache: pages written from cache",
+	"cache: page written requiring lookaside records",
+	"cache: pages written requiring in-memory restoration",
+	"connection: pthread mutex condition wait calls",
+	"cursor: cursor create calls",
+	"cursor: cursor insert calls",
+	"cursor: cursor next calls",
+	"cursor: cursor prev calls",
+	"cursor: cursor remove calls",
+	"cursor: cursor reset calls",
+	"cursor: cursor restarted searches",
+	"cursor: cursor search calls",
+	"cursor: cursor search near calls",
+	"cursor: cursor update calls",
+	"data-handle: connection data handles currently active",
+	"data-handle: session dhandles swept",
+	"data-handle: session sweep attempts",
+	"data-handle: connection sweep dhandles closed",
+	"data-handle: connection sweep candidate became referenced",
+	"data-handle: connection sweep dhandles removed from hash list",
+	"data-handle: connection sweep time-of-death sets",
+	"data-handle: connection sweeps",
+	"connection: files currently open",
+	"log: total log buffer size",
+	"log: log bytes of payload data",
+	"log: log bytes written",
+	"log: yields waiting for previous log file close",
+	"log: total size of compressed records",
+	"log: total in-memory size of compressed records",
+	"log: log records too small to compress",
+	"log: log records not compressed",
+	"log: log records compressed",
+	"log: maximum log file size",
+	"log: pre-allocated log files prepared",
+	"log: number of pre-allocated log files to create",
+	"log: pre-allocated log files used",
+	"log: log release advances write LSN",
+	"log: records processed by log scan",
+	"log: log scan records requiring two reads",
+	"log: log scan operations",
+	"log: consolidated slot closures",
+	"log: written slots coalesced",
+	"log: logging bytes consolidated",
+	"log: consolidated slot joins",
+	"log: consolidated slot join races",
+	"log: busy returns attempting to switch slots",
+	"log: consolidated slot join transitions",
+	"log: consolidated slot unbuffered writes",
+	"log: log sync operations",
+	"log: log sync_dir operations",
+	"log: log server thread advances write LSN",
+	"log: log write operations",
+	"LSM: sleep for LSM checkpoint throttle",
+	"LSM: sleep for LSM merge throttle",
+	"LSM: rows merged in an LSM tree",
+	"LSM: application work units currently queued",
+	"LSM: merge work units currently queued",
+	"LSM: tree queue hit maximum",
+	"LSM: switch work units currently queued",
+	"LSM: tree maintenance operations scheduled",
+	"LSM: tree maintenance operations discarded",
+	"LSM: tree maintenance operations executed",
+	"connection: memory allocations",
+	"connection: memory frees",
+	"connection: memory re-allocations",
+	"thread-yield: page acquire busy blocked",
+	"thread-yield: page acquire eviction blocked",
+	"thread-yield: page acquire locked blocked",
+	"thread-yield: page acquire read blocked",
+	"thread-yield: page acquire time sleeping (usecs)",
+	"connection: total read I/Os",
+	"reconciliation: page reconciliation calls",
+	"reconciliation: page reconciliation calls for eviction",
+	"reconciliation: split bytes currently awaiting free",
+	"reconciliation: split objects currently awaiting free",
+	"connection: pthread mutex shared lock read-lock calls",
+	"connection: pthread mutex shared lock write-lock calls",
+	"session: open cursor count",
+	"session: open session count",
+	"transaction: transaction begins",
+	"transaction: transaction checkpoints",
+	"transaction: transaction checkpoint generation",
+	"transaction: transaction checkpoint currently running",
+	"transaction: transaction checkpoint max time (msecs)",
+	"transaction: transaction checkpoint min time (msecs)",
+	"transaction: transaction checkpoint most recent time (msecs)",
+	"transaction: transaction checkpoint total time (msecs)",
+	"transaction: transactions committed",
+	"transaction: transaction failures due to cache overflow",
+	"transaction: transaction range of IDs currently pinned by a checkpoint",
+	"transaction: transaction range of IDs currently pinned",
+	"transaction: transactions rolled back",
+	"transaction: transaction sync calls",
+	"connection: total write I/Os",
+};
+
+const char *
+__wt_stat_connection_desc(int slot)
+{
+	return (__stats_connection_desc[slot]);
 }
 
 void
-__wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats)
+__wt_stat_connection_init_single(WT_CONNECTION_STATS *stats)
 {
-	/* Clear, so can also be called for reinitialization. */
 	memset(stats, 0, sizeof(*stats));
+}
+
+void
+__wt_stat_connection_init(WT_CONNECTION_IMPL *handle)
+{
+	int i;
 
-	stats->async_cur_queue.desc = "async: current work queue length";
-	stats->async_max_queue.desc = "async: maximum work queue length";
-	stats->async_alloc_race.desc =
-	    "async: number of allocation state races";
-	stats->async_flush.desc = "async: number of flush calls";
-	stats->async_alloc_view.desc =
-	    "async: number of operation slots viewed for allocation";
-	stats->async_full.desc =
-	    "async: number of times operation allocation failed";
-	stats->async_nowork.desc =
-	    "async: number of times worker found no work";
-	stats->async_op_alloc.desc = "async: total allocations";
-	stats->async_op_compact.desc = "async: total compact calls";
-	stats->async_op_insert.desc = "async: total insert calls";
-	stats->async_op_remove.desc = "async: total remove calls";
-	stats->async_op_search.desc = "async: total search calls";
-	stats->async_op_update.desc = "async: total update calls";
-	stats->block_preload.desc = "block-manager: blocks pre-loaded";
-	stats->block_read.desc = "block-manager: blocks read";
-	stats->block_write.desc = "block-manager: blocks written";
-	stats->block_byte_read.desc = "block-manager: bytes read";
-	stats->block_byte_write.desc = "block-manager: bytes written";
-	stats->block_map_read.desc = "block-manager: mapped blocks read";
-	stats->block_byte_map_read.desc = "block-manager: mapped bytes read";
-	stats->cache_bytes_inuse.desc = "cache: bytes currently in the cache";
-	stats->cache_bytes_read.desc = "cache: bytes read into cache";
-	stats->cache_bytes_write.desc = "cache: bytes written from cache";
-	stats->cache_eviction_checkpoint.desc =
-	    "cache: checkpoint blocked page eviction";
-	stats->cache_eviction_queue_empty.desc =
-	    "cache: eviction server candidate queue empty when topping up";
-	stats->cache_eviction_queue_not_empty.desc =
-	    "cache: eviction server candidate queue not empty when topping up";
-	stats->cache_eviction_server_evicting.desc =
-	    "cache: eviction server evicting pages";
-	stats->cache_eviction_server_not_evicting.desc =
-	    "cache: eviction server populating queue, but not evicting pages";
-	stats->cache_eviction_slow.desc =
-	    "cache: eviction server unable to reach eviction goal";
-	stats->cache_eviction_worker_evicting.desc =
-	    "cache: eviction worker thread evicting pages";
-	stats->cache_eviction_force_fail.desc =
-	    "cache: failed eviction of pages that exceeded the in-memory maximum";
-	stats->cache_eviction_hazard.desc =
-	    "cache: hazard pointer blocked page eviction";
-	stats->cache_inmem_split.desc = "cache: in-memory page splits";
-	stats->cache_eviction_internal.desc = "cache: internal pages evicted";
-	stats->cache_bytes_max.desc = "cache: maximum bytes configured";
-	stats->cache_eviction_maximum_page_size.desc =
-	    "cache: maximum page size at eviction";
-	stats->cache_eviction_dirty.desc = "cache: modified pages evicted";
-	stats->cache_eviction_deepen.desc =
-	    "cache: page split during eviction deepened the tree";
-	stats->cache_pages_inuse.desc =
-	    "cache: pages currently held in the cache";
-	stats->cache_eviction_force.desc =
-	    "cache: pages evicted because they exceeded the in-memory maximum";
-	stats->cache_eviction_force_delete.desc =
-	    "cache: pages evicted because they had chains of deleted items";
-	stats->cache_eviction_app.desc =
-	    "cache: pages evicted by application threads";
-	stats->cache_read.desc = "cache: pages read into cache";
-	stats->cache_eviction_fail.desc =
-	    "cache: pages selected for eviction unable to be evicted";
-	stats->cache_eviction_split.desc =
-	    "cache: pages split during eviction";
-	stats->cache_eviction_walk.desc = "cache: pages walked for eviction";
-	stats->cache_write.desc = "cache: pages written from cache";
-	stats->cache_overhead.desc = "cache: percentage overhead";
-	stats->cache_bytes_internal.desc =
-	    "cache: tracked bytes belonging to internal pages in the cache";
-	stats->cache_bytes_leaf.desc =
-	    "cache: tracked bytes belonging to leaf pages in the cache";
-	stats->cache_bytes_overflow.desc =
-	    "cache: tracked bytes belonging to overflow pages in the cache";
-	stats->cache_bytes_dirty.desc =
-	    "cache: tracked dirty bytes in the cache";
-	stats->cache_pages_dirty.desc =
-	    "cache: tracked dirty pages in the cache";
-	stats->cache_eviction_clean.desc = "cache: unmodified pages evicted";
-	stats->file_open.desc = "connection: files currently open";
-	stats->memory_allocation.desc = "connection: memory allocations";
-	stats->memory_free.desc = "connection: memory frees";
-	stats->memory_grow.desc = "connection: memory re-allocations";
-	stats->cond_wait.desc =
-	    "connection: pthread mutex condition wait calls";
-	stats->rwlock_read.desc =
-	    "connection: pthread mutex shared lock read-lock calls";
-	stats->rwlock_write.desc =
-	    "connection: pthread mutex shared lock write-lock calls";
-	stats->read_io.desc = "connection: total read I/Os";
-	stats->write_io.desc = "connection: total write I/Os";
-	stats->cursor_create.desc = "cursor: cursor create calls";
-	stats->cursor_insert.desc = "cursor: cursor insert calls";
-	stats->cursor_next.desc = "cursor: cursor next calls";
-	stats->cursor_prev.desc = "cursor: cursor prev calls";
-	stats->cursor_remove.desc = "cursor: cursor remove calls";
-	stats->cursor_reset.desc = "cursor: cursor reset calls";
-	stats->cursor_search.desc = "cursor: cursor search calls";
-	stats->cursor_search_near.desc = "cursor: cursor search near calls";
-	stats->cursor_update.desc = "cursor: cursor update calls";
-	stats->dh_conn_ref.desc =
-	    "data-handle: connection candidate referenced";
-	stats->dh_conn_handles.desc = "data-handle: connection dhandles swept";
-	stats->dh_conn_sweeps.desc = "data-handle: connection sweeps";
-	stats->dh_conn_tod.desc = "data-handle: connection time-of-death sets";
-	stats->dh_session_handles.desc = "data-handle: session dhandles swept";
-	stats->dh_session_sweeps.desc = "data-handle: session sweep attempts";
-	stats->log_slot_closes.desc = "log: consolidated slot closures";
-	stats->log_slot_races.desc = "log: consolidated slot join races";
-	stats->log_slot_transitions.desc =
-	    "log: consolidated slot join transitions";
-	stats->log_slot_joins.desc = "log: consolidated slot joins";
-	stats->log_slot_toosmall.desc =
-	    "log: failed to find a slot large enough for record";
-	stats->log_bytes_payload.desc = "log: log bytes of payload data";
-	stats->log_bytes_written.desc = "log: log bytes written";
-	stats->log_compress_writes.desc = "log: log records compressed";
-	stats->log_compress_write_fails.desc =
-	    "log: log records not compressed";
-	stats->log_compress_small.desc =
-	    "log: log records too small to compress";
-	stats->log_release_write_lsn.desc =
-	    "log: log release advances write LSN";
-	stats->log_scans.desc = "log: log scan operations";
-	stats->log_scan_rereads.desc =
-	    "log: log scan records requiring two reads";
-	stats->log_write_lsn.desc =
-	    "log: log server thread advances write LSN";
-	stats->log_sync.desc = "log: log sync operations";
-	stats->log_sync_dir.desc = "log: log sync_dir operations";
-	stats->log_writes.desc = "log: log write operations";
-	stats->log_slot_consolidated.desc = "log: logging bytes consolidated";
-	stats->log_max_filesize.desc = "log: maximum log file size";
-	stats->log_prealloc_max.desc =
-	    "log: number of pre-allocated log files to create";
-	stats->log_prealloc_files.desc =
-	    "log: pre-allocated log files prepared";
-	stats->log_prealloc_used.desc = "log: pre-allocated log files used";
-	stats->log_slot_toobig.desc = "log: record size exceeded maximum";
-	stats->log_scan_records.desc = "log: records processed by log scan";
-	stats->log_compress_mem.desc =
-	    "log: total in-memory size of compressed records";
-	stats->log_buffer_size.desc = "log: total log buffer size";
-	stats->log_compress_len.desc = "log: total size of compressed records";
-	stats->log_slot_coalesced.desc = "log: written slots coalesced";
-	stats->log_close_yields.desc =
-	    "log: yields waiting for previous log file close";
-	stats->lsm_work_queue_app.desc =
-	    "LSM: application work units currently queued";
-	stats->lsm_work_queue_manager.desc =
-	    "LSM: merge work units currently queued";
-	stats->lsm_rows_merged.desc = "LSM: rows merged in an LSM tree";
-	stats->lsm_checkpoint_throttle.desc =
-	    "LSM: sleep for LSM checkpoint throttle";
-	stats->lsm_merge_throttle.desc = "LSM: sleep for LSM merge throttle";
-	stats->lsm_work_queue_switch.desc =
-	    "LSM: switch work units currently queued";
-	stats->lsm_work_units_discarded.desc =
-	    "LSM: tree maintenance operations discarded";
-	stats->lsm_work_units_done.desc =
-	    "LSM: tree maintenance operations executed";
-	stats->lsm_work_units_created.desc =
-	    "LSM: tree maintenance operations scheduled";
-	stats->lsm_work_queue_max.desc = "LSM: tree queue hit maximum";
-	stats->rec_pages.desc = "reconciliation: page reconciliation calls";
-	stats->rec_pages_eviction.desc =
-	    "reconciliation: page reconciliation calls for eviction";
-	stats->rec_split_stashed_bytes.desc =
-	    "reconciliation: split bytes currently awaiting free";
-	stats->rec_split_stashed_objects.desc =
-	    "reconciliation: split objects currently awaiting free";
-	stats->session_cursor_open.desc = "session: open cursor count";
-	stats->session_open.desc = "session: open session count";
-	stats->page_busy_blocked.desc =
-	    "thread-yield: page acquire busy blocked";
-	stats->page_forcible_evict_blocked.desc =
-	    "thread-yield: page acquire eviction blocked";
-	stats->page_locked_blocked.desc =
-	    "thread-yield: page acquire locked blocked";
-	stats->page_read_blocked.desc =
-	    "thread-yield: page acquire read blocked";
-	stats->page_sleep.desc =
-	    "thread-yield: page acquire time sleeping (usecs)";
-	stats->txn_begin.desc = "transaction: transaction begins";
-	stats->txn_checkpoint_running.desc =
-	    "transaction: transaction checkpoint currently running";
-	stats->txn_checkpoint_generation.desc =
-	    "transaction: transaction checkpoint generation";
-	stats->txn_checkpoint_time_max.desc =
-	    "transaction: transaction checkpoint max time (msecs)";
-	stats->txn_checkpoint_time_min.desc =
-	    "transaction: transaction checkpoint min time (msecs)";
-	stats->txn_checkpoint_time_recent.desc =
-	    "transaction: transaction checkpoint most recent time (msecs)";
-	stats->txn_checkpoint_time_total.desc =
-	    "transaction: transaction checkpoint total time (msecs)";
-	stats->txn_checkpoint.desc = "transaction: transaction checkpoints";
-	stats->txn_fail_cache.desc =
-	    "transaction: transaction failures due to cache overflow";
-	stats->txn_pinned_range.desc =
-	    "transaction: transaction range of IDs currently pinned";
-	stats->txn_pinned_checkpoint_range.desc =
-	    "transaction: transaction range of IDs currently pinned by a checkpoint";
-	stats->txn_sync.desc = "transaction: transaction sync calls";
-	stats->txn_commit.desc = "transaction: transactions committed";
-	stats->txn_rollback.desc = "transaction: transactions rolled back";
+	for (i = 0; i < WT_COUNTER_SLOTS; ++i) {
+		handle->stats[i] = &handle->stat_array[i];
+		__wt_stat_connection_init_single(handle->stats[i]);
+	}
 }
 
 void
-__wt_stat_refresh_connection_stats(void *stats_arg)
+__wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
 {
-	WT_CONNECTION_STATS *stats;
+	stats->async_cur_queue = 0;
+		/* not clearing async_max_queue */
+	stats->async_alloc_race = 0;
+	stats->async_flush = 0;
+	stats->async_alloc_view = 0;
+	stats->async_full = 0;
+	stats->async_nowork = 0;
+	stats->async_op_alloc = 0;
+	stats->async_op_compact = 0;
+	stats->async_op_insert = 0;
+	stats->async_op_remove = 0;
+	stats->async_op_search = 0;
+	stats->async_op_update = 0;
+	stats->block_preload = 0;
+	stats->block_read = 0;
+	stats->block_write = 0;
+	stats->block_byte_read = 0;
+	stats->block_byte_write = 0;
+	stats->block_map_read = 0;
+	stats->block_byte_map_read = 0;
+		/* not clearing cache_bytes_inuse */
+	stats->cache_bytes_read = 0;
+	stats->cache_bytes_write = 0;
+	stats->cache_eviction_checkpoint = 0;
+	stats->cache_eviction_queue_empty = 0;
+	stats->cache_eviction_queue_not_empty = 0;
+	stats->cache_eviction_server_evicting = 0;
+	stats->cache_eviction_server_not_evicting = 0;
+	stats->cache_eviction_slow = 0;
+	stats->cache_eviction_worker_evicting = 0;
+	stats->cache_eviction_force_fail = 0;
+	stats->cache_eviction_hazard = 0;
+	stats->cache_inmem_splittable = 0;
+	stats->cache_inmem_split = 0;
+	stats->cache_eviction_internal = 0;
+	stats->cache_lookaside_insert = 0;
+	stats->cache_lookaside_remove = 0;
+		/* not clearing cache_bytes_max */
+		/* not clearing cache_eviction_maximum_page_size */
+	stats->cache_eviction_dirty = 0;
+	stats->cache_eviction_deepen = 0;
+	stats->cache_write_lookaside = 0;
+		/* not clearing cache_pages_inuse */
+	stats->cache_eviction_force = 0;
+	stats->cache_eviction_force_delete = 0;
+	stats->cache_eviction_app = 0;
+	stats->cache_read = 0;
+	stats->cache_read_lookaside = 0;
+	stats->cache_eviction_fail = 0;
+	stats->cache_eviction_split = 0;
+	stats->cache_eviction_walk = 0;
+	stats->cache_write = 0;
+	stats->cache_write_restore = 0;
+		/* not clearing cache_overhead */
+		/* not clearing cache_bytes_internal */
+		/* not clearing cache_bytes_leaf */
+		/* not clearing cache_bytes_overflow */
+		/* not clearing cache_bytes_dirty */
+		/* not clearing cache_pages_dirty */
+	stats->cache_eviction_clean = 0;
+		/* not clearing file_open */
+	stats->memory_allocation = 0;
+	stats->memory_free = 0;
+	stats->memory_grow = 0;
+	stats->cond_wait = 0;
+	stats->rwlock_read = 0;
+	stats->rwlock_write = 0;
+	stats->read_io = 0;
+	stats->write_io = 0;
+	stats->cursor_create = 0;
+	stats->cursor_insert = 0;
+	stats->cursor_next = 0;
+	stats->cursor_prev = 0;
+	stats->cursor_remove = 0;
+	stats->cursor_reset = 0;
+	stats->cursor_restart = 0;
+	stats->cursor_search = 0;
+	stats->cursor_search_near = 0;
+	stats->cursor_update = 0;
+		/* not clearing dh_conn_handle_count */
+	stats->dh_sweep_ref = 0;
+	stats->dh_sweep_close = 0;
+	stats->dh_sweep_remove = 0;
+	stats->dh_sweep_tod = 0;
+	stats->dh_sweeps = 0;
+	stats->dh_session_handles = 0;
+	stats->dh_session_sweeps = 0;
+	stats->log_slot_switch_busy = 0;
+	stats->log_slot_closes = 0;
+	stats->log_slot_races = 0;
+	stats->log_slot_transitions = 0;
+	stats->log_slot_joins = 0;
+	stats->log_slot_unbuffered = 0;
+	stats->log_bytes_payload = 0;
+	stats->log_bytes_written = 0;
+	stats->log_compress_writes = 0;
+	stats->log_compress_write_fails = 0;
+	stats->log_compress_small = 0;
+	stats->log_release_write_lsn = 0;
+	stats->log_scans = 0;
+	stats->log_scan_rereads = 0;
+	stats->log_write_lsn = 0;
+	stats->log_sync = 0;
+	stats->log_sync_dir = 0;
+	stats->log_writes = 0;
+	stats->log_slot_consolidated = 0;
+		/* not clearing log_max_filesize */
+		/* not clearing log_prealloc_max */
+	stats->log_prealloc_files = 0;
+	stats->log_prealloc_used = 0;
+	stats->log_scan_records = 0;
+	stats->log_compress_mem = 0;
+		/* not clearing log_buffer_size */
+	stats->log_compress_len = 0;
+	stats->log_slot_coalesced = 0;
+	stats->log_close_yields = 0;
+		/* not clearing lsm_work_queue_app */
+		/* not clearing lsm_work_queue_manager */
+	stats->lsm_rows_merged = 0;
+	stats->lsm_checkpoint_throttle = 0;
+	stats->lsm_merge_throttle = 0;
+		/* not clearing lsm_work_queue_switch */
+	stats->lsm_work_units_discarded = 0;
+	stats->lsm_work_units_done = 0;
+	stats->lsm_work_units_created = 0;
+	stats->lsm_work_queue_max = 0;
+	stats->rec_pages = 0;
+	stats->rec_pages_eviction = 0;
+		/* not clearing rec_split_stashed_bytes */
+		/* not clearing rec_split_stashed_objects */
+		/* not clearing session_cursor_open */
+		/* not clearing session_open */
+	stats->page_busy_blocked = 0;
+	stats->page_forcible_evict_blocked = 0;
+	stats->page_locked_blocked = 0;
+	stats->page_read_blocked = 0;
+	stats->page_sleep = 0;
+	stats->txn_begin = 0;
+		/* not clearing txn_checkpoint_running */
+		/* not clearing txn_checkpoint_generation */
+		/* not clearing txn_checkpoint_time_max */
+		/* not clearing txn_checkpoint_time_min */
+		/* not clearing txn_checkpoint_time_recent */
+		/* not clearing txn_checkpoint_time_total */
+	stats->txn_checkpoint = 0;
+	stats->txn_fail_cache = 0;
+		/* not clearing txn_pinned_range */
+		/* not clearing txn_pinned_checkpoint_range */
+	stats->txn_sync = 0;
+	stats->txn_commit = 0;
+	stats->txn_rollback = 0;
+}
+
+void
+__wt_stat_connection_clear_all(WT_CONNECTION_STATS **stats)
+{
+	u_int i;
 
-	stats = (WT_CONNECTION_STATS *)stats_arg;
-	stats->async_cur_queue.v = 0;
-	stats->async_alloc_race.v = 0;
-	stats->async_flush.v = 0;
-	stats->async_alloc_view.v = 0;
-	stats->async_full.v = 0;
-	stats->async_nowork.v = 0;
-	stats->async_op_alloc.v = 0;
-	stats->async_op_compact.v = 0;
-	stats->async_op_insert.v = 0;
-	stats->async_op_remove.v = 0;
-	stats->async_op_search.v = 0;
-	stats->async_op_update.v = 0;
-	stats->block_preload.v = 0;
-	stats->block_read.v = 0;
-	stats->block_write.v = 0;
-	stats->block_byte_read.v = 0;
-	stats->block_byte_write.v = 0;
-	stats->block_map_read.v = 0;
-	stats->block_byte_map_read.v = 0;
-	stats->cache_bytes_read.v = 0;
-	stats->cache_bytes_write.v = 0;
-	stats->cache_eviction_checkpoint.v = 0;
-	stats->cache_eviction_queue_empty.v = 0;
-	stats->cache_eviction_queue_not_empty.v = 0;
-	stats->cache_eviction_server_evicting.v = 0;
-	stats->cache_eviction_server_not_evicting.v = 0;
-	stats->cache_eviction_slow.v = 0;
-	stats->cache_eviction_worker_evicting.v = 0;
-	stats->cache_eviction_force_fail.v = 0;
-	stats->cache_eviction_hazard.v = 0;
-	stats->cache_inmem_split.v = 0;
-	stats->cache_eviction_internal.v = 0;
-	stats->cache_eviction_dirty.v = 0;
-	stats->cache_eviction_deepen.v = 0;
-	stats->cache_eviction_force.v = 0;
-	stats->cache_eviction_force_delete.v = 0;
-	stats->cache_eviction_app.v = 0;
-	stats->cache_read.v = 0;
-	stats->cache_eviction_fail.v = 0;
-	stats->cache_eviction_split.v = 0;
-	stats->cache_eviction_walk.v = 0;
-	stats->cache_write.v = 0;
-	stats->cache_eviction_clean.v = 0;
-	stats->memory_allocation.v = 0;
-	stats->memory_free.v = 0;
-	stats->memory_grow.v = 0;
-	stats->cond_wait.v = 0;
-	stats->rwlock_read.v = 0;
-	stats->rwlock_write.v = 0;
-	stats->read_io.v = 0;
-	stats->write_io.v = 0;
-	stats->cursor_create.v = 0;
-	stats->cursor_insert.v = 0;
-	stats->cursor_next.v = 0;
-	stats->cursor_prev.v = 0;
-	stats->cursor_remove.v = 0;
-	stats->cursor_reset.v = 0;
-	stats->cursor_search.v = 0;
-	stats->cursor_search_near.v = 0;
-	stats->cursor_update.v = 0;
-	stats->dh_conn_ref.v = 0;
-	stats->dh_conn_handles.v = 0;
-	stats->dh_conn_sweeps.v = 0;
-	stats->dh_conn_tod.v = 0;
-	stats->dh_session_handles.v = 0;
-	stats->dh_session_sweeps.v = 0;
-	stats->log_slot_closes.v = 0;
-	stats->log_slot_races.v = 0;
-	stats->log_slot_transitions.v = 0;
-	stats->log_slot_joins.v = 0;
-	stats->log_slot_toosmall.v = 0;
-	stats->log_bytes_payload.v = 0;
-	stats->log_bytes_written.v = 0;
-	stats->log_compress_writes.v = 0;
-	stats->log_compress_write_fails.v = 0;
-	stats->log_compress_small.v = 0;
-	stats->log_release_write_lsn.v = 0;
-	stats->log_scans.v = 0;
-	stats->log_scan_rereads.v = 0;
-	stats->log_write_lsn.v = 0;
-	stats->log_sync.v = 0;
-	stats->log_sync_dir.v = 0;
-	stats->log_writes.v = 0;
-	stats->log_slot_consolidated.v = 0;
-	stats->log_prealloc_files.v = 0;
-	stats->log_prealloc_used.v = 0;
-	stats->log_slot_toobig.v = 0;
-	stats->log_scan_records.v = 0;
-	stats->log_compress_mem.v = 0;
-	stats->log_compress_len.v = 0;
-	stats->log_slot_coalesced.v = 0;
-	stats->log_close_yields.v = 0;
-	stats->lsm_rows_merged.v = 0;
-	stats->lsm_checkpoint_throttle.v = 0;
-	stats->lsm_merge_throttle.v = 0;
-	stats->lsm_work_units_discarded.v = 0;
-	stats->lsm_work_units_done.v = 0;
-	stats->lsm_work_units_created.v = 0;
-	stats->lsm_work_queue_max.v = 0;
-	stats->rec_pages.v = 0;
-	stats->rec_pages_eviction.v = 0;
-	stats->page_busy_blocked.v = 0;
-	stats->page_forcible_evict_blocked.v = 0;
-	stats->page_locked_blocked.v = 0;
-	stats->page_read_blocked.v = 0;
-	stats->page_sleep.v = 0;
-	stats->txn_begin.v = 0;
-	stats->txn_checkpoint.v = 0;
-	stats->txn_fail_cache.v = 0;
-	stats->txn_sync.v = 0;
-	stats->txn_commit.v = 0;
-	stats->txn_rollback.v = 0;
+	for (i = 0; i < WT_COUNTER_SLOTS; ++i)
+		__wt_stat_connection_clear_single(stats[i]);
+}
+
+void
+__wt_stat_connection_aggregate(
+    WT_CONNECTION_STATS **from, WT_CONNECTION_STATS *to)
+{
+	to->async_cur_queue += WT_STAT_READ(from, async_cur_queue);
+	to->async_max_queue += WT_STAT_READ(from, async_max_queue);
+	to->async_alloc_race += WT_STAT_READ(from, async_alloc_race);
+	to->async_flush += WT_STAT_READ(from, async_flush);
+	to->async_alloc_view += WT_STAT_READ(from, async_alloc_view);
+	to->async_full += WT_STAT_READ(from, async_full);
+	to->async_nowork += WT_STAT_READ(from, async_nowork);
+	to->async_op_alloc += WT_STAT_READ(from, async_op_alloc);
+	to->async_op_compact += WT_STAT_READ(from, async_op_compact);
+	to->async_op_insert += WT_STAT_READ(from, async_op_insert);
+	to->async_op_remove += WT_STAT_READ(from, async_op_remove);
+	to->async_op_search += WT_STAT_READ(from, async_op_search);
+	to->async_op_update += WT_STAT_READ(from, async_op_update);
+	to->block_preload += WT_STAT_READ(from, block_preload);
+	to->block_read += WT_STAT_READ(from, block_read);
+	to->block_write += WT_STAT_READ(from, block_write);
+	to->block_byte_read += WT_STAT_READ(from, block_byte_read);
+	to->block_byte_write += WT_STAT_READ(from, block_byte_write);
+	to->block_map_read += WT_STAT_READ(from, block_map_read);
+	to->block_byte_map_read += WT_STAT_READ(from, block_byte_map_read);
+	to->cache_bytes_inuse += WT_STAT_READ(from, cache_bytes_inuse);
+	to->cache_bytes_read += WT_STAT_READ(from, cache_bytes_read);
+	to->cache_bytes_write += WT_STAT_READ(from, cache_bytes_write);
+	to->cache_eviction_checkpoint +=
+	    WT_STAT_READ(from, cache_eviction_checkpoint);
+	to->cache_eviction_queue_empty +=
+	    WT_STAT_READ(from, cache_eviction_queue_empty);
+	to->cache_eviction_queue_not_empty +=
+	    WT_STAT_READ(from, cache_eviction_queue_not_empty);
+	to->cache_eviction_server_evicting +=
+	    WT_STAT_READ(from, cache_eviction_server_evicting);
+	to->cache_eviction_server_not_evicting +=
+	    WT_STAT_READ(from, cache_eviction_server_not_evicting);
+	to->cache_eviction_slow += WT_STAT_READ(from, cache_eviction_slow);
+	to->cache_eviction_worker_evicting +=
+	    WT_STAT_READ(from, cache_eviction_worker_evicting);
+	to->cache_eviction_force_fail +=
+	    WT_STAT_READ(from, cache_eviction_force_fail);
+	to->cache_eviction_hazard +=
+	    WT_STAT_READ(from, cache_eviction_hazard);
+	to->cache_inmem_splittable +=
+	    WT_STAT_READ(from, cache_inmem_splittable);
+	to->cache_inmem_split += WT_STAT_READ(from, cache_inmem_split);
+	to->cache_eviction_internal +=
+	    WT_STAT_READ(from, cache_eviction_internal);
+	to->cache_lookaside_insert +=
+	    WT_STAT_READ(from, cache_lookaside_insert);
+	to->cache_lookaside_remove +=
+	    WT_STAT_READ(from, cache_lookaside_remove);
+	to->cache_bytes_max += WT_STAT_READ(from, cache_bytes_max);
+	to->cache_eviction_maximum_page_size +=
+	    WT_STAT_READ(from, cache_eviction_maximum_page_size);
+	to->cache_eviction_dirty += WT_STAT_READ(from, cache_eviction_dirty);
+	to->cache_eviction_deepen +=
+	    WT_STAT_READ(from, cache_eviction_deepen);
+	to->cache_write_lookaside +=
+	    WT_STAT_READ(from, cache_write_lookaside);
+	to->cache_pages_inuse += WT_STAT_READ(from, cache_pages_inuse);
+	to->cache_eviction_force += WT_STAT_READ(from, cache_eviction_force);
+	to->cache_eviction_force_delete +=
+	    WT_STAT_READ(from, cache_eviction_force_delete);
+	to->cache_eviction_app += WT_STAT_READ(from, cache_eviction_app);
+	to->cache_read += WT_STAT_READ(from, cache_read);
+	to->cache_read_lookaside += WT_STAT_READ(from, cache_read_lookaside);
+	to->cache_eviction_fail += WT_STAT_READ(from, cache_eviction_fail);
+	to->cache_eviction_split += WT_STAT_READ(from, cache_eviction_split);
+	to->cache_eviction_walk += WT_STAT_READ(from, cache_eviction_walk);
+	to->cache_write += WT_STAT_READ(from, cache_write);
+	to->cache_write_restore += WT_STAT_READ(from, cache_write_restore);
+	to->cache_overhead += WT_STAT_READ(from, cache_overhead);
+	to->cache_bytes_internal += WT_STAT_READ(from, cache_bytes_internal);
+	to->cache_bytes_leaf += WT_STAT_READ(from, cache_bytes_leaf);
+	to->cache_bytes_overflow += WT_STAT_READ(from, cache_bytes_overflow);
+	to->cache_bytes_dirty += WT_STAT_READ(from, cache_bytes_dirty);
+	to->cache_pages_dirty += WT_STAT_READ(from, cache_pages_dirty);
+	to->cache_eviction_clean += WT_STAT_READ(from, cache_eviction_clean);
+	to->file_open += WT_STAT_READ(from, file_open);
+	to->memory_allocation += WT_STAT_READ(from, memory_allocation);
+	to->memory_free += WT_STAT_READ(from, memory_free);
+	to->memory_grow += WT_STAT_READ(from, memory_grow);
+	to->cond_wait += WT_STAT_READ(from, cond_wait);
+	to->rwlock_read += WT_STAT_READ(from, rwlock_read);
+	to->rwlock_write += WT_STAT_READ(from, rwlock_write);
+	to->read_io += WT_STAT_READ(from, read_io);
+	to->write_io += WT_STAT_READ(from, write_io);
+	to->cursor_create += WT_STAT_READ(from, cursor_create);
+	to->cursor_insert += WT_STAT_READ(from, cursor_insert);
+	to->cursor_next += WT_STAT_READ(from, cursor_next);
+	to->cursor_prev += WT_STAT_READ(from, cursor_prev);
+	to->cursor_remove += WT_STAT_READ(from, cursor_remove);
+	to->cursor_reset += WT_STAT_READ(from, cursor_reset);
+	to->cursor_restart += WT_STAT_READ(from, cursor_restart);
+	to->cursor_search += WT_STAT_READ(from, cursor_search);
+	to->cursor_search_near += WT_STAT_READ(from, cursor_search_near);
+	to->cursor_update += WT_STAT_READ(from, cursor_update);
+	to->dh_conn_handle_count += WT_STAT_READ(from, dh_conn_handle_count);
+	to->dh_sweep_ref += WT_STAT_READ(from, dh_sweep_ref);
+	to->dh_sweep_close += WT_STAT_READ(from, dh_sweep_close);
+	to->dh_sweep_remove += WT_STAT_READ(from, dh_sweep_remove);
+	to->dh_sweep_tod += WT_STAT_READ(from, dh_sweep_tod);
+	to->dh_sweeps += WT_STAT_READ(from, dh_sweeps);
+	to->dh_session_handles += WT_STAT_READ(from, dh_session_handles);
+	to->dh_session_sweeps += WT_STAT_READ(from, dh_session_sweeps);
+	to->log_slot_switch_busy += WT_STAT_READ(from, log_slot_switch_busy);
+	to->log_slot_closes += WT_STAT_READ(from, log_slot_closes);
+	to->log_slot_races += WT_STAT_READ(from, log_slot_races);
+	to->log_slot_transitions += WT_STAT_READ(from, log_slot_transitions);
+	to->log_slot_joins += WT_STAT_READ(from, log_slot_joins);
+	to->log_slot_unbuffered += WT_STAT_READ(from, log_slot_unbuffered);
+	to->log_bytes_payload += WT_STAT_READ(from, log_bytes_payload);
+	to->log_bytes_written += WT_STAT_READ(from, log_bytes_written);
+	to->log_compress_writes += WT_STAT_READ(from, log_compress_writes);
+	to->log_compress_write_fails +=
+	    WT_STAT_READ(from, log_compress_write_fails);
+	to->log_compress_small += WT_STAT_READ(from, log_compress_small);
+	to->log_release_write_lsn +=
+	    WT_STAT_READ(from, log_release_write_lsn);
+	to->log_scans += WT_STAT_READ(from, log_scans);
+	to->log_scan_rereads += WT_STAT_READ(from, log_scan_rereads);
+	to->log_write_lsn += WT_STAT_READ(from, log_write_lsn);
+	to->log_sync += WT_STAT_READ(from, log_sync);
+	to->log_sync_dir += WT_STAT_READ(from, log_sync_dir);
+	to->log_writes += WT_STAT_READ(from, log_writes);
+	to->log_slot_consolidated +=
+	    WT_STAT_READ(from, log_slot_consolidated);
+	to->log_max_filesize += WT_STAT_READ(from, log_max_filesize);
+	to->log_prealloc_max += WT_STAT_READ(from, log_prealloc_max);
+	to->log_prealloc_files += WT_STAT_READ(from, log_prealloc_files);
+	to->log_prealloc_used += WT_STAT_READ(from, log_prealloc_used);
+	to->log_scan_records += WT_STAT_READ(from, log_scan_records);
+	to->log_compress_mem += WT_STAT_READ(from, log_compress_mem);
+	to->log_buffer_size += WT_STAT_READ(from, log_buffer_size);
+	to->log_compress_len += WT_STAT_READ(from, log_compress_len);
+	to->log_slot_coalesced += WT_STAT_READ(from, log_slot_coalesced);
+	to->log_close_yields += WT_STAT_READ(from, log_close_yields);
+	to->lsm_work_queue_app += WT_STAT_READ(from, lsm_work_queue_app);
+	to->lsm_work_queue_manager +=
+	    WT_STAT_READ(from, lsm_work_queue_manager);
+	to->lsm_rows_merged += WT_STAT_READ(from, lsm_rows_merged);
+	to->lsm_checkpoint_throttle +=
+	    WT_STAT_READ(from, lsm_checkpoint_throttle);
+	to->lsm_merge_throttle += WT_STAT_READ(from, lsm_merge_throttle);
+	to->lsm_work_queue_switch +=
+	    WT_STAT_READ(from, lsm_work_queue_switch);
+	to->lsm_work_units_discarded +=
+	    WT_STAT_READ(from, lsm_work_units_discarded);
+	to->lsm_work_units_done += WT_STAT_READ(from, lsm_work_units_done);
+	to->lsm_work_units_created +=
+	    WT_STAT_READ(from, lsm_work_units_created);
+	to->lsm_work_queue_max += WT_STAT_READ(from, lsm_work_queue_max);
+	to->rec_pages += WT_STAT_READ(from, rec_pages);
+	to->rec_pages_eviction += WT_STAT_READ(from, rec_pages_eviction);
+	to->rec_split_stashed_bytes +=
+	    WT_STAT_READ(from, rec_split_stashed_bytes);
+	to->rec_split_stashed_objects +=
+	    WT_STAT_READ(from, rec_split_stashed_objects);
+	to->session_cursor_open += WT_STAT_READ(from, session_cursor_open);
+	to->session_open += WT_STAT_READ(from, session_open);
+	to->page_busy_blocked += WT_STAT_READ(from, page_busy_blocked);
+	to->page_forcible_evict_blocked +=
+	    WT_STAT_READ(from, page_forcible_evict_blocked);
+	to->page_locked_blocked += WT_STAT_READ(from, page_locked_blocked);
+	to->page_read_blocked += WT_STAT_READ(from, page_read_blocked);
+	to->page_sleep += WT_STAT_READ(from, page_sleep);
+	to->txn_begin += WT_STAT_READ(from, txn_begin);
+	to->txn_checkpoint_running +=
+	    WT_STAT_READ(from, txn_checkpoint_running);
+	to->txn_checkpoint_generation +=
+	    WT_STAT_READ(from, txn_checkpoint_generation);
+	to->txn_checkpoint_time_max +=
+	    WT_STAT_READ(from, txn_checkpoint_time_max);
+	to->txn_checkpoint_time_min +=
+	    WT_STAT_READ(from, txn_checkpoint_time_min);
+	to->txn_checkpoint_time_recent +=
+	    WT_STAT_READ(from, txn_checkpoint_time_recent);
+	to->txn_checkpoint_time_total +=
+	    WT_STAT_READ(from, txn_checkpoint_time_total);
+	to->txn_checkpoint += WT_STAT_READ(from, txn_checkpoint);
+	to->txn_fail_cache += WT_STAT_READ(from, txn_fail_cache);
+	to->txn_pinned_range += WT_STAT_READ(from, txn_pinned_range);
+	to->txn_pinned_checkpoint_range +=
+	    WT_STAT_READ(from, txn_pinned_checkpoint_range);
+	to->txn_sync += WT_STAT_READ(from, txn_sync);
+	to->txn_commit += WT_STAT_READ(from, txn_commit);
+	to->txn_rollback += WT_STAT_READ(from, txn_rollback);
 }
diff --git a/src/txn/txn.c b/src/txn/txn.c
index 9e8def39fb0..e81f8a68251 100644
--- a/src/txn/txn.c
+++ b/src/txn/txn.c
@@ -134,7 +134,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
 		if ((count = txn_global->scan_count) < 0)
 			WT_PAUSE();
 	} while (count < 0 ||
-	    !WT_ATOMIC_CAS4(txn_global->scan_count, count, count + 1));
+	    !__wt_atomic_casiv32(&txn_global->scan_count, count, count + 1));
 
 	current_id = snap_min = txn_global->current;
 	prev_oldest_id = txn_global->oldest_id;
@@ -147,7 +147,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
 		/* Check that the oldest ID has not moved in the meantime. */
 		if (prev_oldest_id == txn_global->oldest_id) {
 			WT_ASSERT(session, txn_global->scan_count > 0);
-			(void)WT_ATOMIC_SUB4(txn_global->scan_count, 1);
+			(void)__wt_atomic_subiv32(&txn_global->scan_count, 1);
 			return;
 		}
 	}
@@ -182,12 +182,8 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
 	WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
 	txn_state->snap_min = snap_min;
 
-	/* Update the last running ID if we have a much newer value. */
-	if (snap_min > txn_global->last_running + 100)
-		txn_global->last_running = snap_min;
-
 	WT_ASSERT(session, txn_global->scan_count > 0);
-	(void)WT_ATOMIC_SUB4(txn_global->scan_count, 1);
+	(void)__wt_atomic_subiv32(&txn_global->scan_count, 1);
 
 	__txn_sort_snapshot(session, n, current_id);
 }
@@ -212,7 +208,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
 	WT_SESSION_IMPL *oldest_session;
 	WT_TXN_GLOBAL *txn_global;
 	WT_TXN_STATE *s;
-	uint64_t current_id, id, oldest_id, prev_oldest_id, snap_min;
+	uint64_t current_id, id, last_running, oldest_id, prev_oldest_id;
 	uint32_t i, session_cnt;
 	int32_t count;
 	int last_running_moved;
@@ -220,7 +216,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
 	conn = S2C(session);
 	txn_global = &conn->txn_global;
 
-	current_id = snap_min = txn_global->current;
+	current_id = last_running = txn_global->current;
 	oldest_session = NULL;
 	prev_oldest_id = txn_global->oldest_id;
 
@@ -241,11 +237,11 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
 		if ((count = txn_global->scan_count) < 0)
 			WT_PAUSE();
 	} while (count < 0 ||
-	    !WT_ATOMIC_CAS4(txn_global->scan_count, count, count + 1));
+	    !__wt_atomic_casiv32(&txn_global->scan_count, count, count + 1));
 
 	/* The oldest ID cannot change until the scan count goes to zero. */
 	prev_oldest_id = txn_global->oldest_id;
-	current_id = oldest_id = snap_min = txn_global->current;
+	current_id = oldest_id = last_running = txn_global->current;
 
 	/* Walk the array of concurrent transactions. */
 	WT_ORDERED_READ(session_cnt, conn->session_cnt);
@@ -260,8 +256,8 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
 		 */
 		if ((id = s->id) != WT_TXN_NONE &&
 		    WT_TXNID_LE(prev_oldest_id, id) &&
-		    WT_TXNID_LT(id, snap_min))
-			snap_min = id;
+		    WT_TXNID_LT(id, last_running))
+			last_running = id;
 
 		/*
 		 * !!!
@@ -278,8 +274,8 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
 		}
 	}
 
-	if (WT_TXNID_LT(snap_min, oldest_id))
-		oldest_id = snap_min;
+	if (WT_TXNID_LT(last_running, oldest_id))
+		oldest_id = last_running;
 
 	/* The oldest ID can't move past any named snapshots. */
 	if ((id = txn_global->nsnap_oldest_id) != WT_TXN_NONE &&
@@ -287,25 +283,25 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
 		oldest_id = id;
 
 	/* Update the last running ID. */
-	if (WT_TXNID_LT(txn_global->last_running, snap_min)) {
-		txn_global->last_running = snap_min;
-		last_running_moved = 1;
-	} else
-		last_running_moved = 0;
+	last_running_moved =
+	    WT_TXNID_LT(txn_global->last_running, last_running);
 
 	/* Update the oldest ID. */
-	if (WT_TXNID_LT(prev_oldest_id, oldest_id) &&
-	    WT_ATOMIC_CAS4(txn_global->scan_count, 1, -1)) {
+	if ((WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) &&
+	    __wt_atomic_casiv32(&txn_global->scan_count, 1, -1)) {
 		WT_ORDERED_READ(session_cnt, conn->session_cnt);
 		for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
 			if ((id = s->id) != WT_TXN_NONE &&
-			    WT_TXNID_LT(id, oldest_id))
-				oldest_id = id;
+			    WT_TXNID_LT(id, last_running))
+				last_running = id;
 			if ((id = s->snap_min) != WT_TXN_NONE &&
 			    WT_TXNID_LT(id, oldest_id))
 				oldest_id = id;
 		}
 
+		if (WT_TXNID_LT(last_running, oldest_id))
+			oldest_id = last_running;
+
 #ifdef HAVE_DIAGNOSTIC
 		/*
 		 * Make sure the ID doesn't move past any named snapshots.
@@ -318,8 +314,11 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
 		WT_ASSERT(session,
 		    id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id));
 #endif
+		if (WT_TXNID_LT(txn_global->last_running, last_running))
+			txn_global->last_running = last_running;
 		if (WT_TXNID_LT(txn_global->oldest_id, oldest_id))
 			txn_global->oldest_id = oldest_id;
+		WT_ASSERT(session, txn_global->scan_count == -1);
 		txn_global->scan_count = 0;
 	} else {
 		if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) &&
@@ -334,7 +333,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
 			    oldest_session->txn.snap_min);
 		}
 		WT_ASSERT(session, txn_global->scan_count > 0);
-		(void)WT_ATOMIC_SUB4(txn_global->scan_count, 1);
+		(void)__wt_atomic_subiv32(&txn_global->scan_count, 1);
 	}
 }
 
@@ -400,7 +399,6 @@ __wt_txn_release(WT_SESSION_IMPL *session)
 	WT_TXN *txn;
 	WT_TXN_GLOBAL *txn_global;
 	WT_TXN_STATE *txn_state;
-	int was_oldest;
 
 	txn = &session->txn;
 	WT_ASSERT(session, txn->mod_count == 0);
@@ -408,7 +406,6 @@ __wt_txn_release(WT_SESSION_IMPL *session)
 
 	txn_global = &S2C(session)->txn_global;
 	txn_state = WT_SESSION_TXN_STATE(session);
-	was_oldest = 0;
 
 	/* Clear the transaction's ID from the global table. */
 	if (WT_SESSION_IS_CHECKPOINT(session)) {
@@ -419,12 +416,12 @@ __wt_txn_release(WT_SESSION_IMPL *session)
 		txn_global->checkpoint_id = 0;
 		txn_global->checkpoint_pinned = WT_TXN_NONE;
 	} else if (F_ISSET(txn, WT_TXN_HAS_ID)) {
+		WT_ASSERT(session,
+		    !WT_TXNID_LT(txn->id, txn_global->last_running));
+
 		WT_ASSERT(session, txn_state->id != WT_TXN_NONE &&
 		    txn->id != WT_TXN_NONE);
 		WT_PUBLISH(txn_state->id, WT_TXN_NONE);
-
-		/* Quick check for the oldest transaction. */
-		was_oldest = (txn->id == txn_global->last_running);
 		txn->id = WT_TXN_NONE;
 	}
 
@@ -443,14 +440,6 @@ __wt_txn_release(WT_SESSION_IMPL *session)
 	txn->isolation = session->isolation;
 	/* Ensure the transaction flags are cleared on exit */
 	txn->flags = 0;
-
-	/*
-	 * When the oldest transaction in the system completes, bump the oldest
-	 * ID.  This is racy and so not guaranteed, but in practice it keeps
-	 * the oldest ID from falling too far behind.
-	 */
-	if (was_oldest)
-		__wt_txn_update_oldest(session, 1);
 }
 
 /*
@@ -469,7 +458,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
 
 	txn = &session->txn;
 	conn = S2C(session);
-	WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR));
+	WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || txn->mod_count == 0);
 
 	if (!F_ISSET(txn, WT_TXN_RUNNING))
 		WT_RET_MSG(session, EINVAL, "No transaction is active");
@@ -593,6 +582,7 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[])
 		switch (op->type) {
 		case WT_TXN_OP_BASIC:
 		case WT_TXN_OP_INMEM:
+		       WT_ASSERT(session, op->u.upd->txnid == txn->id);
 			op->u.upd->txnid = WT_TXN_ABORTED;
 			break;
 		case WT_TXN_OP_REF:
@@ -660,20 +650,29 @@ __wt_txn_stats_update(WT_SESSION_IMPL *session)
 {
 	WT_TXN_GLOBAL *txn_global;
 	WT_CONNECTION_IMPL *conn;
-	WT_CONNECTION_STATS *stats;
+	WT_CONNECTION_STATS **stats;
 	uint64_t checkpoint_pinned;
 
 	conn = S2C(session);
 	txn_global = &conn->txn_global;
-	stats = &conn->stats;
+	stats = conn->stats;
 	checkpoint_pinned = txn_global->checkpoint_pinned;
 
-	WT_STAT_SET(stats, txn_pinned_range,
-	    txn_global->current - txn_global->oldest_id);
+	WT_STAT_SET(session, stats, txn_pinned_range,
+	   txn_global->current - txn_global->oldest_id);
 
-	WT_STAT_SET(stats, txn_pinned_checkpoint_range,
+	WT_STAT_SET(session, stats, txn_pinned_checkpoint_range,
 	    checkpoint_pinned == WT_TXN_NONE ?
 	    0 : txn_global->current - checkpoint_pinned);
+
+	WT_STAT_SET(
+	    session, stats, txn_checkpoint_time_max, conn->ckpt_time_max);
+	WT_STAT_SET(
+	    session, stats, txn_checkpoint_time_min, conn->ckpt_time_min);
+	WT_STAT_SET(
+	    session, stats, txn_checkpoint_time_recent, conn->ckpt_time_recent);
+	WT_STAT_SET(
+	    session, stats, txn_checkpoint_time_total, conn->ckpt_time_total);
 }
 
 /*
@@ -712,10 +711,11 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[])
 	WT_RET(__wt_rwlock_alloc(session,
 	    &txn_global->nsnap_rwlock, "named snapshot lock"));
 	txn_global->nsnap_oldest_id = WT_TXN_NONE;
-	STAILQ_INIT(&txn_global->nsnaph);
+	TAILQ_INIT(&txn_global->nsnaph);
 
 	WT_RET(__wt_calloc_def(
 	    session, conn->session_size, &txn_global->states));
+	WT_CACHE_LINE_ALIGNMENT_VERIFY(session, txn_global->states);
 
 	for (i = 0, s = txn_global->states; i < conn->session_size; i++, s++)
 		s->id = s->snap_min = WT_TXN_NONE;
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index 49fcd69ffed..9f59c53314e 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -246,6 +246,10 @@ __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[])
 	WT_ASSERT(session, session->dhandle->checkpoint == NULL);
 	WT_ASSERT(session, WT_PREFIX_MATCH(session->dhandle->name, "file:"));
 
+	/* Skip files that are never involved in a checkpoint. */
+	if (F_ISSET(S2BT(session), WT_BTREE_NO_CHECKPOINT))
+		return (0);
+
 	/* Make sure there is space for the next entry. */
 	WT_RET(__wt_realloc_def(session, &session->ckpt_handle_allocated,
 	    session->ckpt_handle_next + 1, &session->ckpt_handle));
@@ -285,19 +289,22 @@ static void
 __checkpoint_stats(
     WT_SESSION_IMPL *session, struct timespec *start, struct timespec *stop)
 {
+	WT_CONNECTION_IMPL *conn;
 	uint64_t msec;
 
+	conn = S2C(session);
+
 	/*
 	 * Get time diff in microseconds.
 	 */
 	msec = WT_TIMEDIFF(*stop, *start) / WT_MILLION;
-	if (msec > WT_CONN_STAT(session, txn_checkpoint_time_max))
-		WT_STAT_FAST_CONN_SET(session, txn_checkpoint_time_max, msec);
-	if (WT_CONN_STAT(session, txn_checkpoint_time_min) == 0 ||
-	    msec < WT_CONN_STAT(session, txn_checkpoint_time_min))
-		WT_STAT_FAST_CONN_SET(session, txn_checkpoint_time_min, msec);
-	WT_STAT_FAST_CONN_SET(session, txn_checkpoint_time_recent, msec);
-	WT_STAT_FAST_CONN_INCRV(session, txn_checkpoint_time_total, msec);
+
+	if (msec > conn->ckpt_time_max)
+		conn->ckpt_time_max = msec;
+	if (conn->ckpt_time_min == 0 || msec < conn->ckpt_time_min)
+		conn->ckpt_time_min = msec;
+	conn->ckpt_time_recent = msec;
+	conn->ckpt_time_total += msec;
 }
 
 /*
@@ -1161,9 +1168,17 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, int final)
 	btree = S2BT(session);
 	bulk = F_ISSET(btree, WT_BTREE_BULK) ? 1 : 0;
 
-	/* If the handle is already dead, force the discard. */
+	/*
+	 * If the handle is already dead or the file isn't durable, force the
+	 * discard.
+	 *
+	 * If the file isn't durable, mark the handle dead, there are asserts
+	 * later on that only dead handles can have modified pages.
+	 */
+	if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT))
+		F_SET(session->dhandle, WT_DHANDLE_DEAD);
 	if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
-		return (__wt_cache_op(session, NULL, WT_SYNC_DISCARD_FORCE));
+		return (__wt_cache_op(session, NULL, WT_SYNC_DISCARD));
 
 	/*
 	 * If closing an unmodified file, check that no update is required
diff --git a/src/txn/txn_log.c b/src/txn/txn_log.c
index 0d66eccd7dc..a63720d736f 100644
--- a/src/txn/txn_log.c
+++ b/src/txn/txn_log.c
@@ -33,18 +33,7 @@ __txn_op_log(WT_SESSION_IMPL *session,
 	 * 3) row store remove; or
 	 * 4) row store insert/update.
 	 */
-	if (cbt->btree->type != BTREE_ROW) {
-		WT_ASSERT(session, cbt->ins != NULL);
-		recno = WT_INSERT_RECNO(cbt->ins);
-		WT_ASSERT(session, recno != 0);
-
-		if (WT_UPDATE_DELETED_ISSET(upd))
-			WT_ERR(__wt_logop_col_remove_pack(session, logrec,
-			    op->fileid, recno));
-		else
-			WT_ERR(__wt_logop_col_put_pack(session, logrec,
-			    op->fileid, recno, &value));
-	} else {
+	if (cbt->btree->type == BTREE_ROW) {
 		WT_ERR(__wt_cursor_row_leaf_key(cbt, &key));
 
 		if (WT_UPDATE_DELETED_ISSET(upd))
@@ -53,6 +42,16 @@ __txn_op_log(WT_SESSION_IMPL *session,
 		else
 			WT_ERR(__wt_logop_row_put_pack(session, logrec,
 			    op->fileid, &key, &value));
+	} else {
+		recno = WT_INSERT_RECNO(cbt->ins);
+		WT_ASSERT(session, recno != WT_RECNO_OOB);
+
+		if (WT_UPDATE_DELETED_ISSET(upd))
+			WT_ERR(__wt_logop_col_remove_pack(session, logrec,
+			    op->fileid, recno));
+		else
+			WT_ERR(__wt_logop_col_put_pack(session, logrec,
+			    op->fileid, recno, &value));
 	}
 
 err:	__wt_buf_free(session, &key);
@@ -308,7 +307,7 @@ __wt_txn_checkpoint_log(
 	switch (flags) {
 	case WT_TXN_LOG_CKPT_PREPARE:
 		txn->full_ckpt = 1;
-		*ckpt_lsn = S2C(session)->log->write_start_lsn;
+		WT_ERR(__wt_log_ckpt_lsn(session, ckpt_lsn));
 		/*
 		 * We need to make sure that the log records in the checkpoint
 		 * LSN are on disk.  In particular to make sure that the
@@ -337,7 +336,7 @@ __wt_txn_checkpoint_log(
 			txn->ckpt_nsnapshot = 0;
 			WT_CLEAR(empty);
 			ckpt_snapshot = &empty;
-			*ckpt_lsn = S2C(session)->log->write_start_lsn;
+			WT_ERR(__wt_log_ckpt_lsn(session, ckpt_lsn));
 		} else
 			ckpt_snapshot = txn->ckpt_snapshot;
 
@@ -419,9 +418,9 @@ __wt_txn_truncate_log(
 	} else {
 		op->type = WT_TXN_OP_TRUNCATE_COL;
 		op->u.truncate_col.start =
-		    (start == NULL) ? 0 : start->recno;
+		    (start == NULL) ? WT_RECNO_OOB : start->recno;
 		op->u.truncate_col.stop =
-		    (stop == NULL) ? 0 : stop->recno;
+		    (stop == NULL) ? WT_RECNO_OOB : stop->recno;
 	}
 
 	/* Write that operation into the in-memory log. */
diff --git a/src/txn/txn_nsnap.c b/src/txn/txn_nsnap.c
index bd352c2237e..be736cc1c98 100644
--- a/src/txn/txn_nsnap.c
+++ b/src/txn/txn_nsnap.c
@@ -34,7 +34,7 @@ __nsnap_drop_one(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name)
 
 	txn_global = &S2C(session)->txn_global;
 
-	STAILQ_FOREACH(found, &txn_global->nsnaph, q)
+	TAILQ_FOREACH(found, &txn_global->nsnaph, q)
 		if (WT_STRING_MATCH(found->name, name->str, name->len))
 			break;
 
@@ -42,10 +42,10 @@ __nsnap_drop_one(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name)
 		return (WT_NOTFOUND);
 
 	/* Bump the global ID if we are removing the first entry */
-	if (found == STAILQ_FIRST(&txn_global->nsnaph))
-		txn_global->nsnap_oldest_id = (STAILQ_NEXT(found, q) != NULL) ?
-		    STAILQ_NEXT(found, q)->snap_min : WT_TXN_NONE;
-	STAILQ_REMOVE(&txn_global->nsnaph, found, __wt_named_snapshot, q);
+	if (found == TAILQ_FIRST(&txn_global->nsnaph))
+		txn_global->nsnap_oldest_id = (TAILQ_NEXT(found, q) != NULL) ?
+		    TAILQ_NEXT(found, q)->snap_min : WT_TXN_NONE;
+	TAILQ_REMOVE(&txn_global->nsnaph, found, q);
 	__nsnap_destroy(session, found);
 
 	return (ret);
@@ -67,7 +67,7 @@ __nsnap_drop_to(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name, int inclusive)
 	last = nsnap = prev = NULL;
 	txn_global = &S2C(session)->txn_global;
 
-	if (STAILQ_EMPTY(&txn_global->nsnaph)) {
+	if (TAILQ_EMPTY(&txn_global->nsnaph)) {
 		if (name == NULL)
 			return (0);
 		/*
@@ -85,7 +85,7 @@ __nsnap_drop_to(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name, int inclusive)
 	 */
 	new_nsnap_oldest = WT_TXN_NONE;
 	if (name != NULL) {
-		STAILQ_FOREACH(last, &txn_global->nsnaph, q) {
+		TAILQ_FOREACH(last, &txn_global->nsnaph, q) {
 			if (WT_STRING_MATCH(last->name, name->str, name->len))
 				break;
 			prev = last;
@@ -102,17 +102,17 @@ __nsnap_drop_to(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name, int inclusive)
 			last = prev;
 		}
 
-		if (STAILQ_NEXT(last, q) != NULL)
-			new_nsnap_oldest = STAILQ_NEXT(last, q)->snap_min;
+		if (TAILQ_NEXT(last, q) != NULL)
+			new_nsnap_oldest = TAILQ_NEXT(last, q)->snap_min;
 	}
 
 	do {
-		nsnap = STAILQ_FIRST(&txn_global->nsnaph);
+		nsnap = TAILQ_FIRST(&txn_global->nsnaph);
 		WT_ASSERT(session, nsnap != NULL);
-		STAILQ_REMOVE_HEAD(&txn_global->nsnaph, q);
+		TAILQ_REMOVE(&txn_global->nsnaph, nsnap, q);
 		__nsnap_destroy(session, nsnap);
 	/* Last will be NULL in the all case so it will never match */
-	} while (nsnap != last && !STAILQ_EMPTY(&txn_global->nsnaph));
+	} while (nsnap != last && !TAILQ_EMPTY(&txn_global->nsnaph));
 
 	/* Now that the queue of named snapshots is updated, update the ID */
 	txn_global->nsnap_oldest_id = new_nsnap_oldest;
@@ -173,9 +173,9 @@ __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[])
 	 */
 	WT_ERR_NOTFOUND_OK(__nsnap_drop_one(session, &cval));
 
-	if (STAILQ_EMPTY(&txn_global->nsnaph))
+	if (TAILQ_EMPTY(&txn_global->nsnaph))
 		txn_global->nsnap_oldest_id = nsnap_new->snap_min;
-	STAILQ_INSERT_TAIL(&txn_global->nsnaph, nsnap_new, q);
+	TAILQ_INSERT_TAIL(&txn_global->nsnaph, nsnap_new, q);
 	nsnap_new = NULL;
 
 err:	if (started_txn)
@@ -254,7 +254,7 @@ __wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *nameval)
 		WT_RET(__wt_session_copy_values(session));
 
 	WT_RET(__wt_readlock(session, txn_global->nsnap_rwlock));
-	STAILQ_FOREACH(nsnap, &txn_global->nsnaph, q)
+	TAILQ_FOREACH(nsnap, &txn_global->nsnaph, q)
 		if (WT_STRING_MATCH(nsnap->name, nameval->str, nameval->len)) {
 			txn->snap_min = txn_state->snap_min = nsnap->snap_min;
 			txn->snap_max = nsnap->snap_max;
@@ -358,10 +358,8 @@ __wt_txn_named_snapshot_destroy(WT_SESSION_IMPL *session)
 	txn_global = &S2C(session)->txn_global;
 	txn_global->nsnap_oldest_id = WT_TXN_NONE;
 
-	while (!STAILQ_EMPTY(&txn_global->nsnaph)) {
-		nsnap = STAILQ_FIRST(&txn_global->nsnaph);
-		WT_ASSERT(session, nsnap != NULL);
-		STAILQ_REMOVE_HEAD(&txn_global->nsnaph, q);
+	while ((nsnap = TAILQ_FIRST(&txn_global->nsnaph)) != NULL) {
+		TAILQ_REMOVE(&txn_global->nsnaph, nsnap, q);
 		__nsnap_destroy(session, nsnap);
 	}
 
diff --git a/src/txn/txn_recover.c b/src/txn/txn_recover.c
index 0eadcbf3b01..240d0a5ffd3 100644
--- a/src/txn/txn_recover.c
+++ b/src/txn/txn_recover.c
@@ -65,7 +65,7 @@ __recovery_cursor(WT_SESSION_IMPL *session, WT_RECOVERY *r,
 			    "No file found with ID %u (max %u)",
 			    id, r->nfiles));
 		r->missing = 1;
-	} else if (WT_LOG_CMP(lsnp, &r->files[id].ckpt_lsn) >= 0) {
+	} else if (__wt_log_cmp(lsnp, &r->files[id].ckpt_lsn) >= 0) {
 		/*
 		 * We're going to apply the operation.  Get the cursor, opening
 		 * one if none is cached.
@@ -144,10 +144,10 @@ __txn_op_apply(
 		GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
 
 		/* Set up the cursors. */
-		if (start_recno == 0) {
+		if (start_recno == WT_RECNO_OOB) {
 			start = NULL;
 			stop = cursor;
-		} else if (stop_recno == 0) {
+		} else if (stop_recno == WT_RECNO_OOB) {
 			start = cursor;
 			stop = NULL;
 		} else {
@@ -522,7 +522,7 @@ __wt_txn_recover(WT_SESSION_IMPL *session)
 	 */
 	WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));
 
-done:
+done:	FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE);
 err:	WT_TRET(__recovery_free(&r));
 	__wt_free(session, config);
 	WT_TRET(session->iface.close(&session->iface, NULL));
diff --git a/src/utilities/util_list.c b/src/utilities/util_list.c
index 1888c7d967b..1d35f2efc72 100644
--- a/src/utilities/util_list.c
+++ b/src/utilities/util_list.c
@@ -97,12 +97,15 @@ list_print(WT_SESSION *session, const char *name, int cflag, int vflag)
 		}
 
 		/*
-		 * XXX
-		 * We don't normally say anything about the WiredTiger
-		 * metadata, it's not a normal "object" in the database.  I'm
-		 * making an exception for the checkpoint and verbose options.
+		 * !!!
+		 * We don't normally say anything about the WiredTiger metadata
+		 * and lookaside tables, they're not application/user "objects"
+		 * in the database.  I'm making an exception for the checkpoint
+		 * and verbose options.
 		 */
-		if (strcmp(key, WT_METADATA_URI) != 0 || cflag || vflag)
+		if (cflag || vflag ||
+		    (strcmp(key, WT_METADATA_URI) != 0 &&
+		    strcmp(key, WT_LAS_URI) != 0))
 			printf("%s\n", key);
 
 		if (!cflag && !vflag)
diff --git a/test/checkpoint/checkpointer.c b/test/checkpoint/checkpointer.c
index dd6fcd6b95a..c4f36ac69ba 100644
--- a/test/checkpoint/checkpointer.c
+++ b/test/checkpoint/checkpointer.c
@@ -134,8 +134,7 @@ done:	if ((ret = session->close(session, NULL)) != 0)
 /*
  * verify_checkpoint --
  *     Open a cursor on each table at the last checkpoint and walk through
- *     the tables in parallel. The key/values should match across all
- *     tables.
+ *     the tables in parallel. The key/values should match across all tables.
  */
 static int
 verify_checkpoint(WT_SESSION *session)
@@ -245,41 +244,36 @@ compare_cursors(
     WT_CURSOR *cursor2, const char *type2)
 {
 	uint64_t key1, key2;
-	char *val1, *val2;
-	char buf[128];
+	char *val1, *val2, buf[128];
+	int ret;
 
+	ret = 0;
 	memset(buf, 0, 128);
 
 	if (cursor1->get_key(cursor1, &key1) != 0 ||
 	    cursor2->get_key(cursor2, &key2) != 0)
 		return (log_print_err("Error getting keys", EINVAL, 1));
 
-	if (key1 != key2) {
-		printf("Key mismatch %" PRIu64 " from a %s table "
-		    "is not %" PRIu64 " from a %s table\n",
-		    key1, type1, key2, type2);
-
-		return (ERR_KEY_MISMATCH);
-	}
-
-	/* Now check the values. */
 	if (cursor1->get_value(cursor1, &val1) != 0 ||
 	    cursor2->get_value(cursor2, &val2) != 0)
 		return (log_print_err("Error getting values", EINVAL, 1));
 
 	if (g.logfp != NULL)
 		fprintf(g.logfp, "k1: %" PRIu64 " k2: %" PRIu64
-		    " val1: %s val2: %s \n",
-		    key1, key2, val1, val2);
-	if (strlen(val1) != strlen(val2) ||
-	    strcmp(val1, val2) != 0) {
-		printf("Value mismatch for key %" PRIu64
-		    ", %s from a %s table is not %s from a %s table\n",
-		    key1, val1, type1, val2, type2);
-		return (ERR_DATA_MISMATCH);
-	}
+		    " val1: %s val2: %s \n", key1, key2, val1, val2);
 
-	return (0);
+	if (key1 != key2)
+		ret = ERR_KEY_MISMATCH;
+	else if (strlen(val1) != strlen(val2) || strcmp(val1, val2) != 0)
+		ret = ERR_DATA_MISMATCH;
+	else
+		return (0);
+
+	printf("Key/value mismatch: %" PRIu64 "/%s from a %s table is not %"
+	    PRIu64 "/%s from a %s table\n",
+	    key1, val1, type1, key2, val2, type2);
+
+	return (ret);
 }
 
 /*
@@ -349,10 +343,10 @@ diagnose_key_error(
 		return (1);
 	c->set_key(c, key1_orig);
 	if ((ret = c->search(c)) != 0)
-		(void)log_print_err("1st cursor didn't find 1st key\n", ret, 0);
+		(void)log_print_err("1st cursor didn't find 1st key", ret, 0);
 	c->set_key(c, key2_orig);
 	if ((ret = c->search(c)) != 0)
-		(void)log_print_err("1st cursor didn't find 2nd key\n", ret, 0);
+		(void)log_print_err("1st cursor didn't find 2nd key", ret, 0);
 	if (c->close(c) != 0)
 		return (1);
 
@@ -361,10 +355,10 @@ diagnose_key_error(
 		return (1);
 	c->set_key(c, key1_orig);
 	if ((ret = c->search(c)) != 0)
-		(void)log_print_err("2nd cursor didn't find 1st key\n", ret, 0);
+		(void)log_print_err("2nd cursor didn't find 1st key", ret, 0);
 	c->set_key(c, key2_orig);
 	if ((ret = c->search(c)) != 0)
-		(void)log_print_err("2nd cursor didn't find 2nd key\n", ret, 0);
+		(void)log_print_err("2nd cursor didn't find 2nd key", ret, 0);
 	if (c->close(c) != 0)
 		return (1);
 
@@ -378,7 +372,7 @@ live_check:
 		return (1);
 	c->set_key(c, key1_orig);
 	if ((ret = c->search(c)) != 0)
-		(void)log_print_err("1st cursor didn't find 1st key\n", ret, 0);
+		(void)log_print_err("1st cursor didn't find 1st key", ret, 0);
 	if (c->close(c) != 0)
 		return (1);
 
@@ -387,7 +381,7 @@ live_check:
 		return (1);
 	c->set_key(c, key2_orig);
 	if ((ret = c->search(c)) != 0)
-		(void)log_print_err("2nd cursor didn't find 2nd key\n", ret, 0);
+		(void)log_print_err("2nd cursor didn't find 2nd key", ret, 0);
 	if (c->close(c) != 0)
 		return (1);
 
diff --git a/test/checkpoint/workers.c b/test/checkpoint/workers.c
index 5cd2ef4e97b..b8ca5a37d2b 100644
--- a/test/checkpoint/workers.c
+++ b/test/checkpoint/workers.c
@@ -44,8 +44,7 @@ create_table(WT_SESSION *session, COOKIE *cookie)
 	p = config;
 	end = config + sizeof(config);
 	p += snprintf(p, (size_t)(end - p),
-	    "key_format=%s,value_format=S",
-	    cookie->type == COL ? "r" : "q");
+	    "key_format=%s,value_format=S", cookie->type == COL ? "r" : "q");
 	if (cookie->type == LSM)
 		(void)snprintf(p, (size_t)(end - p), ",type=lsm");
 
@@ -133,8 +132,7 @@ worker_op(WT_CURSOR *cursor, uint64_t keyno, u_int new_val)
 	char valuebuf[64];
 
 	cursor->set_key(cursor, keyno);
-	(void)snprintf(
-	    valuebuf, sizeof(valuebuf), "%037u", new_val);
+	(void)snprintf(valuebuf, sizeof(valuebuf), "%037u", new_val);
 	cursor->set_value(cursor, valuebuf);
 	if ((ret = cursor->insert(cursor)) != 0) {
 		if (ret == WT_ROLLBACK)
diff --git a/test/format/backup.c b/test/format/backup.c
index 3b95ea92b5e..5805012e1e0 100644
--- a/test/format/backup.c
+++ b/test/format/backup.c
@@ -65,8 +65,7 @@ copy_file(const char *name)
 	int ret;
 
 	len = strlen(g.home) + strlen(g.home_backup) + strlen(name) * 2 + 20;
-	if ((cmd = malloc(len)) == NULL)
-		die(errno, "malloc");
+	cmd = dmalloc(len);
 	(void)snprintf(cmd, len,
 	    "cp %s/%s %s/%s", g.home, name, g.home_backup, name);
 	if ((ret = system(cmd)) != 0)
diff --git a/test/format/bulk.c b/test/format/bulk.c
index 7cf4ba559dc..203043166a4 100644
--- a/test/format/bulk.c
+++ b/test/format/bulk.c
@@ -39,6 +39,7 @@ wts_load(void)
 	int is_bulk, ret;
 
 	conn = g.wts_conn;
+	keybuf = valbuf = NULL;
 
 	if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0)
 		die(ret, "connection.open_session");
diff --git a/test/format/config.c b/test/format/config.c
index 6e767a2c6a2..1f19ecf2cd2 100644
--- a/test/format/config.c
+++ b/test/format/config.c
@@ -36,6 +36,7 @@ static const char *config_file_type(u_int);
 static CONFIG	  *config_find(const char *, size_t);
 static int	   config_is_perm(const char *);
 static void	   config_isolation(void);
+static void	   config_lrt(void);
 static void	   config_map_checksum(const char *, u_int *);
 static void	   config_map_compression(const char *, u_int *);
 static void	   config_map_encryption(const char *, u_int *);
@@ -102,8 +103,7 @@ config_setup(void)
 	 * our configuration, LSM or KVS devices are "tables", but files are
 	 * tested as well.
 	 */
-	if ((g.uri = malloc(256)) == NULL)
-		die(errno, "malloc");
+	g.uri = dmalloc(256);
 	strcpy(g.uri, DATASOURCE("file") ? "file:" : "table:");
 	if (DATASOURCE("helium"))
 		strcat(g.uri, "dev1/");
@@ -135,12 +135,6 @@ config_setup(void)
 	if (DATASOURCE("helium") || DATASOURCE("kvsbdb"))
 		g.c_reverse = 0;
 
-	config_checksum();
-	config_compression("compression");
-	config_compression("logging_compression");
-	config_encryption();
-	config_isolation();
-
 	/*
 	 * Periodically, run single-threaded so we can compare the results to
 	 * a Berkeley DB copy, as long as the thread-count isn't nailed down.
@@ -149,6 +143,13 @@ config_setup(void)
 	if (!g.replay && g.run_cnt % 20 == 19 && !config_is_perm("threads"))
 		g.c_threads = 1;
 
+	config_checksum();
+	config_compression("compression");
+	config_compression("logging_compression");
+	config_encryption();
+	config_isolation();
+	config_lrt();
+
 	/*
 	 * Periodically, set the delete percentage to 0 so salvage gets run,
 	 * as long as the delete percentage isn't nailed down.
@@ -329,6 +330,26 @@ config_isolation(void)
 }
 
 /*
+ * config_lrt --
+ *	Long-running transaction configuration.
+ */
+static void
+config_lrt(void)
+{
+	/*
+	 * The underlying engine doesn't support a lookaside file for
+	 * fixed-length column stores.
+	 */
+	if (g.type == FIX) {
+		if (config_is_perm("long_running_txn"))
+			die(EINVAL,
+			    "long_running_txn not supported with fixed-length "
+			    "column store");
+		g.c_long_running_txn = 0;
+	}
+}
+
+/*
  * config_error --
  *	Display configuration information on error.
  */
diff --git a/test/format/format.h b/test/format/format.h
index 4ec2734aee9..d82dea5451f 100644
--- a/test/format/format.h
+++ b/test/format/format.h
@@ -310,6 +310,8 @@ void	 config_file(const char *);
 void	 config_print(int);
 void	 config_setup(void);
 void	 config_single(const char *, int);
+void	*dmalloc(size_t);
+char	*dstrdup(const char *);
 void	 fclose_and_clear(FILE **);
 void	 key_gen(uint8_t *, size_t *, uint64_t);
 void	 key_gen_insert(WT_RAND_STATE *, uint8_t *, size_t *, uint64_t);
@@ -317,6 +319,7 @@ void	 key_gen_setup(uint8_t **);
 void	 key_len_setup(void);
 void	*lrt(void *);
 void	 path_setup(const char *);
+int	 read_row(WT_CURSOR *, WT_ITEM *, uint64_t, int);
 uint32_t rng(WT_RAND_STATE *);
 void	 track(const char *, uint64_t, TINFO *);
 void	 val_gen(WT_RAND_STATE *, uint8_t *, size_t *, uint64_t);
diff --git a/test/format/lrt.c b/test/format/lrt.c
index a00a4e07879..85b6e29f224 100644
--- a/test/format/lrt.c
+++ b/test/format/lrt.c
@@ -37,33 +37,120 @@ lrt(void *arg)
 {
 	WT_CONNECTION *conn;
 	WT_CURSOR *cursor;
+	WT_ITEM key, value;
 	WT_SESSION *session;
+	size_t buf_len, buf_size;
+	uint64_t keyno, saved_keyno;
 	u_int period;
 	int pinned, ret;
+	uint8_t bitfield, *keybuf;
+	void *buf;
 
-	(void)(arg);
+	(void)(arg);			/* Unused parameter */
+
+	saved_keyno = 0;		/* [-Werror=maybe-uninitialized] */
+
+	key_gen_setup(&keybuf);
+	memset(&key, 0, sizeof(key));
+	key.data = keybuf;
+	memset(&value, 0, sizeof(value));
+
+	buf = NULL;
+	buf_len = buf_size = 0;
 
 	/* Open a session and cursor. */
 	conn = g.wts_conn;
-	if ((ret = conn->open_session(
-	    conn, NULL, "isolation=snapshot", &session)) != 0)
+	if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0)
 		die(ret, "connection.open_session");
 	if ((ret = session->open_cursor(
 	    session, g.uri, NULL, NULL, &cursor)) != 0)
 		die(ret, "session.open_cursor");
 
 	for (pinned = 0;;) {
-		/*
-		 * If we have an open cursor, reset it, releasing our pin, else
-		 * position the cursor, creating a snapshot.
-		 */
 		if (pinned) {
+			/* Re-read the record at the end of the table. */
+			while ((ret = read_row(cursor,
+			    &key, saved_keyno, 1)) == WT_ROLLBACK)
+				;
+			if (ret != 0)
+				die(ret, "read_row %" PRIu64, saved_keyno);
+
+			/* Compare the previous value with the current one. */
+			if (g.type == FIX) {
+				ret = cursor->get_value(cursor, &bitfield);
+				value.data = &bitfield;
+				value.size = 1;
+			} else
+				ret = cursor->get_value(cursor, &value);
+			if (ret != 0)
+				die(ret,
+				    "cursor.get_value: %" PRIu64, saved_keyno);
+
+			if (buf_size != value.size ||
+			    memcmp(buf, value.data, value.size) != 0)
+				die(0, "mismatched start/stop values");
+
+			/* End the transaction. */
+			if ((ret =
+			    session->commit_transaction(session, NULL)) != 0)
+				die(ret, "session.commit_transaction");
+
+			/* Reset the cursor, releasing our pin. */
 			if ((ret = cursor->reset(cursor)) != 0)
 				die(ret, "cursor.reset");
 			pinned = 0;
 		} else {
-			if ((ret = cursor->next(cursor)) != 0)
-				die(ret, "cursor.reset");
+			/*
+			 * Begin transaction: without an explicit transaction,
+			 * the snapshot is only kept around while a cursor is
+			 * positioned. As soon as the cursor loses its position
+			 * a new snapshot will be allocated.
+			 */
+			if ((ret = session->begin_transaction(
+			    session, "isolation=snapshot")) != 0)
+				die(ret, "session.begin_transaction");
+
+			/* Read a record at the end of the table. */
+			do {
+				saved_keyno = mmrand(NULL,
+				    (u_int)(g.key_cnt - g.key_cnt / 10),
+				    (u_int)g.key_cnt);
+				while ((ret = read_row(cursor,
+				    &key, saved_keyno, 1)) == WT_ROLLBACK)
+					;
+			} while (ret == WT_NOTFOUND);
+			if (ret != 0)
+				die(ret, "read_row %" PRIu64, saved_keyno);
+
+			/* Copy the cursor's value. */
+			if (g.type == FIX) {
+				ret = cursor->get_value(cursor, &bitfield);
+				value.data = &bitfield;
+				value.size = 1;
+			} else
+				ret = cursor->get_value(cursor, &value);
+			if (ret != 0)
+				die(ret,
+				    "cursor.get_value: %" PRIu64, saved_keyno);
+			if (buf_len < value.size &&
+			    (buf = realloc(buf, buf_len = value.size)) == NULL)
+				die(errno, "malloc");
+			memcpy(buf, value.data, buf_size = value.size);
+
+			/*
+			 * Move the cursor to an early record in the table,
+			 * hopefully allowing the page with the record just
+			 * retrieved to be evicted from memory.
+			 */
+			do {
+				keyno = mmrand(NULL, 1, (u_int)g.key_cnt / 5);
+				while ((ret = read_row(cursor,
+				    &key, keyno, 1)) == WT_ROLLBACK)
+					;
+			} while (ret == WT_NOTFOUND);
+			if (ret != 0)
+				die(ret, "read_row %" PRIu64, keyno);
+
 			pinned = 1;
 		}
 
@@ -82,5 +169,8 @@ lrt(void *arg)
 	if ((ret = session->close(session, NULL)) != 0)
 		die(ret, "session.close");
 
+	free(keybuf);
+	free(buf);
+
 	return (NULL);
 }
diff --git a/test/format/ops.c b/test/format/ops.c
index 7d3b22175ca..7c38aec4757 100644
--- a/test/format/ops.c
+++ b/test/format/ops.c
@@ -33,7 +33,6 @@ static int   col_remove(WT_CURSOR *, WT_ITEM *, uint64_t, int *);
 static int   col_update(TINFO *, WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t);
 static int   nextprev(WT_CURSOR *, int, int *);
 static void *ops(void *);
-static int   read_row(WT_CURSOR *, WT_ITEM *, uint64_t);
 static int   row_insert(TINFO *, WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t);
 static int   row_remove(WT_CURSOR *, WT_ITEM *, uint64_t, int *);
 static int   row_update(TINFO *, WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t);
@@ -64,6 +63,7 @@ wts_ops(int lastrun)
 	session = NULL;			/* -Wconditional-uninitialized */
 	memset(&backup_tid, 0, sizeof(backup_tid));
 	memset(&compact_tid, 0, sizeof(compact_tid));
+	memset(&lrt_tid, 0, sizeof(lrt_tid));
 
 	/*
 	 * There are two mechanisms to specify the length of the run, a number
@@ -239,13 +239,13 @@ ops(void *arg)
 
 	tinfo = arg;
 
-	/* Initialize the per-thread random number generator. */
-	__wt_random_init(&tinfo->rnd);
-
 	conn = g.wts_conn;
 	keybuf = valbuf = NULL;
 	readonly = 0;			/* -Wconditional-uninitialized */
 
+	/* Initialize the per-thread random number generator. */
+	__wt_random_init(&tinfo->rnd);
+
 	/* Set up the default key and value buffers. */
 	key_gen_setup(&keybuf);
 	val_gen_setup(&tinfo->rnd, &valbuf);
@@ -475,7 +475,7 @@ skip_insert:			if (col_update(tinfo,
 			}
 		} else {
 			++tinfo->search;
-			if (read_row(cursor, &key, keyno))
+			if (read_row(cursor, &key, keyno, 0))
 				if (intxn)
 					goto deadlock;
 			continue;
@@ -498,7 +498,7 @@ skip_insert:			if (col_update(tinfo,
 
 		/* Read to confirm the operation. */
 		++tinfo->search;
-		if (read_row(cursor, &key, keyno))
+		if (read_row(cursor, &key, keyno, 0))
 			goto deadlock;
 
 		/* Reset the cursor: there is no reason to keep pages pinned. */
@@ -583,7 +583,7 @@ wts_read_scan(void)
 		}
 
 		key.data = keybuf;
-		if ((ret = read_row(cursor, &key, cnt)) != 0)
+		if ((ret = read_row(cursor, &key, cnt, 0)) != 0)
 			die(ret, "read_scan");
 	}
 
@@ -597,8 +597,8 @@ wts_read_scan(void)
  * read_row --
  *	Read and verify a single element in a row- or column-store file.
  */
-static int
-read_row(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno)
+int
+read_row(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int notfound_err)
 {
 	static int sn = 0;
 	WT_ITEM value;
@@ -634,19 +634,24 @@ read_row(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno)
 		ret = cursor->search(cursor);
 		sn = 1;
 	}
-	if (ret == 0) {
+	switch (ret) {
+	case 0:
 		if (g.type == FIX) {
 			ret = cursor->get_value(cursor, &bitfield);
 			value.data = &bitfield;
 			value.size = 1;
-		} else {
+		} else
 			ret = cursor->get_value(cursor, &value);
-		}
-	}
-	if (ret == WT_ROLLBACK)
+		break;
+	case WT_ROLLBACK:
 		return (WT_ROLLBACK);
-	if (ret != 0 && ret != WT_NOTFOUND)
+	case WT_NOTFOUND:
+		if (notfound_err)
+			return (WT_NOTFOUND);
+		break;
+	default:
 		die(ret, "read_row: read row %" PRIu64, keyno);
+	}
 
 #ifdef HAVE_BERKELEY_DB
 	if (!SINGLETHREADED)
diff --git a/test/format/smoke.sh b/test/format/smoke.sh
index 8b4b5d9e424..5fbc349f242 100755
--- a/test/format/smoke.sh
+++ b/test/format/smoke.sh
@@ -3,7 +3,7 @@
 set -e
 
 # Smoke-test format as part of running "make check".
-args="-1 -c "." data_source=table ops=100000 rows=10000 threads=4 compression=none"
+args="-1 -c "." data_source=table ops=100000 rows=10000 threads=4 compression=none logging_compression=none"
 
 $TEST_WRAPPER ./t $args file_type=fix
 $TEST_WRAPPER ./t $args file_type=row
diff --git a/test/format/t.c b/test/format/t.c
index 8e8a627235f..603706e0ba1 100644
--- a/test/format/t.c
+++ b/test/format/t.c
@@ -40,7 +40,7 @@ int
 main(int argc, char *argv[])
 {
 	time_t start;
-	int ch, reps, ret;
+	int ch, i, onerun, reps, ret;
 	const char *config, *home;
 
 	config = NULL;
@@ -64,11 +64,12 @@ main(int argc, char *argv[])
 
 	/* Set values from the command line. */
 	home = NULL;
+	onerun = 0;
 	while ((ch = __wt_getopt(
 	    g.progname, argc, argv, "1C:c:H:h:Llqrt:")) != EOF)
 		switch (ch) {
 		case '1':			/* One run */
-			g.c_runs = 1;
+			onerun = 1;
 			break;
 		case 'C':			/* wiredtiger_open config */
 			g.config_open = __wt_optarg;
@@ -105,8 +106,14 @@ main(int argc, char *argv[])
 	argc -= __wt_optind;
 	argv += __wt_optind;
 
-	/* Initialize the global random number generator. */
+	/*
+	 * Initialize the global RNG. Start with the standard seeds, and then
+	 * use seconds since the Epoch modulo a prime to run the RNG for some
+	 * number of steps, so we don't start with the same values every time.
+	 */
 	__wt_random_init(&g.rnd);
+	for (i = (int)time(NULL) % 10007; i > 0; --i)
+		(void)__wt_random(&g.rnd);
 
 	/* Set up paths. */
 	path_setup(home);
@@ -155,6 +162,13 @@ main(int argc, char *argv[])
 		g.c_runs = 1;
 
 	/*
+	 * Let the command line -1 flag override runs configured from other
+	 * sources.
+	 */
+	if (onerun)
+		g.c_runs = 1;
+
+	/*
 	 * Initialize locks to single-thread named checkpoints and backups, last
 	 * last-record updates, and failures.
 	 */
@@ -298,6 +312,11 @@ die(int e, const char *fmt, ...)
 	/* Single-thread error handling. */
 	(void)pthread_rwlock_wrlock(&g.death_lock);
 
+	/* Try and turn off tracking so it doesn't obscure the error message. */
+	if (g.track) {
+		g.track = 0;
+		fprintf(stderr, "\n");
+	}
 	if (fmt != NULL) {				/* Death message. */
 		fprintf(stderr, "%s: ", g.progname);
 		va_start(ap, fmt);
diff --git a/test/format/util.c b/test/format/util.c
index 9d28b7a81bc..0f4f5de7c20 100644
--- a/test/format/util.c
+++ b/test/format/util.c
@@ -78,8 +78,7 @@ key_gen_setup(uint8_t **keyp)
 	*keyp = NULL;
 
 	len = MAX(KILOBYTE(100), g.c_key_max);
-	if ((key = malloc(len)) == NULL)
-		die(errno, "malloc");
+	key = dmalloc(len);
 	for (i = 0; i < len; ++i)
 		key[i] = (uint8_t)("abcdefghijklmnopqrstuvwxyz"[i % 26]);
 	*keyp = key;
@@ -139,8 +138,7 @@ val_gen_setup(WT_RAND_STATE *rnd, uint8_t **valp)
 	 * data for column-store run-length encoded files.
 	 */
 	len = MAX(KILOBYTE(100), g.c_value_max) + 20;
-	if ((val = malloc(len)) == NULL)
-		die(errno, "malloc");
+	val = dmalloc(len);
 	for (i = 0; i < len; ++i)
 		val[i] = (uint8_t)("ABCDEFGHIJKLMNOPQRSTUVWXYZ"[i % 26]);
 
@@ -257,43 +255,36 @@ path_setup(const char *home)
 	size_t len;
 
 	/* Home directory. */
-	if ((g.home = strdup(home == NULL ? "RUNDIR" : home)) == NULL)
-		die(errno, "malloc");
+	g.home = dstrdup(home == NULL ? "RUNDIR" : home);
 
 	/* Log file. */
 	len = strlen(g.home) + strlen("log") + 2;
-	if ((g.home_log = malloc(len)) == NULL)
-		die(errno, "malloc");
+	g.home_log = dmalloc(len);
 	snprintf(g.home_log, len, "%s/%s", g.home, "log");
 
 	/* RNG log file. */
 	len = strlen(g.home) + strlen("rand") + 2;
-	if ((g.home_rand = malloc(len)) == NULL)
-		die(errno, "malloc");
+	g.home_rand = dmalloc(len);
 	snprintf(g.home_rand, len, "%s/%s", g.home, "rand");
 
 	/* Run file. */
 	len = strlen(g.home) + strlen("CONFIG") + 2;
-	if ((g.home_config = malloc(len)) == NULL)
-		die(errno, "malloc");
+	g.home_config = dmalloc(len);
 	snprintf(g.home_config, len, "%s/%s", g.home, "CONFIG");
 
 	/* Statistics file. */
 	len = strlen(g.home) + strlen("stats") + 2;
-	if ((g.home_stats = malloc(len)) == NULL)
-		die(errno, "malloc");
+	g.home_stats = dmalloc(len);
 	snprintf(g.home_stats, len, "%s/%s", g.home, "stats");
 
 	/* Backup directory. */
 	len = strlen(g.home) + strlen("BACKUP") + 2;
-	if ((g.home_backup = malloc(len)) == NULL)
-		die(errno, "malloc");
+	g.home_backup = dmalloc(len);
 	snprintf(g.home_backup, len, "%s/%s", g.home, "BACKUP");
 
 	/* BDB directory. */
 	len = strlen(g.home) + strlen("bdb") + 2;
-	if ((g.home_bdb = malloc(len)) == NULL)
-		die(errno, "malloc");
+	g.home_bdb = dmalloc(len);
 	snprintf(g.home_bdb, len, "%s/%s", g.home, "bdb");
 
 	/*
@@ -315,8 +306,7 @@ path_setup(const char *home)
 		"mkdir KVS"
 #endif
 	len = strlen(g.home) * 3 + strlen(CMD) + 1;
-	if ((g.home_init = malloc(len)) == NULL)
-		die(errno, "malloc");
+	g.home_init = dmalloc(len);
 	snprintf(g.home_init, len, CMD, g.home, g.home, g.home);
 
 	/* Backup directory initialize command, remove and re-create it. */
@@ -327,8 +317,7 @@ path_setup(const char *home)
 #define	CMD	"rm -rf %s && mkdir %s"
 #endif
 	len = strlen(g.home_backup) * 2 + strlen(CMD) + 1;
-	if ((g.home_backup_init = malloc(len)) == NULL)
-		die(errno, "malloc");
+	g.home_backup_init = dmalloc(len);
 	snprintf(g.home_backup_init, len, CMD, g.home_backup, g.home_backup);
 
 	/*
@@ -351,8 +340,7 @@ path_setup(const char *home)
 	"cp WiredTiger* wt* slvg.copy/"
 #endif
 	len = strlen(g.home) + strlen(CMD) + 1;
-	if ((g.home_salvage_copy = malloc(len)) == NULL)
-		die(errno, "malloc");
+	g.home_salvage_copy = dmalloc(len);
 	snprintf(g.home_salvage_copy, len, CMD, g.home);
 }
 
@@ -422,3 +410,31 @@ fclose_and_clear(FILE **fpp)
 		die(errno, "fclose");
 	return;
 }
+
+/*
+ * dmalloc --
+ *	Call malloc, dying on failure.
+ */
+void *
+dmalloc(size_t len)
+{
+	void *p;
+
+	if ((p = malloc(len)) == NULL)
+		die(errno, "malloc");
+	return (p);
+}
+
+/*
+ * dstrdup --
+ *	Call strdup, dying on failure.
+ */
+char *
+dstrdup(const char *str)
+{
+	char *p;
+
+	if ((p = strdup(str)) == NULL)
+		die(errno, "strdup");
+	return (p);
+}
diff --git a/test/format/wts.c b/test/format/wts.c
index 3d3b59810e8..23823c20184 100644
--- a/test/format/wts.c
+++ b/test/format/wts.c
@@ -462,8 +462,7 @@ wts_dump(const char *tag, int dump_bdb)
 	track("dump files and compare", 0ULL, NULL);
 
 	len = strlen(g.home) + strlen(BERKELEY_DB_PATH) + strlen(g.uri) + 100;
-	if ((cmd = malloc(len)) == NULL)
-		die(errno, "malloc");
+	cmd = dmalloc(len);
 	(void)snprintf(cmd, len,
 	    "sh s_dumpcmp -h %s %s %s %s %s %s",
 	    g.home,
@@ -564,9 +563,7 @@ wts_stats(void)
 
 	/* Data source statistics. */
 	fprintf(fp, "\n\n====== Data source statistics:\n");
-	if ((stat_name =
-	    malloc(strlen("statistics:") + strlen(g.uri) + 1)) == NULL)
-		die(errno, "malloc");
+	stat_name = dmalloc(strlen("statistics:") + strlen(g.uri) + 1);
 	sprintf(stat_name, "statistics:%s", g.uri);
 	if ((ret = session->open_cursor(
 	    session, stat_name, NULL, NULL, &cursor)) != 0)
diff --git a/test/suite/run.py b/test/suite/run.py
index 1cb7309cb53..5e7b76a79b9 100644
--- a/test/suite/run.py
+++ b/test/suite/run.py
@@ -312,7 +312,7 @@ if __name__ == '__main__':
     else:
         for arg in testargs:
             testsFromArg(tests, loader, arg)
-        
+
     if debug:
         import pdb
         pdb.set_trace()
diff --git a/test/suite/test_async01.py b/test/suite/test_async01.py
index af5180192af..fee5e8232f1 100644
--- a/test/suite/test_async01.py
+++ b/test/suite/test_async01.py
@@ -51,7 +51,7 @@ class Callback(wiredtiger.AsyncCallback):
     def notify_error(self, key, value, optype, desc):
         tty_pr('ERROR: notify(' + str(key) + ',' + str(value) + ',' +
                str(optype) + '): ' + desc)
-        
+
     def notify(self, op, op_ret, flags):
 
         # Note: we are careful not to throw any errors here.  Any
diff --git a/test/suite/test_async02.py b/test/suite/test_async02.py
index 21d811989c8..c878e8dd114 100644
--- a/test/suite/test_async02.py
+++ b/test/suite/test_async02.py
@@ -51,7 +51,7 @@ class Callback(wiredtiger.AsyncCallback):
     def notify_error(self, key, value, optype, exp, desc):
         tty_pr('ERROR: notify(' + str(key) + ',' + str(value) + ',' +
             str(optype) + '): ' + 'Expected: ' + str(exp) + ' ' + desc)
-        
+
     def notify(self, op, op_ret, flags):
 
         # Note: we are careful not to throw any errors here.  Any
diff --git a/test/suite/test_autoclose.py b/test/suite/test_autoclose.py
index 40106e6f97d..6dc71003a34 100644
--- a/test/suite/test_autoclose.py
+++ b/test/suite/test_autoclose.py
@@ -156,7 +156,7 @@ class test_autoclose(wttest.WiredTigerTestCase):
         self.assertRaisesHavingMessage(exceptions.RuntimeError,
                                        lambda: self.create_table(),
                                        '/wt_session.* is None/')
-        
+
     def test_close_connection1(self):
         """
         Use a connection handle after it is closed.
@@ -166,6 +166,6 @@ class test_autoclose(wttest.WiredTigerTestCase):
         self.assertRaisesHavingMessage(exceptions.RuntimeError,
                                        lambda: conn.open_session(None),
                                        '/wt_connection.* is None/')
-        
+
 if __name__ == '__main__':
     wttest.run()
diff --git a/test/suite/test_backup04.py b/test/suite/test_backup04.py
index 47e656cf9b1..a0a52f49817 100644
--- a/test/suite/test_backup04.py
+++ b/test/suite/test_backup04.py
@@ -83,7 +83,7 @@ class test_backup_target(wttest.WiredTigerTestCase, suite_subprocess):
 
     # Compare the original and backed-up files using the wt dump command.
     def compare(self, uri, dir_full, dir_incr):
-        # print "Compare: full URI: " + uri + " with incremental URI " 
+        # print "Compare: full URI: " + uri + " with incremental URI "
         if dir_full == None:
             full_name='original'
         else:
diff --git a/test/suite/test_backup05.py b/test/suite/test_backup05.py
index 80706b20299..8ab329f761a 100644
--- a/test/suite/test_backup05.py
+++ b/test/suite/test_backup05.py
@@ -71,7 +71,7 @@ class test_backup05(wttest.WiredTigerTestCase, suite_subprocess):
         session = self.setUpSessionOpen(conn)
         session.verify(self.uri)
         conn.close()
-        
+
     def test_backup(self):
         '''Check manual fsyncLock backup strategy'''
 
diff --git a/test/suite/test_base05.py b/test/suite/test_base05.py
index 399cba07164..7d5ff59b2c9 100644
--- a/test/suite/test_base05.py
+++ b/test/suite/test_base05.py
@@ -154,7 +154,7 @@ class test_base05(wttest.WiredTigerTestCase):
             choice = (n + i) % len(reflist)
             result += reflist[choice]
         return result + ':' + str(n)
-        
+
     def test_table_ss(self):
         """
         Create entries, and read back in a cursor: key=string, value=string
@@ -196,7 +196,7 @@ class test_base05(wttest.WiredTigerTestCase):
 
     def do_test_table_base(self, convert):
         """
-        Base functionality that uses regular strings with 
+        Base functionality that uses regular strings with
         non-ASCII (UTF) chars and optionally converts them to
         Unicode (considered a type separate from string in Python).
         """
diff --git a/test/suite/test_baseconfig.py b/test/suite/test_baseconfig.py
new file mode 100644
index 00000000000..6ac3654af11
--- /dev/null
+++ b/test/suite/test_baseconfig.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2015 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import os
+import wiredtiger, wttest
+
+# test_baseconfig
+#       test base configuration file being ignored.
+class test_baseconfig(wttest.WiredTigerTestCase):
+    def test_baseconfig(self):
+        # Open up another database and modify the baseconfig
+        os.mkdir("A")
+        conn = wiredtiger.wiredtiger_open("A", 'create')
+        self.assertTrue(os.path.exists("A/WiredTiger.basecfg"))
+        with open("A/WiredTiger.basecfg", "a") as basecfg_file:
+            basecfg_file.write("foo!")
+        conn.close()
+
+        # Open a database, we should assert here as the basecfg is invalid
+        self.assertRaisesWithMessage(
+            wiredtiger.WiredTigerError,
+            lambda: wiredtiger.wiredtiger_open("A", ''),
+            '/unknown configuration key/')
+
+        conn = wiredtiger.wiredtiger_open("A", "create,config_base=false")
+        conn.close()
+
+if __name__ == '__main__':
+    wttest.run()
diff --git a/test/suite/test_bug005.py b/test/suite/test_bug005.py
index 961bb551b69..3e06bea8694 100644
--- a/test/suite/test_bug005.py
+++ b/test/suite/test_bug005.py
@@ -37,7 +37,7 @@ from helper import key_populate, value_populate
 class test_bug005(wttest.WiredTigerTestCase):
     # This is a btree layer test, test files, ignore tables.
     uri = 'file:test_bug005'
-    
+
     def test_bug005(self):
         # Create the object.
         self.session.create(self.uri, 'value_format=S,key_format=S')
diff --git a/test/suite/test_bug008.py b/test/suite/test_bug008.py
index 0102cbd63f4..75cbd989cd1 100644
--- a/test/suite/test_bug008.py
+++ b/test/suite/test_bug008.py
@@ -48,7 +48,7 @@ class test_bug008(wttest.WiredTigerTestCase):
 
         # Populate the tree and reopen the connection, forcing it to disk
         # and moving the records to an on-page format.
-        simple_populate(self, uri, self.fmt, 100) 
+        simple_populate(self, uri, self.fmt, 100)
         self.reopen_conn()
 
         # Begin a transaction, and add some additional records.
@@ -105,7 +105,7 @@ class test_bug008(wttest.WiredTigerTestCase):
 
         # Populate the tree and reopen the connection, forcing it to disk
         # and moving the records to an on-page format.
-        simple_populate(self, uri, self.fmt, 100) 
+        simple_populate(self, uri, self.fmt, 100)
         self.reopen_conn()
 
         # Add some additional visible records.
diff --git a/test/suite/test_bug011.py b/test/suite/test_bug011.py
index d2c56adb221..b93fc3a75b7 100644
--- a/test/suite/test_bug011.py
+++ b/test/suite/test_bug011.py
@@ -64,7 +64,7 @@ class test_bug011(wttest.WiredTigerTestCase):
         # Make sure we have a cursor for the table so it stays in cache.
         for i in range(0, self.ntables):
             this_uri = 'table:%s-%03d' % (self.table_name, i)
-            cursors.append(self.session.open_cursor(this_uri, None)) 
+            cursors.append(self.session.open_cursor(this_uri, None))
 
         # Make use of the cache.
         for i in range(0, self.nops):
diff --git a/test/suite/test_checkpoint01.py b/test/suite/test_checkpoint01.py
index aacc8f1f055..799e6ded1ea 100644
--- a/test/suite/test_checkpoint01.py
+++ b/test/suite/test_checkpoint01.py
@@ -70,7 +70,7 @@ class test_checkpoint(wttest.WiredTigerTestCase):
         for checkpoint_name, entry in self.checkpoints.iteritems():
             self.add_records(checkpoint_name)
             self.session.checkpoint("name=" + checkpoint_name)
-            
+
     # Create a dictionary of sorted records a checkpoint should include.
     def list_expected(self, name):
         records = {}
diff --git a/test/suite/test_cursor01.py b/test/suite/test_cursor01.py
index 507036e85cf..47cc7f6c5b7 100644
--- a/test/suite/test_cursor01.py
+++ b/test/suite/test_cursor01.py
@@ -167,7 +167,7 @@ class test_cursor01(wttest.WiredTigerTestCase):
     def backward_iter(self, cursor):
         cursor.reset()
         self.assertCursorHasNoKeyValue(cursor)
-        
+
         i = self.nentries - 1
         while True:
             prevret = cursor.prev()
@@ -188,7 +188,7 @@ class test_cursor01(wttest.WiredTigerTestCase):
     def backward_iter_with_dup(self, cursor):
         cursor.reset()
         self.assertCursorHasNoKeyValue(cursor)
-        
+
         i = self.nentries - 1
         while True:
             prevret = cursor.prev()
diff --git a/test/suite/test_cursor04.py b/test/suite/test_cursor04.py
index 50cde0023d8..08f1a7240a5 100644
--- a/test/suite/test_cursor04.py
+++ b/test/suite/test_cursor04.py
@@ -113,7 +113,7 @@ class test_cursor04(wttest.WiredTigerTestCase):
             self.assertEqual(direction, 0)
             self.assertEqual(cursor.get_key(), origkey)
             self.assertEqual(cursor.get_value(), 0)
-        
+
     def test_searches(self):
         """
         Create entries, and read back in a cursor: key=string, value=string
@@ -174,7 +174,7 @@ class test_cursor04(wttest.WiredTigerTestCase):
             self.assertEqual(cmp, 0)
             self.assertEqual(cursor.get_key(), self.genkey(0))
             self.assertEqual(cursor.get_value(), 0)
-            
+
         cursor.set_key(self.genkey(5))
         self.expect_either(cursor, 4, 6)
 
diff --git a/test/suite/test_cursor06.py b/test/suite/test_cursor06.py
index 28ac581cf66..c11d043a548 100644
--- a/test/suite/test_cursor06.py
+++ b/test/suite/test_cursor06.py
@@ -58,7 +58,7 @@ class test_cursor06(wttest.WiredTigerTestCase):
             cursor.set_value(v[0], v[1], v[2], v[3])
         else:
             cursor.set_value(value_populate(cursor, 10))
-            
+
     def test_reconfigure_overwrite(self):
         uri = self.type + self.name
         for open_config in (None, "overwrite=0", "overwrite=1"):
@@ -77,7 +77,7 @@ class test_cursor06(wttest.WiredTigerTestCase):
                 self.set_kv(cursor)
                 cursor.insert()
             cursor.close()
-            
+
     def test_reconfigure_readonly(self):
         uri = self.type + self.name
         for open_config in (None, "readonly=0", "readonly=1"):
diff --git a/test/suite/test_cursor_random.py b/test/suite/test_cursor_random.py
index be08c59210f..10a3140a2fd 100644
--- a/test/suite/test_cursor_random.py
+++ b/test/suite/test_cursor_random.py
@@ -92,7 +92,7 @@ class test_cursor_random(wttest.WiredTigerTestCase):
 
     # Check that next_random works in the presence of a larger set of values,
     # where the values are in a disk format page.
-    def test_cursor_random_multiple_page_records(self):
+    def cursor_random_multiple_page_records(self, reopen):
         uri = self.type + 'random'
         if self.type == 'file:':
             simple_populate(self, uri,
@@ -103,10 +103,10 @@ class test_cursor_random(wttest.WiredTigerTestCase):
                 'allocation_size=512,leaf_page_max=512,key_format=' +\
                 self.fmt, 10000)
 
-        # Close the connection so everything is forced to disk (otherwise the
-        # values are on an insert list and the underlying engine doesn't make
-        # random selections, it selects the middle of the list.
-        self.reopen_conn()
+        # Optionally close the connection so everything is forced to disk,
+        # insert lists are an entirely different path in the code.
+        if reopen:
+            self.reopen_conn()
 
         cursor = self.session.open_cursor(uri, None, "next_random=true")
         last = ''
@@ -120,6 +120,10 @@ class test_cursor_random(wttest.WiredTigerTestCase):
         self.assertLess(match, 5,
             'next_random did not return random records, too many matches found')
 
+    def test_cursor_random_multiple_page_records_reopen(self):
+        self.cursor_random_multiple_page_records(1)
+    def test_cursor_random_multiple_page_records(self):
+        self.cursor_random_multiple_page_records(0)
 
 # Check that opening a random cursor on column-store returns not-supported.
 class test_cursor_random_column(wttest.WiredTigerTestCase):
diff --git a/test/suite/test_cursor_tracker.py b/test/suite/test_cursor_tracker.py
index 1fa93f3e59b..742dea4c32b 100644
--- a/test/suite/test_cursor_tracker.py
+++ b/test/suite/test_cursor_tracker.py
@@ -461,7 +461,7 @@ class TestCursorTracker(wttest.WiredTigerTestCase):
         except:
             v = '[invalid]'
         print(prefix + k + ' ' + v)
-        
+
     def cur_check(self, cursor, got, want, iskey):
         if got != want:
             if iskey:
diff --git a/test/suite/test_durability01.py b/test/suite/test_durability01.py
index 716e38c17d4..8d00d05fa14 100644
--- a/test/suite/test_durability01.py
+++ b/test/suite/test_durability01.py
@@ -52,7 +52,7 @@ class test_durability01(wttest.WiredTigerTestCase, suite_subprocess):
         session = self.setUpSessionOpen(conn)
         session.verify(self.uri)
         conn.close()
-        
+
     def test_durability(self):
         '''Check for missing metadata checkpoints'''
 
diff --git a/test/suite/test_encrypt03.py b/test/suite/test_encrypt03.py
index 0e06d4491ca..0e19ad39263 100644
--- a/test/suite/test_encrypt03.py
+++ b/test/suite/test_encrypt03.py
@@ -86,7 +86,7 @@ class test_encrypt03(wttest.WiredTigerTestCase):
     def test_encrypt(self):
         params = 'key_format=S,value_format=S,encryption=(name='
         if self.file_encrypt != None:
-            params += self.file_encrypt 
+            params += self.file_encrypt
         if self.file_encrypt_args != None:
             params += ',keyid=' + self.file_encrypt_args
         params += ')'
diff --git a/test/suite/test_encrypt04.py b/test/suite/test_encrypt04.py
index ea9bcc5aacb..41fd0f6dd48 100644
--- a/test/suite/test_encrypt04.py
+++ b/test/suite/test_encrypt04.py
@@ -46,9 +46,15 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess):
     # with simply the wrong keyid may appear valid when initially verified,
     # but may result in error on first use. The odds that a real encryptor
     # would leave a lot of its input unchanged is infinitesimally small.
+    #
+    # When both self.forceerror1 and self.forceerror2 occur, we set a config
+    # flag when loading the rotn encryptor, which forces a particular error
+    # return in rotn.decrypt. We look for that return back from
+    # wiredtiger_open.
     encrypt_scen_1 = [
         ('none', dict( name1='none', keyid1='', secretkey1='')),
-        ('rotn17abc', dict( name1='rotn', keyid1='17', secretkey1='ABC')),
+        ('rotn17abc', dict( name1='rotn', keyid1='17',
+                                      secretkey1='ABC', forceerror1=True)),
         ('rotn11abc', dict( name1='rotn', keyid1='11', secretkey1='ABC')),
         ('rotn11xyz', dict( name1='rotn', keyid1='11', secretkey1='XYZ')),
         ('rotn11xyz_and_clear', dict( name1='rotn', keyid1='11',
@@ -58,7 +64,8 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess):
         ('none', dict( name2='none', keyid2='', secretkey2='')),
         ('rotn17abc', dict( name2='rotn', keyid2='17', secretkey2='ABC')),
         ('rotn11abc', dict( name2='rotn', keyid2='11', secretkey2='ABC')),
-        ('rotn11xyz', dict( name2='rotn', keyid2='11', secretkey2='XYZ')),
+        ('rotn11xyz', dict( name2='rotn', keyid2='11',
+                                      secretkey2='XYZ', forceerror2=True)),
         ('rotn11xyz_and_clear', dict( name2='rotn', keyid2='11',
                                       secretkey2='XYZ', fileinclear2=True))
     ]
@@ -73,6 +80,7 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess):
 
     # Override WiredTigerTestCase, we have extensions.
     def setUpConnectionOpen(self, dir):
+        forceerror = None
         if self.part == 1:
             self.name = self.name1
             self.keyid = self.keyid1
@@ -85,15 +93,28 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess):
             self.secretkey = self.secretkey2
             self.fileinclear = self.fileinclear2 if \
                                hasattr(self, 'fileinclear2') else False
+            if hasattr(self, 'forceerror1') and hasattr(self, 'forceerror2'):
+                forceerror = "rotn_force_error=true"
+        self.expect_forceerror = forceerror != None
+        self.got_forceerror = False
 
         encarg = 'encryption=(name={0},keyid={1},secretkey={2}),'.format(
             self.name, self.keyid, self.secretkey)
-        extarg = self.extensionArg([('encryptors', self.name),
-            ('encryptors', self.name)])
+        # If forceerror is set for this test, add a config arg to
+        # the extension string. That signals rotn to return a (-1000)
+        # error code, which we'll detect here.
+        extarg = self.extensionArg([('encryptors', self.name, forceerror)])
         self.pr('encarg = ' + encarg + ' extarg = ' + extarg)
-        conn = wiredtiger.wiredtiger_open(dir,
-            'create,error_prefix="{0}: ",{1}{2}'.format(
-                self.shortid(), encarg, extarg))
+        completed = False
+        try:
+            conn = wiredtiger.wiredtiger_open(dir,
+                'create,error_prefix="{0}: ",{1}{2}'.format(
+                 self.shortid(), encarg, extarg))
+        except (BaseException) as err:
+            # Capture the recognizable error created by rotn
+            if str(-1000) in str(err):
+                self.got_forceerror = True
+            raise
         self.pr(`conn`)
         return conn
 
@@ -119,7 +140,7 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess):
     def extensionArg(self, exts):
         extfiles = []
         for ext in exts:
-            (dirname, name) = ext
+            (dirname, name, extarg) = ext
             if name != None and name != 'none':
                 testdir = os.path.dirname(__file__)
                 extdir = os.path.join(run.wt_builddir, 'ext', dirname)
@@ -127,12 +148,16 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess):
                     extdir, name, '.libs', 'libwiredtiger_' + name + '.so')
                 if not os.path.exists(extfile):
                     self.skipTest('extension "' + extfile + '" not built')
+                extfile = '"' + extfile + '"'
                 if not extfile in extfiles:
-                    extfiles.append(extfile)
+                    s = extfile
+                    if extarg != None:
+                        s += "=(config=\"" + extarg + "\")"
+                    extfiles.append(s)
         if len(extfiles) == 0:
             return ''
         else:
-            return ',extensions=["' + '","'.join(extfiles) + '"]'
+            return ',extensions=[' + ','.join(extfiles) + ']'
 
     # Evaluate expression, which either must succeed (if expect_okay)
     # or must fail (if !expect_okay).
@@ -204,7 +229,8 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess):
                 self.check_records(cursor, r, 0, self.nrecords)
                 self.check_records(cursor, r, self.nrecords, self.nrecords * 2)
             cursor.close()
-        
+        self.assertEqual(self.expect_forceerror, self.got_forceerror)
+
 
 if __name__ == '__main__':
     wttest.run()
diff --git a/test/suite/test_encrypt05.py b/test/suite/test_encrypt05.py
index f5db543ecf3..8a69e5f909f 100644
--- a/test/suite/test_encrypt05.py
+++ b/test/suite/test_encrypt05.py
@@ -93,7 +93,7 @@ class test_encrypt05(wttest.WiredTigerTestCase):
             diff = n - len(self.bigvalue)
             rchr = ''.join(chr(r.randint(1, 255)) for i in range(diff))
             return self.bigvalue + rchr
- 
+
     # Create a table, add key/values with specific lengths, then verify them.
     def test_encrypt(self):
         params = 'key_format=S,value_format=S'
diff --git a/test/suite/test_encrypt06.py b/test/suite/test_encrypt06.py
index 21e4d50769c..5b2007fe6e7 100644
--- a/test/suite/test_encrypt06.py
+++ b/test/suite/test_encrypt06.py
@@ -211,7 +211,7 @@ class test_encrypt06(wttest.WiredTigerTestCase):
 
         c0.close()
         c1.close()
-            
+
         # Force everything to disk so we can examine it
         self.close_conn()
 
@@ -222,7 +222,7 @@ class test_encrypt06(wttest.WiredTigerTestCase):
                          not self.match_string_in_rundir(txt0))
         self.assertEqual(self.expected_encryption(self.encrypt1),
                          not self.match_string_in_rundir(txt1))
-        
+
 
 if __name__ == '__main__':
     wttest.run()
diff --git a/test/suite/test_jsondump02.py b/test/suite/test_jsondump02.py
index 0c6b38db3ef..790f651fd2f 100644
--- a/test/suite/test_jsondump02.py
+++ b/test/suite/test_jsondump02.py
@@ -84,7 +84,7 @@ class test_jsondump02(wttest.WiredTigerTestCase):
                 cursor[insert[0]] = insert[1]
         finally:
             cursor.close()
-        
+
     # Create JSON cursors and test them directly.
     def test_json_cursor(self):
         """
@@ -140,50 +140,50 @@ class test_jsondump02(wttest.WiredTigerTestCase):
 
         # bad tokens
         self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
-            lambda: self.load_json(self.table_uri2, 
+            lambda: self.load_json(self.table_uri2,
               (('<>abc?', '9'),)),
             '/unknown token/')
 
         # bad tokens
         self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
-            lambda: self.load_json(self.table_uri2, 
+            lambda: self.load_json(self.table_uri2,
               (('"abc\u"', ''),)),
             '/invalid Unicode/')
 
         # bad tokens
         self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
-            lambda: self.load_json(self.table_uri2, 
+            lambda: self.load_json(self.table_uri2,
               (('"abc', ''),)),
             '/unterminated string/')
 
         # bad syntax
         self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
-            lambda: self.load_json(self.table_uri2, 
+            lambda: self.load_json(self.table_uri2,
               (('"stuff" "jibberish"', '"value0" "more jibberish"'),)),
             '/expected key name.*\"key0\"/')
 
         # bad types
         self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
-            lambda: self.load_json(self.table_uri2, 
+            lambda: self.load_json(self.table_uri2,
               (('"key0" : "KEY002"', '"value0" : "xyz",\n"value1" : "str0"'),)),
             '/expected unsigned JSON <int>, got <string>/')
 
         # bad types
         self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
-            lambda: self.load_json(self.table_uri2, 
+            lambda: self.load_json(self.table_uri2,
               (('"key0" : "KEY002"', '"value0" : 123,\n"value1" : 456'),)),
             '/expected JSON <string>, got <integer>/')
 
         # extra stuff
         self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
-            lambda: self.load_json(self.table_uri2, 
+            lambda: self.load_json(self.table_uri2,
               (('"key0" : "KEY002"',
                 '"value0" : 123,\n"value1" : "str0",'),)),
             '/expected JSON <EOF>, got \',\'/')
 
         # fields out of order currently not supported
         self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
-            lambda: self.load_json(self.table_uri2, 
+            lambda: self.load_json(self.table_uri2,
               (('"key0" : "KEY002"', '"value1" : "str0",\n"value0" : 123'),)),
             '/expected value name.*\"value0\"/')
 
@@ -192,17 +192,17 @@ class test_jsondump02(wttest.WiredTigerTestCase):
             '\\u', '\\ux', '\\u0', '\\u0F', '\\u0FA', '\\u0FAx',  '\\u0FA\\x')
         for uni in invalid_unicode:
             self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
-                lambda: self.load_json(self.table_uri2, 
+                lambda: self.load_json(self.table_uri2,
                   (('"key0" : "KEY002"', '"value0" : 123,\n"value1" : "'
                     + uni + '"'),)),
                 '/invalid Unicode/')
 
         # this one should work
-        self.load_json(self.table_uri2, 
+        self.load_json(self.table_uri2,
               (('"key0" : "KEY002"', '"value0" : 345,\n"value1" : "str2"'),))
 
         # extraneous/missing space is okay
-        self.load_json(self.table_uri2, 
+        self.load_json(self.table_uri2,
               (('  "key0"\n:\t"KEY003"    ',
                 '"value0":456,"value1"\n\n\r\n:\t\n"str3"'),))
 
diff --git a/test/suite/test_metadata_cursor01.py b/test/suite/test_metadata_cursor01.py
index 35fd1a74354..706b8a4132a 100644
--- a/test/suite/test_metadata_cursor01.py
+++ b/test/suite/test_metadata_cursor01.py
@@ -107,7 +107,7 @@ class test_metadata_cursor01(wttest.WiredTigerTestCase):
         self.create_table()
         cursor = self.session.open_cursor(self.metauri, None, None)
         self.assertCursorHasNoKeyValue(cursor)
-        
+
         while True:
             prevret = cursor.prev()
             if prevret != 0:
@@ -124,7 +124,7 @@ class test_metadata_cursor01(wttest.WiredTigerTestCase):
         self.create_table()
         cursor = self.session.open_cursor(self.metauri, None, None)
         self.assertCursorHasNoKeyValue(cursor)
-        
+
         # Ensure the 'special' metadata metadata is found.
         value = cursor['metadata:']
         self.assertTrue(value.find('key_format') != -1)
diff --git a/test/suite/test_pack.py b/test/suite/test_pack.py
index c9d360c2dcd..451c6fbb9a9 100644
--- a/test/suite/test_pack.py
+++ b/test/suite/test_pack.py
@@ -43,7 +43,7 @@ class test_pack(wttest.WiredTigerTestCase):
             y = cursor.get_value()
             self.tty(' ' + name + ':  ' + str(x) + ' => ' + str(y))
         cursor.reset()
-        
+
     def check(self, fmt, *v):
         v = list(v)
         fmtname = re.sub('([A-Z])', r'_\1', fmt)
diff --git a/test/suite/test_priv01.py b/test/suite/test_priv01.py
index 9b6b494e76e..0602d24a2b2 100644
--- a/test/suite/test_priv01.py
+++ b/test/suite/test_priv01.py
@@ -131,7 +131,7 @@ class test_priv01(wttest.WiredTigerTestCase):
                 lambda: self.common_test(None, edir, None),
                 '/WIREDTIGER_HOME environment variable set but\
  process lacks privileges to use that environment variable/')
-    
+
     def test_env_conf_priv(self):
         edir = 'envdir'
         os.mkdir(edir)
diff --git a/test/suite/test_schema02.py b/test/suite/test_schema02.py
index ab709a28211..0cbff4b5ae0 100644
--- a/test/suite/test_schema02.py
+++ b/test/suite/test_schema02.py
@@ -173,7 +173,7 @@ class test_schema02(wttest.WiredTigerTestCase):
             cursor[(i, 'key' + str(i))] = \
                 ('val' + str(square), square, 'val' + str(cube), cube)
         cursor.close()
-        
+
     def check_entries(self):
         cursor = self.session.open_cursor('table:main', None, None)
         # spot check via search
diff --git a/test/suite/test_schema04.py b/test/suite/test_schema04.py
index 9ad01b0f285..a66e1ea2411 100644
--- a/test/suite/test_schema04.py
+++ b/test/suite/test_schema04.py
@@ -79,7 +79,7 @@ class test_schema04(wttest.WiredTigerTestCase):
                 (i*3)%100, (i*4)%100, (i*5)%100)
             cursor.insert()
         cursor.close()
-        
+
     def check_entries(self):
         cursor = self.session.open_cursor('table:schema04', None, None)
         icursor = []
diff --git a/test/suite/test_schema05.py b/test/suite/test_schema05.py
index c3919af0880..2a7bc042c80 100644
--- a/test/suite/test_schema05.py
+++ b/test/suite/test_schema05.py
@@ -120,7 +120,7 @@ class test_schema05(wttest.WiredTigerTestCase):
             cursor[i] = ','.join([str((i*j)%100) for j in
                                   range(0, self.nindices)])
         cursor.close()
-        
+
     def check_entries(self):
         cursor = self.session.open_cursor('table:schema05', None, None)
         icursor = []
diff --git a/test/suite/test_sweep01.py b/test/suite/test_sweep01.py
index f5e2aa96cbe..13422a75a61 100644
--- a/test/suite/test_sweep01.py
+++ b/test/suite/test_sweep01.py
@@ -42,7 +42,6 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
     uri = 'table:' + tablebase
     numfiles = 50
     numkv = 1000
-    ckpt = 5
 
     types = [
         ('row', dict(tabletype='row',
@@ -65,7 +64,6 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
                 ',create,error_prefix="%s: ",' % self.shortid() + \
                 'file_manager=(close_handle_minimum=0,' + \
                 'close_idle_time=6,close_scan_interval=2),' + \
-                'checkpoint=(wait=%d),' % self.ckpt + \
                 'statistics=(fast),'
         # print "Creating conn at '%s' with config '%s'" % (dir, conn_params)
         try:
@@ -93,12 +91,13 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
                 time.sleep(1)
 
         stat_cursor = self.session.open_cursor('statistics:', None, None)
-        close1 = stat_cursor[stat.conn.dh_conn_handles][2]
-        sweep1 = stat_cursor[stat.conn.dh_conn_sweeps][2]
+        close1 = stat_cursor[stat.conn.dh_sweep_close][2]
+        remove1 = stat_cursor[stat.conn.dh_sweep_remove][2]
+        sweep1 = stat_cursor[stat.conn.dh_sweeps][2]
         sclose1 = stat_cursor[stat.conn.dh_session_handles][2]
         ssweep1 = stat_cursor[stat.conn.dh_session_sweeps][2]
-        tod1 = stat_cursor[stat.conn.dh_conn_tod][2]
-        ref1 = stat_cursor[stat.conn.dh_conn_ref][2]
+        tod1 = stat_cursor[stat.conn.dh_sweep_tod][2]
+        ref1 = stat_cursor[stat.conn.dh_sweep_ref][2]
         nfile1 = stat_cursor[stat.conn.file_open][2]
         stat_cursor.close()
 
@@ -116,10 +115,15 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
         # checkpoint something to do.  Make sure checkpoint doesn't adjust
         # the time of death for inactive handles.
         #
+        # Note that we do checkpoints inline because that has the side effect
+        # of sweeping the session cache, which will allow handles to be
+        # removed.
+        #
         c = self.session.open_cursor(uri, None)
         k = 0
         sleep = 0
         while sleep < 12:
+            self.session.checkpoint()
             k = k+1
             c[k] = 1
             sleep += 2
@@ -127,13 +131,14 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
         c.close()
 
         stat_cursor = self.session.open_cursor('statistics:', None, None)
-        close2 = stat_cursor[stat.conn.dh_conn_handles][2]
-        sweep2 = stat_cursor[stat.conn.dh_conn_sweeps][2]
+        close2 = stat_cursor[stat.conn.dh_sweep_close][2]
+        remove2 = stat_cursor[stat.conn.dh_sweep_remove][2]
+        sweep2 = stat_cursor[stat.conn.dh_sweeps][2]
         sclose2 = stat_cursor[stat.conn.dh_session_handles][2]
         ssweep2 = stat_cursor[stat.conn.dh_session_sweeps][2]
         nfile2 = stat_cursor[stat.conn.file_open][2]
-        tod2 = stat_cursor[stat.conn.dh_conn_tod][2]
-        ref2 = stat_cursor[stat.conn.dh_conn_ref][2]
+        tod2 = stat_cursor[stat.conn.dh_sweep_tod][2]
+        ref2 = stat_cursor[stat.conn.dh_sweep_ref][2]
         stat_cursor.close()
         # print "checkpoint: " + str(self.ckpt)
         # print "nfile1: " + str(nfile1) + " nfile2: " + str(nfile2)
@@ -144,12 +149,13 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
         # print "tod1: " + str(tod1) + " tod2: " + str(tod2)
         # print "ref1: " + str(ref1) + " ref2: " + str(ref2)
 
-        # 
+        #
         # The files are all closed.  Check that sweep did its work even
         # in the presence of recent checkpoints.
         #
         if (close1 >= close2):
             print "XX: close1: " + str(close1) + " close2: " + str(close2)
+            print "remove1: " + str(remove1) + " remove2: " + str(remove2)
             print "sweep1: " + str(sweep1) + " sweep2: " + str(sweep2)
             print "sclose1: " + str(sclose1) + " sclose2: " + str(sclose2)
             print "ssweep1: " + str(ssweep1) + " ssweep2: " + str(ssweep2)
@@ -157,8 +163,19 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
             print "ref1: " + str(ref1) + " ref2: " + str(ref2)
             print "nfile1: " + str(nfile1) + " nfile2: " + str(nfile2)
         self.assertEqual(close1 < close2, True)
+        if (remove1 >= remove2):
+            print "close1: " + str(close1) + " close2: " + str(close2)
+            print "XX: remove1: " + str(remove1) + " remove2: " + str(remove2)
+            print "sweep1: " + str(sweep1) + " sweep2: " + str(sweep2)
+            print "sclose1: " + str(sclose1) + " sclose2: " + str(sclose2)
+            print "ssweep1: " + str(ssweep1) + " ssweep2: " + str(ssweep2)
+            print "tod1: " + str(tod1) + " tod2: " + str(tod2)
+            print "ref1: " + str(ref1) + " ref2: " + str(ref2)
+            print "nfile1: " + str(nfile1) + " nfile2: " + str(nfile2)
+        self.assertEqual(remove1 < remove2, True)
         if (sweep1 >= sweep2):
             print "close1: " + str(close1) + " close2: " + str(close2)
+            print "remove1: " + str(remove1) + " remove2: " + str(remove2)
             print "XX: sweep1: " + str(sweep1) + " sweep2: " + str(sweep2)
             print "sclose1: " + str(sclose1) + " sclose2: " + str(sclose2)
             print "ssweep1: " + str(ssweep1) + " ssweep2: " + str(ssweep2)
@@ -167,6 +184,7 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
         self.assertEqual(sweep1 < sweep2, True)
         if (nfile2 >= nfile1):
             print "close1: " + str(close1) + " close2: " + str(close2)
+            print "remove1: " + str(remove1) + " remove2: " + str(remove2)
             print "sweep1: " + str(sweep1) + " sweep2: " + str(sweep2)
             print "sclose1: " + str(sclose1) + " sclose2: " + str(sclose2)
             print "ssweep1: " + str(ssweep1) + " ssweep2: " + str(ssweep2)
@@ -174,17 +192,18 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
             print "ref1: " + str(ref1) + " ref2: " + str(ref2)
             print "XX: nfile1: " + str(nfile1) + " nfile2: " + str(nfile2)
         self.assertEqual(nfile2 < nfile1, True)
-        # The only files that should be left is the metadata, the lock file
-        # and the active file.
-        if (nfile2 != 3):
+        # The only files that should be left are the metadata, the lookaside
+        # file, the lock file, and the active file.
+        if (nfile2 != 4):
             print "close1: " + str(close1) + " close2: " + str(close2)
+            print "remove1: " + str(remove1) + " remove2: " + str(remove2)
             print "sweep1: " + str(sweep1) + " sweep2: " + str(sweep2)
             print "sclose1: " + str(sclose1) + " sclose2: " + str(sclose2)
             print "ssweep1: " + str(ssweep1) + " ssweep2: " + str(ssweep2)
             print "tod1: " + str(tod1) + " tod2: " + str(tod2)
             print "ref1: " + str(ref1) + " ref2: " + str(ref2)
             print "XX2: nfile1: " + str(nfile1) + " nfile2: " + str(nfile2)
-        self.assertEqual(nfile2 == 3, True)
+        self.assertEqual(nfile2 == 4, True)
 
 if __name__ == '__main__':
     wttest.run()
diff --git a/test/suite/test_sweep03.py b/test/suite/test_sweep03.py
index 4030e2fb715..684c87695c5 100644
--- a/test/suite/test_sweep03.py
+++ b/test/suite/test_sweep03.py
@@ -93,13 +93,13 @@ class test_sweep03(wttest.WiredTigerTestCase, suite_subprocess):
         time.sleep(5)
 
         stat_cursor = self.session.open_cursor('statistics:', None, None)
-        close1 = stat_cursor[stat.conn.dh_conn_handles][2]
-        sweep1 = stat_cursor[stat.conn.dh_conn_sweeps][2]
+        close1 = stat_cursor[stat.conn.dh_sweep_close][2]
+        sweep1 = stat_cursor[stat.conn.dh_sweeps][2]
         stat_cursor.close()
 
         # The sweep server should have run, or the test isn't working.
         self.assertGreater(sweep1, 0)
-        # We expect nothing to have been closed, so dh_conn_handles should be 0
+        # We expect nothing to have been closed.
         self.assertEqual(close1, 0)
 
     def test_disable_idle_timeout_drop_force(self):
@@ -116,7 +116,7 @@ class test_sweep03(wttest.WiredTigerTestCase, suite_subprocess):
         # We just filled the table, now check what the stats are
         stat_cursor = self.session.open_cursor('statistics:', None, None)
         cache1 = stat_cursor[stat.conn.cache_bytes_inuse][2]
-        sweep1 = stat_cursor[stat.conn.dh_conn_sweeps][2]
+        sweep1 = stat_cursor[stat.conn.dh_sweeps][2]
         stat_cursor.close()
 
         # We force the drop in this case to confirm that the handle is closed
@@ -127,8 +127,8 @@ class test_sweep03(wttest.WiredTigerTestCase, suite_subprocess):
         # Grab the stats post table drop to see things have decremented
         stat_cursor = self.session.open_cursor('statistics:', None, None)
         cache2 = stat_cursor[stat.conn.cache_bytes_inuse][2]
-        close2 = stat_cursor[stat.conn.dh_conn_handles][2]
-        sweep2 = stat_cursor[stat.conn.dh_conn_sweeps][2]
+        close2 = stat_cursor[stat.conn.dh_sweep_close][2]
+        sweep2 = stat_cursor[stat.conn.dh_sweeps][2]
         stat_cursor.close()
 
         # Make sure the sweep server is still working.
@@ -151,8 +151,8 @@ class test_sweep03(wttest.WiredTigerTestCase, suite_subprocess):
         # We just filled the table, now check what the stats are
         stat_cursor = self.session.open_cursor('statistics:', None, None)
         cache1 = stat_cursor[stat.conn.cache_bytes_inuse][2]
-        close1 = stat_cursor[stat.conn.dh_conn_handles][2]
-        sweep1 = stat_cursor[stat.conn.dh_conn_sweeps][2]
+        close1 = stat_cursor[stat.conn.dh_sweep_close][2]
+        sweep1 = stat_cursor[stat.conn.dh_sweeps][2]
         stat_cursor.close()
 
         self.session.drop(drop_uri, None)
@@ -162,8 +162,8 @@ class test_sweep03(wttest.WiredTigerTestCase, suite_subprocess):
         # Grab the stats post table drop to see things have decremented
         stat_cursor = self.session.open_cursor('statistics:', None, None)
         cache2 = stat_cursor[stat.conn.cache_bytes_inuse][2]
-        close2 = stat_cursor[stat.conn.dh_conn_handles][2]
-        sweep2 = stat_cursor[stat.conn.dh_conn_sweeps][2]
+        close2 = stat_cursor[stat.conn.dh_sweep_close][2]
+        sweep2 = stat_cursor[stat.conn.dh_sweeps][2]
         stat_cursor.close()
 
         self.assertGreater(sweep2, sweep1)
diff --git a/test/suite/test_txn02.py b/test/suite/test_txn02.py
index 5827a892654..83c10f41244 100644
--- a/test/suite/test_txn02.py
+++ b/test/suite/test_txn02.py
@@ -217,7 +217,7 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess):
         for i, ot in enumerate(zip(ops, txns)):
             ok, txn = ot
             op, k = ok
-            
+
             # Close and reopen the connection and cursor.
             if reopen == 'reopen':
                 self.reopen_conn()
diff --git a/test/suite/test_txn03.py b/test/suite/test_txn03.py
index 41e283a8050..e2efef1742e 100644
--- a/test/suite/test_txn03.py
+++ b/test/suite/test_txn03.py
@@ -39,7 +39,7 @@ class test_txn03(wttest.WiredTigerTestCase):
     uri2 = 'table:' + tablename + "_2"
     key_str = "TEST_KEY1"
     data_str1 = "VAL"
-    data_str2 = "TEST_VAL1" 
+    data_str2 = "TEST_VAL1"
 
     nentries = 1000
     scenarios = check_scenarios([
diff --git a/test/suite/test_txn04.py b/test/suite/test_txn04.py
index d0a21f5ec9c..f9f660223da 100644
--- a/test/suite/test_txn04.py
+++ b/test/suite/test_txn04.py
@@ -72,7 +72,7 @@ class test_txn04(wttest.WiredTigerTestCase, suite_subprocess):
         self.txn_sync = self.sync_list[
             self.scenario_number % len(self.sync_list)]
         self.backup_dir = os.path.join(self.home, "WT_BACKUP")
-        # Set archive false on the home directory.  
+        # Set archive false on the home directory.
         conn_params = \
                 'log=(archive=false,enabled,file_max=%s),' % self.logmax + \
                 'create,error_prefix="%s: ",' % self.shortid() + \
@@ -158,7 +158,7 @@ class test_txn04(wttest.WiredTigerTestCase, suite_subprocess):
             self.session.begin_transaction()
             ok, txn = ot
             op, k = ok
-            
+
             # print '%d: %s(%d)[%s]' % (i, ok[0], ok[1], txn)
             if op == 'insert' or op == 'update':
                 c[k] = i + 2
@@ -199,7 +199,7 @@ class test_txn04(wttest.WiredTigerTestCase, suite_subprocess):
         self.hot_backup(self.uri, committed)
         if txn == 'commit':
             self.assertEqual(True, self.exception == 'true')
-        else: 
+        else:
             self.assertEqual(True, self.exception == 'false')
 
 if __name__ == '__main__':
diff --git a/test/suite/test_txn05.py b/test/suite/test_txn05.py
index 8a2f36fc910..d427b893b17 100644
--- a/test/suite/test_txn05.py
+++ b/test/suite/test_txn05.py
@@ -181,7 +181,7 @@ class test_txn05(wttest.WiredTigerTestCase, suite_subprocess):
             ok, txn = ot
             # print '%d: %s(%d)[%s]' % (i, ok[0], ok[1], txn)
             op, k = ok
-            
+
             # print '%d: %s(%d)[%s]' % (i, ok[0], ok[1], txn)
             if op == 'stop':
                 c.set_key(k)
diff --git a/test/suite/test_txn07.py b/test/suite/test_txn07.py
index 8e7119186f5..fa522582a8e 100644
--- a/test/suite/test_txn07.py
+++ b/test/suite/test_txn07.py
@@ -171,7 +171,7 @@ class test_txn07(wttest.WiredTigerTestCase, suite_subprocess):
             ok, txn = ot
             # print '%d: %s(%d)[%s]' % (i, ok[0], ok[1], txn)
             op, k = ok
-            
+
             # print '%d: %s(%d)[%s]' % (i, ok[0], ok[1], txn)
             if op == 'stop':
                 c.set_key(k)
diff --git a/test/suite/test_txn09.py b/test/suite/test_txn09.py
index 98229c52f2e..df085a75d67 100644
--- a/test/suite/test_txn09.py
+++ b/test/suite/test_txn09.py
@@ -139,7 +139,7 @@ class test_txn09(wttest.WiredTigerTestCase, suite_subprocess):
         for i, ot in enumerate(zip(ops, txns)):
             ok, txn = ot
             op, k = ok
-            
+
             # Close and reopen the connection and cursor, toggling the log
             self.log_enabled = not self.log_enabled
             self.reopen_conn()
diff --git a/test/suite/test_txn10.py b/test/suite/test_txn10.py
index cee25562756..8810df46777 100644
--- a/test/suite/test_txn10.py
+++ b/test/suite/test_txn10.py
@@ -62,15 +62,15 @@ class test_txn10(wttest.WiredTigerTestCase, suite_subprocess):
         self.close_conn()
         self.conn = self.setUpConnectionOpen(newdir)
         self.session = self.setUpSessionOpen(self.conn)
-        
+
     def test_recovery(self):
         ''' Check for bugs in file ID allocation. '''
 
         # Here's the strategy:
-        #    - Create a table (t1). 
-        #    - Do a clean restart. 
-        #    - Create another table (t2). 
-        #    - Insert data into t2. 
+        #    - Create a table (t1).
+        #    - Do a clean restart.
+        #    - Create another table (t2).
+        #    - Insert data into t2.
         #    - Make recovery run.
         #
         # If we aren't tracking file IDs properly, it's possible that
diff --git a/test/suite/test_txn12.py b/test/suite/test_txn12.py
new file mode 100644
index 00000000000..0901811535e
--- /dev/null
+++ b/test/suite/test_txn12.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2015 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wttest
+from suite_subprocess import suite_subprocess
+from wiredtiger import stat
+from wtscenario import multiply_scenarios, number_scenarios
+
+# test_txn12.py
+#    test of commit following failed op in a read only transaction.
+class test_txn12(wttest.WiredTigerTestCase, suite_subprocess):
+    name = 'test_txn12'
+    uri = 'table:' + name
+    create_params = 'key_format=i,value_format=i'
+
+    # Test that read-only transactions can commit following a failure.
+    def test_txn12(self):
+
+        # Setup the session and table.
+        session = self.conn.open_session(None)
+        session.create(self.uri, self.create_params)
+        session.begin_transaction("isolation=snapshot")
+
+        # Create a read only transaction.
+        c = session.open_cursor(self.uri, None)
+        c.next()
+        msg = '/next_random.*boolean/'
+        self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+            lambda:session.open_cursor(self.uri, None, "next_random=bar"), msg)
+        # This commit should succeed as we have done no writes.
+        session.commit_transaction()
+
+        # Create a read/write transaction.
+        session.begin_transaction("isolation=snapshot")
+        c = session.open_cursor(self.uri, None)
+        c[123] = 123
+        self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+            lambda:session.open_cursor(self.uri, None, "next_random=bar"), msg)
+        # This commit should fail as we have written something
+        self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+            lambda:session.commit_transaction(), '/requires rollback/')
+
+if __name__ == '__main__':
+    wttest.run()
+
diff --git a/test/suite/test_util01.py b/test/suite/test_util01.py
index 0b1e2a35833..29033fb43ba 100644
--- a/test/suite/test_util01.py
+++ b/test/suite/test_util01.py
@@ -168,7 +168,7 @@ class test_util01(wttest.WiredTigerTestCase, suite_subprocess):
                     dumpargs.append("-x")
                 dumpargs.append(self.tablename)
                 self.runWt(dumpargs, outfilename="dump.out")
-                
+
         self.assertTrue(self.compare_files("expect.out", "dump.out"))
 
     def test_dump_process(self):
@@ -179,10 +179,10 @@ class test_util01(wttest.WiredTigerTestCase, suite_subprocess):
 
     def test_dump_api(self):
         self.dump(True, False)
- 
+
     def test_dump_api_hex(self):
         self.dump(True, True)
- 
+
 
 if __name__ == '__main__':
     wttest.run()
diff --git a/test/suite/wtscenario.py b/test/suite/wtscenario.py
index 6e4b0d3464e..0f8e8c30c1f 100644
--- a/test/suite/wtscenario.py
+++ b/test/suite/wtscenario.py
@@ -61,7 +61,7 @@ def log2chr(val):
         return chr(ord('0') + p)
     else:
         return chr(ord('a') + p - 10)
-    
+
 megabyte = 1024 * 1024
 
 def check_scenarios(scenes):
diff --git a/test/suite/wttest.py b/test/suite/wttest.py
index 9f833b0b6a4..443fabb00b2 100644
--- a/test/suite/wttest.py
+++ b/test/suite/wttest.py
@@ -169,14 +169,14 @@ class WiredTigerTestCase(unittest.TestCase):
         self.captureerr = CapturedFd('stderr.txt', 'error output')
         sys.stdout = self.captureout.capture()
         sys.stderr = self.captureerr.capture()
-        
+
     def fdTearDown(self):
         # restore stderr/stdout
         self.captureout.release()
         self.captureerr.release()
         sys.stdout = WiredTigerTestCase._stdout
         sys.stderr = WiredTigerTestCase._stderr
-        
+
     def __init__(self, *args, **kwargs):
         if hasattr(self, 'scenarios'):
             assert(len(self.scenarios) == len(dict(self.scenarios)))
@@ -204,11 +204,11 @@ class WiredTigerTestCase(unittest.TestCase):
             'create,error_prefix="%s",%s' % (self.shortid(), self.conn_config))
         self.pr(`conn`)
         return conn
-        
+
     # Can be overridden
     def setUpSessionOpen(self, conn):
         return conn.open_session(None)
-        
+
     # Can be overridden
     def close_conn(self):
         """
@@ -351,7 +351,7 @@ class WiredTigerTestCase(unittest.TestCase):
         else:
             with self.expectedStderr(message):
                 self.assertRaises(exceptionType, expr)
-            
+
     def exceptionToStderr(self, expr):
         """
         Used by assertRaisesHavingMessage to convert an expression
diff --git a/test/suite/wtthread.py b/test/suite/wtthread.py
index 1e2e4f56380..8959684d6d3 100644
--- a/test/suite/wtthread.py
+++ b/test/suite/wtthread.py
@@ -35,7 +35,7 @@ class checkpoint_thread(threading.Thread):
         self.conn = conn
         self.done = done
         threading.Thread.__init__(self)
- 
+
     def run(self):
         sess = self.conn.open_session()
         while not self.done.isSet():
@@ -50,7 +50,7 @@ class backup_thread(threading.Thread):
         self.conn = conn
         self.done = done
         threading.Thread.__init__(self)
- 
+
     def run(self):
         sess = self.conn.open_session()
         while not self.done.isSet():
@@ -111,7 +111,7 @@ class op_thread(threading.Thread):
         self.queue = queue
         self.done = done
         threading.Thread.__init__(self)
- 
+
     def run(self):
         sess = self.conn.open_session()
         if (len(self.uris) == 1):
diff --git a/tools/wtstats/stat_data.py b/tools/wtstats/stat_data.py
index db5b14d6cd6..f2f193c0860 100644
--- a/tools/wtstats/stat_data.py
+++ b/tools/wtstats/stat_data.py
@@ -13,6 +13,7 @@ no_scale_per_second_list = [
     'cache: tracked dirty bytes in the cache',
     'cache: tracked dirty pages in the cache',
     'connection: files currently open',
+    'data-handle: connection data handles currently active',
     'log: maximum log file size',
     'log: number of pre-allocated log files to create',
     'log: total log buffer size',
@@ -42,6 +43,7 @@ no_scale_per_second_list = [
     'btree: column-store internal pages',
     'btree: column-store variable-size deleted values',
     'btree: column-store variable-size leaf pages',
+    'btree: column-store variable-size RLE encoded values',
     'btree: fixed-record size',
     'btree: maximum internal page key size',
     'btree: maximum internal page size',
@@ -74,6 +76,7 @@ no_clear_list = [
     'cache: tracked dirty bytes in the cache',
     'cache: tracked dirty pages in the cache',
     'connection: files currently open',
+    'data-handle: connection data handles currently active',
     'log: maximum log file size',
     'log: number of pre-allocated log files to create',
     'log: total log buffer size',
author	Don Anderson <dda@ddanderson.com>	2015-09-13 21:57:42 -0400
committer	Don Anderson <dda@ddanderson.com>	2015-09-13 21:57:42 -0400
commit	0225351bb6d937309f0bccb800c46e72e1aa4b82 (patch)
tree	95998c9b83fe07fef1642cc281f814e4f25831f0
parent	4c663725867d2f9434298d30883c58a0d96deaa9 (diff)
parent	e1d6886824058b333495236b776b10fcd8fb74ae (diff)
download	mongo-0225351bb6d937309f0bccb800c46e72e1aa4b82.tar.gz