Merge branch 'develop' into mongodb-3.4mongodb-3.3.4

author: Alex Gorrod <alexg@wiredtiger.com> 2016-03-22 14:49:51 +1100
committer: Alex Gorrod <alexg@wiredtiger.com> 2016-03-22 14:49:51 +1100
commit: 9cf8eb2f15c6df7da90c19c86ccf7516ed126183 (patch)
tree: dd8d22e7b881791e64cd8efaa9d0befb12b2ba84
parent: 444981a456059f0652fd3bb1968d58d2c37b9089 (diff)
parent: 18e6091d9c16bf46bc8d0750b2227ca71a559c33 (diff)
download: mongo-9cf8eb2f15c6df7da90c19c86ccf7516ed126183.tar.gz
135 files changed, 4213 insertions, 1433 deletions
diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c
index 5755e22dd2f..340c400ba7e 100644
--- a/bench/wtperf/wtperf.c
+++ b/bench/wtperf/wtperf.c
@@ -1668,7 +1668,7 @@ execute_workload(CONFIG *cfg)
 	for (threads = cfg->workers, i = 0,
 	    workp = cfg->workload; i < cfg->workload_cnt; ++i, ++workp) {
 		lprintf(cfg, 0, 1,
-		    "Starting workload #%d: %" PRId64 " threads, inserts=%"
+		    "Starting workload #%u: %" PRId64 " threads, inserts=%"
 		    PRId64 ", reads=%" PRId64 ", updates=%" PRId64
 		    ", truncate=%" PRId64 ", throttle=%" PRId64,
 		    i + 1, workp->threads, workp->insert,
@@ -2267,7 +2267,7 @@ main(int argc, char *argv[])
 	 * the compact operation, but not for the workloads.
 	 */
 	if (cfg->async_threads > 0) {
-		if (F_ISSET(cfg, CFG_TRUNCATE) > 0) {
+		if (F_ISSET(cfg, CFG_TRUNCATE)) {
 			lprintf(cfg, 1, 0, "Cannot run truncate and async\n");
 			goto err;
 		}
@@ -2285,7 +2285,7 @@ main(int argc, char *argv[])
 		req_len = strlen(",async=(enabled=true,threads=)") + 4;
 		cfg->async_config = dcalloc(req_len, 1);
 		snprintf(cfg->async_config, req_len,
-		    ",async=(enabled=true,threads=%d)",
+		    ",async=(enabled=true,threads=%" PRIu32 ")",
 		    cfg->async_threads);
 	}
 	if ((ret = config_compress(cfg)) != 0)
diff --git a/bench/wtperf/wtperf.h b/bench/wtperf/wtperf.h
index c591499b907..a2b497b3142 100644
--- a/bench/wtperf/wtperf.h
+++ b/bench/wtperf/wtperf.h
@@ -337,7 +337,7 @@ generate_key(CONFIG *cfg, char *key_buf, uint64_t keyno)
 static inline void
 extract_key(char *key_buf, uint64_t *keynop)
 {
-	sscanf(key_buf, "%" SCNu64, keynop);
+	(void)sscanf(key_buf, "%" SCNu64, keynop);
 }
 
 /*
@@ -370,11 +370,11 @@ dmalloc(size_t len)
  *      Call calloc, dying on failure.
  */
 static inline void *
-dcalloc(size_t num, size_t len)
+dcalloc(size_t num, size_t size)
 {
 	void *p;
 
-	if ((p = calloc(len, num)) == NULL)
+	if ((p = calloc(num, size)) == NULL)
 		die(errno, "calloc");
 	return (p);
 }
@@ -416,11 +416,9 @@ static inline char *
 dstrndup(const char *str, const size_t len)
 {
 	char *p;
-	p = dcalloc(len + 1, 1);
 
-	strncpy(p, str, len);
-	if (p == NULL)
-		die(errno, "dstrndup");
+	p = dcalloc(len + 1, sizeof(char));
+	memcpy(p, str, len);
 	return (p);
 }
 #endif
diff --git a/build_posix/Make.subdirs b/build_posix/Make.subdirs
index 14258666d84..4e1f829c0c5 100644
--- a/build_posix/Make.subdirs
+++ b/build_posix/Make.subdirs
@@ -6,6 +6,7 @@
 # If the directory exists, it is added to AUTO_SUBDIRS.
 # If a condition is included, the subdir is made conditional via AM_CONDITIONAL
 ext/collators/reverse
+ext/collators/revint
 ext/compressors/lz4 LZ4
 ext/compressors/nop
 ext/compressors/snappy SNAPPY
@@ -30,6 +31,7 @@ test/cursor_order
 test/fops
 test/format
 test/huge
+test/manydbs
 test/packing
 test/readonly
 test/recovery
diff --git a/build_posix/configure.ac.in b/build_posix/configure.ac.in
index 06d73e2fe12..9251873be73 100644
--- a/build_posix/configure.ac.in
+++ b/build_posix/configure.ac.in
@@ -34,6 +34,22 @@ AC_PROG_CC(cc gcc)
 AC_PROG_CXX(c++ g++)
 AM_PROG_AS(as gas)
 
+# This is a workaround as part of WT-2459. Currently, clang (v3.7) does not
+# support compiling the ASM code we have to perform the CRC checks on PowerPC.
+# To compile with clang we need to override the ASM compiler with CCAS to use
+# gcc. Unfortunately, doing the compilation in this manner means libtool can't
+# determine what tag to use for that one .S file. If we catch that we are using
+# two different compilers for CC and CCAS and we are on a PowerPC system we
+# overload the libtool flags to provide CC by default.
+if test "$CC" != "$CCAS"; then
+	AS_CASE([$host_cpu],
+		[ppc64*], [AM_LIBTOOLFLAGS+="--tag=CC"],
+		[elf64lppc], [AM_LIBTOOLFLAGS+="--tag=CC"],
+		[powerpc*], [AM_LIBTOOLFLAGS+="--tag=CC"],
+		[])
+fi
+AC_SUBST(AM_LIBTOOLFLAGS)
+
 if test "$GCC" = "yes"; then
 	# The Solaris gcc compiler gets the additional -pthreads flag.
 	if test "`uname -s`" = "SunOS"; then
@@ -97,6 +113,13 @@ AC_SYS_LARGEFILE
 
 AC_C_BIGENDIAN
 
+AC_MSG_CHECKING([for a 64-bit build])
+AC_COMPUTE_INT(ac_cv_sizeof_void_p, [sizeof(void *)])
+if test "$ac_cv_sizeof_void_p" != "8" ; then
+    AC_MSG_ERROR([WiredTiger requires a 64-bit build.])
+fi
+AC_MSG_RESULT(yes)
+
 # Linux requires _GNU_SOURCE to be defined
 case "$host_os" in
 linux*)	AM_CFLAGS="$AM_CFLAGS -D_GNU_SOURCE" ;;
diff --git a/build_posix/reconf b/build_posix/reconf
index 8700c5da43d..16d4002d9b9 100755
--- a/build_posix/reconf
+++ b/build_posix/reconf
@@ -24,6 +24,7 @@ clean()
 		aclocal.m4 \
 		auto-includes.chk \
 		autom4te.cache \
+		config.cache \
 		config.hin \
 		config.hin~ \
 		config.log \
diff --git a/build_win/filelist.win b/build_win/filelist.win
index 0a313026793..b6a9caf4a74 100644
--- a/build_win/filelist.win
+++ b/build_win/filelist.win
@@ -155,6 +155,7 @@ src/session/session_compact.c
 src/session/session_dhandle.c
 src/session/session_salvage.c
 src/support/cksum.c
+src/support/cond_auto.c
 src/support/crypto.c
 src/support/err.c
 src/support/filename.c
diff --git a/dist/api_data.py b/dist/api_data.py
index 5575bd9f790..02aee1e8825 100644
--- a/dist/api_data.py
+++ b/dist/api_data.py
@@ -76,12 +76,12 @@ lsm_config = [
         Config('bloom', 'true', r'''
             create bloom filters on LSM tree chunks as they are merged''',
             type='boolean'),
-        Config('bloom_config', '', r'''
-            config string used when creating Bloom filter files, passed
-            to WT_SESSION::create'''),
         Config('bloom_bit_count', '16', r'''
             the number of bits used per item for LSM bloom filters''',
             min='2', max='1000'),
+        Config('bloom_config', '', r'''
+            config string used when creating Bloom filter files, passed
+            to WT_SESSION::create'''),
         Config('bloom_hash_count', '8', r'''
             the number of hash values per item used for LSM bloom
             filters''',
@@ -299,6 +299,15 @@ file_meta = file_config + [
         the file version'''),
 ]
 
+lsm_meta = file_config + lsm_config + [
+    Config('last', '', r'''
+        the last allocated chunk ID'''),
+    Config('chunks', '', r'''
+        active chunks in the LSM tree'''),
+    Config('old_chunks', '', r'''
+        obsolete chunks in the LSM tree'''),
+]
+
 table_only_config = [
     Config('colgroups', '', r'''
         comma-separated list of names of column groups.  Each column
@@ -741,12 +750,16 @@ cursor_runtime_config = [
 ]
 
 methods = {
-'file.meta' : Method(file_meta),
-
 'colgroup.meta' : Method(colgroup_meta),
 
+'file.config' : Method(file_config),
+
+'file.meta' : Method(file_meta),
+
 'index.meta' : Method(index_meta),
 
+'lsm.meta' : Method(lsm_meta),
+
 'table.meta' : Method(table_meta),
 
 'WT_CURSOR.close' : Method([]),
diff --git a/dist/filelist b/dist/filelist
index 4ed7d7e3beb..350e0c50087 100644
--- a/dist/filelist
+++ b/dist/filelist
@@ -153,6 +153,7 @@ src/session/session_compact.c
 src/session/session_dhandle.c
 src/session/session_salvage.c
 src/support/cksum.c
+src/support/cond_auto.c
 src/support/crypto.c
 src/support/err.c
 src/support/filename.c
diff --git a/dist/s_string.ok b/dist/s_string.ok
index 2caaddcc15a..6762521ca76 100644
--- a/dist/s_string.ok
+++ b/dist/s_string.ok
@@ -758,6 +758,8 @@ majorp
 malloc
 marshall
 marshalled
+maxcpu
+maxdbs
 mbll
 mbss
 mem
@@ -770,6 +772,7 @@ memset
 memsize
 metaconf
 metadata
+metadata's
 metafile
 mfence
 minorp
@@ -806,6 +809,7 @@ nfilename
 nhex
 nlpo
 nocase
+noclear
 nocrypto
 nolock
 nonliteral
@@ -844,8 +848,11 @@ parserp
 patchp
 pathname
 pathnames
+pclose
+pcpu
 perf
 pfx
+popen
 poptable
 popthreads
 portably
@@ -871,6 +878,7 @@ ps
 psp
 pthread
 ptr
+ptrdiff
 pushms
 putK
 putV
@@ -908,6 +916,7 @@ resize
 resizing
 ret
 retp
+revint
 rf
 rle
 rmw
@@ -988,6 +997,7 @@ t's
 tV
 tablename
 tcbench
+td
 testutil
 th
 tid
diff --git a/dist/stat_data.py b/dist/stat_data.py
index 09e5643a5d6..bd951e64999 100644
--- a/dist/stat_data.py
+++ b/dist/stat_data.py
@@ -109,6 +109,8 @@ connection_stats = [
     ##########################################
     # System statistics
     ##########################################
+    ConnStat('cond_auto_wait', 'auto adjusting condition wait calls'),
+    ConnStat('cond_auto_wait_reset', 'auto adjusting condition resets'),
     ConnStat('cond_wait', 'pthread mutex condition wait calls'),
     ConnStat('file_open', 'files currently open', 'no_clear,no_scale'),
     ConnStat('memory_allocation', 'memory allocations'),
@@ -124,7 +126,7 @@ connection_stats = [
     ##########################################
     AsyncStat('async_alloc_race', 'number of allocation state races'),
     AsyncStat('async_alloc_view', 'number of operation slots viewed for allocation'),
-    AsyncStat('async_cur_queue', 'current work queue length'),
+    AsyncStat('async_cur_queue', 'current work queue length', 'no_scale'),
     AsyncStat('async_flush', 'number of flush calls'),
     AsyncStat('async_full', 'number of times operation allocation failed'),
     AsyncStat('async_max_queue', 'maximum work queue length', 'no_clear,no_scale'),
@@ -218,6 +220,8 @@ connection_stats = [
     LogStat('log_compress_write_fails', 'log records not compressed'),
     LogStat('log_compress_writes', 'log records compressed'),
     LogStat('log_flush', 'log flush operations'),
+    LogStat('log_force_write', 'log force write operations'),
+    LogStat('log_force_write_skip', 'log force write operations skipped'),
     LogStat('log_max_filesize', 'maximum log file size', 'no_clear,no_scale,size'),
     LogStat('log_prealloc_files', 'pre-allocated log files prepared'),
     LogStat('log_prealloc_max', 'number of pre-allocated log files to create', 'no_clear,no_scale'),
@@ -238,6 +242,7 @@ connection_stats = [
     LogStat('log_sync', 'log sync operations'),
     LogStat('log_sync_dir', 'log sync_dir operations'),
     LogStat('log_write_lsn', 'log server thread advances write LSN'),
+    LogStat('log_write_lsn_skip', 'log server thread write LSN walk skipped'),
     LogStat('log_writes', 'log write operations'),
     LogStat('log_zero_fills', 'log files manually zero-filled'),
 
@@ -397,7 +402,7 @@ dsrc_stats = [
     BlockStat('block_magic', 'file magic number', 'max_aggregate,no_scale'),
     BlockStat('block_major', 'file major version number', 'max_aggregate,no_scale'),
     BlockStat('block_minor', 'minor version number', 'max_aggregate,no_scale'),
-    BlockStat('block_reuse_bytes', 'file bytes available for reuse', 'size'),
+    BlockStat('block_reuse_bytes', 'file bytes available for reuse', 'no_scale,size'),
     BlockStat('block_size', 'file size in bytes', 'no_scale,size'),
 
     ##########################################
diff --git a/examples/c/ex_async.c b/examples/c/ex_async.c
index 584c3e54b87..ecdbd2f4fea 100644
--- a/examples/c/ex_async.c
+++ b/examples/c/ex_async.c
@@ -218,7 +218,7 @@ main(void)
 	 */
 	ret = conn->close(conn, NULL);
 
-	printf("Searched for %d keys\n", ex_asynckeys.num_keys);
+	printf("Searched for %" PRIu32 " keys\n", ex_asynckeys.num_keys);
 
 	return (ret);
 }
diff --git a/examples/c/ex_config_parse.c b/examples/c/ex_config_parse.c
index 124eff21130..be3c78bedd4 100644
--- a/examples/c/ex_config_parse.c
+++ b/examples/c/ex_config_parse.c
@@ -30,6 +30,7 @@
  *	configuration strings.
  */
 
+#include <inttypes.h>
 #include <stdio.h>
 #include <string.h>
 
@@ -99,7 +100,7 @@ main(void)
 	while ((ret = parser->next(parser, &k, &v)) == 0) {
 		printf("%.*s:", (int)k.len, k.str);
 		if (v.type == WT_CONFIG_ITEM_NUM)
-			printf("%d\n", (int)v.val);
+			printf("%" PRId64 "\n", v.val);
 		else
 			printf("%.*s\n", (int)v.len, v.str);
 	}
@@ -126,7 +127,7 @@ main(void)
 		    "log.file_max configuration: %s", wiredtiger_strerror(ret));
 		return (ret);
 	}
-	printf("log file max: %d\n", (int)v.val);
+	printf("log file max: %" PRId64 "\n", v.val);
 	/*! [nested get] */
 	ret = parser->close(parser);
 
diff --git a/examples/c/ex_extractor.c b/examples/c/ex_extractor.c
index fff9c79f8e0..8623f4759fc 100644
--- a/examples/c/ex_extractor.c
+++ b/examples/c/ex_extractor.c
@@ -99,11 +99,13 @@ my_extract(WT_EXTRACTOR *extractor, WT_SESSION *session,
 		 * key(s).  WiredTiger will perform the required operation
 		 * (such as a remove()).
 		 */
-		fprintf(stderr, "EXTRACTOR: index op for year %d: %s %s\n",
+		fprintf(stderr,
+		    "EXTRACTOR: index op for year %" PRIu16 ": %s %s\n",
 		    year, first_name, last_name);
 		result_cursor->set_key(result_cursor, year);
 		if ((ret = result_cursor->insert(result_cursor)) != 0) {
-			fprintf(stderr, "EXTRACTOR: op year %d: error %d\n",
+			fprintf(stderr,
+			    "EXTRACTOR: op year %" PRIu16 ": error %d\n",
 			    year, ret);
 			return (ret);
 		}
@@ -157,7 +159,7 @@ read_index(WT_SESSION *session)
 	 */
 	for (i = 0; i < 10 && RET_OK(ret); i++) {
 		year = (uint16_t)((rand() % YEAR_SPAN) + YEAR_BASE);
-		printf("Year %d:\n", year);
+		printf("Year %" PRIu16 ":\n", year);
 		cursor->set_key(cursor, year);
 		if ((ret = cursor->search(cursor)) != 0)
 			break;
@@ -181,7 +183,7 @@ read_index(WT_SESSION *session)
 		}
 	}
 	if (!RET_OK(ret))
-		fprintf(stderr, "Error %d for year %d\n", ret, year);
+		fprintf(stderr, "Error %d for year %" PRIu16 "\n", ret, year);
 
 	ret = cursor->close(cursor);
 	return (ret);
@@ -245,7 +247,8 @@ setup_table(WT_SESSION *session)
 		cursor->set_key(cursor, p.id);
 		cursor->set_value(cursor,
 		    p.last_name, p.first_name, p.term_start, p.term_end);
-		fprintf(stderr, "SETUP: table insert %d-%d: %s %s\n",
+		fprintf(stderr,
+		    "SETUP: table insert %" PRIu16 "-%" PRIu16 ": %s %s\n",
 		    p.term_start, p.term_end,
 		    p.first_name, p.last_name);
 		ret = cursor->insert(cursor);
diff --git a/examples/c/ex_schema.c b/examples/c/ex_schema.c
index fdf02d12302..70fc7eb2e62 100644
--- a/examples/c/ex_schema.c
+++ b/examples/c/ex_schema.c
@@ -69,7 +69,7 @@ main(void)
 {
 	POP_RECORD *p;
 	WT_CONNECTION *conn;
-	WT_CURSOR *cursor, *cursor2, *join_cursor;
+	WT_CURSOR *cursor, *cursor2, *join_cursor, *stat_cursor;
 	WT_SESSION *session;
 	const char *country;
 	uint64_t recno, population;
@@ -86,7 +86,8 @@ main(void)
 	} else
 		home = NULL;
 
-	if ((ret = wiredtiger_open(home, NULL, "create", &conn)) != 0) {
+	if ((ret = wiredtiger_open(
+	    home, NULL, "create,statistics=(fast)", &conn)) != 0) {
 		fprintf(stderr, "Error connecting to %s: %s\n",
 		    home, wiredtiger_strerror(ret));
 		return (ret);
@@ -164,7 +165,8 @@ main(void)
 		ret = cursor->get_key(cursor, &recno);
 		ret = cursor->get_value(cursor, &country, &year, &population);
 		printf("ID %" PRIu64, recno);
-		printf(": country %s, year %u, population %" PRIu64 "\n",
+		printf(
+		    ": country %s, year %" PRIu16 ", population %" PRIu64 "\n",
 		    country, year, population);
 	}
 	ret = cursor->close(cursor);
@@ -185,7 +187,8 @@ main(void)
 		ret = wiredtiger_struct_unpack(session,
 		    value.data, value.size,
 		    "5sHQ", &country, &year, &population);
-		printf(": country %s, year %u, population %" PRIu64 "\n",
+		printf(
+		    ": country %s, year %" PRIu16 ", population %" PRIu64 "\n",
 		    country, year, population);
 	}
 	/*! [List the records in the table using raw mode.] */
@@ -201,7 +204,9 @@ main(void)
 	cursor->set_key(cursor, 2);
 	if ((ret = cursor->search(cursor)) == 0) {
 		ret = cursor->get_value(cursor, &country, &year, &population);
-		printf("ID 2: country %s, year %u, population %" PRIu64 "\n",
+		printf(
+		    "ID 2: "
+		    "country %s, year %" PRIu16 ", population %" PRIu64 "\n",
 		    country, year, population);
 	}
 	/*! [Read population from the primary column group] */
@@ -229,8 +234,8 @@ main(void)
 	cursor->set_key(cursor, "AU\0\0\0");
 	ret = cursor->search(cursor);
 	ret = cursor->get_value(cursor, &country, &year, &population);
-	printf("AU: country %s, year %u, population %" PRIu64 "\n",
-	    country, (unsigned int)year, population);
+	printf("AU: country %s, year %" PRIu16 ", population %" PRIu64 "\n",
+	    country, year, population);
 	/*! [Search in a simple index] */
 	ret = cursor->close(cursor);
 
@@ -241,8 +246,9 @@ main(void)
 	cursor->set_key(cursor, "USA\0\0", (uint16_t)1900);
 	ret = cursor->search(cursor);
 	ret = cursor->get_value(cursor, &country, &year, &population);
-	printf("US 1900: country %s, year %u, population %" PRIu64 "\n",
-	    country, (unsigned int)year, population);
+	printf(
+	    "US 1900: country %s, year %" PRIu16 ", population %" PRIu64 "\n",
+	    country, year, population);
 	/*! [Search in a composite index] */
 	ret = cursor->close(cursor);
 
@@ -255,7 +261,7 @@ main(void)
 	    "table:poptable(country,year)", NULL, NULL, &cursor);
 	while ((ret = cursor->next(cursor)) == 0) {
 		ret = cursor->get_value(cursor, &country, &year);
-		printf("country %s, year %u\n", country, year);
+		printf("country %s, year %" PRIu16 "\n", country, year);
 	}
 	/*! [Return a subset of values from the table] */
 	ret = cursor->close(cursor);
@@ -273,7 +279,7 @@ main(void)
 		ret = cursor->get_value(cursor, &value);
 		ret = wiredtiger_struct_unpack(
 		    session, value.data, value.size, "5sH", &country, &year);
-		printf("country %s, year %u\n", country, year);
+		printf("country %s, year %" PRIu16 "\n", country, year);
 	}
 	/*! [Return a subset of values from the table using raw mode] */
 	ret = cursor->close(cursor);
@@ -288,7 +294,7 @@ main(void)
 	while ((ret = cursor->next(cursor)) == 0) {
 		ret = cursor->get_key(cursor, &country, &year);
 		ret = cursor->get_value(cursor, &recno);
-		printf("row ID %" PRIu64 ": country %s, year %u\n",
+		printf("row ID %" PRIu64 ": country %s, year %" PRIu16 "\n",
 		    recno, country, year);
 	}
 	/*! [Return the table's record number key using an index] */
@@ -305,7 +311,7 @@ main(void)
 	while ((ret = cursor->next(cursor)) == 0) {
 		ret = cursor->get_key(cursor, &country, &year);
 		ret = cursor->get_value(cursor, &population);
-		printf("population %" PRIu64 ": country %s, year %u\n",
+		printf("population %" PRIu64 ": country %s, year %" PRIu16 "\n",
 		    population, country, year);
 	}
 	/*! [Return a subset of the value columns from an index] */
@@ -320,7 +326,7 @@ main(void)
 	    "index:poptable:country_plus_year()", NULL, NULL, &cursor);
 	while ((ret = cursor->next(cursor)) == 0) {
 		ret = cursor->get_key(cursor, &country, &year);
-		printf("country %s, year %u\n", country, year);
+		printf("country %s, year %" PRIu16 "\n", country, year);
 	}
 	/*! [Access only the index] */
 	ret = cursor->close(cursor);
@@ -350,10 +356,19 @@ main(void)
 		ret = join_cursor->get_value(join_cursor, &country, &year,
 		    &population);
 		printf("ID %" PRIu64, recno);
-		printf(": country %s, year %u, population %" PRIu64 "\n",
+		printf(
+		    ": country %s, year %" PRIu16 ", population %" PRIu64 "\n",
 		    country, year, population);
 	}
 	/*! [Join cursors] */
+
+	/*! [Statistics cursor join cursor] */
+	ret = session->open_cursor(session,
+	    "statistics:join",
+	    join_cursor, NULL, &stat_cursor);
+	/*! [Statistics cursor join cursor] */
+
+	ret = stat_cursor->close(stat_cursor);
 	ret = join_cursor->close(join_cursor);
 	ret = cursor2->close(cursor2);
 	ret = cursor->close(cursor);
diff --git a/examples/c/ex_stat.c b/examples/c/ex_stat.c
index 65402230eb8..6c5c15aacc6 100644
--- a/examples/c/ex_stat.c
+++ b/examples/c/ex_stat.c
@@ -39,6 +39,7 @@
 int print_cursor(WT_CURSOR *);
 int print_database_stats(WT_SESSION *);
 int print_file_stats(WT_SESSION *);
+int print_join_cursor_stats(WT_SESSION *);
 int print_overflow_pages(WT_SESSION *);
 int get_stat(WT_CURSOR *cursor, int stat_field, uint64_t *valuep);
 int print_derived_stats(WT_SESSION *);
@@ -99,6 +100,37 @@ print_file_stats(WT_SESSION *session)
 }
 
 int 
+print_join_cursor_stats(WT_SESSION *session)
+{
+	WT_CURSOR *idx_cursor, *join_cursor, *stat_cursor;
+	int ret;
+
+	ret = session->create(
+	    session, "index:access:idx", "columns=(v)");
+	ret = session->open_cursor(
+	    session, "index:access:idx", NULL, NULL, &idx_cursor);
+	ret = idx_cursor->next(idx_cursor);
+	ret = session->open_cursor(
+	    session, "join:table:access", NULL, NULL, &join_cursor);
+	ret = session->join(session, join_cursor, idx_cursor, "compare=gt");
+	ret = join_cursor->next(join_cursor);
+
+	/*! [statistics join cursor function] */
+	if ((ret = session->open_cursor(session,
+	    "statistics:join", join_cursor, NULL, &stat_cursor)) != 0)
+		return (ret);
+
+	ret = print_cursor(stat_cursor);
+	ret = stat_cursor->close(stat_cursor);
+	/*! [statistics join cursor function] */
+
+	ret = join_cursor->close(join_cursor);
+	ret = idx_cursor->close(idx_cursor);
+
+	return (ret);
+}
+
+int
 print_overflow_pages(WT_SESSION *session)
 {
 	/*! [statistics retrieve by key] */
@@ -204,7 +236,8 @@ main(void)
 	ret = wiredtiger_open(home, NULL, "create,statistics=(all)", &conn);
 	ret = conn->open_session(conn, NULL, NULL, &session);
 	ret = session->create(
-	    session, "table:access", "key_format=S,value_format=S");
+	    session, "table:access",
+	    "key_format=S,value_format=S,columns=(k,v)");
 
 	ret = session->open_cursor(
 	    session, "table:access", NULL, NULL, &cursor);
@@ -219,6 +252,8 @@ main(void)
 
 	ret = print_file_stats(session);
 
+	ret = print_join_cursor_stats(session);
+
 	ret = print_overflow_pages(session);
 
 	ret = print_derived_stats(session);
diff --git a/examples/java/com/wiredtiger/examples/ex_schema.java b/examples/java/com/wiredtiger/examples/ex_schema.java
index be1077ee2df..7cc26acb479 100644
--- a/examples/java/com/wiredtiger/examples/ex_schema.java
+++ b/examples/java/com/wiredtiger/examples/ex_schema.java
@@ -76,7 +76,7 @@ public class ex_schema {
         throws WiredTigerException
     {
         Connection conn;
-        Cursor cursor, cursor2, join_cursor;
+        Cursor cursor, cursor2, join_cursor, stat_cursor;
         Session session;
         String country;
         long recno, population;
@@ -106,7 +106,7 @@ public class ex_schema {
             home = null;
 
         try {
-            conn = wiredtiger.open(home, "create");
+            conn = wiredtiger.open(home, "create,statistics=(fast)");
             session = conn.open_session(null);
         } catch (WiredTigerException wte) {
             System.err.println("WiredTigerException: " + wte);
@@ -368,6 +368,13 @@ public class ex_schema {
                 ", population " + population);
 	}
 	/*! [Join cursors] */
+
+        /*! [Statistics cursor join cursor] */
+        stat_cursor = session.open_cursor(
+            "statistics:join", join_cursor, null);
+        /*! [Statistics cursor join cursor] */
+
+        ret = stat_cursor.close();
 	ret = join_cursor.close();
 	ret = cursor2.close();
 	ret = cursor.close();
diff --git a/examples/java/com/wiredtiger/examples/ex_stat.java b/examples/java/com/wiredtiger/examples/ex_stat.java
index b0b83a2d3b2..f8877a4620e 100644
--- a/examples/java/com/wiredtiger/examples/ex_stat.java
+++ b/examples/java/com/wiredtiger/examples/ex_stat.java
@@ -92,6 +92,33 @@ public class ex_stat {
     }
 
     int 
+    print_join_cursor_stats(Session session)
+        throws WiredTigerException
+    {
+	Cursor idx_cursor, join_cursor, stat_cursor;
+	int ret;
+
+	ret = session.create("index:access:idx", "columns=(v)");
+	idx_cursor = session.open_cursor("index:access:idx", null, null);
+	ret = idx_cursor.next();
+	join_cursor = session.open_cursor("join:table:access", null, null);
+	ret = session.join(join_cursor, idx_cursor, "compare=gt");
+	ret = join_cursor.next();
+
+	/*! [statistics join cursor function] */
+	stat_cursor = session.open_cursor("statistics:join", join_cursor, null);
+
+	ret = print_cursor(stat_cursor);
+	ret = stat_cursor.close();
+	/*! [statistics join cursor function] */
+
+	ret = join_cursor.close();
+	ret = idx_cursor.close();
+
+	return (ret);
+    }
+
+    int
     print_overflow_pages(Session session)
         throws WiredTigerException
     {
@@ -220,7 +247,8 @@ public class ex_stat {
         conn = wiredtiger.open(home, "create,statistics=(all)");
         session = conn.open_session(null);
 
-        ret = session.create("table:access", "key_format=S,value_format=S");
+        ret = session.create("table:access",
+            "key_format=S,value_format=S,columns=(k,v)");
 
         cursor = session.open_cursor("table:access", null, null);
         cursor.putKeyString("key");
@@ -234,6 +262,8 @@ public class ex_stat {
 
         ret = print_file_stats(session);
 
+        ret = print_join_cursor_stats(session);
+
         ret = print_overflow_pages(session);
 
         ret = print_derived_stats(session);
diff --git a/ext/collators/revint/Makefile.am b/ext/collators/revint/Makefile.am
new file mode 100644
index 00000000000..8c85c6a4701
--- /dev/null
+++ b/ext/collators/revint/Makefile.am
@@ -0,0 +1,10 @@
+AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include
+
+noinst_LTLIBRARIES = libwiredtiger_revint_collator.la
+libwiredtiger_revint_collator_la_SOURCES = revint_collator.c
+
+# libtool hack: noinst_LTLIBRARIES turns off building shared libraries as well
+# as installation, it will only build static libraries.  As far as I can tell,
+# the "approved" libtool way to turn them back on is by adding -rpath.
+libwiredtiger_revint_collator_la_LDFLAGS = \
+	-avoid-version -module -rpath /nowhere
diff --git a/ext/collators/revint/revint_collator.c b/ext/collators/revint/revint_collator.c
new file mode 100644
index 00000000000..30b5dc67556
--- /dev/null
+++ b/ext/collators/revint/revint_collator.c
@@ -0,0 +1,153 @@
+/*-
+ * Public Domain 2014-2016 MongoDB, Inc.
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+#include <stdint.h>
+#include <wiredtiger_ext.h>
+
+/*
+ * A simple WiredTiger collator for indices having a single integer key,
+ * where the ordering is descending (reversed).  This collator also
+ * requires that primary key be an integer.
+ */
+
+/* Local collator structure. */
+typedef struct {
+	WT_COLLATOR collator;		/* Must come first */
+	WT_EXTENSION_API *wt_api;	/* Extension API */
+} REVINT_COLLATOR;
+
+/*
+ * revint_compare --
+ *	WiredTiger reverse integer collation, used for tests.
+ */
+static int
+revint_compare(WT_COLLATOR *collator,
+    WT_SESSION *session, const WT_ITEM *k1, const WT_ITEM *k2, int *cmp)
+{
+	const REVINT_COLLATOR *revint_collator;
+	WT_EXTENSION_API *wtapi;
+	WT_PACK_STREAM *pstream;
+	int ret;
+	int64_t i1, i2, p1, p2;
+
+	i1 = i2 = p1 = p2 = 0;
+	revint_collator = (const REVINT_COLLATOR *)collator;
+	wtapi = revint_collator->wt_api;
+
+	/*
+	 * All indices using this collator have an integer key, and the
+	 * primary key is also an integer. A collator is usually passed the
+	 * concatenation of index key and primary key (when available),
+	 * hence we initially unpack using "ii".
+	 *
+	 * A collator may also be called with an item that includes a index
+	 * key and no primary key.  Among items having the same index key,
+	 * an item with no primary key should sort before an item with a
+	 * primary key. The reason is that if the application calls
+	 * WT_CURSOR::search on a index key for which there are more than
+	 * one value, the search key will not yet have a primary key.  We
+	 * want to position the cursor at the 'first' matching index key so
+	 * that repeated calls to WT_CURSOR::next will see them all.
+	 *
+	 * To keep this code simple, we do not reverse the ordering
+	 * when comparing primary keys.
+	 */
+	if ((ret = wtapi->unpack_start(
+	    wtapi, session, "ii", k1->data, k1->size, &pstream)) != 0 ||
+	    (ret = wtapi->unpack_int(wtapi, pstream, &i1)) != 0)
+		goto err;
+	if ((ret = wtapi->unpack_int(wtapi, pstream, &p1)) != 0)
+		/* A missing primary key is OK and sorts first. */
+		p1 = INT64_MIN;
+	if ((ret = wtapi->pack_close(wtapi, pstream, NULL)) != 0)
+		goto err;
+
+	/* Unpack the second pair of numbers. */
+	if ((ret = wtapi->unpack_start(
+	    wtapi, session, "ii", k2->data, k2->size, &pstream)) != 0 ||
+	    (ret = wtapi->unpack_int(wtapi, pstream, &i2)) != 0)
+		goto err;
+	if ((ret = wtapi->unpack_int(wtapi, pstream, &p2)) != 0)
+		/* A missing primary key is OK and sorts first. */
+		p2 = INT64_MIN;
+	if ((ret = wtapi->pack_close(wtapi, pstream, NULL)) != 0)
+		goto err;
+
+	/* sorting is reversed */
+	if (i1 < i2)
+		*cmp = 1;
+	else if (i1 > i2)
+		*cmp = -1;
+	/* compare primary keys next, not reversed */
+	else if (p1 < p2)
+		*cmp = -1;
+	else if (p1 > p2)
+		*cmp = 1;
+	else
+		*cmp = 0; /* index key and primary key are same */
+
+err:	return (ret);
+}
+
+/*
+ * revint_terminate --
+ *	Terminate is called to free the collator and any associated memory.
+ */
+static int
+revint_terminate(WT_COLLATOR *collator, WT_SESSION *session)
+{
+	(void)session;				/* Unused parameters */
+
+	/* Free the allocated memory. */
+	free(collator);
+	return (0);
+}
+
+/*
+ * wiredtiger_extension_init --
+ *	WiredTiger revint collation extension.
+ */
+int
+wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
+{
+	REVINT_COLLATOR *revint_collator;
+
+	(void)config;				/* Unused parameters */
+
+	if ((revint_collator = calloc(1, sizeof(REVINT_COLLATOR))) == NULL)
+		return (errno);
+
+	revint_collator->collator.compare = revint_compare;
+	revint_collator->collator.terminate = revint_terminate;
+	revint_collator->wt_api = connection->get_extension_api(connection);
+
+	return (connection->add_collator(
+	    connection, "revint", &revint_collator->collator, NULL));
+}
diff --git a/src/async/async_op.c b/src/async/async_op.c
index 130c704757b..970c33c3360 100644
--- a/src/async/async_op.c
+++ b/src/async/async_op.c
@@ -349,14 +349,8 @@ __wt_async_op_init(WT_SESSION_IMPL *session)
 		WT_ERR(__async_op_init(conn, op, i));
 	}
 	return (0);
-err:
-	if (async->async_ops != NULL) {
-		__wt_free(session, async->async_ops);
-		async->async_ops = NULL;
-	}
-	if (async->async_queue != NULL) {
-		__wt_free(session, async->async_queue);
-		async->async_queue = NULL;
-	}
+
+err:	__wt_free(session, async->async_ops);
+	__wt_free(session, async->async_queue);
 	return (ret);
 }
diff --git a/src/block/block_ckpt.c b/src/block/block_ckpt.c
index 03059c8f23a..812bf99acfb 100644
--- a/src/block/block_ckpt.c
+++ b/src/block/block_ckpt.c
@@ -812,8 +812,7 @@ __ckpt_string(WT_SESSION_IMPL *session,
 	WT_RET(__wt_block_buffer_to_ckpt(session, block, addr, ci));
 
 	WT_RET(__wt_buf_fmt(session, buf,
-	    "version=%d",
-	    ci->version));
+	    "version=%" PRIu8, ci->version));
 	if (ci->root_offset == WT_BLOCK_INVALID_OFFSET)
 		WT_RET(__wt_buf_catfmt(session, buf, ", root=[Empty]"));
 	else
diff --git a/src/block/block_open.c b/src/block/block_open.c
index d9b2f908737..adb745c99e7 100644
--- a/src/block/block_open.c
+++ b/src/block/block_open.c
@@ -369,7 +369,7 @@ __desc_read(WT_SESSION_IMPL *session, WT_BLOCK *block)
 		WT_ERR_MSG(session, WT_ERROR,
 		    "unsupported WiredTiger file version: this build only "
 		    "supports major/minor versions up to %d/%d, and the file "
-		    "is version %d/%d",
+		    "is version %" PRIu16 "/%" PRIu16,
 		    WT_BLOCK_MAJOR_VERSION, WT_BLOCK_MINOR_VERSION,
 		    desc->majorv, desc->minorv);
 
diff --git a/src/block/block_write.c b/src/block/block_write.c
index 4c6ac198fe4..e05a430832e 100644
--- a/src/block/block_write.c
+++ b/src/block/block_write.c
@@ -206,10 +206,16 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
 	uint32_t cksum;
 	bool local_locked;
 
-	blk = WT_BLOCK_HEADER_REF(buf->mem);
 	fh = block->fh;
 
 	/*
+	 * Clear the block header to ensure all of it is initialized, even the
+	 * unused fields.
+	 */
+	blk = WT_BLOCK_HEADER_REF(buf->mem);
+	memset(blk, 0, sizeof(*blk));
+
+	/*
 	 * Swap the page-header as needed; this doesn't belong here, but it's
 	 * the best place to catch all callers.
 	 */
diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c
index c11b7d35de6..1f3ac443495 100644
--- a/src/btree/bt_cursor.c
+++ b/src/btree/bt_cursor.c
@@ -173,13 +173,18 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
 		 */
 		break;
 	case BTREE_COL_VAR:
+		/* The search function doesn't check for empty pages. */
+		if (page->pg_var_entries == 0)
+			return (false);
+		WT_ASSERT(session, cbt->slot < page->pg_var_entries);
+
 		/*
-		 * If search returned an insert object, there may or may not be
-		 * a matching on-page object, we have to check.  Variable-length
-		 * column-store pages don't map one-to-one to keys, but have
-		 * "slots", check if search returned a valid slot.
+		 * Column-store updates aren't stored on the page, instead they
+		 * are stored as "insert" objects. If search returned an insert
+		 * object we can't return, the returned on-page object must be
+		 * checked for a match.
 		 */
-		if (cbt->slot >= page->pg_var_entries)
+		if (cbt->ins != NULL && !F_ISSET(cbt, WT_CBT_VAR_ONPAGE_MATCH))
 			return (false);
 
 		/*
@@ -194,6 +199,11 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
 			return (false);
 		break;
 	case BTREE_ROW:
+		/* The search function doesn't check for empty pages. */
+		if (page->pg_row_entries == 0)
+			return (false);
+		WT_ASSERT(session, cbt->slot < page->pg_row_entries);
+
 		/*
 		 * See above: for row-store, no insert object can have the same
 		 * key as an on-page object, we're done.
@@ -201,15 +211,6 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
 		if (cbt->ins != NULL)
 			return (false);
 
-		/*
-		 * Check if searched returned a valid slot (the failure mode is
-		 * an empty page, the search function doesn't check, and so the
-		 * more exact test is "page->pg_row_entries == 0", but this test
-		 * mirrors the column-store test).
-		 */
-		if (cbt->slot >= page->pg_row_entries)
-			return (false);
-
 		/* Updates are stored on the page, check for a delete. */
 		if (page->pg_row_upd != NULL && (upd = __wt_txn_read(
 		    session, page->pg_row_upd[cbt->slot])) != NULL) {
@@ -1162,22 +1163,14 @@ int
 __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop)
 {
 	WT_BTREE *btree;
-	WT_CURSOR_BTREE *cbt;
 	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
 
-	cbt = (start != NULL) ? start : stop;
-	session = (WT_SESSION_IMPL *)cbt->iface.session;
-	btree = cbt->btree;
+	session = (WT_SESSION_IMPL *)start->iface.session;
+	btree = start->btree;
 	WT_STAT_FAST_DATA_INCR(session, cursor_truncate);
 
 	/*
-	 * We always delete in a forward direction because it's faster, assert
-	 * our caller provided us with a start cursor.
-	 */
-	WT_ASSERT(session, start != NULL);
-
-	/*
 	 * For recovery, log the start and stop keys for a truncate operation,
 	 * not the individual records removed.  On the other hand, for rollback
 	 * we need to keep track of all the in-memory operations.
diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c
index 795111d53f9..1f739c9572e 100644
--- a/src/btree/bt_discard.c
+++ b/src/btree/bt_discard.c
@@ -337,8 +337,7 @@ __free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
 		copy = WT_ROW_KEY_COPY(rip);
 		(void)__wt_row_leaf_key_info(
 		    page, copy, &ikey, NULL, NULL, NULL);
-		if (ikey != NULL)
-			__wt_free(session, ikey);
+		__wt_free(session, ikey);
 	}
 
 	/*
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index 52152a2fcac..1d33a7e7c9a 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -350,7 +350,7 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
 	/* Initialize locks. */
 	WT_RET(__wt_rwlock_alloc(
 	    session, &btree->ovfl_lock, "btree overflow lock"));
-	WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush lock"));
+	WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush"));
 
 	btree->checkpointing = WT_CKPT_OFF;		/* Not checkpointing */
 	btree->modified = 0;				/* Clean */
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c
index ac9faef4ff2..5cf6a9bf2bc 100644
--- a/src/btree/bt_read.c
+++ b/src/btree/bt_read.c
@@ -281,10 +281,8 @@ err:	WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
 	 * On error, upd points to a single unlinked WT_UPDATE structure,
 	 * first_upd points to a list.
 	 */
-	if (upd != NULL)
-		__wt_free(session, upd);
-	if (first_upd != NULL)
-		__wt_free_update_list(session, first_upd);
+	__wt_free(session, upd);
+	__wt_free_update_list(session, first_upd);
 
 	__wt_scr_free(session, &current_key);
 	__wt_scr_free(session, &las_addr);
@@ -460,12 +458,12 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
 	WT_DECL_RET;
 	WT_PAGE *page;
 	u_int sleep_cnt, wait_cnt;
-	bool busy, cache_work, oldgen, stalled;
+	bool busy, cache_work, evict_soon, stalled;
 	int force_attempts;
 
 	btree = S2BT(session);
 
-	for (oldgen = stalled = false,
+	for (evict_soon = stalled = false,
 	    force_attempts = 0, sleep_cnt = wait_cnt = 0;;) {
 		switch (ref->state) {
 		case WT_REF_DELETED:
@@ -486,7 +484,16 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
 				WT_RET(__wt_cache_eviction_check(
 				    session, 1, NULL));
 			WT_RET(__page_read(session, ref));
-			oldgen = LF_ISSET(WT_READ_WONT_NEED) ||
+
+			/*
+			 * If configured to not trash the cache, leave the page
+			 * generation unset, we'll set it before returning to
+			 * the oldest read generation, so the page is forcibly
+			 * evicted as soon as possible. We don't do that set
+			 * here because we don't want to evict the page before
+			 * we "acquire" it.
+			 */
+			evict_soon = LF_ISSET(WT_READ_WONT_NEED) ||
 			    F_ISSET(session, WT_SESSION_NO_CACHE);
 			continue;
 		case WT_REF_READING:
@@ -575,20 +582,24 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
 			}
 
 			/*
-			 * If we read the page and we are configured to not
-			 * trash the cache, set the oldest read generation so
-			 * the page is forcibly evicted as soon as possible.
+			 * If we read the page and are configured to not trash
+			 * the cache, and no other thread has already used the
+			 * page, set the oldest read generation so the page is
+			 * forcibly evicted as soon as possible.
 			 *
-			 * Otherwise, update the page's read generation.
+			 * Otherwise, if we read the page, or, if configured to
+			 * update the page's read generation and the page isn't
+			 * already flagged for forced eviction, update the page
+			 * read generation.
 			 */
 			page = ref->page;
-			if (oldgen && page->read_gen == WT_READGEN_NOTSET)
-				__wt_page_evict_soon(page);
-			else if (!LF_ISSET(WT_READ_NO_GEN) &&
-			    page->read_gen != WT_READGEN_OLDEST &&
-			    page->read_gen < __wt_cache_read_gen(session))
-				page->read_gen =
-				    __wt_cache_read_gen_bump(session);
+			if (page->read_gen == WT_READGEN_NOTSET) {
+				if (evict_soon)
+					__wt_page_evict_soon(page);
+				else
+					__wt_cache_read_gen_new(session, page);
+			} else if (!LF_ISSET(WT_READ_NO_GEN))
+				__wt_cache_read_gen_bump(session, page);
 skip_evict:
 			/*
 			 * Check if we need an autocommit transaction.
diff --git a/src/btree/bt_rebalance.c b/src/btree/bt_rebalance.c
index 86360e83ddf..d94eb2ddd80 100644
--- a/src/btree/bt_rebalance.c
+++ b/src/btree/bt_rebalance.c
@@ -412,6 +412,7 @@ __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[])
 	WT_UNUSED(cfg);
 
 	btree = S2BT(session);
+	evict_reset = false;
 
 	/*
 	 * If the tree has never been written to disk, we're done, rebalance
@@ -438,7 +439,8 @@ __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[])
 	 * cache is the root page, and that cannot be evicted; however, this way
 	 * eviction ignores the tree entirely.)
 	 */
-	WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset));
+	WT_ERR(__wt_evict_file_exclusive_on(session));
+	evict_reset = true;
 
 	/* Recursively walk the tree. */
 	switch (rs->type) {
@@ -470,7 +472,10 @@ __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[])
 	btree->root.page = rs->root;
 	rs->root = NULL;
 
-err:	/* Discard any leftover root page we created. */
+err:	if (evict_reset)
+	    __wt_evict_file_exclusive_off(session);
+
+	/* Discard any leftover root page we created. */
 	if (rs->root != NULL) {
 		__wt_page_modify_clear(session, rs->root);
 		__wt_page_out(session, &rs->root);
diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c
index 8d78bda79fb..0e064d306b6 100644
--- a/src/btree/bt_slvg.c
+++ b/src/btree/bt_slvg.c
@@ -1206,8 +1206,7 @@ __slvg_col_build_internal(
 	__wt_root_ref_init(&ss->root_ref, page, true);
 
 	if (0) {
-err:		if (addr != NULL)
-			__wt_free(session, addr);
+err:		__wt_free(session, addr);
 		__wt_page_out(session, &page);
 	}
 	return (ret);
@@ -1868,8 +1867,7 @@ __slvg_row_build_internal(
 	__wt_root_ref_init(&ss->root_ref, page, false);
 
 	if (0) {
-err:		if (addr != NULL)
-			__wt_free(session, addr);
+err:		__wt_free(session, addr);
 		__wt_page_out(session, &page);
 	}
 	return (ret);
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 3dea03316ce..4f16a290958 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -151,8 +151,7 @@ __wt_split_stash_discard_all(
 	for (i = 0, stash = session->split_stash;
 	    i < session->split_stash_cnt;
 	    ++i, ++stash)
-		if (stash->p != NULL)
-			__wt_free(session_safe, stash->p);
+		__wt_free(session_safe, stash->p);
 
 	__wt_free(session_safe, session->split_stash);
 	session->split_stash_cnt = session->split_stash_alloc = 0;
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index 5273f0ee2c3..57056eb5c99 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -17,6 +17,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
 {
 	struct timespec end, start;
 	WT_BTREE *btree;
+	WT_CONNECTION_IMPL *conn;
 	WT_DECL_RET;
 	WT_PAGE *page;
 	WT_PAGE_MODIFY *mod;
@@ -25,8 +26,8 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
 	uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages;
 	uint64_t oldest_id, saved_snap_min;
 	uint32_t flags;
-	bool evict_reset;
 
+	conn = S2C(session);
 	btree = S2BT(session);
 	walk = NULL;
 	txn = &session->txn;
@@ -123,9 +124,8 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
 		 */
 		WT_PUBLISH(btree->checkpointing, WT_CKPT_PREPARE);
 
-		WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset));
-		if (evict_reset)
-			__wt_evict_file_exclusive_off(session);
+		WT_ERR(__wt_evict_file_exclusive_on(session));
+		__wt_evict_file_exclusive_off(session);
 
 		WT_PUBLISH(btree->checkpointing, WT_CKPT_RUNNING);
 
@@ -223,7 +223,7 @@ err:	/* On error, clear any left-over tree walk. */
 		 * so that eviction knows that the checkpoint has completed.
 		 */
 		WT_PUBLISH(btree->checkpoint_gen,
-		    S2C(session)->txn_global.checkpoint_gen);
+		    conn->txn_global.checkpoint_gen);
 		WT_STAT_FAST_DATA_SET(session,
 		    btree_checkpoint_generation, btree->checkpoint_gen);
 
@@ -257,7 +257,8 @@ err:	/* On error, clear any left-over tree walk. */
 	 * before checkpointing the file).  Start a flush to stable storage,
 	 * but don't wait for it.
 	 */
-	if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES)
+	if (ret == 0 &&
+	    syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC))
 		WT_RET(btree->bm->sync(btree->bm, session, true));
 
 	return (ret);
@@ -268,24 +269,18 @@ err:	/* On error, clear any left-over tree walk. */
  *	Cache operations.
  */
 int
-__wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_CACHE_OP op)
+__wt_cache_op(WT_SESSION_IMPL *session, WT_CACHE_OP op)
 {
-	WT_DECL_RET;
-	WT_BTREE *btree;
-
-	btree = S2BT(session);
-
 	switch (op) {
 	case WT_SYNC_CHECKPOINT:
 	case WT_SYNC_CLOSE:
 		/*
-		 * Set the checkpoint reference for reconciliation; it's ugly,
-		 * but drilling a function parameter path from our callers to
-		 * the reconciliation of the tree's root page is going to be
-		 * worse.
+		 * Make sure the checkpoint reference is set for
+		 * reconciliation; it's ugly, but drilling a function parameter
+		 * path from our callers to the reconciliation of the tree's
+		 * root page is going to be worse.
 		 */
-		WT_ASSERT(session, btree->ckpt == NULL);
-		btree->ckpt = ckptbase;
+		WT_ASSERT(session, S2BT(session)->ckpt != NULL);
 		break;
 	case WT_SYNC_DISCARD:
 	case WT_SYNC_WRITE_LEAVES:
@@ -295,23 +290,10 @@ __wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_CACHE_OP op)
 	switch (op) {
 	case WT_SYNC_CHECKPOINT:
 	case WT_SYNC_WRITE_LEAVES:
-		WT_ERR(__sync_file(session, op));
-		break;
+		return (__sync_file(session, op));
 	case WT_SYNC_CLOSE:
 	case WT_SYNC_DISCARD:
-		WT_ERR(__wt_evict_file(session, op));
-		break;
+		return (__wt_evict_file(session, op));
+	WT_ILLEGAL_VALUE(session);
 	}
-
-err:	switch (op) {
-	case WT_SYNC_CHECKPOINT:
-	case WT_SYNC_CLOSE:
-		btree->ckpt = NULL;
-		break;
-	case WT_SYNC_DISCARD:
-	case WT_SYNC_WRITE_LEAVES:
-		break;
-	}
-
-	return (ret);
 }
diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c
index ae2c20be1b6..952298f2456 100644
--- a/src/btree/bt_vrfy.c
+++ b/src/btree/bt_vrfy.c
@@ -226,7 +226,7 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[])
 			WT_WITH_PAGE_INDEX(session,
 			    ret = __verify_tree(session, &btree->root, vs));
 
-			WT_TRET(__wt_cache_op(session, NULL, WT_SYNC_DISCARD));
+			WT_TRET(__wt_cache_op(session, WT_SYNC_DISCARD));
 		}
 
 		/* Unload the checkpoint. */
diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c
index 23eae75ec2b..4730267a545 100644
--- a/src/btree/col_srch.c
+++ b/src/btree/col_srch.c
@@ -211,7 +211,6 @@ descend:	/*
 leaf_only:
 	page = current->page;
 	cbt->ref = current;
-	cbt->recno = recno;
 
 	/* 
 	 * Don't bother searching if the caller is appending a new record where
@@ -225,13 +224,6 @@ leaf_only:
 	}
 
 	/*
-	 * Set the on-page slot to an impossible value larger than any possible
-	 * slot (it's used to interpret the search function's return after the
-	 * search returns an insert list for a page that has no entries).
-	 */
-	cbt->slot = UINT32_MAX;
-
-	/*
 	 * Search the leaf page.
 	 *
 	 * Search after a page is pinned does a search of the pinned page before
@@ -244,28 +236,38 @@ leaf_only:
 	 * that's impossibly large for the page. We do have additional setup to
 	 * do in that case, the record may be appended to the page.
 	 */
-	cbt->compare = 0;
 	if (page->type == WT_PAGE_COL_FIX) {
 		if (recno < page->pg_fix_recno) {
+			cbt->recno = page->pg_fix_recno;
 			cbt->compare = 1;
 			return (0);
 		}
 		if (recno >= page->pg_fix_recno + page->pg_fix_entries) {
 			cbt->recno = page->pg_fix_recno + page->pg_fix_entries;
 			goto past_end;
-		} else
+		} else {
+			cbt->recno = recno;
+			cbt->compare = 0;
 			ins_head = WT_COL_UPDATE_SINGLE(page);
+		}
 	} else {
 		if (recno < page->pg_var_recno) {
+			cbt->recno = page->pg_var_recno;
+			cbt->slot = 0;
 			cbt->compare = 1;
 			return (0);
 		}
 		if ((cip = __col_var_search(page, recno, NULL)) == NULL) {
 			cbt->recno = __col_var_last_recno(page);
+			cbt->slot = page->pg_var_entries == 0 ?
+			    0 : page->pg_var_entries - 1;
 			goto past_end;
 		} else {
+			cbt->recno = recno;
 			cbt->slot = WT_COL_SLOT(page, cip);
+			cbt->compare = 0;
 			ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
+			F_SET(cbt, WT_CBT_VAR_ONPAGE_MATCH);
 		}
 	}
 
diff --git a/src/btree/row_key.c b/src/btree/row_key.c
index 8b9e858ec18..9fff092d079 100644
--- a/src/btree/row_key.c
+++ b/src/btree/row_key.c
@@ -52,6 +52,7 @@ __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page)
 	WT_RET(__wt_scr_alloc(session, 0, &key));
 	WT_RET(__wt_scr_alloc(session,
 	    (uint32_t)__bitstr_size(page->pg_row_entries), &tmp));
+	memset(tmp->mem, 0, tmp->memsize);
 
 	if ((gap = btree->key_gap) == 0)
 		gap = 1;
diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c
index 9d68c8e0ce7..6169a0a810a 100644
--- a/src/btree/row_srch.c
+++ b/src/btree/row_srch.c
@@ -461,6 +461,12 @@ leaf_only:
 	cbt->ref = current;
 
 	/*
+	 * Clear current now that we have moved the reference into the btree
+	 * cursor, so that cleanup never releases twice.
+	 */
+	current = NULL;
+
+	/*
 	 * In the case of a right-side tree descent during an insert, do a fast
 	 * check for an append to the page, try to catch cursors appending data
 	 * into the tree.
@@ -614,14 +620,7 @@ leaf_match:	cbt->compare = 0;
 
 	return (0);
 
-err:	/*
-	 * Release the current page if the search started at the root. If the
-	 * search didn't start at the root we should never have gone looking
-	 * beyond the start page.
-	 */
-	WT_ASSERT(session, leaf == NULL || leaf == current);
-	if (leaf == NULL)
-		WT_TRET(__wt_page_release(session, current, 0));
+err:	WT_TRET(__wt_page_release(session, current, 0));
 	return (ret);
 }
 
diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c
index 3549e41e80d..8796ec6b2fc 100644
--- a/src/cache/cache_las.c
+++ b/src/cache/cache_las.c
@@ -205,7 +205,7 @@ __wt_las_cursor(
 	 * useful more than once.
 	 */
 	*session_flags =
-	    F_ISSET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
+	    F_MASK(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
 
 	conn = S2C(session);
 
diff --git a/src/config/config.c b/src/config/config.c
index f480ab83dbd..96ef7a4e62a 100644
--- a/src/config/config.c
+++ b/src/config/config.c
@@ -16,9 +16,9 @@ static int
 __config_err(WT_CONFIG *conf, const char *msg, int err)
 {
 	WT_RET_MSG(conf->session, err,
-	    "Error parsing '%.*s' at byte %u: %s",
+	    "Error parsing '%.*s' at offset %" WT_PTRDIFFT_FMT ": %s",
 	    (int)(conf->end - conf->orig), conf->orig,
-	    (u_int)(conf->cur - conf->orig), msg);
+	    conf->cur - conf->orig, msg);
 }
 
 /*
diff --git a/src/config/config_def.c b/src/config/config_def.c
index 6e88f9b4d14..c752e5eb265 100644
--- a/src/config/config_def.c
+++ b/src/config/config_def.c
@@ -391,6 +391,61 @@ static const WT_CONFIG_CHECK confchk_colgroup_meta[] = {
 	{ NULL, NULL, NULL, NULL, NULL, 0 }
 };
 
+static const WT_CONFIG_CHECK confchk_file_config[] = {
+	{ "allocation_size", "int",
+	    NULL, "min=512B,max=128MB",
+	    NULL, 0 },
+	{ "app_metadata", "string", NULL, NULL, NULL, 0 },
+	{ "block_allocation", "string",
+	    NULL, "choices=[\"first\",\"best\"]",
+	    NULL, 0 },
+	{ "block_compressor", "string", NULL, NULL, NULL, 0 },
+	{ "cache_resident", "boolean", NULL, NULL, NULL, 0 },
+	{ "checksum", "string",
+	    NULL, "choices=[\"on\",\"off\",\"uncompressed\"]",
+	    NULL, 0 },
+	{ "collator", "string", NULL, NULL, NULL, 0 },
+	{ "columns", "list", NULL, NULL, NULL, 0 },
+	{ "dictionary", "int", NULL, "min=0", NULL, 0 },
+	{ "encryption", "category",
+	    NULL, NULL,
+	    confchk_WT_SESSION_create_encryption_subconfigs, 2 },
+	{ "format", "string", NULL, "choices=[\"btree\"]", NULL, 0 },
+	{ "huffman_key", "string", NULL, NULL, NULL, 0 },
+	{ "huffman_value", "string", NULL, NULL, NULL, 0 },
+	{ "internal_item_max", "int", NULL, "min=0", NULL, 0 },
+	{ "internal_key_max", "int", NULL, "min=0", NULL, 0 },
+	{ "internal_key_truncate", "boolean", NULL, NULL, NULL, 0 },
+	{ "internal_page_max", "int",
+	    NULL, "min=512B,max=512MB",
+	    NULL, 0 },
+	{ "key_format", "format", __wt_struct_confchk, NULL, NULL, 0 },
+	{ "key_gap", "int", NULL, "min=0", NULL, 0 },
+	{ "leaf_item_max", "int", NULL, "min=0", NULL, 0 },
+	{ "leaf_key_max", "int", NULL, "min=0", NULL, 0 },
+	{ "leaf_page_max", "int",
+	    NULL, "min=512B,max=512MB",
+	    NULL, 0 },
+	{ "leaf_value_max", "int", NULL, "min=0", NULL, 0 },
+	{ "log", "category",
+	    NULL, NULL,
+	    confchk_WT_SESSION_create_log_subconfigs, 1 },
+	{ "memory_page_max", "int",
+	    NULL, "min=512B,max=10TB",
+	    NULL, 0 },
+	{ "os_cache_dirty_max", "int", NULL, "min=0", NULL, 0 },
+	{ "os_cache_max", "int", NULL, "min=0", NULL, 0 },
+	{ "prefix_compression", "boolean", NULL, NULL, NULL, 0 },
+	{ "prefix_compression_min", "int", NULL, "min=0", NULL, 0 },
+	{ "split_deepen_min_child", "int", NULL, NULL, NULL, 0 },
+	{ "split_deepen_per_child", "int", NULL, NULL, NULL, 0 },
+	{ "split_pct", "int", NULL, "min=25,max=100", NULL, 0 },
+	{ "value_format", "format",
+	    __wt_struct_confchk, NULL,
+	    NULL, 0 },
+	{ NULL, NULL, NULL, NULL, NULL, 0 }
+};
+
 static const WT_CONFIG_CHECK confchk_file_meta[] = {
 	{ "allocation_size", "int",
 	    NULL, "min=512B,max=128MB",
@@ -466,6 +521,67 @@ static const WT_CONFIG_CHECK confchk_index_meta[] = {
 	{ NULL, NULL, NULL, NULL, NULL, 0 }
 };
 
+static const WT_CONFIG_CHECK confchk_lsm_meta[] = {
+	{ "allocation_size", "int",
+	    NULL, "min=512B,max=128MB",
+	    NULL, 0 },
+	{ "app_metadata", "string", NULL, NULL, NULL, 0 },
+	{ "block_allocation", "string",
+	    NULL, "choices=[\"first\",\"best\"]",
+	    NULL, 0 },
+	{ "block_compressor", "string", NULL, NULL, NULL, 0 },
+	{ "cache_resident", "boolean", NULL, NULL, NULL, 0 },
+	{ "checksum", "string",
+	    NULL, "choices=[\"on\",\"off\",\"uncompressed\"]",
+	    NULL, 0 },
+	{ "chunks", "string", NULL, NULL, NULL, 0 },
+	{ "collator", "string", NULL, NULL, NULL, 0 },
+	{ "columns", "list", NULL, NULL, NULL, 0 },
+	{ "dictionary", "int", NULL, "min=0", NULL, 0 },
+	{ "encryption", "category",
+	    NULL, NULL,
+	    confchk_WT_SESSION_create_encryption_subconfigs, 2 },
+	{ "format", "string", NULL, "choices=[\"btree\"]", NULL, 0 },
+	{ "huffman_key", "string", NULL, NULL, NULL, 0 },
+	{ "huffman_value", "string", NULL, NULL, NULL, 0 },
+	{ "internal_item_max", "int", NULL, "min=0", NULL, 0 },
+	{ "internal_key_max", "int", NULL, "min=0", NULL, 0 },
+	{ "internal_key_truncate", "boolean", NULL, NULL, NULL, 0 },
+	{ "internal_page_max", "int",
+	    NULL, "min=512B,max=512MB",
+	    NULL, 0 },
+	{ "key_format", "format", __wt_struct_confchk, NULL, NULL, 0 },
+	{ "key_gap", "int", NULL, "min=0", NULL, 0 },
+	{ "last", "string", NULL, NULL, NULL, 0 },
+	{ "leaf_item_max", "int", NULL, "min=0", NULL, 0 },
+	{ "leaf_key_max", "int", NULL, "min=0", NULL, 0 },
+	{ "leaf_page_max", "int",
+	    NULL, "min=512B,max=512MB",
+	    NULL, 0 },
+	{ "leaf_value_max", "int", NULL, "min=0", NULL, 0 },
+	{ "log", "category",
+	    NULL, NULL,
+	    confchk_WT_SESSION_create_log_subconfigs, 1 },
+	{ "lsm", "category",
+	    NULL, NULL,
+	    confchk_WT_SESSION_create_lsm_subconfigs, 11 },
+	{ "memory_page_max", "int",
+	    NULL, "min=512B,max=10TB",
+	    NULL, 0 },
+	{ "old_chunks", "string", NULL, NULL, NULL, 0 },
+	{ "os_cache_dirty_max", "int", NULL, "min=0", NULL, 0 },
+	{ "os_cache_max", "int", NULL, "min=0", NULL, 0 },
+	{ "prefix_compression", "boolean", NULL, NULL, NULL, 0 },
+	{ "prefix_compression_min", "int", NULL, "min=0", NULL, 0 },
+	{ "split_deepen_min_child", "int", NULL, NULL, NULL, 0 },
+	{ "split_deepen_per_child", "int", NULL, NULL, NULL, 0 },
+	{ "split_pct", "int", NULL, "min=25,max=100", NULL, 0 },
+	{ "value_format", "format",
+	    __wt_struct_confchk, NULL,
+	    NULL, 0 },
+	{ NULL, NULL, NULL, NULL, NULL, 0 }
+};
+
 static const WT_CONFIG_CHECK confchk_table_meta[] = {
 	{ "app_metadata", "string", NULL, NULL, NULL, 0 },
 	{ "colgroups", "list", NULL, NULL, NULL, 0 },
@@ -985,6 +1101,20 @@ static const WT_CONFIG_ENTRY config_entries[] = {
 	  "app_metadata=,collator=,columns=,source=,type=file",
 	  confchk_colgroup_meta, 5
 	},
+	{ "file.config",
+	  "allocation_size=4KB,app_metadata=,block_allocation=best,"
+	  "block_compressor=,cache_resident=0,checksum=uncompressed,"
+	  "collator=,columns=,dictionary=0,encryption=(keyid=,name=),"
+	  "format=btree,huffman_key=,huffman_value=,internal_item_max=0,"
+	  "internal_key_max=0,internal_key_truncate=,internal_page_max=4KB,"
+	  "key_format=u,key_gap=10,leaf_item_max=0,leaf_key_max=0,"
+	  "leaf_page_max=32KB,leaf_value_max=0,log=(enabled=),"
+	  "memory_page_max=5MB,os_cache_dirty_max=0,os_cache_max=0,"
+	  "prefix_compression=0,prefix_compression_min=4,"
+	  "split_deepen_min_child=0,split_deepen_per_child=0,split_pct=75,"
+	  "value_format=u",
+	  confchk_file_config, 33
+	},
 	{ "file.meta",
 	  "allocation_size=4KB,app_metadata=,block_allocation=best,"
 	  "block_compressor=,cache_resident=0,checkpoint=,checkpoint_lsn=,"
@@ -1005,6 +1135,23 @@ static const WT_CONFIG_ENTRY config_entries[] = {
 	  "index_key_columns=,key_format=u,source=,type=file,value_format=u",
 	  confchk_index_meta, 10
 	},
+	{ "lsm.meta",
+	  "allocation_size=4KB,app_metadata=,block_allocation=best,"
+	  "block_compressor=,cache_resident=0,checksum=uncompressed,chunks="
+	  ",collator=,columns=,dictionary=0,encryption=(keyid=,name=),"
+	  "format=btree,huffman_key=,huffman_value=,internal_item_max=0,"
+	  "internal_key_max=0,internal_key_truncate=,internal_page_max=4KB,"
+	  "key_format=u,key_gap=10,last=,leaf_item_max=0,leaf_key_max=0,"
+	  "leaf_page_max=32KB,leaf_value_max=0,log=(enabled=),"
+	  "lsm=(auto_throttle=,bloom=,bloom_bit_count=16,bloom_config=,"
+	  "bloom_hash_count=8,bloom_oldest=0,chunk_count_limit=0,"
+	  "chunk_max=5GB,chunk_size=10MB,merge_max=15,merge_min=0),"
+	  "memory_page_max=5MB,old_chunks=,os_cache_dirty_max=0,"
+	  "os_cache_max=0,prefix_compression=0,prefix_compression_min=4,"
+	  "split_deepen_min_child=0,split_deepen_per_child=0,split_pct=75,"
+	  "value_format=u",
+	  confchk_lsm_meta, 37
+	},
 	{ "table.meta",
 	  "app_metadata=,colgroups=,collator=,columns=,key_format=u,"
 	  "value_format=u",
diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c
index bb67185f5c9..6d115c8fdcd 100644
--- a/src/conn/conn_api.c
+++ b/src/conn/conn_api.c
@@ -772,6 +772,19 @@ __conn_get_extension_api(WT_CONNECTION *wt_conn)
 	conn->extension_api.transaction_visible = __wt_ext_transaction_visible;
 	conn->extension_api.version = wiredtiger_version;
 
+	/* Streaming pack/unpack API */
+	conn->extension_api.pack_start = __wt_ext_pack_start;
+	conn->extension_api.unpack_start = __wt_ext_unpack_start;
+	conn->extension_api.pack_close = __wt_ext_pack_close;
+	conn->extension_api.pack_item = __wt_ext_pack_item;
+	conn->extension_api.pack_int = __wt_ext_pack_int;
+	conn->extension_api.pack_str = __wt_ext_pack_str;
+	conn->extension_api.pack_uint = __wt_ext_pack_uint;
+	conn->extension_api.unpack_item = __wt_ext_unpack_item;
+	conn->extension_api.unpack_int = __wt_ext_unpack_int;
+	conn->extension_api.unpack_str = __wt_ext_unpack_str;
+	conn->extension_api.unpack_uint = __wt_ext_unpack_uint;
+
 	return (&conn->extension_api);
 }
 
@@ -1681,6 +1694,7 @@ __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[])
 		{ "fileops",		WT_VERB_FILEOPS },
 		{ "log",		WT_VERB_LOG },
 		{ "lsm",		WT_VERB_LSM },
+		{ "lsm_manager",	WT_VERB_LSM_MANAGER },
 		{ "metadata",		WT_VERB_METADATA },
 		{ "mutex",		WT_VERB_MUTEX },
 		{ "overflow",		WT_VERB_OVERFLOW },
diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c
index 1831aad5895..9a2c394e9a6 100644
--- a/src/conn/conn_cache.c
+++ b/src/conn/conn_cache.c
@@ -140,6 +140,12 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[])
 	WT_RET(__wt_cache_config(session, false, cfg));
 
 	/*
+	 * The lowest possible page read-generation has a special meaning, it
+	 * marks a page for forcible eviction; don't let it happen by accident.
+	 */
+	cache->read_gen = WT_READGEN_START_VALUE;
+
+	/*
 	 * The target size must be lower than the trigger size or we will never
 	 * get any work done.
 	 */
@@ -147,8 +153,8 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[])
 		WT_ERR_MSG(session, EINVAL,
 		    "eviction target must be lower than the eviction trigger");
 
-	WT_ERR(__wt_cond_alloc(session,
-	    "cache eviction server", false, &cache->evict_cond));
+	WT_ERR(__wt_cond_auto_alloc(session, "cache eviction server",
+	    false, 10000, WT_MILLION, &cache->evict_cond));
 	WT_ERR(__wt_cond_alloc(session,
 	    "eviction waiters", false, &cache->evict_waiter_cond));
 	WT_ERR(__wt_spin_init(session, &cache->evict_lock, "cache eviction"));
@@ -246,7 +252,7 @@ __wt_cache_destroy(WT_SESSION_IMPL *session)
 		    " bytes dirty and %" PRIu64 " pages dirty",
 		    cache->bytes_dirty, cache->pages_dirty);
 
-	WT_TRET(__wt_cond_destroy(session, &cache->evict_cond));
+	WT_TRET(__wt_cond_auto_destroy(session, &cache->evict_cond));
 	WT_TRET(__wt_cond_destroy(session, &cache->evict_waiter_cond));
 	__wt_spin_destroy(session, &cache->evict_lock);
 	__wt_spin_destroy(session, &cache->evict_walk_lock);
diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c
index 2fab08e3afa..5019ab59fe3 100644
--- a/src/conn/conn_dhandle.c
+++ b/src/conn/conn_dhandle.c
@@ -129,16 +129,19 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force)
 	WT_BTREE *btree;
 	WT_DATA_HANDLE *dhandle;
 	WT_DECL_RET;
-	bool evict_reset, marked_dead, no_schema_lock;
+	bool marked_dead, no_schema_lock;
 
 	btree = S2BT(session);
 	bm = btree->bm;
 	dhandle = session->dhandle;
-	evict_reset = marked_dead = false;
+	marked_dead = false;
 
 	if (!F_ISSET(dhandle, WT_DHANDLE_OPEN))
 		return (0);
 
+	/* Turn off eviction. */
+	WT_RET(__wt_evict_file_exclusive_on(session));
+
 	/*
 	 * If we don't already have the schema lock, make it an error to try
 	 * to acquire it.  The problem is that we are holding an exclusive
@@ -160,13 +163,6 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force)
 	__wt_spin_lock(session, &dhandle->close_lock);
 
 	/*
-	 * Ensure we aren't racing with the eviction server; inside the close
-	 * lock so threads won't race setting/clearing the tree's "no eviction"
-	 * flag.
-	 */
-	WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset));
-
-	/*
 	 * The close can fail if an update cannot be written, return the EBUSY
 	 * error to our caller for eventual retry.
 	 *
@@ -204,13 +200,13 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force)
 	    F_ISSET(dhandle, WT_DHANDLE_DEAD) ||
 	    !F_ISSET(dhandle, WT_DHANDLE_OPEN));
 
-err:	if (evict_reset)
-		__wt_evict_file_exclusive_off(session);
-	__wt_spin_unlock(session, &dhandle->close_lock);
+err:	__wt_spin_unlock(session, &dhandle->close_lock);
 
 	if (no_schema_lock)
 		F_CLR(session, WT_SESSION_NO_SCHEMA_LOCK);
 
+	__wt_evict_file_exclusive_off(session);
+
 	return (ret);
 }
 
diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c
index 5999cf20b3b..757d69bf240 100644
--- a/src/conn/conn_log.c
+++ b/src/conn/conn_log.c
@@ -142,6 +142,8 @@ __logmgr_config(
 	}
 
 	WT_RET(__logmgr_sync_cfg(session, cfg));
+	if (conn->log_cond != NULL)
+		WT_RET(__wt_cond_auto_signal(session, conn->log_cond));
 	return (0);
 }
 
@@ -468,7 +470,7 @@ __log_file_server(void *arg)
 				locked = false;
 				__wt_spin_unlock(session, &log->log_sync_lock);
 			} else {
-				WT_ERR(__wt_cond_signal(
+				WT_ERR(__wt_cond_auto_signal(
 				    session, conn->log_wrlsn_cond));
 				/*
 				 * We do not want to wait potentially a second
@@ -667,31 +669,54 @@ __log_wrlsn_server(void *arg)
 {
 	WT_CONNECTION_IMPL *conn;
 	WT_DECL_RET;
+	WT_LOG *log;
+	WT_LSN prev;
 	WT_SESSION_IMPL *session;
 	int yield;
+	bool did_work;
 
 	session = arg;
 	conn = S2C(session);
+	log = conn->log;
 	yield = 0;
+	WT_INIT_LSN(&prev);
+	did_work = false;
 	while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) {
 		/*
-		 * Write out any log record buffers.
+		 * Write out any log record buffers if anything was done
+		 * since last time.  Only call the function to walk the
+		 * slots if the system is not idle.  On an idle system
+		 * the alloc_lsn will not advance and the written lsn will
+		 * match the alloc_lsn.
 		 */
-		WT_ERR(__wt_log_wrlsn(session, &yield));
+		if (__wt_log_cmp(&prev, &log->alloc_lsn) != 0 ||
+		    __wt_log_cmp(&log->write_lsn, &log->alloc_lsn) != 0)
+			WT_ERR(__wt_log_wrlsn(session, &yield));
+		else
+			WT_STAT_FAST_CONN_INCR(session, log_write_lsn_skip);
+		prev = log->alloc_lsn;
+		if (yield == 0)
+			did_work = true;
+		else
+			did_work = false;
 		/*
 		 * If __wt_log_wrlsn did work we want to yield instead of sleep.
 		 */
 		if (yield++ < WT_THOUSAND)
 			__wt_yield();
 		else
-			WT_ERR(__wt_cond_wait(
-			    session, conn->log_wrlsn_cond, 10000));
+			/*
+			 * Send in false because if we did any work we would
+			 * not be on this path.
+			 */
+			WT_ERR(__wt_cond_auto_wait(
+			    session, conn->log_wrlsn_cond, did_work));
 	}
 	/*
 	 * On close we need to do this one more time because there could
 	 * be straggling log writes that need to be written.
 	 */
-	WT_ERR(__wt_log_force_write(session, 1));
+	WT_ERR(__wt_log_force_write(session, 1, NULL));
 	WT_ERR(__wt_log_wrlsn(session, NULL));
 	if (0) {
 err:		__wt_err(session, ret, "log wrlsn server error");
@@ -706,12 +731,13 @@ err:		__wt_err(session, ret, "log wrlsn server error");
 static WT_THREAD_RET
 __log_server(void *arg)
 {
+	struct timespec start, now;
 	WT_CONNECTION_IMPL *conn;
 	WT_DECL_RET;
 	WT_LOG *log;
 	WT_SESSION_IMPL *session;
-	int freq_per_sec;
-	bool locked, signalled;
+	uint64_t timediff;
+	bool did_work, locked, signalled;
 
 	session = arg;
 	conn = S2C(session);
@@ -719,11 +745,10 @@ __log_server(void *arg)
 	locked = signalled = false;
 
 	/*
-	 * Set this to the number of times per second we want to force out the
-	 * log slot buffer.
+	 * Set this to the number of milliseconds we want to run archive and
+	 * pre-allocation.  Start it so that we run on the first time through.
 	 */
-#define	WT_FORCE_PER_SECOND	20
-	freq_per_sec = WT_FORCE_PER_SECOND;
+	timediff = WT_THOUSAND;
 
 	/*
 	 * The log server thread does a variety of work.  It forces out any
@@ -736,6 +761,7 @@ __log_server(void *arg)
 	 * don't want log records sitting in the buffer over the time it
 	 * takes to sync out an earlier file.
 	 */
+	did_work = true;
 	while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) {
 		/*
 		 * Slots depend on future activity.  Force out buffered
@@ -744,15 +770,14 @@ __log_server(void *arg)
 		 * and a buffer may need to wait for the write_lsn to advance
 		 * in the case of a synchronous buffer.  We end up with a hang.
 		 */
-		WT_ERR_BUSY_OK(__wt_log_force_write(session, 0));
+		WT_ERR_BUSY_OK(__wt_log_force_write(session, 0, &did_work));
 
 		/*
 		 * We don't want to archive or pre-allocate files as often as
 		 * we want to force out log buffers.  Only do it once per second
 		 * or if the condition was signalled.
 		 */
-		if (--freq_per_sec <= 0 || signalled) {
-			freq_per_sec = WT_FORCE_PER_SECOND;
+		if (timediff >= WT_THOUSAND || signalled) {
 
 			/*
 			 * Perform log pre-allocation.
@@ -793,8 +818,12 @@ __log_server(void *arg)
 		}
 
 		/* Wait until the next event. */
-		WT_ERR(__wt_cond_wait_signal(session, conn->log_cond,
-		    WT_MILLION / WT_FORCE_PER_SECOND, &signalled));
+
+		WT_ERR(__wt_epoch(session, &start));
+		WT_ERR(__wt_cond_auto_wait_signal(session, conn->log_cond,
+		    did_work, &signalled));
+		WT_ERR(__wt_epoch(session, &now));
+		timediff = WT_TIMEDIFF_MS(now, start);
 	}
 
 	if (0) {
@@ -906,8 +935,9 @@ __wt_logmgr_open(WT_SESSION_IMPL *session)
 	 */
 	WT_RET(__wt_open_internal_session(conn, "log-wrlsn-server",
 	    false, session_flags, &conn->log_wrlsn_session));
-	WT_RET(__wt_cond_alloc(conn->log_wrlsn_session,
-	    "log write lsn server", false, &conn->log_wrlsn_cond));
+	WT_RET(__wt_cond_auto_alloc(conn->log_wrlsn_session,
+	    "log write lsn server", false, 10000, WT_MILLION,
+	    &conn->log_wrlsn_cond));
 	WT_RET(__wt_thread_create(conn->log_wrlsn_session,
 	    &conn->log_wrlsn_tid, __log_wrlsn_server, conn->log_wrlsn_session));
 	conn->log_wrlsn_tid_set = true;
@@ -921,13 +951,13 @@ __wt_logmgr_open(WT_SESSION_IMPL *session)
 	if (conn->log_session != NULL) {
 		WT_ASSERT(session, conn->log_cond != NULL);
 		WT_ASSERT(session, conn->log_tid_set == true);
-		WT_RET(__wt_cond_signal(session, conn->log_cond));
+		WT_RET(__wt_cond_auto_signal(session, conn->log_cond));
 	} else {
 		/* The log server gets its own session. */
 		WT_RET(__wt_open_internal_session(conn,
 		    "log-server", false, session_flags, &conn->log_session));
-		WT_RET(__wt_cond_alloc(conn->log_session,
-		    "log server", false, &conn->log_cond));
+		WT_RET(__wt_cond_auto_alloc(conn->log_session,
+		    "log server", false, 50000, WT_MILLION, &conn->log_cond));
 
 		/*
 		 * Start the thread.
@@ -963,7 +993,7 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
 		return (0);
 	}
 	if (conn->log_tid_set) {
-		WT_TRET(__wt_cond_signal(session, conn->log_cond));
+		WT_TRET(__wt_cond_auto_signal(session, conn->log_cond));
 		WT_TRET(__wt_thread_join(session, conn->log_tid));
 		conn->log_tid_set = false;
 	}
@@ -978,7 +1008,7 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
 		conn->log_file_session = NULL;
 	}
 	if (conn->log_wrlsn_tid_set) {
-		WT_TRET(__wt_cond_signal(session, conn->log_wrlsn_cond));
+		WT_TRET(__wt_cond_auto_signal(session, conn->log_wrlsn_cond));
 		WT_TRET(__wt_thread_join(session, conn->log_wrlsn_tid));
 		conn->log_wrlsn_tid_set = false;
 	}
@@ -999,9 +1029,9 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
 	}
 
 	/* Destroy the condition variables now that all threads are stopped */
-	WT_TRET(__wt_cond_destroy(session, &conn->log_cond));
+	WT_TRET(__wt_cond_auto_destroy(session, &conn->log_cond));
 	WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond));
-	WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond));
+	WT_TRET(__wt_cond_auto_destroy(session, &conn->log_wrlsn_cond));
 
 	WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond));
 	WT_TRET(__wt_cond_destroy(session, &conn->log->log_write_cond));
diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c
index 58577b4587d..aff422654d7 100644
--- a/src/conn/conn_open.c
+++ b/src/conn/conn_open.c
@@ -210,10 +210,8 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
 			/*
 			 * If hash arrays were allocated, free them now.
 			 */
-			if (s->dhhash != NULL)
-				__wt_free(session, s->dhhash);
-			if (s->tablehash != NULL)
-				__wt_free(session, s->tablehash);
+			__wt_free(session, s->dhhash);
+			__wt_free(session, s->tablehash);
 			__wt_free(session, s->hazard);
 		}
 
diff --git a/src/cursor/cur_backup.c b/src/cursor/cur_backup.c
index b097a8c08aa..2fb0c464a76 100644
--- a/src/cursor/cur_backup.c
+++ b/src/cursor/cur_backup.c
@@ -140,8 +140,9 @@ __wt_curbackup_open(WT_SESSION_IMPL *session,
 	 * Start the backup and fill in the cursor's list.  Acquire the schema
 	 * lock, we need a consistent view when creating a copy.
 	 */
-	WT_WITH_SCHEMA_LOCK(session, ret,
-	    ret = __backup_start(session, cb, cfg));
+	WT_WITH_CHECKPOINT_LOCK(session, ret,
+	    WT_WITH_SCHEMA_LOCK(session, ret,
+		ret = __backup_start(session, cb, cfg)));
 	WT_ERR(ret);
 
 	/* __wt_cursor_init is last so we don't have to clean up on error. */
diff --git a/src/cursor/cur_join.c b/src/cursor/cur_join.c
index fa6dd5c32f7..38a83217933 100644
--- a/src/cursor/cur_join.c
+++ b/src/cursor/cur_join.c
@@ -8,6 +8,9 @@
 
 #include "wt_internal.h"
 
+static int __curjoin_insert_endpoint(WT_SESSION_IMPL *,
+    WT_CURSOR_JOIN_ENTRY *, u_int, WT_CURSOR_JOIN_ENDPOINT **);
+
 /*
  * __curjoin_entry_iter_init --
  *	Initialize an iteration for the index managed by a join entry.
@@ -17,42 +20,46 @@ static int
 __curjoin_entry_iter_init(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
     WT_CURSOR_JOIN_ENTRY *entry, WT_CURSOR_JOIN_ITER **iterp)
 {
-	WT_CURSOR *newcur;
 	WT_CURSOR *to_dup;
 	WT_DECL_RET;
 	const char *raw_cfg[] = { WT_CONFIG_BASE(
 	    session, WT_SESSION_open_cursor), "raw", NULL };
 	const char *def_cfg[] = { WT_CONFIG_BASE(
 	    session, WT_SESSION_open_cursor), NULL };
-	const char *uri, **config;
-	char *uribuf;
+	const char *urimain, **config;
+	char *mainbuf, *uri;
 	WT_CURSOR_JOIN_ITER *iter;
 	size_t size;
 
 	iter = NULL;
-	uribuf = NULL;
+	mainbuf = uri = NULL;
 	to_dup = entry->ends[0].cursor;
 
-	uri = to_dup->uri;
 	if (F_ISSET((WT_CURSOR *)cjoin, WT_CURSTD_RAW))
 		config = &raw_cfg[0];
 	else
 		config = &def_cfg[0];
 
+	size = strlen(to_dup->internal_uri) + 3;
+	WT_ERR(__wt_calloc(session, size, 1, &uri));
+	snprintf(uri, size, "%s()", to_dup->internal_uri);
+	urimain = cjoin->table->name;
 	if (cjoin->projection != NULL) {
-		size = strlen(uri) + strlen(cjoin->projection) + 1;
-		WT_ERR(__wt_calloc(session, size, 1, &uribuf));
-		snprintf(uribuf, size, "%s%s", uri, cjoin->projection);
-		uri = uribuf;
+		size = strlen(urimain) + strlen(cjoin->projection) + 1;
+		WT_ERR(__wt_calloc(session, size, 1, &mainbuf));
+		snprintf(mainbuf, size, "%s%s", urimain, cjoin->projection);
+		urimain = mainbuf;
 	}
-	WT_ERR(__wt_open_cursor(session, uri, (WT_CURSOR *)cjoin, config,
-	    &newcur));
-	WT_ERR(__wt_cursor_dup_position(to_dup, newcur));
+
 	WT_ERR(__wt_calloc_one(session, &iter));
+	WT_ERR(__wt_open_cursor(session, uri, (WT_CURSOR *)cjoin, config,
+	    &iter->cursor));
+	WT_ERR(__wt_cursor_dup_position(to_dup, iter->cursor));
+	WT_ERR(__wt_open_cursor(session, urimain, (WT_CURSOR *)cjoin, config,
+	    &iter->main));
 	iter->cjoin = cjoin;
 	iter->session = session;
 	iter->entry = entry;
-	iter->cursor = newcur;
 	iter->positioned = false;
 	iter->isequal = (entry->ends_next == 1 &&
 	    WT_CURJOIN_END_RANGE(&entry->ends[0]) == WT_CURJOIN_END_EQ);
@@ -61,7 +68,8 @@ __curjoin_entry_iter_init(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
 	if (0) {
 err:		__wt_free(session, iter);
 	}
-	__wt_free(session, uribuf);
+	__wt_free(session, mainbuf);
+	__wt_free(session, uri);
 	return (ret);
 }
 
@@ -87,45 +95,80 @@ __curjoin_pack_recno(WT_SESSION_IMPL *session, uint64_t r, uint8_t *buf,
 }
 
 /*
- * __curjoin_entry_iter_next --
- *	Get the next item in an iteration.
+ * __curjoin_split_key --
+ *	Copy the primary key from a cursor (either main table or index)
+ *	to another cursor.  When copying from an index file, the index
+ *	key is also returned.
  *
  */
 static int
-__curjoin_entry_iter_next(WT_CURSOR_JOIN_ITER *iter, WT_ITEM *primkey,
-    uint64_t *rp)
+__curjoin_split_key(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
+    WT_ITEM *idxkey, WT_CURSOR *tocur, WT_CURSOR *fromcur,
+    const char *repack_fmt, bool isindex)
 {
 	WT_CURSOR *firstcg_cur;
-	WT_CURSOR_JOIN *cjoin;
-	WT_SESSION_IMPL *session;
-	uint64_t r;
+	WT_CURSOR_INDEX *cindex;
+	WT_ITEM *keyp;
+	const uint8_t *p;
+
+	if (isindex) {
+		cindex = ((WT_CURSOR_INDEX *)fromcur);
+		/*
+		 * Repack tells us where the index key ends; advance past
+		 * that to get where the raw primary key starts.
+		 */
+		WT_RET(__wt_struct_repack(session, cindex->child->key_format,
+		    repack_fmt != NULL ? repack_fmt : cindex->iface.key_format,
+		    &cindex->child->key, idxkey));
+		WT_ASSERT(session, cindex->child->key.size > idxkey->size);
+		tocur->key.data = (uint8_t *)idxkey->data + idxkey->size;
+		tocur->key.size = cindex->child->key.size - idxkey->size;
+		if (WT_CURSOR_RECNO(tocur)) {
+			p = (const uint8_t *)tocur->key.data;
+			WT_RET(__wt_vunpack_uint(&p, tocur->key.size,
+			    &tocur->recno));
+		} else
+			tocur->recno = 0;
+	} else {
+		firstcg_cur = ((WT_CURSOR_TABLE *)fromcur)->cg_cursors[0];
+		keyp = &firstcg_cur->key;
+		if (WT_CURSOR_RECNO(tocur)) {
+			WT_ASSERT(session, keyp->size == sizeof(uint64_t));
+			tocur->recno = *(uint64_t *)keyp->data;
+			WT_RET(__curjoin_pack_recno(session, tocur->recno,
+			    cjoin->recno_buf, sizeof(cjoin->recno_buf),
+			    &tocur->key));
+		} else {
+			WT_ITEM_SET(tocur->key, *keyp);
+			tocur->recno = 0;
+		}
+		idxkey->data = NULL;
+		idxkey->size = 0;
+	}
+	return (0);
+}
 
+/*
+ * __curjoin_entry_iter_next --
+ *	Get the next item in an iteration.
+ *
+ */
+static int
+__curjoin_entry_iter_next(WT_CURSOR_JOIN_ITER *iter, WT_CURSOR *cursor)
+{
 	if (iter->positioned)
 		WT_RET(iter->cursor->next(iter->cursor));
 	else
 		iter->positioned = true;
 
-	session = iter->session;
-	cjoin = iter->cjoin;
-
 	/*
 	 * Set our key to the primary key, we'll also need this
 	 * to check membership.
 	 */
-	if (iter->entry->index != NULL)
-		firstcg_cur = ((WT_CURSOR_INDEX *)iter->cursor)->cg_cursors[0];
-	else
-		firstcg_cur = ((WT_CURSOR_TABLE *)iter->cursor)->cg_cursors[0];
-	if (WT_CURSOR_RECNO(&cjoin->iface)) {
-		r = *(uint64_t *)firstcg_cur->key.data;
-		WT_RET(__curjoin_pack_recno(session, r, cjoin->recno_buf,
-		    sizeof(cjoin->recno_buf), primkey));
-		*rp = r;
-	} else {
-		WT_ITEM_SET(*primkey, firstcg_cur->key);
-		*rp = 0;
-	}
-	iter->curkey = primkey;
+	WT_RET(__curjoin_split_key(iter->session, iter->cjoin, &iter->idxkey,
+	    cursor, iter->cursor, iter->entry->repack_format,
+	    iter->entry->index != NULL));
+	iter->curkey = &cursor->key;
 	iter->entry->stats.actual_count++;
 	iter->entry->stats.accesses++;
 	return (0);
@@ -141,6 +184,7 @@ __curjoin_entry_iter_reset(WT_CURSOR_JOIN_ITER *iter)
 {
 	if (iter->positioned) {
 		WT_RET(iter->cursor->reset(iter->cursor));
+		WT_RET(iter->main->reset(iter->main));
 		WT_RET(__wt_cursor_dup_position(
 		    iter->cjoin->entries[0].ends[0].cursor, iter->cursor));
 		iter->positioned = false;
@@ -172,6 +216,8 @@ __curjoin_entry_iter_close(WT_CURSOR_JOIN_ITER *iter)
 
 	if (iter->cursor != NULL)
 		WT_TRET(iter->cursor->close(iter->cursor));
+	if (iter->main != NULL)
+		WT_TRET(iter->main->close(iter->main));
 	__wt_free(iter->session, iter);
 
 	return (ret);
@@ -227,10 +273,8 @@ __curjoin_get_value(WT_CURSOR *cursor, ...)
 	    !__curjoin_entry_iter_ready(iter))
 		WT_ERR_MSG(session, EINVAL,
 		    "join cursor must be advanced with next()");
-	if (iter->entry->index != NULL)
-		WT_ERR(__wt_curindex_get_valuev(iter->cursor, ap));
-	else
-		WT_ERR(__wt_curtable_get_valuev(iter->cursor, ap));
+
+	WT_ERR(__wt_curtable_get_valuev(iter->main, ap));
 
 err:	va_end(ap);
 	API_END_RET(session, ret);
@@ -246,41 +290,26 @@ __curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
 {
 	WT_COLLATOR *collator;
 	WT_CURSOR *c;
-	WT_CURSOR_INDEX *cindex;
 	WT_CURSOR_JOIN_ENDPOINT *end, *endmax;
 	WT_DECL_RET;
 	WT_DECL_ITEM(uribuf);
 	WT_ITEM curkey, curvalue;
-	WT_TABLE *maintable;
 	const char *raw_cfg[] = { WT_CONFIG_BASE(
 	    session, WT_SESSION_open_cursor), "raw", NULL };
-	const char *mainkey_str, *p;
-	size_t mainkey_len, size;
-	u_int i;
+	const char *uri;
+	size_t size;
 	int cmp, skip;
 
 	c = NULL;
 	skip = 0;
 
-	if (entry->index != NULL) {
+	if (entry->index != NULL)
 		/*
-		 * Open a cursor having a projection of the keys of the
-		 * index we're comparing against.  Open it raw, we're
-		 * going to compare it to the raw keys of the
-		 * reference cursors.
+		 * Open the raw index.  We're avoiding any references
+		 * to the main table, they may be expensive.
 		 */
-		maintable = ((WT_CURSOR_TABLE *)entry->main)->table;
-		mainkey_str = maintable->colconf.str + 1;
-		for (p = mainkey_str, i = 0;
-		     p != NULL && i < maintable->nkey_columns; i++)
-			p = strchr(p + 1, ',');
-		WT_ASSERT(session, p != 0);
-		mainkey_len = WT_PTRDIFF(p, mainkey_str);
-		size = strlen(entry->index->name) + mainkey_len + 3;
-		WT_ERR(__wt_scr_alloc(session, size, &uribuf));
-		WT_ERR(__wt_buf_fmt(session, uribuf, "%s(%.*s)",
-		    entry->index->name, (int)mainkey_len, mainkey_str));
-	} else {
+		uri = entry->index->source;
+	else {
 		/*
 		 * For joins on the main table, we just need the primary
 		 * key for comparison, we don't need any values.
@@ -289,32 +318,38 @@ __curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
 		WT_ERR(__wt_scr_alloc(session, size, &uribuf));
 		WT_ERR(__wt_buf_fmt(session, uribuf, "%s()",
 		    cjoin->table->name));
+		uri = uribuf->data;
 	}
-	WT_ERR(__wt_open_cursor(
-	    session, uribuf->data, &cjoin->iface, raw_cfg, &c));
+	WT_ERR(__wt_open_cursor(session, uri, &cjoin->iface, raw_cfg, &c));
 
 	/* Initially position the cursor if necessary. */
 	endmax = &entry->ends[entry->ends_next];
-	if ((end = &entry->ends[0]) < endmax &&
-	    F_ISSET(end, WT_CURJOIN_END_GE)) {
-		WT_ERR(__wt_cursor_dup_position(end->cursor, c));
-		if (WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_GE)
-			skip = 1;
+	if ((end = &entry->ends[0]) < endmax) {
+		if (F_ISSET(end, WT_CURJOIN_END_GT) ||
+		    WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_EQ) {
+			WT_ERR(__wt_cursor_dup_position(end->cursor, c));
+			if (WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_GE)
+				skip = 1;
+		} else if (F_ISSET(end, WT_CURJOIN_END_LT)) {
+			if ((ret = c->next(c)) == WT_NOTFOUND)
+				goto done;
+			WT_ERR(ret);
+		} else
+			WT_ERR(__wt_illegal_value(session, NULL));
 	}
 	collator = (entry->index == NULL) ? NULL : entry->index->collator;
 	while (ret == 0) {
 		WT_ERR(c->get_key(c, &curkey));
 		if (entry->index != NULL) {
-			cindex = (WT_CURSOR_INDEX *)c;
 			/*
 			 * Repack so it's comparable to the
 			 * reference endpoints.
 			 */
 			WT_ERR(__wt_struct_repack(session,
-			    cindex->child->key_format,
+			    c->key_format,
 			    (entry->repack_format != NULL ?
-			    entry->repack_format : cindex->iface.key_format),
-			    &cindex->child->key, &curkey));
+			    entry->repack_format : entry->index->idxkey_format),
+			    &c->key, &curkey));
 		}
 		for (end = &entry->ends[skip]; end < endmax; end++) {
 			WT_ERR(__wt_compare(session, collator, &curkey,
@@ -335,8 +370,12 @@ __curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
 					goto done;
 			}
 		}
-		if (entry->index != NULL)
-			WT_ERR(c->get_value(c, &curvalue));
+		if (entry->index != NULL) {
+			curvalue.data =
+			    (unsigned char *)curkey.data + curkey.size;
+			WT_ASSERT(session, c->key.size > curkey.size);
+			curvalue.size = c->key.size - curkey.size;
+		}
 		else
 			WT_ERR(c->get_key(c, &curvalue));
 		WT_ERR(__wt_bloom_insert(bloom, &curvalue));
@@ -401,8 +440,13 @@ __curjoin_init_iter(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin)
 {
 	WT_BLOOM *bloom;
 	WT_DECL_RET;
+	WT_CURSOR *origcur;
 	WT_CURSOR_JOIN_ENTRY *je, *jeend, *je2;
 	WT_CURSOR_JOIN_ENDPOINT *end;
+	const char *def_cfg[] = { WT_CONFIG_BASE(
+	    session, WT_SESSION_open_cursor), NULL };
+	const char *raw_cfg[] = { WT_CONFIG_BASE(
+	    session, WT_SESSION_open_cursor), "raw", NULL };
 	uint32_t f, k;
 
 	if (cjoin->entries_next == 0)
@@ -411,9 +455,27 @@ __curjoin_init_iter(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin)
 		    "cursors");
 
 	je = &cjoin->entries[0];
+	jeend = &cjoin->entries[cjoin->entries_next];
+
+	/*
+	 * For a single compare=le endpoint in the first iterated entry,
+	 * construct a companion compare=ge endpoint that will actually
+	 * be iterated.
+	 */
+	if (((je = cjoin->entries) != jeend) &&
+	    je->ends_next == 1 && F_ISSET(&je->ends[0], WT_CURJOIN_END_LT)) {
+		origcur = je->ends[0].cursor;
+		WT_RET(__curjoin_insert_endpoint(session, je, 0, &end));
+		WT_RET(__wt_open_cursor(session, origcur->uri,
+		    (WT_CURSOR *)cjoin,
+		    F_ISSET(origcur, WT_CURSTD_RAW) ? raw_cfg : def_cfg,
+		    &end->cursor));
+		WT_RET(end->cursor->next(end->cursor));
+		end->flags = WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ |
+		    WT_CURJOIN_END_OWN_CURSOR;
+	}
 	WT_RET(__curjoin_entry_iter_init(session, cjoin, je, &cjoin->iter));
 
-	jeend = &cjoin->entries[cjoin->entries_next];
 	for (je = cjoin->entries; je < jeend; je++) {
 		__wt_stat_join_init_single(&je->stats);
 		for (end = &je->ends[0]; end < &je->ends[je->ends_next];
@@ -431,6 +493,10 @@ __curjoin_init_iter(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin)
 			F_SET(cjoin, WT_CURJOIN_SKIP_FIRST_LEFT);
 
 		if (F_ISSET(je, WT_CURJOIN_ENTRY_BLOOM)) {
+			if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED)
+			       WT_RET_MSG(session, EINVAL,
+				   "join cursors with Bloom filters cannot be "
+				   "used with read-uncommitted isolation");
 			if (je->bloom == NULL) {
 				/*
 				 * Look for compatible filters to be shared,
@@ -604,6 +670,8 @@ __curjoin_entry_member(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
 	WT_ITEM *key, v;
 	bool bloom_found;
 
+	if (skip_left && entry->ends_next == 1)
+		return (0);	/* no checks to make */
 	key = cjoin->iter->curkey;
 	entry->stats.accesses++;
 	bloom_found = false;
@@ -626,20 +694,30 @@ __curjoin_entry_member(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
 		bloom_found = true;
 	}
 	if (entry->index != NULL) {
-		memset(&v, 0, sizeof(v));  /* Keep lint quiet. */
-		c = entry->main;
-		c->set_key(c, key);
-		if ((ret = c->search(c)) == 0)
-			ret = c->get_value(c, &v);
-		else if (ret == WT_NOTFOUND)
-			WT_ERR_MSG(session, WT_ERROR,
-			    "main table for join is missing entry.");
-		WT_TRET(c->reset(c));
-		WT_ERR(ret);
+		/*
+		 * If this entry is used by the iterator, then we already
+		 * have the index key, and we won't have to do any extraction
+		 * either.
+		 */
+		if (entry == cjoin->iter->entry)
+			WT_ITEM_SET(v, cjoin->iter->idxkey);
+		else {
+			memset(&v, 0, sizeof(v));  /* Keep lint quiet. */
+			c = entry->main;
+			c->set_key(c, key);
+			if ((ret = c->search(c)) == 0)
+				ret = c->get_value(c, &v);
+			else if (ret == WT_NOTFOUND)
+				WT_ERR_MSG(session, WT_ERROR,
+				    "main table for join is missing entry");
+			WT_TRET(c->reset(c));
+			WT_ERR(ret);
+		}
 	} else
-		v = *key;
+		WT_ITEM_SET(v, *key);
 
-	if ((idx = entry->index) != NULL && idx->extractor != NULL) {
+	if ((idx = entry->index) != NULL && idx->extractor != NULL &&
+	    entry != cjoin->iter->entry) {
 		WT_CLEAR(extract_cursor);
 		extract_cursor.iface = iface;
 		extract_cursor.iface.session = &session->iface;
@@ -667,7 +745,9 @@ err:		if (ret == WT_NOTFOUND && bloom_found)
 static int
 __curjoin_next(WT_CURSOR *cursor)
 {
+	WT_CURSOR *c;
 	WT_CURSOR_JOIN *cjoin;
+	WT_CURSOR_JOIN_ITER *iter;
 	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
 	bool skip_left;
@@ -683,9 +763,11 @@ __curjoin_next(WT_CURSOR *cursor)
 	if (!F_ISSET(cjoin, WT_CURJOIN_INITIALIZED))
 		WT_ERR(__curjoin_init_iter(session, cjoin));
 
+	F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+	iter = cjoin->iter;
+
 nextkey:
-	if ((ret = __curjoin_entry_iter_next(cjoin->iter, &cursor->key,
-	    &cursor->recno)) == 0) {
+	if ((ret = __curjoin_entry_iter_next(iter, cursor)) == 0) {
 		F_SET(cursor, WT_CURSTD_KEY_EXT);
 
 		/*
@@ -702,13 +784,26 @@ nextkey:
 				 * If this is compare=eq on our outer iterator,
 				 * and we've moved past it, we're done.
 				 */
-				if (cjoin->iter->isequal && i == 0)
+				if (iter->isequal && i == 0)
 					break;
 				goto nextkey;
 			}
 			skip_left = false;
 			WT_ERR(ret);
 		}
+	} else if (ret != WT_NOTFOUND)
+		WT_ERR(ret);
+
+	if (ret == 0) {
+		/*
+		 * Position the 'main' cursor, this will be used to
+		 * retrieve values from the cursor join.
+		 */
+		c = iter->main;
+		c->set_key(c, iter->curkey);
+		if ((ret = c->search(c)) != 0)
+			WT_ERR(c->search(c));
+		F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
 	}
 
 	if (0) {
@@ -772,8 +867,11 @@ __curjoin_close(WT_CURSOR *cursor)
 		if (F_ISSET(entry, WT_CURJOIN_ENTRY_OWN_BLOOM))
 			WT_TRET(__wt_bloom_close(entry->bloom));
 		for (end = &entry->ends[0];
-		     end < &entry->ends[entry->ends_next]; end++)
+		     end < &entry->ends[entry->ends_next]; end++) {
 			F_CLR(end->cursor, WT_CURSTD_JOINED);
+			if (F_ISSET(end, WT_CURJOIN_END_OWN_CURSOR))
+				WT_TRET(end->cursor->close(end->cursor));
+		}
 		__wt_free(session, entry->ends);
 		__wt_free(session, entry->repack_format);
 	}
@@ -879,7 +977,7 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
     uint64_t count, uint32_t bloom_bit_count, uint32_t bloom_hash_count)
 {
 	WT_CURSOR_INDEX *cindex;
-	WT_CURSOR_JOIN_ENDPOINT *end, *newend;
+	WT_CURSOR_JOIN_ENDPOINT *end;
 	WT_CURSOR_JOIN_ENTRY *entry;
 	WT_DECL_RET;
 	bool hasins, needbloom, range_eq;
@@ -1000,17 +1098,10 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
 		entry->bloom_hash_count =
 		    WT_MAX(entry->bloom_hash_count, bloom_hash_count);
 	}
-	WT_ERR(__wt_realloc_def(session, &entry->ends_allocated,
-	    entry->ends_next + 1, &entry->ends));
-	if (!hasins)
-		ins = entry->ends_next;
-	newend = &entry->ends[ins];
-	memmove(newend + 1, newend,
-	    (entry->ends_next - ins) * sizeof(WT_CURSOR_JOIN_ENDPOINT));
-	memset(newend, 0, sizeof(WT_CURSOR_JOIN_ENDPOINT));
-	entry->ends_next++;
-	newend->cursor = ref_cursor;
-	F_SET(newend, range);
+	WT_ERR(__curjoin_insert_endpoint(session, entry,
+	    hasins ? ins : entry->ends_next, &end));
+	end->cursor = ref_cursor;
+	F_SET(end, range);
 
 	/* Open the main file with a projection of the indexed columns. */
 	if (entry->main == NULL && idx != NULL) {
@@ -1049,7 +1140,28 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
 		    cindex->iface.key_format);
 	}
 
-err:	if (main_uri != NULL)
-		__wt_free(session, main_uri);
+err:	__wt_free(session, main_uri);
 	return (ret);
 }
+
+/*
+ * __curjoin_insert_endpoint --
+ *	Insert a new entry into the endpoint array for the join entry.
+ */
+static int
+__curjoin_insert_endpoint(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry,
+    u_int pos, WT_CURSOR_JOIN_ENDPOINT **newendp)
+{
+	WT_CURSOR_JOIN_ENDPOINT *newend;
+
+	WT_RET(__wt_realloc_def(session, &entry->ends_allocated,
+	    entry->ends_next + 1, &entry->ends));
+	newend = &entry->ends[pos];
+	memmove(newend + 1, newend,
+	    (entry->ends_next - pos) * sizeof(WT_CURSOR_JOIN_ENDPOINT));
+	memset(newend, 0, sizeof(WT_CURSOR_JOIN_ENDPOINT));
+	entry->ends_next++;
+	*newendp = newend;
+
+	return (0);
+}
diff --git a/src/cursor/cur_log.c b/src/cursor/cur_log.c
index 47436ac7237..0a13803da5d 100644
--- a/src/cursor/cur_log.c
+++ b/src/cursor/cur_log.c
@@ -397,7 +397,7 @@ __wt_curlog_open(WT_SESSION_IMPL *session,
 	 * The user may be trying to read a log record they just wrote.
 	 * Log records may be buffered, so force out any now.
 	 */
-	WT_ERR(__wt_log_force_write(session, 1));
+	WT_ERR(__wt_log_force_write(session, 1, NULL));
 
 	/* Log cursors block archiving. */
 	WT_ERR(__wt_readlock(session, log->log_archive_lock));
diff --git a/src/cursor/cur_metadata.c b/src/cursor/cur_metadata.c
index df2cc3f546e..3d702e2ea8c 100644
--- a/src/cursor/cur_metadata.c
+++ b/src/cursor/cur_metadata.c
@@ -31,6 +31,58 @@
 } while (0)
 
 /*
+ * __wt_schema_create_final --
+ *	Create a single configuration line from a set of configuration strings,
+ * including all of the defaults declared for a session.create, and stripping
+ * any configuration strings that don't belong in a session.create. Here for
+ * the wt dump command utility, which reads a set of configuration strings and
+ * needs to add in the defaults and then collapse them into single string for
+ * a subsequent load.
+ */
+int
+__wt_schema_create_final(
+    WT_SESSION_IMPL *session, char *cfg_arg[], char **value_ret)
+{
+	WT_DECL_RET;
+	u_int i;
+	const char **cfg;
+
+	/*
+	 * Count the entries in the original,
+	 * Allocate a copy with the defaults as the first entry,
+	 * Collapse the whole thing into a single configuration string (which
+	 * also strips any entries that don't appear in the first entry).
+	 */
+	for (i = 0; cfg_arg[i] != NULL; ++i)
+		;
+	WT_RET(__wt_calloc_def(session, i + 2, &cfg));
+	cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_create);
+	for (i = 0; cfg_arg[i] != NULL; ++i)
+		cfg[i + 1] = cfg_arg[i];
+	cfg[i + 1] = NULL;
+
+	ret = __wt_config_collapse(session, cfg, value_ret);
+
+	__wt_free(session, cfg);
+	return (ret);
+}
+
+/*
+ * __schema_create_strip --
+ *	Discard any configuration information from a schema entry that is not
+ * applicable to an session.create call. Here for the metadata:create URI.
+ */
+static int
+__schema_create_strip(
+    WT_SESSION_IMPL *session, const char *value, char **value_ret)
+{
+	const char *cfg[] =
+	    { WT_CONFIG_BASE(session, WT_SESSION_create), value, NULL };
+
+	return (__wt_config_collapse(session, cfg, value_ret));
+}
+
+/*
  * __curmetadata_setkv --
  *	Copy key/value into the public cursor, stripping internal metadata for
  *	"create-only" cursors.
@@ -49,8 +101,7 @@ __curmetadata_setkv(WT_CURSOR_METADATA *mdc, WT_CURSOR *fc)
 	c->key.data = fc->key.data;
 	c->key.size = fc->key.size;
 	if (F_ISSET(mdc, WT_MDC_CREATEONLY)) {
-		WT_RET(__wt_schema_create_strip(
-		    session, fc->value.data, NULL, &value));
+		WT_RET(__schema_create_strip(session, fc->value.data, &value));
 		ret = __wt_buf_set(
 		    session, &c->value, value, strlen(value) + 1);
 		__wt_free(session, value);
@@ -92,8 +143,7 @@ __curmetadata_metadata_search(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
 	WT_RET(__wt_metadata_search(session, WT_METAFILE_URI, &value));
 
 	if (F_ISSET(mdc, WT_MDC_CREATEONLY)) {
-		ret = __wt_schema_create_strip(
-		    session, value, NULL, &stripped);
+		ret = __schema_create_strip(session, value, &stripped);
 		__wt_free(session, value);
 		WT_RET(ret);
 		value = stripped;
diff --git a/src/cursor/cur_stat.c b/src/cursor/cur_stat.c
index 34e64b34ccb..f7a8f5fc866 100644
--- a/src/cursor/cur_stat.c
+++ b/src/cursor/cur_stat.c
@@ -207,6 +207,8 @@ __curstat_next(WT_CURSOR *cursor)
 	if (cst->notpositioned) {
 		cst->notpositioned = false;
 		cst->key = WT_STAT_KEY_MIN(cst);
+		if (cst->next_set != NULL)
+			WT_ERR((*cst->next_set)(session, cst, true, true));
 	} else if (cst->key < WT_STAT_KEY_MAX(cst))
 		++cst->key;
 	else if (cst->next_set != NULL)
@@ -249,6 +251,8 @@ __curstat_prev(WT_CURSOR *cursor)
 	if (cst->notpositioned) {
 		cst->notpositioned = false;
 		cst->key = WT_STAT_KEY_MAX(cst);
+		if (cst->next_set != NULL)
+			WT_ERR((*cst->next_set)(session, cst, false, true));
 	} else if (cst->key > WT_STAT_KEY_MIN(cst))
 		--cst->key;
 	else if (cst->next_set != NULL)
@@ -558,9 +562,6 @@ __wt_curstat_init(WT_SESSION_IMPL *session,
 	else
 		return (__wt_bad_object_type(session, uri));
 
-	if (cst->next_set != NULL)
-		WT_RET((*cst->next_set)(session, cst, false, true));
-
 	return (0);
 }
 
diff --git a/src/docs/command-line.dox b/src/docs/command-line.dox
index e2b376d5e3f..0f5c56d25ce 100644
--- a/src/docs/command-line.dox
+++ b/src/docs/command-line.dox
@@ -41,7 +41,7 @@ by default and commands that only read data will not run recovery.
 Perform a backup of a database or set of data sources.
 
 The \c backup command performs a backup of the database, copying the
-database files to a \c specified directory, which can be subsequently
+underlying files to a \c specified directory, which can be subsequently
 opened as a WiredTiger database.  See @ref backup for more information,
 and @ref file_permissions for specifics on the copied file permissions.
 
@@ -58,10 +58,10 @@ the named data sources.
 
 <hr>
 @section util_compact wt compact
-Compact a table or file.
+Compact a table.
 
-The \c compact command attempts to rewrite the specified table or file
-to consume less disk space.
+The \c compact command attempts to rewrite the specified table to
+consume less disk space.
 
 @subsection util_compact_synopsis Synopsis
 <code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] compact uri</code>
@@ -71,7 +71,7 @@ The \c compact command has no command-specific options.
 
 <hr>
 @section util_create wt create
-Create a table or file.
+Create a table.
 
 The \c create command creates the specified \c uri with the specified
 configuration.  It is equivalent to a call to WT_SESSION::create with
@@ -88,7 +88,7 @@ Include a configuration string to be passed to WT_SESSION::create.
 
 <hr>
 @section util_drop wt drop
-Drop a table or file.
+Drop a table.
 
 The \c drop command drops the specified \c uri.  It is equivalent to a
 call to WT_SESSION::drop with the "force" configuration argument.
@@ -136,10 +136,10 @@ printable characters unencoded).
 
 <hr>
 @section util_list wt list
-List the tables and files in the database.
+List the tables in the database.
 
-By default, the \c list command prints out the tables and files stored in
-the database.  If a URI is specified as an argument, only information about
+By default, the \c list command prints out the tables stored in the
+database.  If a URI is specified as an argument, only information about
 that data source is printed.
 
 @subsection util_list_synopsis Synopsis
@@ -158,16 +158,16 @@ value is printed.
 
 <hr>
 @section util_load wt load
-Load a table or file from dump output.
+Load a table from dump output.
 
 The \c load command reads the standard input for data and loads it into
-a table or file, creating the table or file if it does not yet exist.
-The data should be the format produced by the \c dump command; see
-@ref dump_formats for details.
+a table, creating the table if it does not yet exist.  The data should
+be the format produced by the \c dump command; see @ref dump_formats for
+details.
 
-By default, if the table or file already exists, data in the file or
-table will be overwritten by the new data (use the \c -n option to
-make an attempt to overwrite existing data return an error).
+By default, if the table already exists, data in the table will be
+overwritten by the new data (use the \c -n option to make an attempt to
+overwrite existing data return an error).
 
 @subsection util_load_synopsis Synopsis
 <code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] load [-ajn] [-f input] [-r name] [uri configuration ...]</code>
@@ -182,8 +182,8 @@ number keys.  The \c -a option is only applicable when loading into a
 column store.
 
 @par <code>-f</code>
-By default, the \c load command reads from the standard input; the \c
--f option reads the input from the specified file.
+By default, the \c load command reads from the standard input; the \c -f
+option reads the input from the specified file.
 
 @par <code>-j</code>
 Load input in the JSON (<a href="http://www.json.org">JavaScript Object
@@ -196,7 +196,7 @@ load command to fail if there's an attempt to overwrite already existing
 data.
 
 @par <code>-r</code>
-By default, the \c load command uses the table or file name taken from the
+By default, the \c load command uses the table name taken from the
 input; the \c -r option renames the data source.
 
 Additionally, \c uri and \c configuration pairs may be specified to the
@@ -227,24 +227,23 @@ table:xxx block_allocation=first table:xxx prefix_compress=false
 
 <hr>
 @section util_loadtext wt loadtext
-Load text into a table or file.
+Load text into a table.
 
 The \c loadtext command reads the standard input for text and loads it
-into a table or file.  The input data should be printable characters,
-with newline delimiters for each key or value.
+into a table.  The input data should be printable characters, with
+newline delimiters for each key or value.
 
-The \c loadtext command does not create the file if it does not yet
+The \c loadtext command does not create the object if it does not yet
 exist.
 
-In the case of inserting values into a column-store table or file, each
-value is appended to the table or file; in the case of inserting values
-into a row-store table or file, lines are handled in pairs, where the
-first line is the key and the second line is the value.  If the
-row-store table or file already exists, data in the table or file will
-be overwritten by the new data.
+In the case of inserting values into a column-store table, each value
+is appended to the table; in the case of inserting values into a
+row-store table, lines are handled in pairs, where the first line is the
+key and the second line is the value.  If the row-store table already
+exists, data in the table will be overwritten by the new data.
 
 @subsection util_loadtext_synopsis Synopsis
-<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] loadtext [-f input]</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] loadtext [-f input] uri</code>
 
 @subsection util_loadtext_options Options
 The following are command-specific options for the \c loadtext command:
@@ -275,7 +274,7 @@ to the default string format.
 
 <hr>
 @section util_read wt read
-Read records from a table or file.
+Read records from a table.
 
 The \c read command prints out the records associated with the specified
 keys from the specified data source.  The data source must be configured
@@ -291,9 +290,9 @@ The \c read command has no command-specific options.
 
 <hr>
 @section util_rename wt rename
-Rename a table or file.
+Rename a table.
 
-The \c rename command renames the specified table or file.
+The \c rename command renames the specified table.
 
 @subsection util_rename_synopsis Synopsis
 <code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] rename uri name</code>
@@ -303,11 +302,11 @@ The \c rename command has no command-specific options.
 
 <hr>
 @section util_salvage wt salvage
-Recover data from a corrupted file.
+Recover data from a corrupted table.
 
 The \c salvage command salvages the specified data source, discarding any
-data that cannot be recovered.  Underlying files are re-written in
-place, overwriting the original file contents.
+data that cannot be recovered.  Underlying files are re-written in place,
+overwriting the original file contents.
 
 @subsection util_salvage_synopsis Synopsis
 <code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] salvage [-F force] uri</code>
@@ -316,9 +315,9 @@ place, overwriting the original file contents.
 The following are command-specific options for the \c salvage command:
 
 @par <code>-F</code>
-By default, salvage will refuse to salvage files that fail basic tests
-(for example, files that don't appear to be in a WiredTiger format).
-The \c -F option forces the salvage of the file, regardless.
+By default, salvage will refuse to salvage tables that fail basic tests
+(for example, tables that don't appear to be in a WiredTiger format).
+The \c -F option forces the salvage of the table, regardless.
 
 <hr>
 @section util_stat wt stat
@@ -339,11 +338,11 @@ Include only "fast" statistics in the output (equivalent to passing
 
 <hr>
 @section util_upgrade wt upgrade
-Upgrade a table or file.
+Upgrade a table.
 
-The \c upgrade command upgrades the specified table or file, exiting
-success if the data source is up-to-date, and failure if the data source
-cannot be upgraded.
+The \c upgrade command upgrades the specified table, exiting success if
+the data source is up-to-date, and failure if the data source cannot be
+upgraded.
 
 @subsection util_upgrade_synopsis Synopsis
 <code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] upgrade uri</code>
@@ -353,11 +352,10 @@ The \c upgrade command has no command-specific options.
 
 <hr>
 @section util_verify wt verify
-Check the structural integrity of a table or file.
+Check the structural integrity of a table.
 
-The \c verify command verifies the specified table or file, exiting
-success if the data source is correct, and failure if the data source is
-corrupted.
+The \c verify command verifies the specified table, exiting success if
+the data source is correct, and failure if the data source is corrupted.
 
 @subsection util_verify_synopsis Synopsis
 <code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] verify uri</code>
@@ -367,7 +365,7 @@ The \c verify command has no command-specific options.
 
 <hr>
 @section util_write wt write
-Write records to a table or file.
+Write records to a table.
 
 The \c write command stores records into the specified data source.
 The data source must be configured with string or record number keys and
diff --git a/src/docs/data-sources.dox b/src/docs/data-sources.dox
index d09d1cbc1b8..7f1879e0ffe 100644
--- a/src/docs/data-sources.dox
+++ b/src/docs/data-sources.dox
@@ -38,7 +38,7 @@ cursor types that give access to data managed by WiredTiger:
 	key=<code>string</code>\, value=<code>string</code>\,<br>
 	see @ref metadata for details}
 @row{<tt>statistics:[\<data source URI\>]</tt>,
-	database or data source statistics cursor,
+	database, data source or join statistics cursor,
 	key=<code>int id</code>\,<br>
 	value=<code>(string description\,
 	string value\, uint64_t value)</code>\,<br>
@@ -106,7 +106,9 @@ WiredTiger database as well as statistics for individual data sources.
 The statistics are at two levels: per-database and per-individual data
 source.  Database-wide statistics are retrieved with the \c "statistics:"
 URI; individual data source statistics are available by specifying
-\c "statistics:<data source URI>".
+\c "statistics:<data source URI>".  Additionally, statistics about a
+join cursor can be retrieved by specifying \c "statistics:join" and
+supplying the join cursor as an argument in the SESSION::open_cursor call.
 
 The statistic key is an integer from the list of keys in
 @ref_single statistics_keys "Statistics Keys".  Statistics cursors return
@@ -127,7 +129,11 @@ The following is an example of printing statistics about a table:
 
 @snippet ex_stat.c statistics table function
 
-Both examples can use a common display routine that iterates through the
+The following is an example of printing statistics about a join cursor:
+
+@snippet ex_stat.c statistics join cursor function
+
+These three examples can use a common display routine that iterates through the
 statistics until the cursor returns the end of the list.
 
 @snippet ex_stat.c statistics display function
diff --git a/src/docs/statistics.dox b/src/docs/statistics.dox
index 453da34c51a..0a29e351e4e 100644
--- a/src/docs/statistics.dox
+++ b/src/docs/statistics.dox
@@ -79,6 +79,15 @@ or logged:
 
 @snippet ex_all.c Statistics clear configuration
 
+The following example opens a statistics cursor on an open join cursor:
+
+@snippet ex_schema.c Statistics cursor join cursor
+
+The statistics gathered will be organized by reference cursors participating
+in the join (see WT_SESSION::join); the uri of each reference cursor appears
+as a prefix in the description field returned as a value by the statistics
+cursor.
+
 @section statistics_log Statistics logging
 
 WiredTiger will optionally log database statistics into a file when the
diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox
index df0a22ba0fe..8b3d61e4c19 100644
--- a/src/docs/upgrading.dox
+++ b/src/docs/upgrading.dox
@@ -2,6 +2,14 @@
 
 @section version_271 Upgrading to Version 2.7.1
 <dl>
+<dt>LSM metadata</dt>
+<dd>
+There is a change to the format of LSM metadata in this release to fix bugs
+in dump / load of tables of type LSM.  Tables created with the old LSM metadata
+format will be upgraded automatically, but once updated to the new version
+<b>are no longer compatible with older releases of WiredTiger</b>.
+</dd>
+
 <dt>Column-store bulk-load cursors</dt>
 <dd>
 Historically, bulk-load of a column-store object ignored any key set in the
diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c
index 641864a8baa..ca98b1bd62a 100644
--- a/src/evict/evict_file.c
+++ b/src/evict/evict_file.c
@@ -18,13 +18,12 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
 	WT_DECL_RET;
 	WT_PAGE *page;
 	WT_REF *next_ref, *ref;
-	bool evict_reset;
 
 	/*
 	 * We need exclusive access to the file -- disable ordinary eviction
 	 * and drain any blocks already queued.
 	 */
-	WT_RET(__wt_evict_file_exclusive_on(session, &evict_reset));
+	WT_RET(__wt_evict_file_exclusive_on(session));
 
 	/* Make sure the oldest transaction ID is up-to-date. */
 	__wt_txn_update_oldest(session, true);
@@ -98,8 +97,7 @@ err:		/* On error, clear any left-over tree walk. */
 			    session, next_ref, WT_READ_NO_EVICT));
 	}
 
-	if (evict_reset)
-		__wt_evict_file_exclusive_off(session);
+	__wt_evict_file_exclusive_off(session);
 
 	return (ret);
 }
diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c
index 884c08a02df..50a00787f35 100644
--- a/src/evict/evict_lru.c
+++ b/src/evict/evict_lru.c
@@ -159,7 +159,7 @@ __wt_evict_server_wake(WT_SESSION_IMPL *session)
 		    bytes_max / WT_MEGABYTE));
 	}
 
-	return (__wt_cond_signal(session, cache->evict_cond));
+	return (__wt_cond_auto_signal(session, cache->evict_cond));
 }
 
 /*
@@ -175,8 +175,8 @@ __evict_server(void *arg)
 	WT_SESSION_IMPL *session;
 #ifdef HAVE_DIAGNOSTIC
 	struct timespec now, stuck_ts;
-	uint64_t pages_evicted = 0;
 #endif
+	uint64_t pages_evicted = 0;
 	u_int spins;
 
 	session = arg;
@@ -219,11 +219,11 @@ __evict_server(void *arg)
 
 			/* Next time we wake up, reverse the sweep direction. */
 			cache->flags ^= WT_CACHE_WALK_REVERSE;
-#ifdef HAVE_DIAGNOSTIC
 			pages_evicted = 0;
 		} else if (pages_evicted != cache->pages_evict) {
-			WT_ERR(__wt_epoch(session, &stuck_ts));
 			pages_evicted = cache->pages_evict;
+#ifdef HAVE_DIAGNOSTIC
+			WT_ERR(__wt_epoch(session, &stuck_ts));
 		} else {
 			/* After being stuck for 5 minutes, give up. */
 			WT_ERR(__wt_epoch(session, &now));
@@ -238,7 +238,8 @@ __evict_server(void *arg)
 
 		WT_ERR(__wt_verbose(session, WT_VERB_EVICTSERVER, "sleeping"));
 		/* Don't rely on signals: check periodically. */
-		WT_ERR(__wt_cond_wait(session, cache->evict_cond, 100000));
+		WT_ERR(__wt_cond_auto_wait(
+		    session, cache->evict_cond, pages_evicted != 0));
 		WT_ERR(__wt_verbose(session, WT_VERB_EVICTSERVER, "waking"));
 	}
 
@@ -720,12 +721,32 @@ __evict_clear_walks(WT_SESSION_IMPL *session)
 }
 
 /*
- * __evict_request_walk_clear --
+ * __evict_clear_all_walks --
+ *	Clear the eviction walk points for all files a session is waiting on.
+ */
+static int
+__evict_clear_all_walks(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DATA_HANDLE *dhandle;
+	WT_DECL_RET;
+
+	conn = S2C(session);
+
+	TAILQ_FOREACH(dhandle, &conn->dhqh, q)
+		if (WT_PREFIX_MATCH(dhandle->name, "file:"))
+			WT_WITH_DHANDLE(session,
+			    dhandle, WT_TRET(__evict_clear_walk(session)));
+	return (ret);
+}
+
+/*
+ * __evict_request_clear_walk --
  *	Request that the eviction server clear the tree's current eviction
  *	point.
  */
 static int
-__evict_request_walk_clear(WT_SESSION_IMPL *session)
+__evict_request_clear_walk(WT_SESSION_IMPL *session)
 {
 	WT_BTREE *btree;
 	WT_CACHE *cache;
@@ -753,32 +774,12 @@ __evict_request_walk_clear(WT_SESSION_IMPL *session)
 }
 
 /*
- * __evict_clear_all_walks --
- *	Clear the eviction walk points for all files a session is waiting on.
- */
-static int
-__evict_clear_all_walks(WT_SESSION_IMPL *session)
-{
-	WT_CONNECTION_IMPL *conn;
-	WT_DATA_HANDLE *dhandle;
-	WT_DECL_RET;
-
-	conn = S2C(session);
-
-	TAILQ_FOREACH(dhandle, &conn->dhqh, q)
-		if (WT_PREFIX_MATCH(dhandle->name, "file:"))
-			WT_WITH_DHANDLE(session,
-			    dhandle, WT_TRET(__evict_clear_walk(session)));
-	return (ret);
-}
-
-/*
  * __wt_evict_file_exclusive_on --
  *	Get exclusive eviction access to a file and discard any of the file's
  *	blocks queued for eviction.
  */
 int
-__wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp)
+__wt_evict_file_exclusive_on(WT_SESSION_IMPL *session)
 {
 	WT_BTREE *btree;
 	WT_CACHE *cache;
@@ -786,40 +787,39 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp)
 	WT_EVICT_ENTRY *evict;
 	u_int i, elem;
 
-	*evict_resetp = false;
-
 	btree = S2BT(session);
 	cache = S2C(session)->cache;
 
-	/* If the file was never evictable, there's no work to do. */
-	if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
-		return (0);
-
 	/*
-	 * Hold the walk lock to set the "no eviction" flag: no new pages from
-	 * the file will be queued for eviction after this point.
+	 * Hold the walk lock to set the no-eviction flag.
+	 *
+	 * The no-eviction flag can be set permanently, in which case we never
+	 * increment the no-eviction count.
 	 */
 	__wt_spin_lock(session, &cache->evict_walk_lock);
-	if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) {
-		F_SET(btree, WT_BTREE_NO_EVICTION);
-		*evict_resetp = true;
+	if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) {
+		if (btree->evict_disabled != 0)
+			++btree->evict_disabled;
+		__wt_spin_unlock(session, &cache->evict_walk_lock);
+		return (0);
 	}
-	__wt_spin_unlock(session, &cache->evict_walk_lock);
+	++btree->evict_disabled;
 
-	/* If some other operation has disabled eviction, we're done. */
-	if (!*evict_resetp)
-		return (0);
+	/*
+	 * Ensure no new pages from the file will be queued for eviction after
+	 * this point.
+	 */
+	F_SET(btree, WT_BTREE_NO_EVICTION);
+	WT_FULL_BARRIER();
 
 	/* Clear any existing LRU eviction walk for the file. */
-	WT_ERR(__evict_request_walk_clear(session));
-
-	/* Hold the evict lock to remove any queued pages from this file. */
-	__wt_spin_lock(session, &cache->evict_lock);
+	WT_ERR(__evict_request_clear_walk(session));
 
 	/*
 	 * The eviction candidate list might reference pages from the file,
-	 * clear it.
+	 * clear it. Hold the evict lock to remove queued pages from a file.
 	 */
+	__wt_spin_lock(session, &cache->evict_lock);
 	elem = cache->evict_max;
 	for (i = 0, evict = cache->evict_queue; i < elem; i++, evict++)
 		if (evict->btree == btree)
@@ -833,10 +833,11 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp)
 	while (btree->evict_busy > 0)
 		__wt_yield();
 
-	return (0);
-
-err:	F_CLR(btree, WT_BTREE_NO_EVICTION);
-	*evict_resetp = false;
+	if (0) {
+err:		--btree->evict_disabled;
+		F_CLR(btree, WT_BTREE_NO_EVICTION);
+	}
+	__wt_spin_unlock(session, &cache->evict_walk_lock);
 	return (ret);
 }
 
@@ -848,8 +849,10 @@ void
 __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session)
 {
 	WT_BTREE *btree;
+	WT_CACHE *cache;
 
 	btree = S2BT(session);
+	cache = S2C(session)->cache;
 
 	/*
 	 * We have seen subtle bugs with multiple threads racing to turn
@@ -857,10 +860,17 @@ __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session)
 	 */
 	WT_DIAGNOSTIC_YIELD;
 
-	WT_ASSERT(session, btree->evict_ref == NULL &&
-	    F_ISSET(btree, WT_BTREE_NO_EVICTION));
+	WT_ASSERT(session,
+	    btree->evict_ref == NULL && F_ISSET(btree, WT_BTREE_NO_EVICTION));
 
-	F_CLR(btree, WT_BTREE_NO_EVICTION);
+	/*
+	 * The no-eviction flag can be set permanently, in which case we never
+	 * increment the no-eviction count.
+	 */
+	__wt_spin_lock(session, &cache->evict_walk_lock);
+	if (btree->evict_disabled > 0 && --btree->evict_disabled == 0)
+		F_CLR(btree, WT_BTREE_NO_EVICTION);
+	__wt_spin_unlock(session, &cache->evict_walk_lock);
 }
 
 /*
@@ -890,7 +900,7 @@ __evict_lru_walk(WT_SESSION_IMPL *session)
 {
 	WT_CACHE *cache;
 	WT_DECL_RET;
-	uint64_t cutoff;
+	uint64_t cutoff, read_gen_oldest;
 	uint32_t candidates, entries;
 
 	cache = S2C(session)->cache;
@@ -931,34 +941,62 @@ __evict_lru_walk(WT_SESSION_IMPL *session)
 		return (0);
 	}
 
-	WT_ASSERT(session, cache->evict_queue[0].ref != NULL);
-
-	/* Track the oldest read generation we have in the queue. */
-	cache->read_gen_oldest = cache->evict_queue[0].ref->page->read_gen;
-
+	/* Decide how many of the candidates we're going to try and evict. */
 	if (FLD_ISSET(cache->state,
-	    WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK))
+	    WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK)) {
 		/*
 		 * Take all candidates if we only gathered pages with an oldest
 		 * read generation set.
 		 */
 		cache->evict_candidates = entries;
-	else {
-		/* Find the bottom 25% of read generations. */
-		cutoff = (3 * __evict_read_gen(&cache->evict_queue[0]) +
-		    __evict_read_gen(&cache->evict_queue[entries - 1])) / 4;
+	} else {
 		/*
-		 * Don't take less than 10% or more than 50% of entries,
-		 * regardless.  That said, if there is only one entry, which is
-		 * normal when populating an empty file, don't exclude it.
+		 * Find the oldest read generation we have in the queue, used
+		 * to set the initial value for pages read into the system.
+		 * The queue is sorted, find the first "normal" generation.
 		 */
-		for (candidates = 1 + entries / 10;
-		    candidates < entries / 2;
-		    candidates++)
-			if (__evict_read_gen(
-			    &cache->evict_queue[candidates]) > cutoff)
+		read_gen_oldest = WT_READGEN_OLDEST;
+		for (candidates = 0; candidates < entries; ++candidates) {
+			read_gen_oldest =
+			    __evict_read_gen(&cache->evict_queue[candidates]);
+			if (read_gen_oldest != WT_READGEN_OLDEST)
 				break;
-		cache->evict_candidates = candidates;
+		}
+
+		/*
+		 * Take all candidates if we only gathered pages with an oldest
+		 * read generation set.
+		 *
+		 * We normally never take more than 50% of the entries; if 50%
+		 * of the entries were at the oldest read generation, take them.
+		 */
+		if (read_gen_oldest == WT_READGEN_OLDEST)
+			cache->evict_candidates = entries;
+		else if (candidates >= entries / 2)
+			cache->evict_candidates = candidates;
+		else {
+			/* Save the calculated oldest generation. */
+			cache->read_gen_oldest = read_gen_oldest;
+
+			/* Find the bottom 25% of read generations. */
+			cutoff =
+			    (3 * read_gen_oldest + __evict_read_gen(
+			    &cache->evict_queue[entries - 1])) / 4;
+
+			/*
+			 * Don't take less than 10% or more than 50% of entries,
+			 * regardless. That said, if there is only one entry,
+			 * which is normal when populating an empty file, don't
+			 * exclude it.
+			 */
+			for (candidates = 1 + entries / 10;
+			    candidates < entries / 2;
+			    candidates++)
+				if (__evict_read_gen(
+				    &cache->evict_queue[candidates]) > cutoff)
+					break;
+			cache->evict_candidates = candidates;
+		}
 	}
 
 	cache->evict_current = cache->evict_queue;
@@ -1127,23 +1165,27 @@ retry:	while (slot < max_entries && ret == 0) {
 		__wt_spin_unlock(session, &conn->dhandle_lock);
 		dhandle_locked = false;
 
-		__wt_spin_lock(session, &cache->evict_walk_lock);
-
 		/*
-		 * Re-check the "no eviction" flag -- it is used to enforce
-		 * exclusive access when a handle is being closed.
+		 * Re-check the "no eviction" flag, used to enforce exclusive
+		 * access when a handle is being closed. If not set, remember
+		 * the file to visit first, next loop.
+		 *
+		 * Only try to acquire the lock and simply continue if we fail;
+		 * the lock is held while the thread turning off eviction clears
+		 * the tree's current eviction point, and part of the process is
+		 * waiting on this thread to acknowledge that action.
 		 */
-		if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) {
-			/* Remember the file to visit first, next loop. */
-			cache->evict_file_next = dhandle;
-
-			WT_WITH_DHANDLE(session, dhandle,
-			    ret = __evict_walk_file(session, &slot));
-			WT_ASSERT(session, session->split_gen == 0);
+		if (!F_ISSET(btree, WT_BTREE_NO_EVICTION) &&
+		    !__wt_spin_trylock(session, &cache->evict_walk_lock)) {
+			if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) {
+				cache->evict_file_next = dhandle;
+				WT_WITH_DHANDLE(session, dhandle,
+				    ret = __evict_walk_file(session, &slot));
+				WT_ASSERT(session, session->split_gen == 0);
+			}
+			__wt_spin_unlock(session, &cache->evict_walk_lock);
 		}
 
-		__wt_spin_unlock(session, &cache->evict_walk_lock);
-
 		/*
 		 * If we didn't find any candidates in the file, skip it next
 		 * time.
@@ -1286,6 +1328,18 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
 		if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU))
 			continue;
 
+		/*
+		 * It's possible (but unlikely) to visit a page without a read
+		 * generation, if we race with the read instantiating the page.
+		 * Ignore those pages, but set the page's read generation here
+		 * to ensure a bug doesn't somehow leave a page without a read
+		 * generation.
+		 */
+		if (page->read_gen == WT_READGEN_NOTSET) {
+			__wt_cache_read_gen_new(session, page);
+			continue;
+		}
+
 		/* Pages we no longer need (clean or dirty), are found money. */
 		if (__wt_page_is_empty(page) ||
 		    F_ISSET(session->dhandle, WT_DHANDLE_DEAD) ||
@@ -1311,13 +1365,6 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
 		    internal_pages >= (int)(evict - start) / 2)
 			continue;
 
-		/*
-		 * If this page has never been considered for eviction, set its
-		 * read generation to somewhere in the middle of the LRU list.
-		 */
-		if (page->read_gen == WT_READGEN_NOTSET)
-			page->read_gen = __wt_cache_read_gen_new(session);
-
 fast:		/* If the page can't be evicted, give up. */
 		if (!__wt_page_can_evict(session, ref, NULL))
 			continue;
@@ -1477,7 +1524,6 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server)
 {
 	WT_BTREE *btree;
 	WT_DECL_RET;
-	WT_PAGE *page;
 	WT_REF *ref;
 
 	WT_RET(__evict_get_ref(session, is_server, &btree, &ref));
@@ -1506,9 +1552,7 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server)
 	 * the page and some other thread may have evicted it by the time we
 	 * look at it.
 	 */
-	page = ref->page;
-	if (page->read_gen != WT_READGEN_OLDEST)
-		page->read_gen = __wt_cache_read_gen_bump(session);
+	__wt_cache_read_gen_bump(session, ref->page);
 
 	WT_WITH_BTREE(session, btree, ret = __wt_evict(session, ref, false));
 
diff --git a/src/include/btmem.h b/src/include/btmem.h
index ee495c52fc8..7cdf2bef43a 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -598,9 +598,14 @@ struct __wt_page {
 	 * read generation is incremented by the eviction server each time it
 	 * becomes active.  To avoid incrementing a page's read generation too
 	 * frequently, it is set to a future point.
+	 *
+	 * Because low read generation values have special meaning, and there
+	 * are places where we manipulate the value, use an initial value well
+	 * outside of the special range.
 	 */
 #define	WT_READGEN_NOTSET	0
 #define	WT_READGEN_OLDEST	1
+#define	WT_READGEN_START_VALUE	100
 #define	WT_READGEN_STEP		100
 	uint64_t read_gen;
 
diff --git a/src/include/btree.h b/src/include/btree.h
index 703de0f2fc6..fd921677751 100644
--- a/src/include/btree.h
+++ b/src/include/btree.h
@@ -129,10 +129,11 @@ struct __wt_btree {
 	uint64_t rec_max_txn;		/* Maximum txn seen (clean trees) */
 	uint64_t write_gen;		/* Write generation */
 
-	WT_REF  *evict_ref;		/* Eviction thread's location */
-	uint64_t evict_priority;	/* Relative priority of cached pages */
-	u_int    evict_walk_period;	/* Skip this many LRU walks */
-	u_int    evict_walk_skips;	/* Number of walks skipped */
+	WT_REF	   *evict_ref;		/* Eviction thread's location */
+	uint64_t    evict_priority;	/* Relative priority of cached pages */
+	u_int	    evict_walk_period;	/* Skip this many LRU walks */
+	u_int	    evict_walk_skips;	/* Number of walks skipped */
+	u_int	    evict_disabled;	/* Eviction disabled count */
 	volatile uint32_t evict_busy;	/* Count of threads in eviction */
 
 	enum {
diff --git a/src/include/cache.h b/src/include/cache.h
index a3961d6043e..9184a2fe6ed 100644
--- a/src/include/cache.h
+++ b/src/include/cache.h
@@ -75,9 +75,9 @@ struct __wt_cache {
 	/*
 	 * Read information.
 	 */
-	uint64_t   read_gen;		/* Page read generation (LRU) */
-	uint64_t   read_gen_oldest;	/* The oldest read generation that
-					   eviction knows about */
+	uint64_t read_gen;		/* Current page read generation */
+	uint64_t read_gen_oldest;	/* Oldest read generation the eviction
+					 * server saw in its last queue load */
 
 	/*
 	 * Eviction thread information.
diff --git a/src/include/cache.i b/src/include/cache.i
index ee13eee84c5..8cf7555e716 100644
--- a/src/include/cache.i
+++ b/src/include/cache.i
@@ -28,34 +28,43 @@ __wt_cache_read_gen_incr(WT_SESSION_IMPL *session)
 
 /*
  * __wt_cache_read_gen_bump --
- *      Get the read generation to keep a page in memory.
+ *      Update the page's read generation.
  */
-static inline uint64_t
-__wt_cache_read_gen_bump(WT_SESSION_IMPL *session)
+static inline void
+__wt_cache_read_gen_bump(WT_SESSION_IMPL *session, WT_PAGE *page)
 {
+	/* Ignore pages set for forcible eviction. */
+	if (page->read_gen == WT_READGEN_OLDEST)
+		return;
+
+	/* Ignore pages already in the future. */
+	if (page->read_gen > __wt_cache_read_gen(session))
+		return;
+
 	/*
-	 * We return read-generations from the future (where "the future" is
-	 * measured by increments of the global read generation).  The reason
-	 * is because when acquiring a new hazard pointer for a page, we can
-	 * check its read generation, and if the read generation isn't less
-	 * than the current global generation, we don't bother updating the
-	 * page.  In other words, the goal is to avoid some number of updates
-	 * immediately after each update we have to make.
+	 * We set read-generations in the future (where "the future" is measured
+	 * by increments of the global read generation).  The reason is because
+	 * when acquiring a new hazard pointer for a page, we can check its read
+	 * generation, and if the read generation isn't less than the current
+	 * global generation, we don't bother updating the page.  In other
+	 * words, the goal is to avoid some number of updates immediately after
+	 * each update we have to make.
 	 */
-	return (__wt_cache_read_gen(session) + WT_READGEN_STEP);
+	page->read_gen = __wt_cache_read_gen(session) + WT_READGEN_STEP;
 }
 
 /*
  * __wt_cache_read_gen_new --
  *      Get the read generation for a new page in memory.
  */
-static inline uint64_t
-__wt_cache_read_gen_new(WT_SESSION_IMPL *session)
+static inline void
+__wt_cache_read_gen_new(WT_SESSION_IMPL *session, WT_PAGE *page)
 {
 	WT_CACHE *cache;
 
 	cache = S2C(session)->cache;
-	return (__wt_cache_read_gen(session) + cache->read_gen_oldest) / 2;
+	page->read_gen =
+	    (__wt_cache_read_gen(session) + cache->read_gen_oldest) / 2;
 }
 
 /*
@@ -119,12 +128,11 @@ __wt_session_can_wait(WT_SESSION_IMPL *session)
 		return (0);
 
 	/*
-	 * LSM sets the no-eviction flag when holding the LSM tree lock,
-	 * in that case, or when holding the schema lock, we don't want to
-	 * highjack the thread for eviction.
+	 * LSM sets the no-eviction flag when holding the LSM tree lock, in that
+	 * case, or when holding the schema lock, we don't want to highjack the
+	 * thread for eviction.
 	 */
-	if (F_ISSET(session,
-	    WT_SESSION_NO_EVICTION | WT_SESSION_LOCKED_SCHEMA))
+	if (F_ISSET(session, WT_SESSION_NO_EVICTION | WT_SESSION_LOCKED_SCHEMA))
 		return (0);
 
 	return (1);
@@ -224,11 +232,11 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, bool *didworkp)
 		return (0);
 
 	/*
-	 * Threads operating on trees that cannot be evicted are ignored,
-	 * mostly because they're not contributing to the problem.
+	 * Threads operating on cache-resident trees are ignored because they're
+	 * not contributing to the problem.
 	 */
 	btree = S2BT_SAFE(session);
-	if (btree != NULL && F_ISSET(btree, WT_BTREE_NO_EVICTION))
+	if (btree != NULL && F_ISSET(btree, WT_BTREE_IN_MEMORY))
 		return (0);
 
 	/* Check if eviction is needed. */
diff --git a/src/include/config.h b/src/include/config.h
index e63db0e76cf..48a255134af 100644
--- a/src/include/config.h
+++ b/src/include/config.h
@@ -85,13 +85,15 @@ struct __wt_config_parser_impl {
 #define	WT_CONFIG_ENTRY_WT_SESSION_upgrade		33
 #define	WT_CONFIG_ENTRY_WT_SESSION_verify		34
 #define	WT_CONFIG_ENTRY_colgroup_meta			35
-#define	WT_CONFIG_ENTRY_file_meta			36
-#define	WT_CONFIG_ENTRY_index_meta			37
-#define	WT_CONFIG_ENTRY_table_meta			38
-#define	WT_CONFIG_ENTRY_wiredtiger_open			39
-#define	WT_CONFIG_ENTRY_wiredtiger_open_all		40
-#define	WT_CONFIG_ENTRY_wiredtiger_open_basecfg		41
-#define	WT_CONFIG_ENTRY_wiredtiger_open_usercfg		42
+#define	WT_CONFIG_ENTRY_file_config			36
+#define	WT_CONFIG_ENTRY_file_meta			37
+#define	WT_CONFIG_ENTRY_index_meta			38
+#define	WT_CONFIG_ENTRY_lsm_meta			39
+#define	WT_CONFIG_ENTRY_table_meta			40
+#define	WT_CONFIG_ENTRY_wiredtiger_open			41
+#define	WT_CONFIG_ENTRY_wiredtiger_open_all		42
+#define	WT_CONFIG_ENTRY_wiredtiger_open_basecfg		43
+#define	WT_CONFIG_ENTRY_wiredtiger_open_usercfg		44
 /*
  * configuration section: END
  * DO NOT EDIT: automatically built by dist/flags.py.
diff --git a/src/include/cursor.h b/src/include/cursor.h
index 48db8b9ec23..4b35daf106e 100644
--- a/src/include/cursor.h
+++ b/src/include/cursor.h
@@ -213,10 +213,11 @@ struct __wt_cursor_btree {
 #define	WT_CBT_NO_TXN   	0x10	/* Non-transactional cursor
 					   (e.g. on a checkpoint) */
 #define	WT_CBT_SEARCH_SMALLEST	0x20	/* Row-store: small-key insert list */
+#define	WT_CBT_VAR_ONPAGE_MATCH	0x40	/* Var-store: on-page recno match */
 
 #define	WT_CBT_POSITION_MASK		/* Flags associated with position */ \
 	(WT_CBT_ITERATE_APPEND | WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV | \
-	WT_CBT_SEARCH_SMALLEST)
+	WT_CBT_SEARCH_SMALLEST | WT_CBT_VAR_ONPAGE_MATCH)
 
 	uint8_t flags;
 };
@@ -287,8 +288,10 @@ struct __wt_cursor_join_iter {
 	WT_SESSION_IMPL		*session;
 	WT_CURSOR_JOIN		*cjoin;
 	WT_CURSOR_JOIN_ENTRY	*entry;
-	WT_CURSOR		*cursor;
-	WT_ITEM			*curkey;
+	WT_CURSOR		*cursor;	/* has null projection */
+	WT_CURSOR		*main;		/* main table with projection */
+	WT_ITEM			*curkey;	/* primary key */
+	WT_ITEM			 idxkey;
 	bool			 positioned;
 	bool			 isequal;	/* advancing means we're done */
 };
@@ -303,6 +306,7 @@ struct __wt_cursor_join_endpoint {
 #define	WT_CURJOIN_END_GT	0x04		/* include values >  cursor */
 #define	WT_CURJOIN_END_GE	(WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ)
 #define	WT_CURJOIN_END_LE	(WT_CURJOIN_END_LT | WT_CURJOIN_END_EQ)
+#define	WT_CURJOIN_END_OWN_CURSOR 0x08		/* must close cursor */
 	uint8_t			 flags;		/* range for this endpoint */
 };
 #define	WT_CURJOIN_END_RANGE(endp)					\
diff --git a/src/include/extern.h b/src/include/extern.h
index 55b0b8cd7ff..48c52d4a109 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -168,7 +168,7 @@ extern int __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing);
 extern int __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref);
 extern int __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref);
 extern int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst);
-extern int __wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_CACHE_OP op);
+extern int __wt_cache_op(WT_SESSION_IMPL *session, WT_CACHE_OP op);
 extern int __wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]);
 extern int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]);
 extern int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk, size_t size, bool empty_page_ok);
@@ -297,6 +297,7 @@ extern int __wt_json_to_item(WT_SESSION_IMPL *session, const char *jstr, const c
 extern ssize_t __wt_json_strlen(const char *src, size_t srclen);
 extern int __wt_json_strncpy(char **pdst, size_t dstlen, const char *src, size_t srclen);
 extern int __wt_curlog_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_schema_create_final( WT_SESSION_IMPL *session, char *cfg_arg[], char **value_ret);
 extern int __wt_curmetadata_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
 extern void __wt_curstat_dsrc_final(WT_CURSOR_STAT *cst);
 extern int __wt_curstat_init(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *curjoin, const char *cfg[], WT_CURSOR_STAT *cst);
@@ -341,7 +342,7 @@ extern void __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref);
 extern int __wt_evict_server_wake(WT_SESSION_IMPL *session);
 extern int __wt_evict_create(WT_SESSION_IMPL *session);
 extern int __wt_evict_destroy(WT_SESSION_IMPL *session);
-extern int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp);
+extern int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session);
 extern void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session);
 extern int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full);
 extern void __wt_evict_priority_set(WT_SESSION_IMPL *session, uint64_t v);
@@ -364,7 +365,7 @@ extern int __wt_log_open(WT_SESSION_IMPL *session);
 extern int __wt_log_close(WT_SESSION_IMPL *session);
 extern int __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep);
 extern int __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, int (*func)(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, WT_LSN *next_lsnp, void *cookie, int firstrecord), void *cookie);
-extern int __wt_log_force_write(WT_SESSION_IMPL *session, bool retry);
+extern int __wt_log_force_write(WT_SESSION_IMPL *session, bool retry, bool *did_work);
 extern int __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags);
 extern int __wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap);
 extern int __wt_log_flush(WT_SESSION_IMPL *session, uint32_t flags);
@@ -485,7 +486,9 @@ extern int __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, char **va
 extern int __wt_turtle_update(WT_SESSION_IMPL *session, const char *key, const char *value);
 extern void __wt_abort(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn));
 extern int __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp);
+extern int __wt_malloc(WT_SESSION_IMPL *session, size_t bytes_to_allocate, void *retp);
 extern int __wt_realloc(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp);
+extern int __wt_realloc_noclear(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp);
 extern int __wt_realloc_aligned(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp);
 extern int __wt_strndup(WT_SESSION_IMPL *session, const void *str, size_t len, void *retp);
 extern void __wt_free_int(WT_SESSION_IMPL *session, const void *p_arg);
@@ -558,6 +561,17 @@ extern int __wt_struct_size(WT_SESSION_IMPL *session, size_t *sizep, const char
 extern int __wt_struct_pack(WT_SESSION_IMPL *session, void *buffer, size_t size, const char *fmt, ...);
 extern int __wt_struct_unpack(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, ...);
 extern int __wt_struct_repack(WT_SESSION_IMPL *session, const char *infmt, const char *outfmt, const WT_ITEM *inbuf, WT_ITEM *outbuf);
+extern int __wt_ext_pack_start(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *format, void *buffer, size_t size, WT_PACK_STREAM **psp);
+extern int __wt_ext_unpack_start(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *format, const void *buffer, size_t size, WT_PACK_STREAM **psp);
+extern int __wt_ext_pack_close(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, size_t *usedp);
+extern int __wt_ext_pack_item(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, WT_ITEM *item);
+extern int __wt_ext_pack_int(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, int64_t i);
+extern int __wt_ext_pack_str(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, const char *s);
+extern int __wt_ext_pack_uint(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, uint64_t u);
+extern int __wt_ext_unpack_item(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, WT_ITEM *item);
+extern int __wt_ext_unpack_int(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, int64_t *ip);
+extern int __wt_ext_unpack_str(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, const char **sp);
+extern int __wt_ext_unpack_uint(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, uint64_t *up);
 extern int __wt_ovfl_discard_add(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell);
 extern void __wt_ovfl_discard_free(WT_SESSION_IMPL *session, WT_PAGE *page);
 extern int __wt_ovfl_reuse_search(WT_SESSION_IMPL *session, WT_PAGE *page, uint8_t **addrp, size_t *addr_sizep, const void *value, size_t value_size);
@@ -576,7 +590,6 @@ extern int __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
 extern int __wt_bulk_insert_fix( WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted);
 extern int __wt_bulk_insert_fix_bitmap(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
 extern int __wt_bulk_insert_var( WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted);
-extern int __wt_schema_create_strip(WT_SESSION_IMPL *session, const char *v1, const char *v2, char **value_ret);
 extern int __wt_direct_io_size_check(WT_SESSION_IMPL *session, const char **cfg, const char *config_name, uint32_t *allocsizep);
 extern int __wt_schema_colgroup_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *cgname, const char *config, WT_ITEM *buf);
 extern int __wt_schema_index_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *idxname, const char *config, WT_ITEM *buf);
@@ -637,6 +650,11 @@ extern int __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *ch
 extern int __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]);
 extern uint32_t __wt_cksum(const void *chunk, size_t len);
 extern void __wt_cksum_init(void);
+extern int __wt_cond_auto_alloc( WT_SESSION_IMPL *session, const char *name, bool is_signalled, uint64_t min, uint64_t max, WT_CONDVAR **condp);
+extern int __wt_cond_auto_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond);
+extern int __wt_cond_auto_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool *signalled);
+extern int __wt_cond_auto_wait( WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress);
+extern int __wt_cond_auto_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp);
 extern int __wt_decrypt(WT_SESSION_IMPL *session, WT_ENCRYPTOR *encryptor, size_t skip, WT_ITEM *in, WT_ITEM *out);
 extern int __wt_encrypt(WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t skip, WT_ITEM *in, WT_ITEM *out);
 extern void __wt_encrypt_size(WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t incoming_size, size_t *sizep);
@@ -736,7 +754,7 @@ extern void __wt_txn_destroy(WT_SESSION_IMPL *session);
 extern int __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]);
 extern int __wt_txn_global_destroy(WT_SESSION_IMPL *session);
 extern int __wt_checkpoint_name_ok(WT_SESSION_IMPL *session, const char *name, size_t len);
-extern int __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]);
 extern int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]);
 extern int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]);
 extern int __wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[]);
diff --git a/src/include/gcc.h b/src/include/gcc.h
index 6ccc0de3c03..ce6afdd6e9c 100644
--- a/src/include/gcc.h
+++ b/src/include/gcc.h
@@ -6,6 +6,7 @@
  * See the file LICENSE for redistribution information.
  */
 
+#define	WT_PTRDIFFT_FMT	"td"			/* ptrdiff_t format string */
 #define	WT_SIZET_FMT	"zu"			/* size_t format string */
 
 /* Add GCC-specific attributes to types and function declarations. */
diff --git a/src/include/lint.h b/src/include/lint.h
index f8b17022968..1b64186cbab 100644
--- a/src/include/lint.h
+++ b/src/include/lint.h
@@ -6,6 +6,7 @@
  * See the file LICENSE for redistribution information.
  */
 
+#define	WT_PTRDIFFT_FMT	"td"			/* ptrdiff_t format string */
 #define	WT_SIZET_FMT	"zu"			/* size_t format string */
 
 #define	WT_COMPILER_TYPE_ALIGN(x)
diff --git a/src/include/lsm.h b/src/include/lsm.h
index 7cb3ccc895d..444073087df 100644
--- a/src/include/lsm.h
+++ b/src/include/lsm.h
@@ -179,7 +179,7 @@ struct __wt_lsm_tree {
 	int collator_owned;
 
 	uint32_t refcnt;		/* Number of users of the tree */
-	uint8_t exclusive;		/* Tree is locked exclusively */
+	WT_SESSION_IMPL *excl_session;	/* Session has exclusive lock */
 
 #define	LSM_TREE_MAX_QUEUE	100
 	uint32_t queue_ref;
@@ -215,7 +215,7 @@ struct __wt_lsm_tree {
 	size_t chunk_alloc;		/* Space allocated for chunks */
 	uint32_t nchunks;		/* Number of active chunks */
 	uint32_t last;			/* Last allocated ID */
-	int modified;			/* Have there been updates? */
+	bool modified;			/* Have there been updates? */
 
 	WT_LSM_CHUNK **old_chunks;	/* Array of old LSM chunks */
 	size_t old_alloc;		/* Space allocated for old chunks */
@@ -242,13 +242,18 @@ struct __wt_lsm_tree {
 	int64_t lsm_lookup_no_bloom;
 	int64_t lsm_merge_throttle;
 
-#define	WT_LSM_TREE_ACTIVE		0x01	/* Workers are active */
-#define	WT_LSM_TREE_AGGRESSIVE_TIMER	0x02	/* Timer for merge aggression */
-#define	WT_LSM_TREE_COMPACTING		0x04	/* Tree being compacted */
-#define	WT_LSM_TREE_MERGES		0x08	/* Tree should run merges */
-#define	WT_LSM_TREE_NEED_SWITCH		0x10	/* New chunk needs creating */
-#define	WT_LSM_TREE_OPEN		0x20	/* The tree is open */
-#define	WT_LSM_TREE_THROTTLE		0x40	/* Throttle updates */
+	/*
+	 * The tree is open for business. This used to be a flag, but it is
+	 * susceptible to races.
+	 */
+	bool active;
+
+#define	WT_LSM_TREE_AGGRESSIVE_TIMER	0x01	/* Timer for merge aggression */
+#define	WT_LSM_TREE_COMPACTING		0x02	/* Tree being compacted */
+#define	WT_LSM_TREE_MERGES		0x04	/* Tree should run merges */
+#define	WT_LSM_TREE_NEED_SWITCH		0x08	/* New chunk needs creating */
+#define	WT_LSM_TREE_OPEN		0x10	/* The tree is open */
+#define	WT_LSM_TREE_THROTTLE		0x20	/* Throttle updates */
 	uint32_t flags;
 };
 
diff --git a/src/include/misc.h b/src/include/misc.h
index 4d3ca758dc7..07d52c61eac 100644
--- a/src/include/misc.h
+++ b/src/include/misc.h
@@ -198,7 +198,7 @@
 
 /* Check if a string matches a prefix. */
 #define	WT_PREFIX_MATCH(str, pfx)					\
-	(((const char *)str)[0] == ((const char *)pfx)[0] &&		\
+	(((const char *)(str))[0] == ((const char *)pfx)[0] &&		\
 	    strncmp((str), (pfx), strlen(pfx)) == 0)
 
 /* Check if a string matches a prefix, and move past it. */
diff --git a/src/include/msvc.h b/src/include/msvc.h
index 99260a44875..d5be5bd8c60 100644
--- a/src/include/msvc.h
+++ b/src/include/msvc.h
@@ -13,6 +13,7 @@
 
 #define	inline __inline
 
+#define	WT_PTRDIFFT_FMT	"Id"			/* ptrdiff_t format string */
 #define	WT_SIZET_FMT	"Iu"			/* size_t format string */
 
 /*
diff --git a/src/include/mutex.h b/src/include/mutex.h
index f798bfb3ece..04679884930 100644
--- a/src/include/mutex.h
+++ b/src/include/mutex.h
@@ -20,6 +20,13 @@ struct __wt_condvar {
 
 	int waiters;			/* Numbers of waiters, or
 					   -1 if signalled with no waiters. */
+	/*
+	 * The following fields are only used for automatically adjusting
+	 * condition variables. They could be in a separate structure.
+	 */
+	uint64_t	min_wait;	/* Minimum wait duration */
+	uint64_t	max_wait;	/* Maximum wait duration */
+	uint64_t	prev_wait;	/* Wait duration used last time */
 };
 
 /*
diff --git a/src/include/packing.i b/src/include/packing.i
index 784a55ef2ae..35b2ddc43db 100644
--- a/src/include/packing.i
+++ b/src/include/packing.i
@@ -677,8 +677,8 @@ __wt_struct_unpackv(WT_SESSION_IMPL *session,
 
 	if (fmt[0] != '\0' && fmt[1] == '\0') {
 		pv.type = fmt[0];
-		if ((ret = __unpack_read(session, &pv, &p, size)) == 0)
-			WT_UNPACK_PUT(session, pv, ap);
+		WT_RET(__unpack_read(session, &pv, &p, size));
+		WT_UNPACK_PUT(session, pv, ap);
 		return (0);
 	}
 
diff --git a/src/include/stat.h b/src/include/stat.h
index 8bc6c37b53e..f9170dc1a79 100644
--- a/src/include/stat.h
+++ b/src/include/stat.h
@@ -299,6 +299,8 @@ struct __wt_connection_stats {
 	int64_t cache_bytes_dirty;
 	int64_t cache_pages_dirty;
 	int64_t cache_eviction_clean;
+	int64_t cond_auto_wait_reset;
+	int64_t cond_auto_wait;
 	int64_t file_open;
 	int64_t memory_allocation;
 	int64_t memory_free;
@@ -337,6 +339,8 @@ struct __wt_connection_stats {
 	int64_t log_bytes_written;
 	int64_t log_zero_fills;
 	int64_t log_flush;
+	int64_t log_force_write;
+	int64_t log_force_write_skip;
 	int64_t log_compress_writes;
 	int64_t log_compress_write_fails;
 	int64_t log_compress_small;
@@ -344,6 +348,7 @@ struct __wt_connection_stats {
 	int64_t log_scans;
 	int64_t log_scan_rereads;
 	int64_t log_write_lsn;
+	int64_t log_write_lsn_skip;
 	int64_t log_sync;
 	int64_t log_sync_dir;
 	int64_t log_writes;
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index 0c314e0705f..1e263f22880 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -874,7 +874,7 @@ struct __wt_session {
 	 *  updates).  See @ref data_sources for more information.
 	 *  <br>
 	 *  @copydoc doc_cursor_types
-	 * @param to_dup a cursor to duplicate
+	 * @param to_dup a cursor to duplicate or gather statistics on
 	 * @configstart{WT_SESSION.open_cursor, see dist/api_data.py}
 	 * @config{append, append the value as a new record\, creating a new
 	 * record number key; valid only for cursors with record number keys., a
@@ -3850,187 +3850,197 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
 #define	WT_STAT_CONN_CACHE_PAGES_DIRTY			1070
 /*! cache: unmodified pages evicted */
 #define	WT_STAT_CONN_CACHE_EVICTION_CLEAN		1071
+/*! connection: auto adjusting condition resets */
+#define	WT_STAT_CONN_COND_AUTO_WAIT_RESET		1072
+/*! connection: auto adjusting condition wait calls */
+#define	WT_STAT_CONN_COND_AUTO_WAIT			1073
 /*! connection: files currently open */
-#define	WT_STAT_CONN_FILE_OPEN				1072
+#define	WT_STAT_CONN_FILE_OPEN				1074
 /*! connection: memory allocations */
-#define	WT_STAT_CONN_MEMORY_ALLOCATION			1073
+#define	WT_STAT_CONN_MEMORY_ALLOCATION			1075
 /*! connection: memory frees */
-#define	WT_STAT_CONN_MEMORY_FREE			1074
+#define	WT_STAT_CONN_MEMORY_FREE			1076
 /*! connection: memory re-allocations */
-#define	WT_STAT_CONN_MEMORY_GROW			1075
+#define	WT_STAT_CONN_MEMORY_GROW			1077
 /*! connection: pthread mutex condition wait calls */
-#define	WT_STAT_CONN_COND_WAIT				1076
+#define	WT_STAT_CONN_COND_WAIT				1078
 /*! connection: pthread mutex shared lock read-lock calls */
-#define	WT_STAT_CONN_RWLOCK_READ			1077
+#define	WT_STAT_CONN_RWLOCK_READ			1079
 /*! connection: pthread mutex shared lock write-lock calls */
-#define	WT_STAT_CONN_RWLOCK_WRITE			1078
+#define	WT_STAT_CONN_RWLOCK_WRITE			1080
 /*! connection: total read I/Os */
-#define	WT_STAT_CONN_READ_IO				1079
+#define	WT_STAT_CONN_READ_IO				1081
 /*! connection: total write I/Os */
-#define	WT_STAT_CONN_WRITE_IO				1080
+#define	WT_STAT_CONN_WRITE_IO				1082
 /*! cursor: cursor create calls */
-#define	WT_STAT_CONN_CURSOR_CREATE			1081
+#define	WT_STAT_CONN_CURSOR_CREATE			1083
 /*! cursor: cursor insert calls */
-#define	WT_STAT_CONN_CURSOR_INSERT			1082
+#define	WT_STAT_CONN_CURSOR_INSERT			1084
 /*! cursor: cursor next calls */
-#define	WT_STAT_CONN_CURSOR_NEXT			1083
+#define	WT_STAT_CONN_CURSOR_NEXT			1085
 /*! cursor: cursor prev calls */
-#define	WT_STAT_CONN_CURSOR_PREV			1084
+#define	WT_STAT_CONN_CURSOR_PREV			1086
 /*! cursor: cursor remove calls */
-#define	WT_STAT_CONN_CURSOR_REMOVE			1085
+#define	WT_STAT_CONN_CURSOR_REMOVE			1087
 /*! cursor: cursor reset calls */
-#define	WT_STAT_CONN_CURSOR_RESET			1086
+#define	WT_STAT_CONN_CURSOR_RESET			1088
 /*! cursor: cursor restarted searches */
-#define	WT_STAT_CONN_CURSOR_RESTART			1087
+#define	WT_STAT_CONN_CURSOR_RESTART			1089
 /*! cursor: cursor search calls */
-#define	WT_STAT_CONN_CURSOR_SEARCH			1088
+#define	WT_STAT_CONN_CURSOR_SEARCH			1090
 /*! cursor: cursor search near calls */
-#define	WT_STAT_CONN_CURSOR_SEARCH_NEAR			1089
+#define	WT_STAT_CONN_CURSOR_SEARCH_NEAR			1091
 /*! cursor: cursor update calls */
-#define	WT_STAT_CONN_CURSOR_UPDATE			1090
+#define	WT_STAT_CONN_CURSOR_UPDATE			1092
 /*! cursor: truncate calls */
-#define	WT_STAT_CONN_CURSOR_TRUNCATE			1091
+#define	WT_STAT_CONN_CURSOR_TRUNCATE			1093
 /*! data-handle: connection data handles currently active */
-#define	WT_STAT_CONN_DH_CONN_HANDLE_COUNT		1092
+#define	WT_STAT_CONN_DH_CONN_HANDLE_COUNT		1094
 /*! data-handle: connection sweep candidate became referenced */
-#define	WT_STAT_CONN_DH_SWEEP_REF			1093
+#define	WT_STAT_CONN_DH_SWEEP_REF			1095
 /*! data-handle: connection sweep dhandles closed */
-#define	WT_STAT_CONN_DH_SWEEP_CLOSE			1094
+#define	WT_STAT_CONN_DH_SWEEP_CLOSE			1096
 /*! data-handle: connection sweep dhandles removed from hash list */
-#define	WT_STAT_CONN_DH_SWEEP_REMOVE			1095
+#define	WT_STAT_CONN_DH_SWEEP_REMOVE			1097
 /*! data-handle: connection sweep time-of-death sets */
-#define	WT_STAT_CONN_DH_SWEEP_TOD			1096
+#define	WT_STAT_CONN_DH_SWEEP_TOD			1098
 /*! data-handle: connection sweeps */
-#define	WT_STAT_CONN_DH_SWEEPS				1097
+#define	WT_STAT_CONN_DH_SWEEPS				1099
 /*! data-handle: session dhandles swept */
-#define	WT_STAT_CONN_DH_SESSION_HANDLES			1098
+#define	WT_STAT_CONN_DH_SESSION_HANDLES			1100
 /*! data-handle: session sweep attempts */
-#define	WT_STAT_CONN_DH_SESSION_SWEEPS			1099
+#define	WT_STAT_CONN_DH_SESSION_SWEEPS			1101
 /*! log: busy returns attempting to switch slots */
-#define	WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY		1100
+#define	WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY		1102
 /*! log: consolidated slot closures */
-#define	WT_STAT_CONN_LOG_SLOT_CLOSES			1101
+#define	WT_STAT_CONN_LOG_SLOT_CLOSES			1103
 /*! log: consolidated slot join races */
-#define	WT_STAT_CONN_LOG_SLOT_RACES			1102
+#define	WT_STAT_CONN_LOG_SLOT_RACES			1104
 /*! log: consolidated slot join transitions */
-#define	WT_STAT_CONN_LOG_SLOT_TRANSITIONS		1103
+#define	WT_STAT_CONN_LOG_SLOT_TRANSITIONS		1105
 /*! log: consolidated slot joins */
-#define	WT_STAT_CONN_LOG_SLOT_JOINS			1104
+#define	WT_STAT_CONN_LOG_SLOT_JOINS			1106
 /*! log: consolidated slot unbuffered writes */
-#define	WT_STAT_CONN_LOG_SLOT_UNBUFFERED		1105
+#define	WT_STAT_CONN_LOG_SLOT_UNBUFFERED		1107
 /*! log: log bytes of payload data */
-#define	WT_STAT_CONN_LOG_BYTES_PAYLOAD			1106
+#define	WT_STAT_CONN_LOG_BYTES_PAYLOAD			1108
 /*! log: log bytes written */
-#define	WT_STAT_CONN_LOG_BYTES_WRITTEN			1107
+#define	WT_STAT_CONN_LOG_BYTES_WRITTEN			1109
 /*! log: log files manually zero-filled */
-#define	WT_STAT_CONN_LOG_ZERO_FILLS			1108
+#define	WT_STAT_CONN_LOG_ZERO_FILLS			1110
 /*! log: log flush operations */
-#define	WT_STAT_CONN_LOG_FLUSH				1109
+#define	WT_STAT_CONN_LOG_FLUSH				1111
+/*! log: log force write operations */
+#define	WT_STAT_CONN_LOG_FORCE_WRITE			1112
+/*! log: log force write operations skipped */
+#define	WT_STAT_CONN_LOG_FORCE_WRITE_SKIP		1113
 /*! log: log records compressed */
-#define	WT_STAT_CONN_LOG_COMPRESS_WRITES		1110
+#define	WT_STAT_CONN_LOG_COMPRESS_WRITES		1114
 /*! log: log records not compressed */
-#define	WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS		1111
+#define	WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS		1115
 /*! log: log records too small to compress */
-#define	WT_STAT_CONN_LOG_COMPRESS_SMALL			1112
+#define	WT_STAT_CONN_LOG_COMPRESS_SMALL			1116
 /*! log: log release advances write LSN */
-#define	WT_STAT_CONN_LOG_RELEASE_WRITE_LSN		1113
+#define	WT_STAT_CONN_LOG_RELEASE_WRITE_LSN		1117
 /*! log: log scan operations */
-#define	WT_STAT_CONN_LOG_SCANS				1114
+#define	WT_STAT_CONN_LOG_SCANS				1118
 /*! log: log scan records requiring two reads */
-#define	WT_STAT_CONN_LOG_SCAN_REREADS			1115
+#define	WT_STAT_CONN_LOG_SCAN_REREADS			1119
 /*! log: log server thread advances write LSN */
-#define	WT_STAT_CONN_LOG_WRITE_LSN			1116
+#define	WT_STAT_CONN_LOG_WRITE_LSN			1120
+/*! log: log server thread write LSN walk skipped */
+#define	WT_STAT_CONN_LOG_WRITE_LSN_SKIP			1121
 /*! log: log sync operations */
-#define	WT_STAT_CONN_LOG_SYNC				1117
+#define	WT_STAT_CONN_LOG_SYNC				1122
 /*! log: log sync_dir operations */
-#define	WT_STAT_CONN_LOG_SYNC_DIR			1118
+#define	WT_STAT_CONN_LOG_SYNC_DIR			1123
 /*! log: log write operations */
-#define	WT_STAT_CONN_LOG_WRITES				1119
+#define	WT_STAT_CONN_LOG_WRITES				1124
 /*! log: logging bytes consolidated */
-#define	WT_STAT_CONN_LOG_SLOT_CONSOLIDATED		1120
+#define	WT_STAT_CONN_LOG_SLOT_CONSOLIDATED		1125
 /*! log: maximum log file size */
-#define	WT_STAT_CONN_LOG_MAX_FILESIZE			1121
+#define	WT_STAT_CONN_LOG_MAX_FILESIZE			1126
 /*! log: number of pre-allocated log files to create */
-#define	WT_STAT_CONN_LOG_PREALLOC_MAX			1122
+#define	WT_STAT_CONN_LOG_PREALLOC_MAX			1127
 /*! log: pre-allocated log files not ready and missed */
-#define	WT_STAT_CONN_LOG_PREALLOC_MISSED		1123
+#define	WT_STAT_CONN_LOG_PREALLOC_MISSED		1128
 /*! log: pre-allocated log files prepared */
-#define	WT_STAT_CONN_LOG_PREALLOC_FILES			1124
+#define	WT_STAT_CONN_LOG_PREALLOC_FILES			1129
 /*! log: pre-allocated log files used */
-#define	WT_STAT_CONN_LOG_PREALLOC_USED			1125
+#define	WT_STAT_CONN_LOG_PREALLOC_USED			1130
 /*! log: records processed by log scan */
-#define	WT_STAT_CONN_LOG_SCAN_RECORDS			1126
+#define	WT_STAT_CONN_LOG_SCAN_RECORDS			1131
 /*! log: total in-memory size of compressed records */
-#define	WT_STAT_CONN_LOG_COMPRESS_MEM			1127
+#define	WT_STAT_CONN_LOG_COMPRESS_MEM			1132
 /*! log: total log buffer size */
-#define	WT_STAT_CONN_LOG_BUFFER_SIZE			1128
+#define	WT_STAT_CONN_LOG_BUFFER_SIZE			1133
 /*! log: total size of compressed records */
-#define	WT_STAT_CONN_LOG_COMPRESS_LEN			1129
+#define	WT_STAT_CONN_LOG_COMPRESS_LEN			1134
 /*! log: written slots coalesced */
-#define	WT_STAT_CONN_LOG_SLOT_COALESCED			1130
+#define	WT_STAT_CONN_LOG_SLOT_COALESCED			1135
 /*! log: yields waiting for previous log file close */
-#define	WT_STAT_CONN_LOG_CLOSE_YIELDS			1131
+#define	WT_STAT_CONN_LOG_CLOSE_YIELDS			1136
 /*! reconciliation: fast-path pages deleted */
-#define	WT_STAT_CONN_REC_PAGE_DELETE_FAST		1132
+#define	WT_STAT_CONN_REC_PAGE_DELETE_FAST		1137
 /*! reconciliation: page reconciliation calls */
-#define	WT_STAT_CONN_REC_PAGES				1133
+#define	WT_STAT_CONN_REC_PAGES				1138
 /*! reconciliation: page reconciliation calls for eviction */
-#define	WT_STAT_CONN_REC_PAGES_EVICTION			1134
+#define	WT_STAT_CONN_REC_PAGES_EVICTION			1139
 /*! reconciliation: pages deleted */
-#define	WT_STAT_CONN_REC_PAGE_DELETE			1135
+#define	WT_STAT_CONN_REC_PAGE_DELETE			1140
 /*! reconciliation: split bytes currently awaiting free */
-#define	WT_STAT_CONN_REC_SPLIT_STASHED_BYTES		1136
+#define	WT_STAT_CONN_REC_SPLIT_STASHED_BYTES		1141
 /*! reconciliation: split objects currently awaiting free */
-#define	WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS		1137
+#define	WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS		1142
 /*! session: open cursor count */
-#define	WT_STAT_CONN_SESSION_CURSOR_OPEN		1138
+#define	WT_STAT_CONN_SESSION_CURSOR_OPEN		1143
 /*! session: open session count */
-#define	WT_STAT_CONN_SESSION_OPEN			1139
+#define	WT_STAT_CONN_SESSION_OPEN			1144
 /*! thread-yield: page acquire busy blocked */
-#define	WT_STAT_CONN_PAGE_BUSY_BLOCKED			1140
+#define	WT_STAT_CONN_PAGE_BUSY_BLOCKED			1145
 /*! thread-yield: page acquire eviction blocked */
-#define	WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED	1141
+#define	WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED	1146
 /*! thread-yield: page acquire locked blocked */
-#define	WT_STAT_CONN_PAGE_LOCKED_BLOCKED		1142
+#define	WT_STAT_CONN_PAGE_LOCKED_BLOCKED		1147
 /*! thread-yield: page acquire read blocked */
-#define	WT_STAT_CONN_PAGE_READ_BLOCKED			1143
+#define	WT_STAT_CONN_PAGE_READ_BLOCKED			1148
 /*! thread-yield: page acquire time sleeping (usecs) */
-#define	WT_STAT_CONN_PAGE_SLEEP				1144
+#define	WT_STAT_CONN_PAGE_SLEEP				1149
 /*! transaction: number of named snapshots created */
-#define	WT_STAT_CONN_TXN_SNAPSHOTS_CREATED		1145
+#define	WT_STAT_CONN_TXN_SNAPSHOTS_CREATED		1150
 /*! transaction: number of named snapshots dropped */
-#define	WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED		1146
+#define	WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED		1151
 /*! transaction: transaction begins */
-#define	WT_STAT_CONN_TXN_BEGIN				1147
+#define	WT_STAT_CONN_TXN_BEGIN				1152
 /*! transaction: transaction checkpoint currently running */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_RUNNING		1148
+#define	WT_STAT_CONN_TXN_CHECKPOINT_RUNNING		1153
 /*! transaction: transaction checkpoint generation */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_GENERATION		1149
+#define	WT_STAT_CONN_TXN_CHECKPOINT_GENERATION		1154
 /*! transaction: transaction checkpoint max time (msecs) */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX		1150
+#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX		1155
 /*! transaction: transaction checkpoint min time (msecs) */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN		1151
+#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN		1156
 /*! transaction: transaction checkpoint most recent time (msecs) */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT		1152
+#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT		1157
 /*! transaction: transaction checkpoint total time (msecs) */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL		1153
+#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL		1158
 /*! transaction: transaction checkpoints */
-#define	WT_STAT_CONN_TXN_CHECKPOINT			1154
+#define	WT_STAT_CONN_TXN_CHECKPOINT			1159
 /*! transaction: transaction failures due to cache overflow */
-#define	WT_STAT_CONN_TXN_FAIL_CACHE			1155
+#define	WT_STAT_CONN_TXN_FAIL_CACHE			1160
 /*! transaction: transaction range of IDs currently pinned */
-#define	WT_STAT_CONN_TXN_PINNED_RANGE			1156
+#define	WT_STAT_CONN_TXN_PINNED_RANGE			1161
 /*! transaction: transaction range of IDs currently pinned by a checkpoint */
-#define	WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE	1157
+#define	WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE	1162
 /*! transaction: transaction range of IDs currently pinned by named
  * snapshots */
-#define	WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE		1158
+#define	WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE		1163
 /*! transaction: transaction sync calls */
-#define	WT_STAT_CONN_TXN_SYNC				1159
+#define	WT_STAT_CONN_TXN_SYNC				1164
 /*! transaction: transactions committed */
-#define	WT_STAT_CONN_TXN_COMMIT				1160
+#define	WT_STAT_CONN_TXN_COMMIT				1165
 /*! transaction: transactions rolled back */
-#define	WT_STAT_CONN_TXN_ROLLBACK			1161
+#define	WT_STAT_CONN_TXN_ROLLBACK			1166
 
 /*!
  * @}
diff --git a/src/include/wiredtiger_ext.h b/src/include/wiredtiger_ext.h
index 0db876b56f3..7d97d97dcf5 100644
--- a/src/include/wiredtiger_ext.h
+++ b/src/include/wiredtiger_ext.h
@@ -268,8 +268,9 @@ struct __wt_extension_api {
 	    WT_SESSION *session, const char *key, const char *value);
 
 	/*!
-	 * Pack a structure into a buffer.
-	 * See ::wiredtiger_struct_pack for details.
+	 * Pack a structure into a buffer. Deprecated in favor of stream
+	 * based pack and unpack API. See WT_EXTENSION_API::pack_start for
+	 * details.
 	 *
 	 * @param wt_api the extension handle
 	 * @param session the session handle
@@ -282,8 +283,8 @@ struct __wt_extension_api {
 	    void *buffer, size_t size, const char *format, ...);
 
 	/*!
-	 * Calculate the size required to pack a structure.
-	 * See ::wiredtiger_struct_size for details.
+	 * Calculate the size required to pack a structure. Deprecated in
+	 * favor of stream based pack and unpack API.
 	 *
 	 * @param wt_api the extension handle
 	 * @param session the session handle
@@ -296,8 +297,9 @@ struct __wt_extension_api {
 	    size_t *sizep, const char *format, ...);
 
 	/*!
-	 * Unpack a structure from a buffer.
-	 * See ::wiredtiger_struct_unpack for details.
+	 * Unpack a structure from a buffer. Deprecated in favor of stream
+	 * based pack and unpack API. See WT_EXTENSION_API::unpack_start for
+	 * details.
 	 *
 	 * @param wt_api the extension handle
 	 * @param session the session handle
@@ -309,6 +311,130 @@ struct __wt_extension_api {
 	int (*struct_unpack)(WT_EXTENSION_API *wt_api, WT_SESSION *session,
 	    const void *buffer, size_t size, const char *format, ...);
 
+	/*
+	 * Streaming pack/unpack API.
+	 */
+	/*!
+	 * Start a packing operation into a buffer.
+	 * See ::wiredtiger_pack_start for details.
+	 *
+	 * @param session the session handle
+	 * @param format the data format, see @ref packing
+	 * @param buffer a pointer to memory to hold the packed data
+	 * @param size the size of the buffer
+	 * @param[out] psp the new packing stream handle
+	 * @errors
+	 */
+	int (*pack_start)(WT_EXTENSION_API *wt_api,
+	    WT_SESSION *session, const char *format,
+	    void *buffer, size_t size, WT_PACK_STREAM **psp);
+
+	/*!
+	 * Start an unpacking operation from a buffer.
+	 * See ::wiredtiger_unpack_start for details.
+	 *
+	 * @param session the session handle
+	 * @param format the data format, see @ref packing
+	 * @param buffer a pointer to memory holding the packed data
+	 * @param size the size of the buffer
+	 * @param[out] psp the new packing stream handle
+	 * @errors
+	 */
+	int (*unpack_start)(WT_EXTENSION_API *wt_api,
+	    WT_SESSION *session, const char *format,
+	    const void *buffer, size_t size, WT_PACK_STREAM **psp);
+
+	/*!
+	 * Close a packing stream.
+	 *
+	 * @param ps the packing stream handle
+	 * @param[out] usedp the number of bytes in the buffer used by the
+	 * stream
+	 * @errors
+	 */
+	int (*pack_close)(WT_EXTENSION_API *wt_api,
+	    WT_PACK_STREAM *ps, size_t *usedp);
+
+	/*!
+	 * Pack an item into a packing stream.
+	 *
+	 * @param ps the packing stream handle
+	 * @param item an item to pack
+	 * @errors
+	 */
+	int (*pack_item)(WT_EXTENSION_API *wt_api,
+	    WT_PACK_STREAM *ps, WT_ITEM *item);
+
+	/*!
+	 * Pack a signed integer into a packing stream.
+	 *
+	 * @param ps the packing stream handle
+	 * @param i a signed integer to pack
+	 * @errors
+	 */
+	int (*pack_int)(WT_EXTENSION_API *wt_api,
+	    WT_PACK_STREAM *ps, int64_t i);
+
+	/*!
+	 * Pack a string into a packing stream.
+	 *
+	 * @param ps the packing stream handle
+	 * @param s a string to pack
+	 * @errors
+	 */
+	int (*pack_str)(WT_EXTENSION_API *wt_api,
+	    WT_PACK_STREAM *ps, const char *s);
+
+	/*!
+	 * Pack an unsigned integer into a packing stream.
+	 *
+	 * @param ps the packing stream handle
+	 * @param u an unsigned integer to pack
+	 * @errors
+	 */
+	int (*pack_uint)(WT_EXTENSION_API *wt_api,
+	    WT_PACK_STREAM *ps, uint64_t u);
+
+	/*!
+	 * Unpack an item from a packing stream.
+	 *
+	 * @param ps the packing stream handle
+	 * @param item an item to unpack
+	 * @errors
+	 */
+	int (*unpack_item)(WT_EXTENSION_API *wt_api,
+	    WT_PACK_STREAM *ps, WT_ITEM *item);
+
+	/*!
+	 * Unpack a signed integer from a packing stream.
+	 *
+	 * @param ps the packing stream handle
+	 * @param[out] ip the unpacked signed integer
+	 * @errors
+	 */
+	int (*unpack_int)(WT_EXTENSION_API *wt_api,
+	    WT_PACK_STREAM *ps, int64_t *ip);
+
+	/*!
+	 * Unpack a string from a packing stream.
+	 *
+	 * @param ps the packing stream handle
+	 * @param[out] sp the unpacked string
+	 * @errors
+	 */
+	int (*unpack_str)(WT_EXTENSION_API *wt_api,
+	    WT_PACK_STREAM *ps, const char **sp);
+
+	/*!
+	 * Unpack an unsigned integer from a packing stream.
+	 *
+	 * @param ps the packing stream handle
+	 * @param[out] up the unpacked unsigned integer
+	 * @errors
+	 */
+	int (*unpack_uint)(WT_EXTENSION_API *wt_api,
+	     WT_PACK_STREAM *ps, uint64_t *up);
+
 	/*!
 	 * Return the current transaction ID.
 	 *
diff --git a/src/log/log.c b/src/log/log.c
index 03145d8408c..e41073299a8 100644
--- a/src/log/log.c
+++ b/src/log/log.c
@@ -29,7 +29,7 @@ __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn)
 	log = conn->log;
 	log->ckpt_lsn = *ckp_lsn;
 	if (conn->log_cond != NULL)
-		WT_RET(__wt_cond_signal(session, conn->log_cond));
+		WT_RET(__wt_cond_auto_signal(session, conn->log_cond));
 	return (0);
 }
 
@@ -46,7 +46,7 @@ __wt_log_flush_lsn(WT_SESSION_IMPL *session, WT_LSN *lsn, bool start)
 
 	conn = S2C(session);
 	log = conn->log;
-	WT_RET(__wt_log_force_write(session, 1));
+	WT_RET(__wt_log_force_write(session, 1, NULL));
 	WT_RET(__wt_log_wrlsn(session, NULL));
 	if (start)
 		*lsn = log->write_start_lsn;
@@ -118,9 +118,9 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn)
 	 */
 	if (log->sync_dir_lsn.l.file < min_lsn->l.file) {
 		WT_ERR(__wt_verbose(session, WT_VERB_LOG,
-		    "log_force_sync: sync directory %s to LSN %d/%lu",
-		    log->log_dir_fh->name,
-		    min_lsn->l.file, min_lsn->l.offset));
+		    "log_force_sync: sync directory %s to LSN %" PRIu32
+		    "/%" PRIu32,
+		    log->log_dir_fh->name, min_lsn->l.file, min_lsn->l.offset));
 		WT_ERR(__wt_directory_sync_fh(session, log->log_dir_fh));
 		log->sync_dir_lsn = *min_lsn;
 		WT_STAT_FAST_CONN_INCR(session, log_sync_dir);
@@ -130,7 +130,7 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn)
 	 */
 	if (__wt_log_cmp(&log->sync_lsn, min_lsn) < 0) {
 		WT_ERR(__wt_verbose(session, WT_VERB_LOG,
-		    "log_force_sync: sync %s to LSN %d/%lu",
+		    "log_force_sync: sync %s to LSN %" PRIu32 "/%" PRIu32,
 		    log->log_fh->name, min_lsn->l.file, min_lsn->l.offset));
 		WT_ERR(__wt_fsync(session, log->log_fh));
 		log->sync_lsn = *min_lsn;
@@ -273,7 +273,7 @@ __wt_log_get_all_files(WT_SESSION_IMPL *session,
 	 * These may be files needed by backup.  Force the current slot
 	 * to get written to the file.
 	 */
-	WT_RET(__wt_log_force_write(session, 1));
+	WT_RET(__wt_log_force_write(session, 1, NULL));
 	WT_RET(__log_get_files(session, WT_LOG_FILENAME, &files, &count));
 
 	/* Filter out any files that are below the checkpoint LSN. */
@@ -697,7 +697,7 @@ __log_openfile(WT_SESSION_IMPL *session,
 			WT_ERR_MSG(session, WT_ERROR,
 			    "unsupported WiredTiger file version: this build "
 			    " only supports major/minor versions up to %d/%d, "
-			    " and the file is version %d/%d",
+			    " and the file is version %" PRIu16 "/%" PRIu16,
 			    WT_LOG_MAJOR_VERSION, WT_LOG_MINOR_VERSION,
 			    desc->majorv, desc->minorv);
 	}
@@ -824,7 +824,7 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created)
 		if (create_log) {
 			WT_STAT_FAST_CONN_INCR(session, log_prealloc_missed);
 			if (conn->log_cond != NULL)
-				WT_RET(__wt_cond_signal(
+				WT_RET(__wt_cond_auto_signal(
 				    session, conn->log_cond));
 		}
 	}
@@ -1129,7 +1129,8 @@ __wt_log_open(WT_SESSION_IMPL *session)
 	}
 	log->fileid = lastlog;
 	WT_ERR(__wt_verbose(session, WT_VERB_LOG,
-	    "log_open: first log %d last log %d", firstlog, lastlog));
+	    "log_open: first log %" PRIu32 " last log %" PRIu32,
+	    firstlog, lastlog));
 	if (firstlog == UINT32_MAX) {
 		WT_ASSERT(session, logcount == 0);
 		WT_INIT_LSN(&log->first_lsn);
@@ -1251,10 +1252,8 @@ __log_has_hole(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, bool *hole)
 		}
 	}
 
-err:	if (buf != NULL)
-		__wt_free(session, buf);
-	if (zerobuf != NULL)
-		__wt_free(session, zerobuf);
+err:	__wt_free(session, buf);
+	__wt_free(session, zerobuf);
 	return (ret);
 }
 
@@ -1338,7 +1337,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep)
 		 */
 		if (F_ISSET(session, WT_SESSION_LOCKED_SLOT))
 			__wt_spin_unlock(session, &log->log_slot_lock);
-		WT_ERR(__wt_cond_signal(session, conn->log_wrlsn_cond));
+		WT_ERR(__wt_cond_auto_signal(session, conn->log_wrlsn_cond));
 		if (++yield_count < WT_THOUSAND)
 			__wt_yield();
 		else
@@ -1395,7 +1394,8 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep)
 		    (log->sync_dir_lsn.l.file < sync_lsn.l.file)) {
 			WT_ASSERT(session, log->log_dir_fh != NULL);
 			WT_ERR(__wt_verbose(session, WT_VERB_LOG,
-			    "log_release: sync directory %s to LSN %u/%lu",
+			    "log_release: sync directory %s to LSN %" PRIu32
+			    "/%" PRIu32,
 			    log->log_dir_fh->name,
 			    sync_lsn.l.file, sync_lsn.l.offset));
 			WT_ERR(__wt_directory_sync_fh(
@@ -1410,7 +1410,8 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep)
 		if (F_ISSET(slot, WT_SLOT_SYNC) &&
 		    __wt_log_cmp(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
 			WT_ERR(__wt_verbose(session, WT_VERB_LOG,
-			    "log_release: sync log %s to LSN %u/%lu",
+			    "log_release: sync log %s to LSN %" PRIu32
+			    "/%" PRIu32,
 			    log->log_fh->name,
 			    sync_lsn.l.file, sync_lsn.l.offset));
 			WT_STAT_FAST_CONN_INCR(session, log_sync);
@@ -1477,7 +1478,7 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags,
 
 	if (LF_ISSET(WT_LOGSCAN_RECOVER))
 		WT_RET(__wt_verbose(session, WT_VERB_LOG,
-		    "__wt_log_scan truncating to %u/%u",
+		    "__wt_log_scan truncating to %" PRIu32 "/%" PRIu32,
 		    log->trunc_lsn.l.file, log->trunc_lsn.l.offset));
 
 	if (log != NULL) {
@@ -1758,14 +1759,25 @@ err:	WT_STAT_FAST_CONN_INCR(session, log_scans);
  *	Wrapper function that takes the lock.
  */
 int
-__wt_log_force_write(WT_SESSION_IMPL *session, bool retry)
+__wt_log_force_write(WT_SESSION_IMPL *session, bool retry, bool *did_work)
 {
 	WT_LOG *log;
 	WT_MYSLOT myslot;
+	uint32_t joined;
 
 	log = S2C(session)->log;
 	memset(&myslot, 0, sizeof(myslot));
+	WT_STAT_FAST_CONN_INCR(session, log_force_write);
+	if (did_work != NULL)
+		*did_work = true;
 	myslot.slot = log->active_slot;
+	joined = WT_LOG_SLOT_JOINED(log->active_slot->slot_state);
+	if (joined == 0) {
+		WT_STAT_FAST_CONN_INCR(session, log_force_write_skip);
+		if (did_work != NULL)
+			*did_work = false;
+		return (0);
+	}
 	return (__wt_log_slot_switch(session, &myslot, retry, true));
 }
 
@@ -1998,10 +2010,10 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
 		 * XXX I've seen times when conditions are NULL.
 		 */
 		if (conn->log_cond != NULL) {
-			WT_ERR(__wt_cond_signal(session, conn->log_cond));
+			WT_ERR(__wt_cond_auto_signal(session, conn->log_cond));
 			__wt_yield();
 		} else
-			WT_ERR(__wt_log_force_write(session, 1));
+			WT_ERR(__wt_log_force_write(session, 1, NULL));
 	}
 	if (LF_ISSET(WT_LOG_FLUSH)) {
 		/* Wait for our writes to reach the OS */
@@ -2128,7 +2140,7 @@ __wt_log_flush(WT_SESSION_IMPL *session, uint32_t flags)
 		WT_RET(__wt_log_flush_lsn(session, &lsn, false));
 
 	WT_RET(__wt_verbose(session, WT_VERB_LOG,
-	    "log_flush: flags %d LSN %u/%lu",
+	    "log_flush: flags %#" PRIx32 " LSN %" PRIu32 "/%" PRIu32,
 	    flags, lsn.l.file, lsn.l.offset));
 	/*
 	 * If the user wants write-no-sync, there is nothing more to do.
diff --git a/src/log/log_slot.c b/src/log/log_slot.c
index 2844516e78f..570d1c9ce48 100644
--- a/src/log/log_slot.c
+++ b/src/log/log_slot.c
@@ -253,7 +253,7 @@ __wt_log_slot_new(WT_SESSION_IMPL *session)
 		/*
 		 * If we didn't find any free slots signal the worker thread.
 		 */
-		(void)__wt_cond_signal(session, conn->log_wrlsn_cond);
+		(void)__wt_cond_auto_signal(session, conn->log_wrlsn_cond);
 		__wt_yield();
 	}
 	/* NOTREACHED */
diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c
index 0197b6481f4..e023b2b407e 100644
--- a/src/lsm/lsm_cursor.c
+++ b/src/lsm/lsm_cursor.c
@@ -1556,7 +1556,7 @@ __wt_clsm_open(WT_SESSION_IMPL *session,
 	WT_ERR(ret);
 
 	/* Make sure we have exclusive access if and only if we want it */
-	WT_ASSERT(session, !bulk || lsm_tree->exclusive);
+	WT_ASSERT(session, !bulk || lsm_tree->excl_session != NULL);
 
 	WT_ERR(__wt_calloc_one(session, &clsm));
 
diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c
index cf581475d2c..943a5894ab3 100644
--- a/src/lsm/lsm_manager.c
+++ b/src/lsm/lsm_manager.c
@@ -390,7 +390,7 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session)
 		F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST);
 		dhandle_locked = true;
 		TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) {
-			if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE))
+			if (!lsm_tree->active)
 				continue;
 			WT_ERR(__wt_epoch(session, &now));
 			pushms = lsm_tree->work_push_ts.tv_sec == 0 ? 0 :
@@ -433,8 +433,10 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session)
 				    session, WT_LSM_WORK_BLOOM, 0, lsm_tree));
 				WT_ERR(__wt_verbose(session,
 				    WT_VERB_LSM_MANAGER,
-				    "MGR %s: queue %d mod %d nchunks %d"
-				    " flags 0x%x aggressive %d pushms %" PRIu64
+				    "MGR %s: queue %" PRIu32 " mod %d "
+				    "nchunks %" PRIu32
+				    " flags %#" PRIx32 " aggressive %" PRIu32
+				    " pushms %" PRIu64
 				    " fillms %" PRIu64,
 				    lsm_tree->name, lsm_tree->queue_ref,
 				    lsm_tree->modified, lsm_tree->nchunks,
@@ -648,7 +650,7 @@ __wt_lsm_manager_push_entry(WT_SESSION_IMPL *session,
 	 * is checked.
 	 */
 	(void)__wt_atomic_add32(&lsm_tree->queue_ref, 1);
-	if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) {
+	if (!lsm_tree->active) {
 		(void)__wt_atomic_sub32(&lsm_tree->queue_ref, 1);
 		return (0);
 	}
diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c
index 29325066da7..6d907284546 100644
--- a/src/lsm/lsm_merge.c
+++ b/src/lsm/lsm_merge.c
@@ -60,10 +60,11 @@ __lsm_merge_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
 {
 	struct timespec now;
 	uint64_t msec_since_last_merge, msec_to_create_merge;
-	u_int new_aggressive;
+	uint32_t new_aggressive;
 
 	new_aggressive = 0;
 
+	WT_ASSERT(session, lsm_tree->merge_min != 0);
 	/*
 	 * If the tree is open read-only or we are compacting, be very
 	 * aggressive. Otherwise, we can spend a long time waiting for merges
@@ -124,8 +125,9 @@ __lsm_merge_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
 
 	if (new_aggressive > lsm_tree->merge_aggressiveness) {
 		WT_RET(__wt_verbose(session, WT_VERB_LSM,
-		    "LSM merge %s got aggressive (old %u new %u), "
-		    "merge_min %d, %u / %" PRIu64,
+		    "LSM merge %s got aggressive "
+		    "(old %" PRIu32 " new %" PRIu32 "), "
+		    "merge_min %u, %" PRIu64 " / %" PRIu64,
 		    lsm_tree->name, lsm_tree->merge_aggressiveness,
 		    new_aggressive, lsm_tree->merge_min,
 		    msec_since_last_merge, lsm_tree->chunk_fill_ms));
@@ -410,7 +412,8 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
 		    start_chunk, end_chunk, dest_id, record_count, generation));
 		for (verb = start_chunk; verb <= end_chunk; verb++)
 			WT_ERR(__wt_verbose(session, WT_VERB_LSM,
-			    "Merging %s: Chunk[%u] id %u, gen: %" PRIu32
+			    "Merging %s: Chunk[%u] id %" PRIu32
+			    ", gen: %" PRIu32
 			    ", size: %" PRIu64 ", records: %" PRIu64,
 			    lsm_tree->name, verb, lsm_tree->chunk[verb]->id,
 			    lsm_tree->chunk[verb]->generation,
@@ -460,7 +463,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
 #define	LSM_MERGE_CHECK_INTERVAL	WT_THOUSAND
 	for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
 		if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) {
-			if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE))
+			if (!lsm_tree->active)
 				WT_ERR(EINTR);
 
 			WT_STAT_FAST_CONN_INCRV(session,
diff --git a/src/lsm/lsm_meta.c b/src/lsm/lsm_meta.c
index d76b2a48aa7..e19e2cd0126 100644
--- a/src/lsm/lsm_meta.c
+++ b/src/lsm/lsm_meta.c
@@ -9,17 +9,17 @@
 #include "wt_internal.h"
 
 /*
- * __wt_lsm_meta_read --
- *	Read the metadata for an LSM tree.
+ * __lsm_meta_read_v0 --
+ *	Read v0 of LSM metadata.
  */
-int
-__wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+static int
+__lsm_meta_read_v0(
+    WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, const char *lsmconf)
 {
 	WT_CONFIG cparser, lparser;
 	WT_CONFIG_ITEM ck, cv, fileconf, lk, lv, metadata;
 	WT_DECL_RET;
 	WT_LSM_CHUNK *chunk;
-	char *lsmconfig;
 	u_int nchunks;
 
 	chunk = NULL;			/* -Wconditional-uninitialized */
@@ -28,8 +28,7 @@ __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
 	if (F_ISSET(S2C(session), WT_CONN_LSM_MERGE))
 		F_SET(lsm_tree, WT_LSM_TREE_MERGES);
 
-	WT_RET(__wt_metadata_search(session, lsm_tree->name, &lsmconfig));
-	WT_ERR(__wt_config_init(session, &cparser, lsmconfig));
+	WT_ERR(__wt_config_init(session, &cparser, lsmconf));
 	while ((ret = __wt_config_next(&cparser, &ck, &cv)) == 0) {
 		if (WT_STRING_MATCH("key_format", ck.str, ck.len)) {
 			__wt_free(session, lsm_tree->key_format);
@@ -48,7 +47,7 @@ __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
 			 * from the file configuration.
 			 */
 			WT_ERR(__wt_config_getones(
-			    session, lsmconfig, "file_config", &fileconf));
+			    session, lsmconf, "file_config", &fileconf));
 			WT_CLEAR(metadata);
 			WT_ERR_NOTFOUND_OK(__wt_config_subgets(
 			    session, &fileconf, "app_metadata", &metadata));
@@ -160,16 +159,292 @@ __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
 		 */
 	}
 	WT_ERR_NOTFOUND_OK(ret);
+err:	return (ret);
+}
+
+/*
+ * __lsm_meta_read_v1 --
+ *	Read v1 of LSM metadata.
+ */
+static int
+__lsm_meta_read_v1(
+    WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, const char *lsmconf)
+{
+	WT_CONFIG lparser;
+	WT_CONFIG_ITEM cv, lk, lv, metadata;
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
+	WT_LSM_CHUNK *chunk;
+	const char *file_cfg[] = {
+	    WT_CONFIG_BASE(session, file_config), NULL, NULL, NULL };
+	char *fileconf;
+	u_int nchunks;
+
+	chunk = NULL;			/* -Wconditional-uninitialized */
+
+	WT_ERR(__wt_config_getones(session, lsmconf, "key_format", &cv));
+	WT_ERR(__wt_strndup(session, cv.str, cv.len, &lsm_tree->key_format));
+	WT_ERR(__wt_config_getones(session, lsmconf, "value_format", &cv));
+	WT_ERR(__wt_strndup(session, cv.str, cv.len, &lsm_tree->value_format));
+
+	WT_ERR(__wt_config_getones(session, lsmconf, "collator", &cv));
+	if (cv.len != 0 && !WT_STRING_MATCH("none", cv.str, cv.len)) {
+		/* Extract the application-supplied metadata (if any). */
+		WT_CLEAR(metadata);
+		WT_ERR_NOTFOUND_OK(__wt_config_getones(
+		    session, lsmconf, "app_metadata", &metadata));
+		WT_ERR(__wt_collator_config(session, lsm_tree->name,
+		    &cv, &metadata,
+		    &lsm_tree->collator, &lsm_tree->collator_owned));
+		WT_ERR(__wt_strndup(session,
+		    cv.str, cv.len, &lsm_tree->collator_name));
+	}
+
+	WT_ERR(__wt_config_getones(session, lsmconf, "lsm.auto_throttle", &cv));
+	if (cv.val)
+		F_SET(lsm_tree, WT_LSM_TREE_THROTTLE);
+	else
+		F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE);
+
+	WT_ERR(__wt_config_getones(session, lsmconf, "lsm.bloom", &cv));
+	FLD_SET(lsm_tree->bloom,
+	    (cv.val == 0 ? WT_LSM_BLOOM_OFF : WT_LSM_BLOOM_MERGED));
+	WT_ERR(__wt_config_getones(session, lsmconf, "lsm.bloom_oldest", &cv));
+	if (cv.val != 0)
+		FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST);
+
+	if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) &&
+	    FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST))
+		WT_ERR_MSG(session, EINVAL,
+		    "Bloom filters can only be created on newest and oldest "
+		    "chunks if bloom filters are enabled");
+
+	WT_ERR(__wt_config_getones(
+	    session, lsmconf, "lsm.bloom_bit_count", &cv));
+	lsm_tree->bloom_bit_count = (uint32_t)cv.val;
+	WT_ERR(__wt_config_getones(session, lsmconf, "lsm.bloom_config", &cv));
+	/* Don't include the brackets. */
+	if (cv.type == WT_CONFIG_ITEM_STRUCT) {
+		cv.str++;
+		cv.len -= 2;
+	}
+	WT_ERR(__wt_config_check(session,
+	   WT_CONFIG_REF(session, WT_SESSION_create), cv.str, cv.len));
+	WT_ERR(__wt_strndup(session, cv.str, cv.len, &lsm_tree->bloom_config));
+	WT_ERR(__wt_config_getones(
+	    session, lsmconf, "lsm.bloom_hash_count", &cv));
+	lsm_tree->bloom_hash_count = (uint32_t)cv.val;
+
+	WT_ERR(__wt_config_getones(
+	    session, lsmconf, "lsm.chunk_count_limit", &cv));
+	lsm_tree->chunk_count_limit = (uint32_t)cv.val;
+	if (cv.val == 0)
+		F_SET(lsm_tree, WT_LSM_TREE_MERGES);
+	else
+		F_CLR(lsm_tree, WT_LSM_TREE_MERGES);
+	WT_ERR(__wt_config_getones(session, lsmconf, "lsm.chunk_max", &cv));
+	lsm_tree->chunk_max = (uint64_t)cv.val;
+	WT_ERR(__wt_config_getones(session, lsmconf, "lsm.chunk_size", &cv));
+	lsm_tree->chunk_size = (uint64_t)cv.val;
+
+	if (lsm_tree->chunk_size > lsm_tree->chunk_max)
+		WT_ERR_MSG(session, EINVAL,
+		    "Chunk size (chunk_size) must be smaller than or equal to "
+		    "the maximum chunk size (chunk_max)");
+
+	WT_ERR(__wt_config_getones(session, lsmconf, "lsm.merge_max", &cv));
+	lsm_tree->merge_max = (uint32_t)cv.val;
+	WT_ERR(__wt_config_getones(session, lsmconf, "lsm.merge_min", &cv));
+	lsm_tree->merge_min = (uint32_t)cv.val;
+
+	if (lsm_tree->merge_min > lsm_tree->merge_max)
+		WT_ERR_MSG(session, EINVAL,
+		    "LSM merge_min must be less than or equal to merge_max");
+
+	WT_ERR(__wt_config_getones(session, lsmconf, "last", &cv));
+	lsm_tree->last = (u_int)cv.val;
+	WT_ERR(__wt_config_getones(session, lsmconf, "chunks", &cv));
+	WT_ERR(__wt_config_subinit(session, &lparser, &cv));
+	for (nchunks = 0; (ret =
+	    __wt_config_next(&lparser, &lk, &lv)) == 0; ) {
+		if (WT_STRING_MATCH("id", lk.str, lk.len)) {
+			WT_ERR(__wt_realloc_def(session,
+			    &lsm_tree->chunk_alloc,
+			    nchunks + 1, &lsm_tree->chunk));
+			WT_ERR(__wt_calloc_one(session, &chunk));
+			lsm_tree->chunk[nchunks++] = chunk;
+			chunk->id = (uint32_t)lv.val;
+			WT_ERR(__wt_lsm_tree_chunk_name(session,
+			    lsm_tree, chunk->id, &chunk->uri));
+			F_SET(chunk,
+			    WT_LSM_CHUNK_ONDISK |
+			    WT_LSM_CHUNK_STABLE);
+		} else if (WT_STRING_MATCH("bloom", lk.str, lk.len)) {
+			WT_ERR(__wt_lsm_tree_bloom_name(
+			    session, lsm_tree, chunk->id, &chunk->bloom_uri));
+			F_SET(chunk, WT_LSM_CHUNK_BLOOM);
+			continue;
+		} else if (WT_STRING_MATCH("chunk_size", lk.str, lk.len)) {
+			chunk->size = (uint64_t)lv.val;
+			continue;
+		} else if (WT_STRING_MATCH("count", lk.str, lk.len)) {
+			chunk->count = (uint64_t)lv.val;
+			continue;
+		} else if (WT_STRING_MATCH("generation", lk.str, lk.len)) {
+			chunk->generation = (uint32_t)lv.val;
+			continue;
+		}
+	}
+	WT_ERR_NOTFOUND_OK(ret);
+	lsm_tree->nchunks = nchunks;
+
+	WT_ERR(__wt_config_getones(session, lsmconf, "old_chunks", &cv));
+	WT_ERR(__wt_config_subinit(session, &lparser, &cv));
+	for (nchunks = 0; (ret =
+	    __wt_config_next(&lparser, &lk, &lv)) == 0; ) {
+		if (WT_STRING_MATCH("bloom", lk.str, lk.len)) {
+			WT_ERR(__wt_strndup(session,
+			    lv.str, lv.len, &chunk->bloom_uri));
+			F_SET(chunk, WT_LSM_CHUNK_BLOOM);
+			continue;
+		}
+		WT_ERR(__wt_realloc_def(session,
+		    &lsm_tree->old_alloc, nchunks + 1,
+		    &lsm_tree->old_chunks));
+		WT_ERR(__wt_calloc_one(session, &chunk));
+		lsm_tree->old_chunks[nchunks++] = chunk;
+		WT_ERR(__wt_strndup(session,
+		    lk.str, lk.len, &chunk->uri));
+		F_SET(chunk, WT_LSM_CHUNK_ONDISK);
+	}
+	WT_ERR_NOTFOUND_OK(ret);
+	lsm_tree->nold_chunks = nchunks;
+
+	/*
+	 * Set up the config for each chunk.
+	 *
+	 * Make the memory_page_max double the chunk size, so application
+	 * threads don't immediately try to force evict the chunk when the
+	 * worker thread clears the NO_EVICTION flag.
+	 */
+	file_cfg[1] = lsmconf;
+	WT_ERR(__wt_scr_alloc(session, 0, &buf));
+	WT_ERR(__wt_buf_fmt(session, buf,
+	    "key_format=u,value_format=u,memory_page_max=%" PRIu64,
+	    2 * lsm_tree->chunk_max));
+	file_cfg[2] = buf->data;
+	WT_ERR(__wt_config_collapse(session, file_cfg, &fileconf));
+	lsm_tree->file_config = fileconf;
+
+	/*
+	 * Ignore any other values: the metadata entry might have been
+	 * created by a future release, with unknown options.
+	 */
+err:	__wt_scr_free(session, &buf);
+	return (ret);
+}
+
+/*
+ * __lsm_meta_upgrade_v1 --
+ *	Upgrade to v1 of LSM metadata.
+ */
+static int
+__lsm_meta_upgrade_v1(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
+	const char *new_cfg[] = {
+	    WT_CONFIG_BASE(session, lsm_meta), NULL, NULL, NULL };
+
+	/* Include the custom config that used to be embedded in file_config. */
+	new_cfg[1] = lsm_tree->file_config;
+
+	WT_ERR(__wt_scr_alloc(session, 0, &buf));
+	WT_ERR(__wt_buf_fmt(session, buf,
+	    "key_format=%s,value_format=%s",
+	    lsm_tree->key_format, lsm_tree->value_format));
+
+	WT_ERR(__wt_buf_catfmt(session, buf, ",collator=%s",
+	    lsm_tree->collator_name != NULL ?  lsm_tree->collator_name : ""));
+
+	WT_ERR(__wt_buf_catfmt(session, buf, ",lsm=("));
+
+	WT_ERR(__wt_buf_catfmt(session, buf, "auto_throttle=%d",
+	    F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE)));
+
+	WT_ERR(__wt_buf_catfmt(session, buf, ",bloom=%d",
+	    FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_MERGED)));
+	WT_ERR(__wt_buf_catfmt(session, buf, ",bloom_oldest=%d",
+	    FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST)));
+	WT_ERR(__wt_buf_catfmt(session, buf, ",bloom_bit_count=%" PRIu32,
+	    lsm_tree->bloom_bit_count));
+	if (lsm_tree->bloom_config != NULL &&
+	    strlen(lsm_tree->bloom_config) > 0)
+		WT_ERR(__wt_buf_catfmt(session, buf, ",bloom_config=(%s)",
+		    lsm_tree->bloom_config));
+	else
+		WT_ERR(__wt_buf_catfmt(session, buf, ",bloom_config="));
+	WT_ERR(__wt_buf_catfmt(session, buf, ",bloom_hash_count=%" PRIu32,
+	    lsm_tree->bloom_hash_count));
+
+	WT_ERR(__wt_buf_catfmt(session, buf, ",chunk_count_limit=%" PRIu32,
+	    lsm_tree->chunk_count_limit));
+	WT_ERR(__wt_buf_catfmt(session, buf, ",chunk_max=%" PRIu64,
+	    lsm_tree->chunk_max));
+	WT_ERR(__wt_buf_catfmt(session, buf, ",merge_max=%" PRIu32,
+	    lsm_tree->merge_max));
+	WT_ERR(__wt_buf_catfmt(session, buf, ",merge_min=%" PRIu32,
+	    lsm_tree->merge_min));
+
+	WT_ERR(__wt_buf_catfmt(session, buf, ")"));
+
+	new_cfg[2] = buf->data;
+	WT_ERR(__wt_config_merge(session, new_cfg, NULL, &lsm_tree->config));
+
+err:	__wt_scr_free(session, &buf);
+	return (ret);
+}
+/*
+ * __wt_lsm_meta_read --
+ *	Read the metadata for an LSM tree.
+ */
+int
+__wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+	WT_CONFIG_ITEM cval;
+	WT_DECL_RET;
+	char *lsmconf;
+	bool upgrade;
+
+	/* LSM trees inherit the merge setting from the connection. */
+	if (F_ISSET(S2C(session), WT_CONN_LSM_MERGE))
+		F_SET(lsm_tree, WT_LSM_TREE_MERGES);
+
+	WT_RET(__wt_metadata_search(session, lsm_tree->name, &lsmconf));
 
+	upgrade = false;
+	ret = __wt_config_getones(session, lsmconf, "file_config", &cval);
+	if (ret == 0) {
+		ret = __lsm_meta_read_v0(session, lsm_tree, lsmconf);
+		__wt_free(session, lsmconf);
+		WT_RET(ret);
+		upgrade = true;
+	} else if (ret == WT_NOTFOUND) {
+		lsm_tree->config = lsmconf;
+		ret = 0;
+		WT_RET(__lsm_meta_read_v1(session, lsm_tree, lsmconf));
+	}
 	/*
-	 * If the default merge_min was not overridden, calculate it now.  We
-	 * do this here so that trees created before merge_min was added get a
-	 * sane value.
+	 * If the default merge_min was not overridden, calculate it now.
 	 */
 	if (lsm_tree->merge_min < 2)
 		lsm_tree->merge_min = WT_MAX(2, lsm_tree->merge_max / 2);
-
-err:	__wt_free(session, lsmconfig);
+	/*
+	 * If needed, upgrade the configuration.  We need to do this after
+	 * we have fixed the merge_min value.
+	 */
+	if (upgrade)
+		WT_RET(__lsm_meta_upgrade_v1(session, lsm_tree));
 	return (ret);
 }
 
@@ -184,32 +459,15 @@ __wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
 	WT_DECL_RET;
 	WT_LSM_CHUNK *chunk;
 	u_int i;
+	const char *new_cfg[] = { NULL, NULL, NULL };
+	char *new_metadata;
 	bool first;
 
+	new_metadata = NULL;
+
 	WT_RET(__wt_scr_alloc(session, 0, &buf));
-	WT_ERR(__wt_buf_fmt(session, buf,
-	    "key_format=%s,value_format=%s,bloom_config=(%s),file_config=(%s)",
-	    lsm_tree->key_format, lsm_tree->value_format,
-	    lsm_tree->bloom_config, lsm_tree->file_config));
-	if (lsm_tree->collator_name != NULL)
-		WT_ERR(__wt_buf_catfmt(
-		    session, buf, ",collator=%s", lsm_tree->collator_name));
 	WT_ERR(__wt_buf_catfmt(session, buf,
-	    ",last=%" PRIu32
-	    ",chunk_count_limit=%" PRIu32
-	    ",chunk_max=%" PRIu64
-	    ",chunk_size=%" PRIu64
-	    ",auto_throttle=%" PRIu32
-	    ",merge_max=%" PRIu32
-	    ",merge_min=%" PRIu32
-	    ",bloom=%" PRIu32
-	    ",bloom_bit_count=%" PRIu32
-	    ",bloom_hash_count=%" PRIu32,
-	    lsm_tree->last, lsm_tree->chunk_count_limit,
-	    lsm_tree->chunk_max, lsm_tree->chunk_size,
-	    F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) ? 1 : 0,
-	    lsm_tree->merge_max, lsm_tree->merge_min, lsm_tree->bloom,
-	    lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count));
+	    ",last=%" PRIu32, lsm_tree->last));
 	WT_ERR(__wt_buf_catfmt(session, buf, ",chunks=["));
 	for (i = 0; i < lsm_tree->nchunks; i++) {
 		chunk = lsm_tree->chunk[i];
@@ -243,9 +501,15 @@ __wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
 			    session, buf, ",bloom=\"%s\"", chunk->bloom_uri));
 	}
 	WT_ERR(__wt_buf_catfmt(session, buf, "]"));
-	ret = __wt_metadata_update(session, lsm_tree->name, buf->data);
+
+	/* Update the existing configuration with the new values. */
+	new_cfg[0] = lsm_tree->config;
+	new_cfg[1] = buf->data;
+	WT_ERR(__wt_config_collapse(session, new_cfg, &new_metadata));
+	ret = __wt_metadata_update(session, lsm_tree->name, new_metadata);
 	WT_ERR(ret);
 
 err:	__wt_scr_free(session, &buf);
+	__wt_free(session, new_metadata);
 	return (ret);
 }
diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c
index 7c188bf3dc7..cb1ddf22f84 100644
--- a/src/lsm/lsm_tree.c
+++ b/src/lsm/lsm_tree.c
@@ -27,6 +27,7 @@ __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final)
 
 	WT_UNUSED(final);	/* Only used in diagnostic builds */
 
+	WT_ASSERT(session, !lsm_tree->active);
 	/*
 	 * The work unit queue should be empty, but it's worth checking
 	 * since work units use a different locking scheme to regular tree
@@ -85,19 +86,27 @@ __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final)
  *	Close an LSM tree structure.
  */
 static int
-__lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+__lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final)
 {
 	WT_DECL_RET;
 	int i;
 
-	/* Stop any active merges. */
-	F_CLR(lsm_tree, WT_LSM_TREE_ACTIVE);
+	/*
+	 * Stop any new work units being added. The barrier is necessary
+	 * because we rely on the state change being visible before checking
+	 * the tree queue state.
+	 */
+	lsm_tree->active = false;
+	WT_READ_BARRIER();
 
 	/*
-	 * Wait for all LSM operations and work units that were in flight to
-	 * finish.
+	 * Wait for all LSM operations to drain. If WiredTiger is shutting
+	 * down also wait for the tree reference count to go to zero, otherwise
+	 * we know a user is holding a reference to the tree, so exclusive
+	 * access is not available.
 	 */
-	for (i = 0; lsm_tree->refcnt > 1 || lsm_tree->queue_ref > 0; ++i) {
+	for (i = 0;
+	    lsm_tree->queue_ref > 0 || (final && lsm_tree->refcnt > 1); ++i) {
 		/*
 		 * Remove any work units from the manager queues. Do this step
 		 * repeatedly in case a work unit was in the process of being
@@ -114,11 +123,14 @@ __lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
 		if (i % WT_THOUSAND == 0) {
 			WT_WITHOUT_LOCKS(session, ret =
 			    __wt_lsm_manager_clear_tree(session, lsm_tree));
-			WT_RET(ret);
+			WT_ERR(ret);
 		}
 		__wt_yield();
 	}
 	return (0);
+
+err:	lsm_tree->active = true;
+	return (ret);
 }
 
 /*
@@ -142,7 +154,7 @@ __wt_lsm_tree_close_all(WT_SESSION_IMPL *session)
 		 * is unconditional.
 		 */
 		(void)__wt_atomic_add32(&lsm_tree->refcnt, 1);
-		WT_TRET(__lsm_tree_close(session, lsm_tree));
+		WT_TRET(__lsm_tree_close(session, lsm_tree, true));
 		WT_TRET(__lsm_tree_discard(session, lsm_tree, true));
 	}
 
@@ -157,9 +169,12 @@ static int
 __lsm_tree_set_name(WT_SESSION_IMPL *session,
     WT_LSM_TREE *lsm_tree, const char *uri)
 {
-	if (lsm_tree->name != NULL)
-		__wt_free(session, lsm_tree->name);
-	WT_RET(__wt_strdup(session, uri, &lsm_tree->name));
+	void *p;
+
+	WT_RET(__wt_strdup(session, uri, &p));
+
+	__wt_free(session, lsm_tree->name);
+	lsm_tree->name = p;
 	lsm_tree->filename = lsm_tree->name + strlen("lsm:");
 	return (0);
 }
@@ -306,15 +321,15 @@ int
 __wt_lsm_tree_create(WT_SESSION_IMPL *session,
     const char *uri, bool exclusive, const char *config)
 {
-	WT_CONFIG_ITEM cval;
-	WT_DECL_ITEM(buf);
 	WT_DECL_RET;
 	WT_LSM_TREE *lsm_tree;
 	const char *cfg[] =
-	    { WT_CONFIG_BASE(session, WT_SESSION_create), config, NULL };
-	char *tmpconfig;
+	    { WT_CONFIG_BASE(session, lsm_meta), config, NULL };
+	const char *metadata;
 
-	/* If the tree is open, it already exists. */
+	metadata = NULL;
+
+	/* If the tree can be opened, it already exists. */
 	WT_WITH_HANDLE_LIST_LOCK(session,
 	    ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree));
 	if (ret == 0) {
@@ -323,128 +338,9 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session,
 	}
 	WT_RET_NOTFOUND_OK(ret);
 
-	/*
-	 * If the tree has metadata, it already exists.
-	 *
-	 * !!!
-	 * Use a local variable: we don't care what the existing configuration
-	 * is, but we don't want to overwrite the real config.
-	 */
-	if (__wt_metadata_search(session, uri, &tmpconfig) == 0) {
-		__wt_free(session, tmpconfig);
-		return (exclusive ? EEXIST : 0);
-	}
-	WT_RET_NOTFOUND_OK(ret);
-
-	/* In-memory configurations don't make sense for LSM. */
-	if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
-		WT_RET_MSG(session, EINVAL,
-		    "LSM trees not supported by in-memory configurations");
-
-	WT_RET(__wt_config_gets(session, cfg, "key_format", &cval));
-	if (WT_STRING_MATCH("r", cval.str, cval.len))
-		WT_RET_MSG(session, EINVAL,
-		    "LSM trees cannot be configured as column stores");
-
-	WT_RET(__wt_calloc_one(session, &lsm_tree));
-
-	WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri));
-
-	WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval));
-	WT_ERR(__wt_strndup(
-	    session, cval.str, cval.len, &lsm_tree->key_format));
-	WT_ERR(__wt_config_gets(session, cfg, "value_format", &cval));
-	WT_ERR(__wt_strndup(
-	    session, cval.str, cval.len, &lsm_tree->value_format));
-
-	WT_ERR(__wt_config_gets_none(session, cfg, "collator", &cval));
-	WT_ERR(__wt_strndup(
-	    session, cval.str, cval.len, &lsm_tree->collator_name));
-
-	WT_ERR(__wt_config_gets(session, cfg, "cache_resident", &cval));
-	if (cval.val != 0)
-		WT_ERR_MSG(session, EINVAL,
-		    "The cache_resident flag is not compatible with LSM");
-
-	WT_ERR(__wt_config_gets(session, cfg, "lsm.auto_throttle", &cval));
-	if (cval.val)
-		F_SET(lsm_tree, WT_LSM_TREE_THROTTLE);
-	else
-		F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE);
-	WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom", &cval));
-	FLD_SET(lsm_tree->bloom,
-	    (cval.val == 0 ? WT_LSM_BLOOM_OFF : WT_LSM_BLOOM_MERGED));
-	WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_oldest", &cval));
-	if (cval.val != 0)
-		FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST);
-
-	if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) &&
-	    FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST))
-		WT_ERR_MSG(session, EINVAL,
-		    "Bloom filters can only be created on newest and oldest "
-		    "chunks if bloom filters are enabled");
-
-	WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_config", &cval));
-	if (cval.type == WT_CONFIG_ITEM_STRUCT) {
-		cval.str++;
-		cval.len -= 2;
-	}
-	WT_ERR(__wt_config_check(session,
-	   WT_CONFIG_REF(session, WT_SESSION_create), cval.str, cval.len));
-	WT_ERR(__wt_strndup(
-	    session, cval.str, cval.len, &lsm_tree->bloom_config));
-
-	WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_bit_count", &cval));
-	lsm_tree->bloom_bit_count = (uint32_t)cval.val;
-	WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_hash_count", &cval));
-	lsm_tree->bloom_hash_count = (uint32_t)cval.val;
-	WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_count_limit", &cval));
-	lsm_tree->chunk_count_limit = (uint32_t)cval.val;
-	if (cval.val == 0)
-		F_SET(lsm_tree, WT_LSM_TREE_MERGES);
-	else
-		F_CLR(lsm_tree, WT_LSM_TREE_MERGES);
-	WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_max", &cval));
-	lsm_tree->chunk_max = (uint64_t)cval.val;
-	WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_size", &cval));
-	lsm_tree->chunk_size = (uint64_t)cval.val;
-	if (lsm_tree->chunk_size > lsm_tree->chunk_max)
-		WT_ERR_MSG(session, EINVAL,
-		    "Chunk size (chunk_size) must be smaller than or equal to "
-		    "the maximum chunk size (chunk_max)");
-	WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_max", &cval));
-	lsm_tree->merge_max = (uint32_t)cval.val;
-	WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_min", &cval));
-	lsm_tree->merge_min = (uint32_t)cval.val;
-	if (lsm_tree->merge_min > lsm_tree->merge_max)
-		WT_ERR_MSG(session, EINVAL,
-		    "LSM merge_min must be less than or equal to merge_max");
-
 	if (!F_ISSET(S2C(session), WT_CONN_READONLY)) {
-		/*
-		 * Set up the config for each chunk.
-		 *
-		 * Make the memory_page_max double the chunk size, so
-		 * application threads don't immediately try to force evict
-		 * the chunk when the worker thread clears the NO_EVICTION flag.
-		 */
-		WT_ERR(__wt_scr_alloc(session, 0, &buf));
-		WT_ERR(__wt_buf_fmt(session, buf,
-		    "%s,key_format=u,value_format=u,memory_page_max=%" PRIu64,
-		    config, 2 * lsm_tree->chunk_max));
-		WT_ERR(__wt_strndup(
-		    session, buf->data, buf->size, &lsm_tree->file_config));
-
-		/* Create the first chunk and flush the metadata. */
-		WT_ERR(__wt_lsm_meta_write(session, lsm_tree));
-
-		/* Discard our partially populated handle. */
-		ret = __lsm_tree_discard(session, lsm_tree, false);
-		lsm_tree = NULL;
-	} else {
-		F_CLR(lsm_tree, WT_LSM_TREE_MERGES);
-		FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_OFF);
-		FLD_CLR(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST);
+		WT_ERR(__wt_config_merge(session, cfg, NULL, &metadata));
+		WT_ERR(__wt_metadata_insert(session, uri, metadata));
 	}
 
 	/*
@@ -452,16 +348,12 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session,
 	 * error: the returned handle is NULL on error, and the metadata
 	 * tracking macros handle cleaning up on failure.
 	 */
-	if (ret == 0)
-		WT_WITH_HANDLE_LIST_LOCK(session,
-		    ret = __lsm_tree_open(session, uri, true, &lsm_tree));
+	WT_WITH_HANDLE_LIST_LOCK(session,
+	    ret = __lsm_tree_open(session, uri, true, &lsm_tree));
 	if (ret == 0)
 		__wt_lsm_tree_release(session, lsm_tree);
 
-	if (0) {
-err:		WT_TRET(__lsm_tree_discard(session, lsm_tree, false));
-	}
-	__wt_scr_free(session, &buf);
+err:	__wt_free(session, metadata);
 	return (ret);
 }
 
@@ -483,27 +375,26 @@ __lsm_tree_find(WT_SESSION_IMPL *session,
 	/* See if the tree is already open. */
 	TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q)
 		if (strcmp(uri, lsm_tree->name) == 0) {
-			/*
-			 * Short circuit if the handle is already held
-			 * exclusively or exclusive access is requested and
-			 * there are references held.
-			 */
-			if ((exclusive && lsm_tree->refcnt > 0) ||
-			    lsm_tree->exclusive)
-				return (EBUSY);
-
 			if (exclusive) {
 				/*
 				 * Make sure we win the race to switch on the
 				 * exclusive flag.
 				 */
-				if (!__wt_atomic_cas8(
-				    &lsm_tree->exclusive, 0, 1))
+				if (!__wt_atomic_cas_ptr(
+				    &lsm_tree->excl_session, NULL, session))
 					return (EBUSY);
-				/* Make sure there are no readers */
-				if (!__wt_atomic_cas32(
-				    &lsm_tree->refcnt, 0, 1)) {
-					lsm_tree->exclusive = 0;
+
+				/*
+				 * Drain the work queue before checking for
+				 * open cursors - otherwise we can generate
+				 * spurious busy returns.
+				 */
+				(void)__wt_atomic_add32(&lsm_tree->refcnt, 1);
+				if (__lsm_tree_close(
+				    session, lsm_tree, false) != 0 ||
+				    lsm_tree->refcnt != 1) {
+					__wt_lsm_tree_release(
+					    session, lsm_tree);
 					return (EBUSY);
 				}
 			} else {
@@ -513,11 +404,11 @@ __lsm_tree_find(WT_SESSION_IMPL *session,
 				 * We got a reference, check if an exclusive
 				 * lock beat us to it.
 				 */
-				if (lsm_tree->exclusive) {
+				if (lsm_tree->excl_session != NULL) {
 					WT_ASSERT(session,
 					    lsm_tree->refcnt > 0);
-					(void)__wt_atomic_sub32(
-					    &lsm_tree->refcnt, 1);
+					__wt_lsm_tree_release(
+					    session, lsm_tree);
 					return (EBUSY);
 				}
 			}
@@ -609,7 +500,7 @@ __lsm_tree_open(WT_SESSION_IMPL *session,
 	 * with getting handles exclusive.
 	 */
 	lsm_tree->refcnt = 1;
-	lsm_tree->exclusive = exclusive ? 1 : 0;
+	lsm_tree->excl_session = exclusive ? session : NULL;
 	lsm_tree->queue_ref = 0;
 
 	/* Set a flush timestamp as a baseline. */
@@ -617,7 +508,9 @@ __lsm_tree_open(WT_SESSION_IMPL *session,
 
 	/* Now the tree is setup, make it visible to others. */
 	TAILQ_INSERT_HEAD(&S2C(session)->lsmqh, lsm_tree, q);
-	F_SET(lsm_tree, WT_LSM_TREE_ACTIVE | WT_LSM_TREE_OPEN);
+	if (!exclusive)
+		lsm_tree->active = true;
+	F_SET(lsm_tree, WT_LSM_TREE_OPEN);
 
 	*treep = lsm_tree;
 
@@ -644,7 +537,7 @@ __wt_lsm_tree_get(WT_SESSION_IMPL *session,
 		ret = __lsm_tree_open(session, uri, exclusive, treep);
 
 	WT_ASSERT(session, ret != 0 ||
-	    (exclusive ? 1 : 0)  == (*treep)->exclusive);
+	     (*treep)->excl_session == (exclusive ? session : NULL));
 	return (ret);
 }
 
@@ -656,8 +549,11 @@ void
 __wt_lsm_tree_release(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
 {
 	WT_ASSERT(session, lsm_tree->refcnt > 0);
-	if (lsm_tree->exclusive)
-		lsm_tree->exclusive = 0;
+	if (lsm_tree->excl_session == session) {
+		/* We cleared the active flag when getting exclusive access. */
+		lsm_tree->active = true;
+		lsm_tree->excl_session = NULL;
+	}
 	(void)__wt_atomic_sub32(&lsm_tree->refcnt, 1);
 }
 
@@ -874,7 +770,7 @@ __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
 	F_CLR(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
 	++lsm_tree->dsk_gen;
 
-	lsm_tree->modified = 1;
+	lsm_tree->modified = true;
 
 	/*
 	 * Set the switch transaction in the previous chunk unless this is
@@ -970,9 +866,7 @@ __wt_lsm_tree_drop(
 	WT_WITH_HANDLE_LIST_LOCK(session,
 	    ret = __wt_lsm_tree_get(session, name, true, &lsm_tree));
 	WT_RET(ret);
-
-	/* Shut down the LSM worker. */
-	WT_ERR(__lsm_tree_close(session, lsm_tree));
+	WT_ASSERT(session, !lsm_tree->active);
 
 	/* Prevent any new opens. */
 	WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
@@ -1001,6 +895,7 @@ __wt_lsm_tree_drop(
 	WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree));
 	ret = __wt_metadata_remove(session, name);
 
+	WT_ASSERT(session, !lsm_tree->active);
 err:	if (locked)
 		WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
 	WT_WITH_HANDLE_LIST_LOCK(session,
@@ -1033,9 +928,6 @@ __wt_lsm_tree_rename(WT_SESSION_IMPL *session,
 	    ret = __wt_lsm_tree_get(session, olduri, true, &lsm_tree));
 	WT_RET(ret);
 
-	/* Shut down the LSM worker. */
-	WT_ERR(__lsm_tree_close(session, lsm_tree));
-
 	/* Prevent any new opens. */
 	WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
 	locked = true;
@@ -1073,8 +965,8 @@ __wt_lsm_tree_rename(WT_SESSION_IMPL *session,
 
 err:	if (locked)
 		WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
-	if (old != NULL)
-		__wt_free(session, old);
+	__wt_free(session, old);
+
 	/*
 	 * Discard this LSM tree structure. The first operation on the renamed
 	 * tree will create a new one.
@@ -1108,9 +1000,6 @@ __wt_lsm_tree_truncate(
 	    ret = __wt_lsm_tree_get(session, name, true, &lsm_tree));
 	WT_RET(ret);
 
-	/* Shut down the LSM worker. */
-	WT_ERR(__lsm_tree_close(session, lsm_tree));
-
 	/* Prevent any new opens. */
 	WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
 	locked = true;
@@ -1314,8 +1203,8 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp)
 	if (chunk != NULL) {
 		WT_ERR(__wt_verbose(session, WT_VERB_LSM,
 		    "Compact force flush %s flags 0x%" PRIx32
-		    " chunk %u flags 0x%"
-		    PRIx32, name, lsm_tree->flags, chunk->id, chunk->flags));
+		    " chunk %" PRIu32 " flags 0x%" PRIx32,
+		    name, lsm_tree->flags, chunk->id, chunk->flags));
 		flushing = true;
 		/*
 		 * Make sure the in-memory chunk gets flushed do not push a
@@ -1337,7 +1226,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp)
 	}
 
 	/* Wait for the work unit queues to drain. */
-	while (F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) {
+	while (lsm_tree->active) {
 		/*
 		 * The flush flag is cleared when the chunk has been flushed.
 		 * Continue to push forced flushes until the chunk is on disk.
@@ -1348,7 +1237,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp)
 			if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
 				WT_ERR(__wt_verbose(session,
 				    WT_VERB_LSM,
-				    "Compact flush done %s chunk %u.  "
+				    "Compact flush done %s chunk %" PRIu32 ". "
 				    "Start compacting progress %" PRIu64,
 				    name, chunk->id,
 				    lsm_tree->merge_progressing));
@@ -1359,7 +1248,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp)
 				progress = lsm_tree->merge_progressing;
 			} else {
 				WT_ERR(__wt_verbose(session, WT_VERB_LSM,
-				    "Compact flush retry %s chunk %u",
+				    "Compact flush retry %s chunk %" PRIu32,
 				    name, chunk->id));
 				WT_ERR(__wt_lsm_manager_push_entry(session,
 				    WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE,
@@ -1419,7 +1308,6 @@ err:
 
 	__wt_lsm_tree_release(session, lsm_tree);
 	return (ret);
-
 }
 
 /*
diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c
index 7723818f607..87771e2cb6c 100644
--- a/src/lsm/lsm_work_unit.c
+++ b/src/lsm/lsm_work_unit.c
@@ -29,7 +29,7 @@ __lsm_copy_chunks(WT_SESSION_IMPL *session,
 	cookie->nchunks = 0;
 
 	WT_RET(__wt_lsm_tree_readlock(session, lsm_tree));
-	if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE))
+	if (!lsm_tree->active)
 		return (__wt_lsm_tree_readunlock(session, lsm_tree));
 
 	/* Take a copy of the current state of the LSM tree. */
@@ -72,14 +72,14 @@ __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session,
 {
 	WT_DECL_RET;
 	WT_LSM_CHUNK *chunk, *evict_chunk, *flush_chunk;
-	u_int i;
+	uint32_t i;
 
 	*chunkp = NULL;
 	chunk = evict_chunk = flush_chunk = NULL;
 
 	WT_ASSERT(session, lsm_tree->queue_ref > 0);
 	WT_RET(__wt_lsm_tree_readlock(session, lsm_tree));
-	if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE) || lsm_tree->nchunks == 0)
+	if (!lsm_tree->active || lsm_tree->nchunks == 0)
 		return (__wt_lsm_tree_readunlock(session, lsm_tree));
 
 	/* Search for a chunk to evict and/or a chunk to flush. */
@@ -118,7 +118,7 @@ __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session,
 
 	if (chunk != NULL) {
 		WT_ERR(__wt_verbose(session, WT_VERB_LSM,
-		    "Flush%s: return chunk %u of %u: %s",
+		    "Flush%s: return chunk %" PRIu32 " of %" PRIu32 ": %s",
 		    force ? " w/ force" : "",
 		    i, lsm_tree->nchunks, chunk->uri));
 
@@ -322,7 +322,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
 		 */
 		saved_isolation = session->txn.isolation;
 		session->txn.isolation = WT_ISO_READ_UNCOMMITTED;
-		ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES);
+		ret = __wt_cache_op(session, WT_SYNC_WRITE_LEAVES);
 		session->txn.isolation = saved_isolation;
 		WT_TRET(__wt_session_release_btree(session));
 	}
diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c
index 7562cb1cae3..0874da8db13 100644
--- a/src/lsm/lsm_worker.c
+++ b/src/lsm/lsm_worker.c
@@ -20,7 +20,7 @@ int
 __wt_lsm_worker_start(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args)
 {
 	WT_RET(__wt_verbose(session, WT_VERB_LSM_MANAGER,
-	    "Start LSM worker %d type 0x%x", args->id, args->type));
+	    "Start LSM worker %u type %#" PRIx32, args->id, args->type));
 	return (__wt_thread_create(session, &args->tid, __lsm_worker, args));
 }
 
@@ -59,9 +59,8 @@ __lsm_worker_general_op(
 		 */
 		if (chunk != NULL) {
 			WT_ERR(__wt_verbose(session, WT_VERB_LSM,
-			    "Flush%s chunk %d %s",
-			    force ? " w/ force" : "",
-			    chunk->id, chunk->uri));
+			    "Flush%s chunk %" PRIu32 " %s",
+			    force ? " w/ force" : "", chunk->id, chunk->uri));
 			ret = __wt_lsm_checkpoint_chunk(
 			    session, entry->lsm_tree, chunk);
 			WT_ASSERT(session, chunk->refcnt > 0);
@@ -140,7 +139,7 @@ __lsm_worker(void *arg)
 			if (ret == WT_NOTFOUND) {
 				F_CLR(entry->lsm_tree, WT_LSM_TREE_COMPACTING);
 				ret = 0;
-			} else if (ret == EBUSY)
+			} else if (ret == EBUSY || ret == EINTR)
 				ret = 0;
 
 			/* Paranoia: clear session state. */
@@ -164,7 +163,7 @@ __lsm_worker(void *arg)
 	if (ret != 0) {
 err:		__wt_lsm_manager_free_work_unit(session, entry);
 		WT_PANIC_MSG(session, ret,
-		    "Error in LSM worker thread %d", cookie->id);
+		    "Error in LSM worker thread %u", cookie->id);
 	}
 	return (WT_THREAD_RET_VALUE);
 }
diff --git a/src/meta/meta_ckpt.c b/src/meta/meta_ckpt.c
index df4cd2cb4d6..0a864432daf 100644
--- a/src/meta/meta_ckpt.c
+++ b/src/meta/meta_ckpt.c
@@ -212,8 +212,7 @@ __ckpt_last_name(
 		if (found && a.val < found)
 			continue;
 
-		if (*namep != NULL)
-			__wt_free(session, *namep);
+		__wt_free(session, *namep);
 		WT_ERR(__wt_strndup(session, k.str, k.len, namep));
 		found = a.val;
 	}
@@ -221,7 +220,7 @@ __ckpt_last_name(
 		ret = WT_NOTFOUND;
 
 	if (0) {
-err:		__wt_free(session, namep);
+err:		__wt_free(session, *namep);
 	}
 	return (ret);
 }
diff --git a/src/meta/meta_table.c b/src/meta/meta_table.c
index 61cc009c983..e5f2727b5b6 100644
--- a/src/meta/meta_table.c
+++ b/src/meta/meta_table.c
@@ -67,18 +67,16 @@ __wt_metadata_cursor_open(
 	btree = ((WT_CURSOR_BTREE *)(*cursorp))->btree;
 
 	/* 
-	 * Set special flags for the metadata file: eviction (the metadata file
-	 * is in-memory and never evicted), logging (the metadata file is always
-	 * logged if possible).
+	 * Special settings for metadata: skew eviction so metadata almost
+	 * always stays in cache and make sure metadata is logged if possible.
 	 *
-	 * Test flags before setting them so updates can't race in subsequent
-	 * opens (the first update is safe because it's single-threaded from
+	 * Test before setting so updates can't race in subsequent opens (the
+	 * first update is safe because it's single-threaded from
 	 * wiredtiger_open).
 	 */
-	if (!F_ISSET(btree, WT_BTREE_IN_MEMORY))
-		F_SET(btree, WT_BTREE_IN_MEMORY);
-	if (!F_ISSET(btree, WT_BTREE_NO_EVICTION))
-		F_SET(btree, WT_BTREE_NO_EVICTION);
+	if (btree->evict_priority == 0)
+		WT_WITH_BTREE(session, btree,
+		    __wt_evict_priority_set(session, WT_EVICT_INT_SKEW));
 	if (F_ISSET(btree, WT_BTREE_NO_LOGGING))
 		F_CLR(btree, WT_BTREE_NO_LOGGING);
 
diff --git a/src/os_posix/os_alloc.c b/src/os_posix/os_alloc.c
index 3876f9a1afe..cfc7b80450e 100644
--- a/src/os_posix/os_alloc.c
+++ b/src/os_posix/os_alloc.c
@@ -18,22 +18,13 @@
 #include <gperftools/tcmalloc.h>
 
 #define	calloc			tc_calloc
+#define	malloc			tc_malloc
 #define	realloc 		tc_realloc
 #define	posix_memalign 		tc_posix_memalign
 #define	free 			tc_free
 #endif
 
 /*
- * There's no malloc interface, WiredTiger never calls malloc.
- *
- * The problem is an application might allocate memory, write secret stuff in
- * it, free the memory, then WiredTiger allocates the memory and uses it for a
- * file page or log record, then writes it to disk, without having overwritten
- * it fully.  That results in the secret stuff being protected by WiredTiger's
- * permission mechanisms, potentially inappropriate for the secret stuff.
- */
-
-/*
  * __wt_calloc --
  *	ANSI calloc function.
  */
@@ -67,12 +58,46 @@ __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp)
 }
 
 /*
- * __wt_realloc --
- *	ANSI realloc function.
+ * __wt_malloc --
+ *	ANSI malloc function.
  */
 int
-__wt_realloc(WT_SESSION_IMPL *session,
-    size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp)
+__wt_malloc(WT_SESSION_IMPL *session, size_t bytes_to_allocate, void *retp)
+{
+	void *p;
+
+	/*
+	 * Defensive: if our caller doesn't handle errors correctly, ensure a
+	 * free won't fail.
+	 */
+	*(void **)retp = NULL;
+
+	/*
+	 * !!!
+	 * This function MUST handle a NULL WT_SESSION_IMPL handle.
+	 */
+	WT_ASSERT(session, bytes_to_allocate != 0);
+
+	if (session != NULL)
+		WT_STAT_FAST_CONN_INCR(session, memory_allocation);
+
+	if ((p = malloc(bytes_to_allocate)) == NULL)
+		WT_RET_MSG(session, __wt_errno(),
+		    "memory allocation of %" WT_SIZET_FMT " bytes failed",
+		    bytes_to_allocate);
+
+	*(void **)retp = p;
+	return (0);
+}
+
+/*
+ * __realloc_func --
+ *	ANSI realloc function.
+ */
+static int
+__realloc_func(WT_SESSION_IMPL *session,
+    size_t *bytes_allocated_ret, size_t bytes_to_allocate, bool clear_memory,
+    void *retp)
 {
 	void *p;
 	size_t bytes_allocated;
@@ -107,15 +132,12 @@ __wt_realloc(WT_SESSION_IMPL *session,
 		    bytes_to_allocate);
 
 	/*
-	 * Clear the allocated memory -- an application might: allocate memory,
-	 * write secret stuff into it, free the memory, then we re-allocate the
-	 * memory and use it for a file page or log record, and then write it to
-	 * disk.  That would result in the secret stuff being protected by the
-	 * WiredTiger permission mechanisms, potentially inappropriate for the
-	 * secret stuff.
+	 * Clear the allocated memory, parts of WiredTiger depend on allocated
+	 * memory being cleared.
 	 */
-	memset((uint8_t *)
-	    p + bytes_allocated, 0, bytes_to_allocate - bytes_allocated);
+	if (clear_memory)
+		memset((uint8_t *)p + bytes_allocated,
+		    0, bytes_to_allocate - bytes_allocated);
 
 	/* Update caller's bytes allocated value. */
 	if (bytes_allocated_ret != NULL)
@@ -126,9 +148,33 @@ __wt_realloc(WT_SESSION_IMPL *session,
 }
 
 /*
+ * __wt_realloc --
+ *	WiredTiger's realloc API.
+ */
+int
+__wt_realloc(WT_SESSION_IMPL *session,
+    size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp)
+{
+	return (__realloc_func(
+	    session, bytes_allocated_ret, bytes_to_allocate, true, retp));
+}
+
+/*
+ * __wt_realloc_noclear --
+ *	WiredTiger's realloc API, not clearing allocated memory.
+ */
+int
+__wt_realloc_noclear(WT_SESSION_IMPL *session,
+    size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp)
+{
+	return (__realloc_func(
+	    session, bytes_allocated_ret, bytes_to_allocate, false, retp));
+}
+
+/*
  * __wt_realloc_aligned --
  *	ANSI realloc function that aligns to buffer boundaries, configured with
- *	the "buffer_alignment" key to wiredtiger_open.
+ * the "buffer_alignment" key to wiredtiger_open.
  */
 int
 __wt_realloc_aligned(WT_SESSION_IMPL *session,
@@ -184,10 +230,6 @@ __wt_realloc_aligned(WT_SESSION_IMPL *session,
 		__wt_free(session, p);
 		p = newp;
 
-		/* Clear the allocated memory (see above). */
-		memset((uint8_t *)p + bytes_allocated, 0,
-		    bytes_to_allocate - bytes_allocated);
-
 		/* Update caller's bytes allocated value. */
 		if (bytes_allocated_ret != NULL)
 			*bytes_allocated_ret = bytes_to_allocate;
@@ -200,11 +242,11 @@ __wt_realloc_aligned(WT_SESSION_IMPL *session,
 	 * If there is no posix_memalign function, or no alignment configured,
 	 * fall back to realloc.
 	 *
-	 * Windows note: Visual C CRT memalign does not match Posix behavior
-	 * and would also double each allocation so it is bad for memory use
+	 * Windows note: Visual C CRT memalign does not match POSIX behavior
+	 * and would also double each allocation so it is bad for memory use.
 	 */
-	return (__wt_realloc(
-	    session, bytes_allocated_ret, bytes_to_allocate, retp));
+	return (__realloc_func(
+	    session, bytes_allocated_ret, bytes_to_allocate, false, retp));
 }
 
 /*
@@ -221,13 +263,14 @@ __wt_strndup(WT_SESSION_IMPL *session, const void *str, size_t len, void *retp)
 		return (0);
 	}
 
-	WT_RET(__wt_calloc(session, len + 1, 1, &p));
+	WT_RET(__wt_malloc(session, len + 1, &p));
 
 	/*
 	 * Don't change this to strncpy, we rely on this function to duplicate
 	 * "strings" that contain nul bytes.
 	 */
 	memcpy(p, str, len);
+	((uint8_t *)p)[len] = '\0';
 
 	*(void **)retp = p;
 	return (0);
diff --git a/src/os_posix/os_stdio.c b/src/os_posix/os_stdio.c
index 7ab107eda1e..65a0f40a659 100644
--- a/src/os_posix/os_stdio.c
+++ b/src/os_posix/os_stdio.c
@@ -46,8 +46,7 @@ __wt_fopen(WT_SESSION_IMPL *session,
 	if (*fpp == NULL)
 		ret = __wt_errno();
 
-	if (pathbuf != NULL)
-		__wt_free(session, pathbuf);
+	__wt_free(session, pathbuf);
 
 	if (ret == 0)
 		return (0);
diff --git a/src/packing/pack_stream.c b/src/packing/pack_stream.c
index 98da5b405c3..1393eb9a9c1 100644
--- a/src/packing/pack_stream.c
+++ b/src/packing/pack_stream.c
@@ -65,8 +65,7 @@ wiredtiger_pack_close(WT_PACK_STREAM *ps, size_t *usedp)
 	if (usedp != NULL)
 		*usedp = WT_PTRDIFF(ps->p, ps->start);
 
-	if (ps != NULL)
-		__wt_free(ps->pack.session, ps);
+	__wt_free(ps->pack.session, ps);
 
 	return (0);
 }
@@ -327,3 +326,139 @@ wiredtiger_unpack_uint(WT_PACK_STREAM *ps, uint64_t *up)
 	}
 	return (0);
 }
+
+/*
+ * __wt_ext_pack_start --
+ *	WT_EXTENSION.pack_start method.
+ */
+int
+__wt_ext_pack_start(WT_EXTENSION_API *wt_api,
+    WT_SESSION *wt_session, const char *format,
+    void *buffer, size_t size, WT_PACK_STREAM **psp)
+{
+	WT_CONNECTION_IMPL *conn;
+
+	conn = (WT_CONNECTION_IMPL *)wt_api->conn;
+	if (wt_session == NULL)
+		wt_session = (WT_SESSION *)conn->default_session;
+	return (wiredtiger_pack_start(wt_session, format, buffer, size, psp));
+}
+
+/*
+ * __wt_ext_unpack_start --
+ *	WT_EXTENSION.unpack_start
+ */
+int
+__wt_ext_unpack_start(WT_EXTENSION_API *wt_api,
+    WT_SESSION *wt_session, const char *format,
+    const void *buffer, size_t size, WT_PACK_STREAM **psp)
+{
+	WT_CONNECTION_IMPL *conn;
+
+	conn = (WT_CONNECTION_IMPL *)wt_api->conn;
+	if (wt_session == NULL)
+		wt_session = (WT_SESSION *)conn->default_session;
+	return (wiredtiger_unpack_start(wt_session, format, buffer, size, psp));
+}
+
+/*
+ * __wt_ext_pack_close --
+ *	WT_EXTENSION.pack_close
+ */
+int
+__wt_ext_pack_close(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, size_t *usedp)
+{
+	WT_UNUSED(wt_api);
+	return (wiredtiger_pack_close(ps, usedp));
+}
+
+/*
+ * __wt_ext_pack_item --
+ *	WT_EXTENSION.pack_item
+ */
+int
+__wt_ext_pack_item(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, WT_ITEM *item)
+{
+	WT_UNUSED(wt_api);
+	return (wiredtiger_pack_item(ps, item));
+}
+
+/*
+ * __wt_ext_pack_int --
+ *	WT_EXTENSION.pack_int
+ */
+int
+__wt_ext_pack_int(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, int64_t i)
+{
+	WT_UNUSED(wt_api);
+	return (wiredtiger_pack_int(ps, i));
+}
+
+/*
+ * __wt_ext_pack_str --
+ *	WT_EXTENSION.pack_str
+ */
+int
+__wt_ext_pack_str(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, const char *s)
+{
+	WT_UNUSED(wt_api);
+	return (wiredtiger_pack_str(ps, s));
+}
+
+/*
+ * __wt_ext_pack_uint --
+ *	WT_EXTENSION.pack_uint
+ */
+int
+__wt_ext_pack_uint(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, uint64_t u)
+{
+	WT_UNUSED(wt_api);
+	return (wiredtiger_pack_uint(ps, u));
+}
+
+/*
+ * __wt_ext_unpack_item --
+ *	WT_EXTENSION.unpack_item
+ */
+int
+__wt_ext_unpack_item(WT_EXTENSION_API *wt_api,
+    WT_PACK_STREAM *ps, WT_ITEM *item)
+{
+	WT_UNUSED(wt_api);
+	return (wiredtiger_unpack_item(ps, item));
+}
+
+/*
+ * __wt_ext_unpack_int --
+ *	WT_EXTENSION.unpack_int
+ */
+int
+__wt_ext_unpack_int(WT_EXTENSION_API *wt_api,
+    WT_PACK_STREAM *ps, int64_t *ip)
+{
+	WT_UNUSED(wt_api);
+	return (wiredtiger_unpack_int(ps, ip));
+}
+
+/*
+ * __wt_ext_unpack_str --
+ *	WT_EXTENSION.unpack_str
+ */
+int
+__wt_ext_unpack_str(WT_EXTENSION_API *wt_api,
+    WT_PACK_STREAM *ps, const char **sp)
+{
+	WT_UNUSED(wt_api);
+	return (wiredtiger_unpack_str(ps, sp));
+}
+
+/*
+ * __wt_ext_unpack_uint --
+ *	WT_EXTENSION.unpack_uint
+ */
+int
+__wt_ext_unpack_uint(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, uint64_t *up)
+{
+	WT_UNUSED(wt_api);
+	return (wiredtiger_unpack_uint(ps, up));
+}
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index f245ff5d921..a69f335c9b3 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -1960,12 +1960,21 @@ __rec_split_init(WT_SESSION_IMPL *session,
 	WT_RET(__wt_buf_init(session, &r->disk_image, corrected_page_size));
 
 	/*
-	 * Clear the disk page's header and block-manager space, set the page
-	 * type (the type doesn't change, and setting it later would require
-	 * additional code in a few different places).
+	 * Clear the disk page header to ensure all of it is initialized, even
+	 * the unused fields.
+	 *
+	 * In the case of fixed-length column-store, clear the entire buffer:
+	 * fixed-length column-store sets bits in bytes, where the bytes are
+	 * assumed to initially be 0.
+	 */
+	memset(r->disk_image.mem, 0, page->type == WT_PAGE_COL_FIX ?
+	    corrected_page_size : WT_PAGE_HEADER_SIZE);
+
+	/*
+	 * Set the page type (the type doesn't change, and setting it later
+	 * would require additional code in a few different places).
 	 */
 	dsk = r->disk_image.mem;
-	memset(dsk, 0, WT_PAGE_HEADER_BYTE_SIZE(btree));
 	dsk->type = page->type;
 
 	/*
@@ -3026,13 +3035,13 @@ __rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r)
 	 * The data isn't laid out on a page boundary or nul padded; copy it to
 	 * a clean, aligned, padded buffer before writing it.
 	 *
-	 * Allocate a scratch buffer to hold the new disk image.  Copy the
-	 * WT_PAGE_HEADER header onto the scratch buffer, most of the header
-	 * information remains unchanged between the pages.
+	 * Allocate a scratch buffer to hold the new disk image. Copy the disk
+	 * page's header and block-manager space into the scratch buffer, most
+	 * of the header information remains unchanged between the pages.
 	 */
 	WT_RET(__wt_scr_alloc(session, r->disk_image.memsize, &tmp));
 	dsk = tmp->mem;
-	memcpy(dsk, r->disk_image.mem, WT_PAGE_HEADER_SIZE);
+	memcpy(dsk, r->disk_image.mem, WT_PAGE_HEADER_BYTE_SIZE(btree));
 
 	/*
 	 * For each split chunk we've created, update the disk image and copy
diff --git a/src/schema/schema_create.c b/src/schema/schema_create.c
index 9b3b76b62de..756f1fdcc6c 100644
--- a/src/schema/schema_create.c
+++ b/src/schema/schema_create.c
@@ -9,22 +9,6 @@
 #include "wt_internal.h"
 
 /*
- * __wt_schema_create_strip --
- *	Discard any configuration information from a schema entry that is not
- * applicable to an session.create call, here for the wt dump command utility,
- * which only wants to dump the schema information needed for load.
- */
-int
-__wt_schema_create_strip(WT_SESSION_IMPL *session,
-    const char *v1, const char *v2, char **value_ret)
-{
-	const char *cfg[] =
-	    { WT_CONFIG_BASE(session, WT_SESSION_create), v1, v2, NULL };
-
-	return (__wt_config_collapse(session, cfg, value_ret));
-}
-
-/*
  * __wt_direct_io_size_check --
  *	Return a size from the configuration, complaining if it's insufficient
  * for direct I/O.
diff --git a/src/schema/schema_open.c b/src/schema/schema_open.c
index 49318f80959..e7ce4e42498 100644
--- a/src/schema/schema_open.c
+++ b/src/schema/schema_open.c
@@ -109,8 +109,7 @@ __wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table)
 
 err:	__wt_scr_free(session, &buf);
 	__wt_schema_destroy_colgroup(session, &colgroup);
-	if (cgconfig != NULL)
-		__wt_free(session, cgconfig);
+	__wt_free(session, cgconfig);
 	return (ret);
 }
 
diff --git a/src/schema/schema_plan.c b/src/schema/schema_plan.c
index 612a2d2d192..12a1aa9c22f 100644
--- a/src/schema/schema_plan.c
+++ b/src/schema/schema_plan.c
@@ -212,7 +212,7 @@ __wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table,
 				WT_ASSERT(session, !value_only ||
 				    coltype == WT_PROJ_VALUE);
 				WT_RET(__wt_buf_catfmt(
-				    session, plan, "%d%c", cg, coltype));
+				    session, plan, "%u%c", cg, coltype));
 
 				/*
 				 * Set the current column group and column
@@ -226,7 +226,7 @@ __wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table,
 			if (current_col < col) {
 				if (col - current_col > 1)
 					WT_RET(__wt_buf_catfmt(session,
-					    plan, "%d", col - current_col));
+					    plan, "%u", col - current_col));
 				WT_RET(__wt_buf_catfmt(session,
 				    plan, "%c", WT_PROJ_SKIP));
 			}
@@ -375,8 +375,8 @@ __wt_struct_reformat(WT_SESSION_IMPL *session, WT_TABLE *table,
 			pv.type = 'u';
 
 		if (pv.havesize)
-			WT_RET(__wt_buf_catfmt(
-			    session, format, "%d%c", (int)pv.size, pv.type));
+			WT_RET(__wt_buf_catfmt(session,
+			    format, "%" PRIu32 "%c", pv.size, pv.type));
 		else
 			WT_RET(__wt_buf_catfmt(session, format, "%c", pv.type));
 	} while (have_next);
@@ -399,8 +399,8 @@ __wt_struct_truncate(WT_SESSION_IMPL *session,
 	while (ncols-- > 0) {
 		WT_RET(__pack_next(&pack, &pv));
 		if (pv.havesize)
-			WT_RET(__wt_buf_catfmt(
-			    session, format, "%d%c", (int)pv.size, pv.type));
+			WT_RET(__wt_buf_catfmt(session,
+			    format, "%" PRIu32 "%c", pv.size, pv.type));
 		else
 			WT_RET(__wt_buf_catfmt(session, format, "%c", pv.type));
 	}
diff --git a/src/schema/schema_truncate.c b/src/schema/schema_truncate.c
index e7752b60ca4..d9a798b6ed8 100644
--- a/src/schema/schema_truncate.c
+++ b/src/schema/schema_truncate.c
@@ -131,22 +131,19 @@ int
 __wt_schema_range_truncate(
     WT_SESSION_IMPL *session, WT_CURSOR *start, WT_CURSOR *stop)
 {
-	WT_CURSOR *cursor;
 	WT_DATA_SOURCE *dsrc;
 	WT_DECL_RET;
 	const char *uri;
 
-	cursor = (start != NULL) ? start : stop;
-	uri = cursor->internal_uri;
+	uri = start->internal_uri;
 
 	if (WT_PREFIX_MATCH(uri, "file:")) {
-		if (start != NULL)
-			WT_CURSOR_NEEDKEY(start);
+		WT_CURSOR_NEEDKEY(start);
 		if (stop != NULL)
 			WT_CURSOR_NEEDKEY(stop);
-		WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)cursor)->btree,
+		WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)start)->btree,
 		    ret = __wt_btcur_range_truncate(
-			(WT_CURSOR_BTREE *)start, (WT_CURSOR_BTREE *)stop));
+		    (WT_CURSOR_BTREE *)start, (WT_CURSOR_BTREE *)stop));
 	} else if (WT_PREFIX_MATCH(uri, "table:"))
 		ret = __wt_table_range_truncate(
 		    (WT_CURSOR_TABLE *)start, (WT_CURSOR_TABLE *)stop);
diff --git a/src/schema/schema_worker.c b/src/schema/schema_worker.c
index e60a7107786..52be76bb7a5 100644
--- a/src/schema/schema_worker.c
+++ b/src/schema/schema_worker.c
@@ -126,7 +126,7 @@ __wt_schema_worker(WT_SESSION_IMPL *session,
 			   dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg));
 		else if (file_func == __wt_checkpoint)
 			;
-		else if (file_func == __wt_checkpoint_list)
+		else if (file_func == __wt_checkpoint_get_handles)
 			;
 		else if (file_func == __wt_checkpoint_sync)
 			;
diff --git a/src/session/session_api.c b/src/session/session_api.c
index 2414229681b..bb496494234 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -40,7 +40,8 @@ __wt_session_reset_cursors(WT_SESSION_IMPL *session, bool free_buffers)
 		/* Stop when there are no positioned cursors. */
 		if (session->ncursors == 0)
 			break;
-		WT_TRET(cursor->reset(cursor));
+		if (!F_ISSET(cursor, WT_CURSTD_JOINED))
+			WT_TRET(cursor->reset(cursor));
 		/* Optionally, free the cursor buffers */
 		if (free_buffers) {
 			__wt_buf_free(session, &cursor->key);
@@ -492,10 +493,13 @@ __session_create(WT_SESSION *wt_session, const char *uri, const char *config)
 		/*
 		 * We can't disallow type entirely, a configuration string might
 		 * innocently include it, for example, a dump/load pair.  If the
-		 * URI type prefix and the type are the same, let it go.
+		 * underlying type is "file", it's OK ("file" is the underlying
+		 * type for every type); if the URI type prefix and the type are
+		 * the same, let it go.
 		 */
 		if ((ret =
 		    __wt_config_getones(session, config, "type", &cval)) == 0 &&
+		    !WT_STRING_MATCH("file", cval.str, cval.len) &&
 		    (strncmp(uri, cval.str, cval.len) != 0 ||
 		    uri[cval.len] != ':'))
 			WT_ERR_MSG(session, EINVAL,
@@ -1597,7 +1601,7 @@ __open_session(WT_CONNECTION_IMPL *conn,
 	if (i == conn->session_size)
 		WT_ERR_MSG(session, ENOMEM,
 		    "only configured to support %" PRIu32 " sessions"
-		    " (including %d additional internal sessions)",
+		    " (including %" PRIu32 " additional internal sessions)",
 		    conn->session_size, WT_EXTRA_INTERNAL_SESSIONS);
 
 	/*
diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c
index 242d9ac5cc4..ddf4d3dfa33 100644
--- a/src/session/session_dhandle.c
+++ b/src/session/session_dhandle.c
@@ -577,7 +577,7 @@ __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint)
 	 * files, since changes to the underlying file are visible to the in
 	 * memory pages.
 	 */
-	WT_ERR(__wt_cache_op(session, NULL, WT_SYNC_DISCARD));
+	WT_ERR(__wt_cache_op(session, WT_SYNC_DISCARD));
 
 	/*
 	 * We lock checkpoint handles that we are overwriting, so the handle
diff --git a/src/support/cond_auto.c b/src/support/cond_auto.c
new file mode 100644
index 00000000000..ec95622f333
--- /dev/null
+++ b/src/support/cond_auto.c
@@ -0,0 +1,136 @@
+/*-
+ * Public Domain 2014-2016 MongoDB, Inc.
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * This is an implementation of condition variables that automatically adjust
+ * the wait time depending on whether the wake is resulting in useful work.
+ */
+
+/*
+ * __wt_cond_auto_alloc --
+ *	Allocate and initialize an automatically adjusting condition variable.
+ */
+int
+__wt_cond_auto_alloc(
+    WT_SESSION_IMPL *session, const char *name,
+    bool is_signalled, uint64_t min, uint64_t max, WT_CONDVAR **condp)
+{
+	WT_CONDVAR *cond;
+
+	WT_RET(__wt_cond_alloc(session, name, is_signalled, condp));
+	cond = *condp;
+
+	cond->min_wait = min;
+	cond->max_wait = max;
+	cond->prev_wait = min;
+
+	return (0);
+}
+
+/*
+ * __wt_cond_auto_signal --
+ *	Signal a condition variable.
+ */
+int
+__wt_cond_auto_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond)
+{
+
+	WT_ASSERT(session, cond->min_wait != 0);
+	return (__wt_cond_signal(session, cond));
+}
+
+/*
+ * __wt_cond_auto_wait_signal --
+ *	Wait on a mutex, optionally timing out.  If we get it before the time
+ *	out period expires, let the caller know.
+ *	TODO: Can this version of the API be removed, now that we have the
+ *	auto adjusting condition variables?
+ */
+int
+__wt_cond_auto_wait_signal(
+    WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool *signalled)
+{
+	uint64_t delta;
+
+	/*
+	 * Catch cases where this function is called with a condition variable
+	 * that was initialized non-auto.
+	 */
+	WT_ASSERT(session, cond->min_wait != 0);
+
+	WT_STAT_FAST_CONN_INCR(session, cond_auto_wait);
+	if (progress)
+		cond->prev_wait = cond->min_wait;
+	else {
+		delta = WT_MAX(1, (cond->max_wait - cond->min_wait) / 10);
+		cond->prev_wait = WT_MIN(
+		    cond->max_wait, cond->prev_wait + delta);
+	}
+
+	WT_RET(__wt_cond_wait_signal(
+	    session, cond, cond->prev_wait, signalled));
+
+	if (progress || *signalled)
+		WT_STAT_FAST_CONN_INCR(session, cond_auto_wait_reset);
+	if (*signalled)
+		cond->prev_wait = cond->min_wait;
+
+	return (0);
+}
+
+/*
+ * __wt_cond_auto_wait --
+ *	Wait on a mutex, optionally timing out.  If we get it before the time
+ *	out period expires, let the caller know.
+ */
+int
+__wt_cond_auto_wait(
+    WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress)
+{
+	bool signalled;
+
+	/*
+	 * Call the signal version so the wait period is reset if the
+	 * condition is woken explicitly.
+	 */
+	WT_RET(__wt_cond_auto_wait_signal(session, cond, progress, &signalled));
+
+	return (0);
+}
+
+/*
+ * __wt_cond_auto_destroy --
+ *	Destroy a condition variable.
+ */
+int
+__wt_cond_auto_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp)
+{
+	return (__wt_cond_destroy(session, condp));
+}
diff --git a/src/support/huffman.c b/src/support/huffman.c
index edd0bc9f648..1e1aaeab5b5 100644
--- a/src/support/huffman.c
+++ b/src/support/huffman.c
@@ -492,11 +492,12 @@ __wt_huffman_open(WT_SESSION_IMPL *session,
 	uint8_t symbol;
 	uint32_t weighted_length;
 
-	printf("leaf depth %" PRIu16 "..%" PRIu16 ", memory use: "
-	    "codes %u# * %uB  + code2symbol %u# * %uB\n",
+	printf("leaf depth %" PRIu16 "..%" PRIu16
+	    ", memory use: codes %u# * %" WT_SIZET_FMT
+	    "B + code2symbol %u# * %" WT_SIZET_FMT "B\n",
 	    huffman->min_depth, huffman->max_depth,
-	    huffman->numSymbols, (u_int)sizeof(WT_HUFFMAN_CODE),
-	    1U << huffman->max_depth, (u_int)sizeof(uint16_t));
+	    huffman->numSymbols, sizeof(WT_HUFFMAN_CODE),
+	    1U << huffman->max_depth, sizeof(uint16_t));
 
 	/*
 	 * measure quality of computed Huffman codes, for different max bit
diff --git a/src/support/power8/crc32.S b/src/support/power8/crc32.S
index 3ef2928aaa1..c0b81143f07 100644
--- a/src/support/power8/crc32.S
+++ b/src/support/power8/crc32.S
@@ -65,14 +65,13 @@
 #define off96		r30
 #define off112		r31
 
-#define const1		v25
-#define const2		v26
+#define const1		v24
+#define const2		v25
 
-#define byteswap	v27
-#define	mask_32bit	v28
-#define	mask_64bit	v29
-#define zeroes		v30
-#define ones		v31
+#define byteswap	v26
+#define	mask_32bit	v27
+#define	mask_64bit	v28
+#define zeroes		v29
 
 #ifdef BYTESWAP_DATA
 #define VPERM(A, B, C, D) vperm	A, B, C, D
@@ -90,31 +89,6 @@ FUNC_START(__crc32_vpmsum)
 	std	r26,-48(r1)
 	std	r25,-56(r1)
 
-	li 	r31, -256
-	stvx 	v20, r31, r1
-	li 	r31, -240
-	stvx 	v21, r31, r1
-	li 	r31, -224
-	stvx 	v22, r31, r1
-	li 	r31, -208
-	stvx 	v23, r31, r1
-	li 	r31, -192
-	stvx 	v24, r31, r1
-	li 	r31, -176
-	stvx 	v25, r31, r1
-	li 	r31, -160
-	stvx 	v26, r31, r1
-	li 	r31, -144
-	stvx 	v27, r31, r1
-	li 	r31, -128
-	stvx 	v28, r31, r1
-	li 	r31, -112
-	stvx 	v29, r31, r1
-	li 	r31, -96
-	stvx 	v30, r31, r1
-	li 	r31, -80
-	stvx 	v31, r31, r1
-
 	li	off16,16
 	li	off32,32
 	li	off48,48
@@ -124,13 +98,28 @@ FUNC_START(__crc32_vpmsum)
 	li	off112,112
 	li	r0,0
 
+	/* Enough room for saving 10 non volatile VMX registers */
+	subi	r6,r1,56+10*16
+	subi	r7,r1,56+2*16
+
+	stvx	v20,0,r6
+	stvx	v21,off16,r6
+	stvx	v22,off32,r6
+	stvx	v23,off48,r6
+	stvx	v24,off64,r6
+	stvx	v25,off80,r6
+	stvx	v26,off96,r6
+	stvx	v27,off112,r6
+	stvx	v28,0,r7
+	stvx	v29,off16,r7
+
 	mr	r10,r3
 
 	vxor	zeroes,zeroes,zeroes
-	vspltisw ones,-1
+	vspltisw v0,-1
 
-	vsldoi	mask_32bit,zeroes,ones,4
-	vsldoi	mask_64bit,zeroes,ones,8
+	vsldoi	mask_32bit,zeroes,v0,4
+	vsldoi	mask_64bit,zeroes,v0,8
 
 	/* Get the initial value into v8 */
 	vxor	v8,v8,v8
@@ -596,30 +585,20 @@ FUNC_START(__crc32_vpmsum)
 	/* Get it into r3 */
 	MFVRD(r3, v0)
 
-	li 	r31, -256
-	lvx 	v20, r31, r1
-	li 	r31, -240
-	lvx 	v21, r31, r1
-	li 	r31, -224
-	lvx 	v22, r31, r1
-	li 	r31, -208
-	lvx 	v23, r31, r1
-	li 	r31, -192
-	lvx 	v24, r31, r1
-	li 	r31, -176
-	lvx 	v25, r31, r1
-	li 	r31, -160
-	lvx 	v26, r31, r1
-	li 	r31, -144
-	lvx 	v27, r31, r1
-	li 	r31, -128
-	lvx 	v28, r31, r1
-	li 	r31, -112
-	lvx 	v29, r31, r1
-	li 	r31, -96
-	lvx 	v30, r31, r1
-	li 	r31, -80
-	lvx 	v31, r31, r1
+.Lout:
+	subi	r6,r1,56+10*16
+	subi	r7,r1,56+2*16
+
+	lvx	v20,0,r6
+	lvx	v21,off16,r6
+	lvx	v22,off32,r6
+	lvx	v23,off48,r6
+	lvx	v24,off64,r6
+	lvx	v25,off80,r6
+	lvx	v26,off96,r6
+	lvx	v27,off112,r6
+	lvx	v28,0,r7
+	lvx	v29,off16,r7
 
 	ld	r31,-8(r1)
 	ld	r30,-16(r1)
@@ -786,6 +765,7 @@ FUNC_START(__crc32_vpmsum)
 
 .Lzero:
 	mr	r3,r10
-	blr
+	b	.Lout
+
 FUNC_END(__crc32_vpmsum)
 #endif
diff --git a/src/support/scratch.c b/src/support/scratch.c
index 94020ba2621..aea98dc49ef 100644
--- a/src/support/scratch.c
+++ b/src/support/scratch.c
@@ -45,7 +45,7 @@ __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size)
 			WT_RET(__wt_realloc_aligned(
 			    session, &buf->memsize, size, &buf->mem));
 		else
-			WT_RET(__wt_realloc(
+			WT_RET(__wt_realloc_noclear(
 			    session, &buf->memsize, size, &buf->mem));
 	}
 
diff --git a/src/support/stat.c b/src/support/stat.c
index 0df38bfe6b0..2a826eda962 100644
--- a/src/support/stat.c
+++ b/src/support/stat.c
@@ -581,6 +581,8 @@ static const char * const __stats_connection_desc[] = {
 	"cache: tracked dirty bytes in the cache",
 	"cache: tracked dirty pages in the cache",
 	"cache: unmodified pages evicted",
+	"connection: auto adjusting condition resets",
+	"connection: auto adjusting condition wait calls",
 	"connection: files currently open",
 	"connection: memory allocations",
 	"connection: memory frees",
@@ -619,6 +621,8 @@ static const char * const __stats_connection_desc[] = {
 	"log: log bytes written",
 	"log: log files manually zero-filled",
 	"log: log flush operations",
+	"log: log force write operations",
+	"log: log force write operations skipped",
 	"log: log records compressed",
 	"log: log records not compressed",
 	"log: log records too small to compress",
@@ -626,6 +630,7 @@ static const char * const __stats_connection_desc[] = {
 	"log: log scan operations",
 	"log: log scan records requiring two reads",
 	"log: log server thread advances write LSN",
+	"log: log server thread write LSN walk skipped",
 	"log: log sync operations",
 	"log: log sync_dir operations",
 	"log: log write operations",
@@ -773,6 +778,8 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
 		/* not clearing cache_bytes_dirty */
 		/* not clearing cache_pages_dirty */
 	stats->cache_eviction_clean = 0;
+	stats->cond_auto_wait_reset = 0;
+	stats->cond_auto_wait = 0;
 		/* not clearing file_open */
 	stats->memory_allocation = 0;
 	stats->memory_free = 0;
@@ -811,6 +818,8 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
 	stats->log_bytes_written = 0;
 	stats->log_zero_fills = 0;
 	stats->log_flush = 0;
+	stats->log_force_write = 0;
+	stats->log_force_write_skip = 0;
 	stats->log_compress_writes = 0;
 	stats->log_compress_write_fails = 0;
 	stats->log_compress_small = 0;
@@ -818,6 +827,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
 	stats->log_scans = 0;
 	stats->log_scan_rereads = 0;
 	stats->log_write_lsn = 0;
+	stats->log_write_lsn_skip = 0;
 	stats->log_sync = 0;
 	stats->log_sync_dir = 0;
 	stats->log_writes = 0;
@@ -974,6 +984,8 @@ __wt_stat_connection_aggregate(
 	to->cache_bytes_dirty += WT_STAT_READ(from, cache_bytes_dirty);
 	to->cache_pages_dirty += WT_STAT_READ(from, cache_pages_dirty);
 	to->cache_eviction_clean += WT_STAT_READ(from, cache_eviction_clean);
+	to->cond_auto_wait_reset += WT_STAT_READ(from, cond_auto_wait_reset);
+	to->cond_auto_wait += WT_STAT_READ(from, cond_auto_wait);
 	to->file_open += WT_STAT_READ(from, file_open);
 	to->memory_allocation += WT_STAT_READ(from, memory_allocation);
 	to->memory_free += WT_STAT_READ(from, memory_free);
@@ -1012,6 +1024,8 @@ __wt_stat_connection_aggregate(
 	to->log_bytes_written += WT_STAT_READ(from, log_bytes_written);
 	to->log_zero_fills += WT_STAT_READ(from, log_zero_fills);
 	to->log_flush += WT_STAT_READ(from, log_flush);
+	to->log_force_write += WT_STAT_READ(from, log_force_write);
+	to->log_force_write_skip += WT_STAT_READ(from, log_force_write_skip);
 	to->log_compress_writes += WT_STAT_READ(from, log_compress_writes);
 	to->log_compress_write_fails +=
 	    WT_STAT_READ(from, log_compress_write_fails);
@@ -1021,6 +1035,7 @@ __wt_stat_connection_aggregate(
 	to->log_scans += WT_STAT_READ(from, log_scans);
 	to->log_scan_rereads += WT_STAT_READ(from, log_scan_rereads);
 	to->log_write_lsn += WT_STAT_READ(from, log_write_lsn);
+	to->log_write_lsn_skip += WT_STAT_READ(from, log_write_lsn_skip);
 	to->log_sync += WT_STAT_READ(from, log_sync);
 	to->log_sync_dir += WT_STAT_READ(from, log_sync_dir);
 	to->log_writes += WT_STAT_READ(from, log_writes);
diff --git a/src/txn/txn.c b/src/txn/txn.c
index e8fd8c0c119..7a768a8fe20 100644
--- a/src/txn/txn.c
+++ b/src/txn/txn.c
@@ -344,7 +344,7 @@ retry:
 		    current_id - oldest_id > 10000 && oldest_session != NULL) {
 			(void)__wt_verbose(session, WT_VERB_TRANSACTION,
 			    "old snapshot %" PRIu64
-			    " pinned in session %d [%s]"
+			    " pinned in session %" PRIu32 " [%s]"
 			    " with snap_min %" PRIu64 "\n",
 			    oldest_id, oldest_session->id,
 			    oldest_session->lastop,
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index 4bb8ccdc6f0..1eebc9e9d04 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -8,6 +8,10 @@
 
 #include "wt_internal.h"
 
+static int __checkpoint_lock_tree(
+    WT_SESSION_IMPL *, bool, bool, const char *[]);
+static int __checkpoint_tree_helper(WT_SESSION_IMPL *, const char *[]);
+
 /*
  * __wt_checkpoint_name_ok --
  *	Complain if the checkpoint name isn't acceptable.
@@ -224,11 +228,11 @@ __checkpoint_data_source(WT_SESSION_IMPL *session, const char *cfg[])
 }
 
 /*
- * __wt_checkpoint_list --
+ * __wt_checkpoint_get_handles --
  *	Get a list of handles to flush.
  */
 int
-__wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[])
+__wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[])
 {
 	WT_DECL_RET;
 	const char *name;
@@ -254,6 +258,13 @@ __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[])
 	if ((ret = __wt_session_get_btree(session, name, NULL, NULL, 0)) != 0)
 		return (ret == EBUSY ? 0 : ret);
 
+	WT_SAVE_DHANDLE(session,
+	    ret = __checkpoint_lock_tree(session, true, true, cfg));
+	if (ret != 0) {
+		WT_TRET(__wt_session_release_btree(session));
+		return (ret);
+	}
+
 	session->ckpt_handle[session->ckpt_handle_next++] = session->dhandle;
 	return (0);
 }
@@ -267,7 +278,7 @@ __checkpoint_write_leaves(WT_SESSION_IMPL *session, const char *cfg[])
 {
 	WT_UNUSED(cfg);
 
-	return (__wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES));
+	return (__wt_cache_op(session, WT_SYNC_WRITE_LEAVES));
 }
 
 /*
@@ -371,15 +382,20 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
 	/* Configure logging only if doing a full checkpoint. */
 	logging = FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED);
 
+	/* Keep track of handles acquired for locking. */
+	WT_ERR(__wt_meta_track_on(session));
+	tracking = true;
+
 	/*
 	 * Get a list of handles we want to flush; this may pull closed objects
 	 * into the session cache, but we're going to do that eventually anyway.
 	 */
+	WT_ASSERT(session, session->ckpt_handle_next == 0);
 	WT_WITH_SCHEMA_LOCK(session, ret,
 	    WT_WITH_TABLE_LOCK(session, ret,
 		WT_WITH_HANDLE_LIST_LOCK(session,
 		    ret = __checkpoint_apply_all(
-		    session, cfg, __wt_checkpoint_list, NULL))));
+		    session, cfg, __wt_checkpoint_get_handles, NULL))));
 	WT_ERR(ret);
 
 	/*
@@ -408,12 +424,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
 	 * completion. Do it after flushing the pages to give the
 	 * asynchronous flush as much time as possible before we wait.
 	 */
-	if (F_ISSET(conn, WT_CONN_CKPT_SYNC))
-		WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync));
-
-	/* Start the checkpoint for real. */
-	WT_ERR(__wt_meta_track_on(session));
-	tracking = true;
+	WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync));
 
 	/* Tell logging that we are about to start a database checkpoint. */
 	if (full && logging)
@@ -427,6 +438,8 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
 		WT_ERR(__wt_epoch(session, &start));
 
 	/*
+	 * Start the checkpoint for real.
+	 *
 	 * Bump the global checkpoint generation, used to figure out whether
 	 * checkpoint has visited a tree.  There is no need for this to be
 	 * atomic: it is only written while holding the checkpoint lock.
@@ -490,7 +503,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
 		WT_ERR(__wt_txn_checkpoint_log(
 		    session, full, WT_TXN_LOG_CKPT_START, NULL));
 
-	WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint));
+	WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_tree_helper));
 
 	/*
 	 * Clear the dhandle so the visibility check doesn't get confused about
@@ -509,8 +522,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
 	 * Checkpoints have to hit disk (it would be reasonable to configure for
 	 * lazy checkpoints, but we don't support them yet).
 	 */
-	if (F_ISSET(conn, WT_CONN_CKPT_SYNC))
-		WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync));
+	WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync));
 
 	WT_ERR(__checkpoint_verbose_track(session,
 	    "sync completed", &verb_timer));
@@ -754,14 +766,13 @@ __drop_to(WT_CKPT *ckptbase, const char *name, size_t len)
 }
 
 /*
- * __checkpoint_worker --
- *	Checkpoint a tree.
+ * __checkpoint_lock_tree --
+ *	Acquire the locks required to checkpoint a tree.
  */
 static int
-__checkpoint_worker(WT_SESSION_IMPL *session,
-    const char *cfg[], bool is_checkpoint, bool need_tracking)
+__checkpoint_lock_tree(WT_SESSION_IMPL *session,
+    bool is_checkpoint, bool need_tracking, const char *cfg[])
 {
-	WT_BM *bm;
 	WT_BTREE *btree;
 	WT_CKPT *ckpt, *ckptbase;
 	WT_CONFIG dropconf;
@@ -769,19 +780,15 @@ __checkpoint_worker(WT_SESSION_IMPL *session,
 	WT_CONNECTION_IMPL *conn;
 	WT_DATA_HANDLE *dhandle;
 	WT_DECL_RET;
-	WT_LSN ckptlsn;
-	int deleted, was_modified;
-	bool fake_ckpt, force, hot_backup_locked;
-	const char *name;
 	char *name_alloc;
+	const char *name;
+	bool hot_backup_locked;
 
 	btree = S2BT(session);
-	bm = btree->bm;
 	conn = S2C(session);
 	ckpt = ckptbase = NULL;
 	dhandle = session->dhandle;
-	was_modified = btree->modified;
-	fake_ckpt = hot_backup_locked = false;
+	hot_backup_locked = false;
 	name_alloc = NULL;
 
 	/*
@@ -800,15 +807,6 @@ __checkpoint_worker(WT_SESSION_IMPL *session,
 	WT_ASSERT(session, !need_tracking ||
 	    WT_IS_METADATA(session, dhandle) || WT_META_TRACKING(session));
 
-	/*
-	 * Set the checkpoint LSN to the maximum LSN so that if logging is
-	 * disabled, recovery will never roll old changes forward over the
-	 * non-logged changes in this checkpoint.  If logging is enabled, a
-	 * real checkpoint LSN will be assigned later for this checkpoint and
-	 * overwrite this.
-	 */
-	WT_MAX_LSN(&ckptlsn);
-
 	/* Get the list of checkpoints for this file. */
 	WT_RET(__wt_meta_ckptlist_get(session, dhandle->name, &ckptbase));
 
@@ -859,74 +857,15 @@ __checkpoint_worker(WT_SESSION_IMPL *session,
 	/* Drop checkpoints with the same name as the one we're taking. */
 	__drop(ckptbase, name, strlen(name));
 
-	/*
-	 * Check for clean objects not requiring a checkpoint.
-	 *
-	 * If we're closing a handle, and the object is clean, we can skip the
-	 * checkpoint, whatever checkpoints we have are sufficient.  (We might
-	 * not have any checkpoints if the object was never modified, and that's
-	 * OK: the object creation code doesn't mark the tree modified so we can
-	 * skip newly created trees here.)
-	 *
-	 * If the application repeatedly checkpoints an object (imagine hourly
-	 * checkpoints using the same explicit or internal name), there's no
-	 * reason to repeat the checkpoint for clean objects.  The test is if
-	 * the only checkpoint we're deleting is the last one in the list and
-	 * it has the same name as the checkpoint we're about to take, skip the
-	 * work.  (We can't skip checkpoints that delete more than the last
-	 * checkpoint because deleting those checkpoints might free up space in
-	 * the file.)  This means an application toggling between two (or more)
-	 * checkpoint names will repeatedly take empty checkpoints, but that's
-	 * not likely enough to make detection worthwhile.
-	 *
-	 * Checkpoint read-only objects otherwise: the application must be able
-	 * to open the checkpoint in a cursor after taking any checkpoint, which
-	 * means it must exist.
-	 */
-	force = false;
-	F_CLR(btree, WT_BTREE_SKIP_CKPT);
-	if (!btree->modified && cfg != NULL) {
-		ret = __wt_config_gets(session, cfg, "force", &cval);
-		if (ret != 0 && ret != WT_NOTFOUND)
-			WT_ERR(ret);
-		if (ret == 0 && cval.val != 0)
-			force = true;
-	}
-	if (!btree->modified && !force) {
-		if (!is_checkpoint)
-			goto nockpt;
-
-		deleted = 0;
-		WT_CKPT_FOREACH(ckptbase, ckpt)
-			if (F_ISSET(ckpt, WT_CKPT_DELETE))
-				++deleted;
-		/*
-		 * Complicated test: if the last checkpoint in the object has
-		 * the same name as the checkpoint we're taking (correcting for
-		 * internal checkpoint names with their generational suffix
-		 * numbers), we can skip the checkpoint, there's nothing to do.
-		 * The exception is if we're deleting two or more checkpoints:
-		 * then we may save space.
-		 */
-		if (ckpt > ckptbase &&
-		    (strcmp(name, (ckpt - 1)->name) == 0 ||
-		    (WT_PREFIX_MATCH(name, WT_CHECKPOINT) &&
-		    WT_PREFIX_MATCH((ckpt - 1)->name, WT_CHECKPOINT))) &&
-		    deleted < 2) {
-nockpt:			F_SET(btree, WT_BTREE_SKIP_CKPT);
-			WT_PUBLISH(btree->checkpoint_gen,
-			    S2C(session)->txn_global.checkpoint_gen);
-			WT_STAT_FAST_DATA_SET(session,
-			    btree_checkpoint_generation,
-			    btree->checkpoint_gen);
-			goto done;
-		}
-	}
-
 	/* Add a new checkpoint entry at the end of the list. */
 	WT_CKPT_FOREACH(ckptbase, ckpt)
 		;
 	WT_ERR(__wt_strdup(session, name, &ckpt->name));
+	/*
+	 * We are now done with the local use of the name.  Free the local
+	 * allocation, if needed.
+	 */
+	__wt_free(session, name_alloc);
 	F_SET(ckpt, WT_CKPT_ADD);
 
 	/*
@@ -1007,32 +946,128 @@ nockpt:			F_SET(btree, WT_BTREE_SKIP_CKPT);
 	 * copy instead of forcing checkpoints on clean objects to associate
 	 * names with checkpoints.
 	 */
-	if (is_checkpoint)
-		switch (F_MASK(btree, WT_BTREE_SPECIAL_FLAGS)) {
-		case 0:
-			break;
-		case WT_BTREE_BULK:
-			/*
-			 * The only checkpoints a bulk-loaded file should have
-			 * are fake ones we created without the underlying block
-			 * manager.  I'm leaving this code here because it's a
-			 * cheap test and a nasty race.
-			 */
-			WT_CKPT_FOREACH(ckptbase, ckpt)
-				if (!F_ISSET(ckpt, WT_CKPT_ADD | WT_CKPT_FAKE))
-					WT_ERR_MSG(session, ret,
-					    "block-manager checkpoint found "
-					    "for a bulk-loaded file");
-			fake_ckpt = true;
-			goto fake;
-		case WT_BTREE_REBALANCE:
-		case WT_BTREE_SALVAGE:
-		case WT_BTREE_UPGRADE:
-		case WT_BTREE_VERIFY:
-			WT_ERR_MSG(session, EINVAL,
-			    "checkpoints are blocked during rebalance, "
-			    "salvage, upgrade or verify operations");
+	WT_ASSERT(session,
+	    !is_checkpoint || !F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS));
+
+	hot_backup_locked = false;
+	WT_ERR(__wt_readunlock(session, conn->hot_backup_lock));
+
+	WT_ASSERT(session, btree->ckpt == NULL);
+	btree->ckpt = ckptbase;
+
+	return (0);
+
+err:	if (hot_backup_locked)
+		WT_TRET(__wt_readunlock(session, conn->hot_backup_lock));
+
+	__wt_meta_ckptlist_free(session, ckptbase);
+	__wt_free(session, name_alloc);
+
+	return (ret);
+}
+
+/*
+ * __checkpoint_tree --
+ *	Checkpoint a single tree.
+ *	Assumes all necessary locks have been acquired by the caller.
+ */
+static int
+__checkpoint_tree(
+    WT_SESSION_IMPL *session, bool is_checkpoint, const char *cfg[])
+{
+	WT_BM *bm;
+	WT_BTREE *btree;
+	WT_CKPT *ckpt, *ckptbase;
+	WT_CONFIG_ITEM cval;
+	WT_CONNECTION_IMPL *conn;
+	WT_DATA_HANDLE *dhandle;
+	WT_DECL_RET;
+	WT_LSN ckptlsn;
+	const char *name;
+	int deleted, was_modified;
+	bool fake_ckpt, force;
+
+	btree = S2BT(session);
+	bm = btree->bm;
+	ckptbase = btree->ckpt;
+	conn = S2C(session);
+	dhandle = session->dhandle;
+	fake_ckpt = false;
+	was_modified = btree->modified;
+
+	/*
+	 * Set the checkpoint LSN to the maximum LSN so that if logging is
+	 * disabled, recovery will never roll old changes forward over the
+	 * non-logged changes in this checkpoint.  If logging is enabled, a
+	 * real checkpoint LSN will be assigned for this checkpoint and
+	 * overwrite this.
+	 */
+	WT_MAX_LSN(&ckptlsn);
+
+	/*
+	 * Check for clean objects not requiring a checkpoint.
+	 *
+	 * If we're closing a handle, and the object is clean, we can skip the
+	 * checkpoint, whatever checkpoints we have are sufficient.  (We might
+	 * not have any checkpoints if the object was never modified, and that's
+	 * OK: the object creation code doesn't mark the tree modified so we can
+	 * skip newly created trees here.)
+	 *
+	 * If the application repeatedly checkpoints an object (imagine hourly
+	 * checkpoints using the same explicit or internal name), there's no
+	 * reason to repeat the checkpoint for clean objects.  The test is if
+	 * the only checkpoint we're deleting is the last one in the list and
+	 * it has the same name as the checkpoint we're about to take, skip the
+	 * work.  (We can't skip checkpoints that delete more than the last
+	 * checkpoint because deleting those checkpoints might free up space in
+	 * the file.)  This means an application toggling between two (or more)
+	 * checkpoint names will repeatedly take empty checkpoints, but that's
+	 * not likely enough to make detection worthwhile.
+	 *
+	 * Checkpoint read-only objects otherwise: the application must be able
+	 * to open the checkpoint in a cursor after taking any checkpoint, which
+	 * means it must exist.
+	 */
+	force = false;
+	F_CLR(btree, WT_BTREE_SKIP_CKPT);
+	if (!btree->modified && cfg != NULL) {
+		ret = __wt_config_gets(session, cfg, "force", &cval);
+		if (ret != 0 && ret != WT_NOTFOUND)
+			WT_ERR(ret);
+		if (ret == 0 && cval.val != 0)
+			force = true;
+	}
+	if (!btree->modified && !force) {
+		if (!is_checkpoint)
+			goto nockpt;
+
+		deleted = 0;
+		WT_CKPT_FOREACH(ckptbase, ckpt)
+			if (F_ISSET(ckpt, WT_CKPT_DELETE))
+				++deleted;
+		/*
+		 * Complicated test: if the tree is clean and last two
+		 * checkpoints have the same name (correcting for internal
+		 * checkpoint names with their generational suffix numbers), we
+		 * can skip the checkpoint, there's nothing to do.  The
+		 * exception is if we're deleting two or more checkpoints: then
+		 * we may save space.
+		 */
+		name = (ckpt - 1)->name;
+		if (ckpt > ckptbase + 1 && deleted < 2 &&
+		    (strcmp(name, (ckpt - 2)->name) == 0 ||
+		    (WT_PREFIX_MATCH(name, WT_CHECKPOINT) &&
+		    WT_PREFIX_MATCH((ckpt - 2)->name, WT_CHECKPOINT)))) {
+nockpt:			F_SET(btree, WT_BTREE_SKIP_CKPT);
+			WT_PUBLISH(btree->checkpoint_gen,
+			    S2C(session)->txn_global.checkpoint_gen);
+			WT_STAT_FAST_DATA_SET(session,
+			    btree_checkpoint_generation,
+			    btree->checkpoint_gen);
+			ret = 0;
+			goto err;
 		}
+	}
 
 	/*
 	 * If an object has never been used (in other words, if it could become
@@ -1086,9 +1121,9 @@ nockpt:			F_SET(btree, WT_BTREE_SKIP_CKPT);
 
 	/* Flush the file from the cache, creating the checkpoint. */
 	if (is_checkpoint)
-		WT_ERR(__wt_cache_op(session, ckptbase, WT_SYNC_CHECKPOINT));
+		WT_ERR(__wt_cache_op(session, WT_SYNC_CHECKPOINT));
 	else
-		WT_ERR(__wt_cache_op(session, ckptbase, WT_SYNC_CLOSE));
+		WT_ERR(__wt_cache_op(session, WT_SYNC_CLOSE));
 
 	/*
 	 * All blocks being written have been written; set the object's write
@@ -1120,9 +1155,8 @@ fake:	/*
 	 * sync the file here or we could roll forward the metadata in
 	 * recovery and open a checkpoint that isn't yet durable.
 	 */
-	if (F_ISSET(conn, WT_CONN_CKPT_SYNC) &&
-	    (WT_IS_METADATA(session, dhandle) ||
-	    !F_ISSET(&session->txn, WT_TXN_RUNNING)))
+	if (WT_IS_METADATA(session, dhandle) ||
+	    !F_ISSET(&session->txn, WT_TXN_RUNNING))
 		WT_ERR(__wt_checkpoint_sync(session, NULL));
 
 	WT_ERR(__wt_meta_ckptlist_set(
@@ -1147,7 +1181,6 @@ fake:	/*
 		WT_ERR(__wt_txn_checkpoint_log(
 		    session, false, WT_TXN_LOG_CKPT_STOP, NULL));
 
-done:
 err:	/*
 	 * If the checkpoint didn't complete successfully, make sure the
 	 * tree is marked dirty.
@@ -1155,30 +1188,42 @@ err:	/*
 	if (ret != 0 && !btree->modified && was_modified)
 		btree->modified = 1;
 
-	if (hot_backup_locked)
-		WT_TRET(__wt_readunlock(session, conn->hot_backup_lock));
-
 	__wt_meta_ckptlist_free(session, ckptbase);
-	__wt_free(session, name_alloc);
+	btree->ckpt = NULL;
 
 	return (ret);
 }
 
 /*
+ * __checkpoint_tree_helper --
+ *	Checkpoint a tree (suitable for use in *_apply functions).
+ */
+static int
+__checkpoint_tree_helper(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	return (__checkpoint_tree(session, true, cfg));
+}
+
+/*
  * __wt_checkpoint --
  *	Checkpoint a file.
  */
 int
 __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
 {
+	WT_DECL_RET;
+
 	/* Should not be called with a checkpoint handle. */
 	WT_ASSERT(session, session->dhandle->checkpoint == NULL);
 
-	/* Should be holding the schema lock. */
+	/* We must hold the metadata lock if checkpointing the metadata. */
 	WT_ASSERT(session, !WT_IS_METADATA(session, session->dhandle) ||
 	    F_ISSET(session, WT_SESSION_LOCKED_METADATA));
 
-	return (__checkpoint_worker(session, cfg, true, true));
+	WT_SAVE_DHANDLE(session,
+	    ret = __checkpoint_lock_tree(session, true, true, cfg));
+	WT_RET(ret);
+	return (__checkpoint_tree(session, true, cfg));
 }
 
 /*
@@ -1197,8 +1242,9 @@ __wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[])
 	/* Should not be called with a checkpoint handle. */
 	WT_ASSERT(session, session->dhandle->checkpoint == NULL);
 
-	/* Should have an underlying block manager reference. */
-	WT_ASSERT(session, bm != NULL);
+	/* Unnecessary if checkpoint_sync has been configured "off". */
+	if (!F_ISSET(S2C(session), WT_CONN_CKPT_SYNC))
+		return (0);
 
 	return (bm->sync(bm, session, false));
 }
@@ -1227,7 +1273,7 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final)
 	if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT))
 		F_SET(session->dhandle, WT_DHANDLE_DEAD);
 	if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
-		return (__wt_cache_op(session, NULL, WT_SYNC_DISCARD));
+		return (__wt_cache_op(session, WT_SYNC_DISCARD));
 
 	/*
 	 * If closing an unmodified file, check that no update is required
@@ -1236,7 +1282,7 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final)
 	if (!btree->modified && !bulk) {
 		__wt_txn_update_oldest(session, true);
 		return (__wt_txn_visible_all(session, btree->rec_max_txn) ?
-		    __wt_cache_op(session, NULL, WT_SYNC_DISCARD) : EBUSY);
+		    __wt_cache_op(session, WT_SYNC_DISCARD) : EBUSY);
 	}
 
 	/*
@@ -1250,10 +1296,14 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final)
 	if (need_tracking)
 		WT_RET(__wt_meta_track_on(session));
 
-	WT_TRET(__checkpoint_worker(session, NULL, false, need_tracking));
+	WT_SAVE_DHANDLE(session,
+	    ret = __checkpoint_lock_tree(session, false, need_tracking, NULL));
+	WT_ASSERT(session, ret == 0);
+	if (ret == 0)
+		ret = __checkpoint_tree(session, false, NULL);
 
 	if (need_tracking)
-		WT_RET(__wt_meta_track_off(session, true, ret != 0));
+		WT_TRET(__wt_meta_track_off(session, true, ret != 0));
 
 	return (ret);
 }
diff --git a/src/txn/txn_recover.c b/src/txn/txn_recover.c
index f41691bbc3b..1ea4dba1152 100644
--- a/src/txn/txn_recover.c
+++ b/src/txn/txn_recover.c
@@ -88,11 +88,11 @@ __recovery_cursor(WT_SESSION_IMPL *session, WT_RECOVERY *r,
  * Helper to a cursor if this operation is to be applied during recovery.
  */
 #define	GET_RECOVERY_CURSOR(session, r, lsnp, fileid, cp)		\
-	WT_ERR(__recovery_cursor(					\
-	    (session), (r), (lsnp), (fileid), false, (cp)));		\
-	WT_ERR(__wt_verbose((session), WT_VERB_RECOVERY,		\
-	    "%s op %d to file %d at LSN %u/%u",				\
-	    (cursor == NULL) ? "Skipping" : "Applying",			\
+	WT_ERR(__recovery_cursor(session, r, lsnp, fileid, false, cp));	\
+	WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY,			\
+	    "%s op %" PRIu32 " to file %" PRIu32 " at LSN %" PRIu32	\
+	    "/%" PRIu32,						\
+	    cursor == NULL ? "Skipping" : "Applying",			\
 	    optype, fileid, lsnp->l.file, lsnp->l.offset));		\
 	if (cursor == NULL)						\
 		break
@@ -334,7 +334,7 @@ __recovery_setup_file(WT_RECOVERY *r, const char *uri, const char *config)
 	r->files[fileid].ckpt_lsn = lsn;
 
 	WT_RET(__wt_verbose(r->session, WT_VERB_RECOVERY,
-	    "Recovering %s with id %u @ (%" PRIu32 ", %" PRIu32 ")",
+	    "Recovering %s with id %" PRIu32 " @ (%" PRIu32 ", %" PRIu32 ")",
 	    uri, fileid, lsn.l.file, lsn.l.offset));
 
 	return (0);
@@ -496,7 +496,7 @@ __wt_txn_recover(WT_SESSION_IMPL *session)
 	 */
 	r.metadata_only = false;
 	WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY,
-	    "Main recovery loop: starting at %u/%u",
+	    "Main recovery loop: starting at %" PRIu32 "/%" PRIu32,
 	    r.ckpt_lsn.l.file, r.ckpt_lsn.l.offset));
 	WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec));
 	/*
diff --git a/src/utilities/util_dump.c b/src/utilities/util_dump.c
index ca761a52d8a..aedd9168fbd 100644
--- a/src/utilities/util_dump.c
+++ b/src/utilities/util_dump.c
@@ -22,10 +22,10 @@ static int dump_prefix(WT_SESSION *, bool);
 static int dump_record(WT_CURSOR *, bool, bool);
 static int dump_suffix(WT_SESSION *);
 static int dump_table_config(WT_SESSION *, WT_CURSOR *, const char *);
-static int dump_table_config_type(
+static int dump_table_config_complex(
     WT_SESSION *, WT_CURSOR *, WT_CURSOR *, const char *, const char *);
 static int dup_json_string(const char *, char **);
-static int print_config(WT_SESSION *, const char *, const char *, const char *);
+static int print_config(WT_SESSION *, const char *, char *[]);
 static int usage(void);
 
 int
@@ -150,9 +150,9 @@ dump_config(WT_SESSION *session, const char *uri, bool hex)
 
 	/* Open a metadata cursor. */
 	if ((ret = session->open_cursor(
-	    session, "metadata:create", NULL, NULL, &cursor)) != 0) {
+	    session, "metadata:", NULL, NULL, &cursor)) != 0) {
 		fprintf(stderr, "%s: %s: session.open_cursor: %s\n", progname,
-		    "metadata:create", session->strerror(session, ret));
+		    "metadata:", session->strerror(session, ret));
 		return (1);
 	}
 	/*
@@ -352,12 +352,23 @@ match:		if ((ret = cursor->get_key(cursor, &key)) != 0)
 static int
 dump_json_table_config(WT_SESSION *session, const char *uri)
 {
+	WT_CONFIG_ITEM cval;
 	WT_CURSOR *cursor;
 	WT_DECL_RET;
+	size_t len;
 	int tret;
-	char *value;
+	const char *name, *value;
+	char *p;
+
+	p = NULL;
+
+	/* Get the table name. */
+	if ((name = strchr(uri, ':')) == NULL) {
+		fprintf(stderr, "%s: %s: corrupted uri\n", progname, uri);
+		return (1);
+	}
+	++name;
 
-	/* Dump the config. */
 	/* Open a metadata cursor. */
 	if ((ret = session->open_cursor(
 	    session, "metadata:create", NULL, NULL, &cursor)) != 0) {
@@ -368,12 +379,41 @@ dump_json_table_config(WT_SESSION *session, const char *uri)
 	}
 
 	/*
-	 * Search for the object itself, to make sure it
-	 * exists, and get its config string. This where we
-	 * find out a table object doesn't exist, use a simple
-	 * error message.
+	 * Search for the object itself, just to make sure it exists, we don't
+	 * want to output a header if the user entered the wrong name. This is
+	 * where we find out a table doesn't exist, use a simple error message.
+	 *
+	 * Workaround for WiredTiger "simple" table handling. Simple tables
+	 * have column-group entries, but they aren't listed in the metadata's
+	 * table entry. Figure out if it's a simple table and in that case,
+	 * retrieve the column-group entry and use the value from its "source"
+	 * file.
 	 */
-	cursor->set_key(cursor, uri);
+	if (WT_PREFIX_MATCH(uri, "table:")) {
+		len = strlen("colgroup:") + strlen(name) + 1;
+		if ((p = malloc(len)) == NULL)
+			return (util_err(session, errno, NULL));
+		(void)snprintf(p, len, "colgroup:%s", name);
+		cursor->set_key(cursor, p);
+		if ((ret = cursor->search(cursor)) == 0) {
+			if ((ret = cursor->get_value(cursor, &value)) != 0)
+				return (util_cerr(cursor, "get_value", ret));
+			if ((ret = __wt_config_getones(
+			    (WT_SESSION_IMPL *)session,
+			    value, "source", &cval)) != 0)
+				return (util_err(
+				    session, ret, "%s: source entry", p));
+			free(p);
+			len = cval.len + 10;
+			if ((p = malloc(len)) == NULL)
+				return (util_err(session, errno, NULL));
+			(void)snprintf(p, len, "%.*s", (int)cval.len, cval.str);
+			cursor->set_key(cursor, p);
+		} else
+			cursor->set_key(cursor, uri);
+	} else
+		cursor->set_key(cursor, uri);
+
 	if ((ret = cursor->search(cursor)) == 0) {
 		if ((ret = cursor->get_value(cursor, &value)) != 0)
 			ret = util_cerr(cursor, "get_value", ret);
@@ -381,8 +421,7 @@ dump_json_table_config(WT_SESSION *session, const char *uri)
 		    session, cursor, uri, value) != 0)
 			ret = 1;
 	} else if (ret == WT_NOTFOUND)
-		ret = util_err(
-		    session, 0, "%s: No such object exists", uri);
+		ret = util_err(session, 0, "%s: No such object exists", uri);
 	else
 		ret = util_err(session, ret, "%s", uri);
 
@@ -392,6 +431,7 @@ dump_json_table_config(WT_SESSION *session, const char *uri)
 			ret = tret;
 	}
 
+	free(p);
 	return (ret);
 }
 
@@ -414,10 +454,17 @@ dump_json_table_end(WT_SESSION *session)
 static int
 dump_table_config(WT_SESSION *session, WT_CURSOR *cursor, const char *uri)
 {
+	WT_CONFIG_ITEM cval;
 	WT_CURSOR *srch;
 	WT_DECL_RET;
+	size_t len;
 	int tret;
-	const char *key, *name, *value;
+	bool complex_table;
+	const char *name, *v;
+	char *p, **cfg, *_cfg[4] = {NULL, NULL, NULL, NULL};
+
+	p = NULL;
+	cfg = &_cfg[3];
 
 	/* Get the table name. */
 	if ((name = strchr(uri, ':')) == NULL) {
@@ -427,59 +474,111 @@ dump_table_config(WT_SESSION *session, WT_CURSOR *cursor, const char *uri)
 	++name;
 
 	/*
-	 * Dump out the config information: first, dump the uri entry itself
-	 * (requires a lookup).
+	 * Dump out the config information: first, dump the uri entry itself,
+	 * it overrides all subsequent configurations.
 	 */
 	cursor->set_key(cursor, uri);
 	if ((ret = cursor->search(cursor)) != 0)
 		return (util_cerr(cursor, "search", ret));
-	if ((ret = cursor->get_key(cursor, &key)) != 0)
-		return (util_cerr(cursor, "get_key", ret));
-	if ((ret = cursor->get_value(cursor, &value)) != 0)
+	if ((ret = cursor->get_value(cursor, &v)) != 0)
 		return (util_cerr(cursor, "get_value", ret));
-	if (print_config(session, key, value, NULL) != 0)
-		return (1);
+	if ((*--cfg = strdup(v)) == NULL)
+		return (util_err(session, errno, NULL));
 
 	/*
-	 * The underlying table configuration function needs a second cursor:
-	 * open one before calling it, it makes error handling hugely simpler.
+	 * Workaround for WiredTiger "simple" table handling. Simple tables
+	 * have column-group entries, but they aren't listed in the metadata's
+	 * table entry, and the name is different from other column-groups.
+	 * Figure out if it's a simple table and in that case, retrieve the
+	 * column-group's configuration value and the column-group's "source"
+	 * entry, where the column-group entry overrides the source's.
 	 */
-	if ((ret =
-	    session->open_cursor(session, NULL, cursor, NULL, &srch)) != 0)
-		return (util_cerr(cursor, "open_cursor", ret));
+	complex_table = false;
+	if (WT_PREFIX_MATCH(uri, "table:")) {
+		len = strlen("colgroup:") + strlen(name) + 1;
+		if ((p = malloc(len)) == NULL)
+			return (util_err(session, errno, NULL));
+		(void)snprintf(p, len, "colgroup:%s", name);
+		cursor->set_key(cursor, p);
+		if ((ret = cursor->search(cursor)) == 0) {
+			if ((ret = cursor->get_value(cursor, &v)) != 0)
+				return (util_cerr(cursor, "get_value", ret));
+			if ((*--cfg = strdup(v)) == NULL)
+				return (util_err(session, errno, NULL));
+			if ((ret =__wt_config_getones(
+			    (WT_SESSION_IMPL *)session,
+			    *cfg, "source", &cval)) != 0)
+				return (util_err(
+				    session, ret, "%s: source entry", p));
+			free(p);
+			len = cval.len + 10;
+			if ((p = malloc(len)) == NULL)
+				return (util_err(session, errno, NULL));
+			(void)snprintf(p, len, "%.*s", (int)cval.len, cval.str);
+			cursor->set_key(cursor, p);
+			if ((ret = cursor->search(cursor)) != 0)
+				return (util_cerr(cursor, "search", ret));
+			if ((ret = cursor->get_value(cursor, &v)) != 0)
+				return (util_cerr(cursor, "get_value", ret));
+			if ((*--cfg = strdup(v)) == NULL)
+				return (util_err(session, errno, NULL));
+		} else
+			complex_table = true;
+	}
 
-	if ((ret = dump_table_config_type(
-	    session, cursor, srch, name, "colgroup:")) == 0)
-		ret = dump_table_config_type(
-		    session, cursor, srch, name, "index:");
+	if (print_config(session, uri, cfg) != 0)
+		return (1);
 
-	if ((tret = srch->close(srch)) != 0) {
-		tret = util_cerr(cursor, "close", tret);
-		if (ret == 0)
-			ret = tret;
+	if (complex_table) {
+		/*
+		 * The underlying table configuration function needs a second
+		 * cursor: open one before calling it, it makes error handling
+		 * hugely simpler.
+		 */
+		if ((ret = session->open_cursor(
+		    session, "metadata:", NULL, NULL, &srch)) != 0)
+			return (util_cerr(cursor, "open_cursor", ret));
+
+		if ((ret = dump_table_config_complex(
+		    session, cursor, srch, name, "colgroup:")) == 0)
+			ret = dump_table_config_complex(
+			    session, cursor, srch, name, "index:");
+
+		if ((tret = srch->close(srch)) != 0) {
+			tret = util_cerr(cursor, "close", tret);
+			if (ret == 0)
+				ret = tret;
+		}
 	}
 
+	free(p);
+	free(_cfg[0]);
+	free(_cfg[1]);
+	free(_cfg[2]);
 	return (ret);
 }
 
 /*
- * dump_table_config_type --
+ * dump_table_config_complex --
  *	Dump the column groups or indices for a table.
  */
 static int
-dump_table_config_type(WT_SESSION *session,
+dump_table_config_complex(WT_SESSION *session,
     WT_CURSOR *cursor, WT_CURSOR *srch, const char *name, const char *entry)
 {
 	WT_CONFIG_ITEM cval;
 	WT_DECL_RET;
-	const char *key, *skip, *value, *value_source;
+	const char *key;
+	size_t len;
 	int exact;
-	char *p;
+	const char *v;
+	char *p, *cfg[3] = {NULL, NULL, NULL};
 
 	/*
 	 * Search the file looking for column group and index key/value pairs:
 	 * for each one, look up the related source information and append it
-	 * to the base record.
+	 * to the base record, where the column group and index configuration
+	 * overrides the source configuration.
 	 */
 	cursor->set_key(cursor, entry);
 	if ((ret = cursor->search_near(cursor, &exact)) != 0) {
@@ -497,27 +596,32 @@ match:		if ((ret = cursor->get_key(cursor, &key)) != 0)
 		if (!WT_PREFIX_MATCH(key, entry))
 			return (0);
 
-		/* Check for a table name match. */
-		skip = key + strlen(entry);
-		if (strncmp(
-		    skip, name, strlen(name)) != 0 || skip[strlen(name)] != ':')
+		/*
+		 * Check for a table name match. This test will match "simple"
+		 * table column-groups as well as the more complex ones, but
+		 * the previous version of the test was wrong and we're only
+		 * in this function in the case of complex tables.
+		 */
+		if (!WT_PREFIX_MATCH(key + strlen(entry), name))
 			continue;
 
 		/* Get the value. */
-		if ((ret = cursor->get_value(cursor, &value)) != 0)
+		if ((ret = cursor->get_value(cursor, &v)) != 0)
 			return (util_cerr(cursor, "get_value", ret));
+		if ((cfg[1] = strdup(v)) == NULL)
+			return (util_err(session, errno, NULL));
 
 		/* Crack it and get the underlying source. */
 		if ((ret = __wt_config_getones(
-		    (WT_SESSION_IMPL *)session, value, "source", &cval)) != 0)
+		    (WT_SESSION_IMPL *)session, cfg[1], "source", &cval)) != 0)
 			return (
 			    util_err(session, ret, "%s: source entry", key));
 
 		/* Nul-terminate the source entry. */
-		if ((p = malloc(cval.len + 10)) == NULL)
+		len = cval.len + 10;
+		if ((p = malloc(len)) == NULL)
 			return (util_err(session, errno, NULL));
-		(void)strncpy(p, cval.str, cval.len);
-		p[cval.len] = '\0';
+		(void)snprintf(p, len, "%.*s", (int)cval.len, cval.str);
 		srch->set_key(srch, p);
 		if ((ret = srch->search(srch)) != 0)
 			ret = util_err(session, ret, "%s: %s", key, p);
@@ -526,16 +630,22 @@ match:		if ((ret = cursor->get_key(cursor, &key)) != 0)
 			return (1);
 
 		/* Get the source's value. */
-		if ((ret = srch->get_value(srch, &value_source)) != 0)
+		if ((ret = srch->get_value(srch, &v)) != 0)
 			return (util_cerr(cursor, "get_value", ret));
+		if ((cfg[0] = strdup(v)) == NULL)
+			return (util_err(session, errno, NULL));
 
 		/*
 		 * The dumped configuration string is the original key plus the
-		 * source's configuration.
+		 * source's configuration, where the values of the original key
+		 * override any source configurations of the same name.
 		 */
-		if (print_config(session, key, value, value_source) != 0)
+		if (print_config(session, key, cfg) != 0)
 			return (util_err(session, EIO, NULL));
 	}
+	free(cfg[0]);
+	free(cfg[1]);
+
 	if (ret == 0 || ret == WT_NOTFOUND)
 		return (0);
 	return (util_cerr(cursor, "next", ret));
@@ -649,27 +759,21 @@ dup_json_string(const char *str, char **result)
  *	Output a key/value URI pair by combining v1 and v2.
  */
 static int
-print_config(WT_SESSION *session,
-    const char *key, const char *v1, const char *v2)
+print_config(WT_SESSION *session, const char *key, char *cfg[])
 {
 	WT_DECL_RET;
 	char *value_ret;
-	const char *cfg[] = { v1, v2, NULL };
 
 	/*
-	 * The underlying call will stop if the first string is NULL -- check
-	 * here and swap in that case.
+	 * We have all of the object configuration, but don't have the default
+	 * session.create configuration. Have the underlying library add in the
+	 * defaults and collapse it all into one load configuration string.
 	 */
-	if (cfg[0] == NULL) {
-		cfg[0] = cfg[1];
-		cfg[1] = NULL;
-	}
-
-	if ((ret = __wt_config_collapse(
+	if ((ret = __wt_schema_create_final(
 	    (WT_SESSION_IMPL *)session, cfg, &value_ret)) != 0)
 		return (util_err(session, ret, NULL));
 	ret = printf("%s\n%s\n", key, value_ret);
-	free((char *)value_ret);
+	free(value_ret);
 	if (ret < 0)
 		return (util_err(session, EIO, NULL));
 	return (0);
diff --git a/test/bloom/test_bloom.c b/test/bloom/test_bloom.c
index 183dc3d2d42..f95bc7faaf9 100644
--- a/test/bloom/test_bloom.c
+++ b/test/bloom/test_bloom.c
@@ -160,7 +160,7 @@ run(void)
 	for (i = 0; i < g.c_ops; i++) {
 		item.data = g.entries[i];
 		if ((ret = __wt_bloom_insert(bloomp, &item)) != 0)
-			testutil_die(ret, "__wt_bloom_insert: %d", i);
+			testutil_die(ret, "__wt_bloom_insert: %" PRIu32, i);
 	}
 
 	testutil_check(__wt_bloom_finalize(bloomp));
@@ -168,7 +168,8 @@ run(void)
 	for (i = 0; i < g.c_ops; i++) {
 		item.data = g.entries[i];
 		if ((ret = __wt_bloom_get(bloomp, &item)) != 0) {
-			fprintf(stderr, "get failed at record: %d\n", i);
+			fprintf(stderr,
+			    "get failed at record: %" PRIu32 "\n", i);
 			testutil_die(ret, "__wt_bloom_get");
 		}
 	}
@@ -201,7 +202,8 @@ run(void)
 			testutil_die(ret, "__wt_bloom_get");
 	}
 	free((void *)item.data);
-	printf("Out of %d ops, got %d false positives, %.4f%%\n",
+	printf(
+	    "Out of %" PRIu32 " ops, got %" PRIu32 " false positives, %.4f%%\n",
 	    g.c_ops, fp, 100.0 * fp/g.c_ops);
 	testutil_check(__wt_bloom_drop(bloomp, NULL));
 }
diff --git a/test/checkpoint/test_checkpoint.c b/test/checkpoint/test_checkpoint.c
index 0f28a86b675..c5524b3c63e 100644
--- a/test/checkpoint/test_checkpoint.c
+++ b/test/checkpoint/test_checkpoint.c
@@ -136,7 +136,7 @@ main(int argc, char *argv[])
 
 	printf("%s: process %" PRIu64 "\n", g.progname, (uint64_t)getpid());
 	for (cnt = 1; (runs == 0 || cnt <= runs) && g.status == 0; ++cnt) {
-		printf("    %d: %u workers, %u tables\n",
+		printf("    %d: %d workers, %d tables\n",
 		    cnt, g.nworkers, g.ntables);
 
 		(void)cleanup();		/* Clean up previous runs */
diff --git a/test/cursor_order/cursor_order.c b/test/cursor_order/cursor_order.c
index 68d2f092c60..d8cfc0c1421 100644
--- a/test/cursor_order/cursor_order.c
+++ b/test/cursor_order/cursor_order.c
@@ -154,8 +154,10 @@ main(int argc, char *argv[])
 
 	printf("%s: process %" PRIu64 "\n", progname, (uint64_t)getpid());
 	for (cnt = 1; runs == 0 || cnt <= runs; ++cnt) {
-		printf("    %d: %u reverse scanners, %u writers\n", cnt,
-		    (int)cfg->reverse_scanners, (int)cfg->append_inserters);
+		printf(
+		    "    %d: %" PRIu64
+		    " reverse scanners, %" PRIu64 " writers\n",
+		    cnt, cfg->reverse_scanners, cfg->append_inserters);
 
 		shutdown();			/* Clean up previous runs */
 
diff --git a/test/fops/file.c b/test/fops/file.c
index 4cd92e7b590..ea15f1ee80d 100644
--- a/test/fops/file.c
+++ b/test/fops/file.c
@@ -147,7 +147,7 @@ obj_create_unique(int force)
 	/* Generate a unique object name. */
 	if ((ret = pthread_rwlock_wrlock(&single)) != 0)
 		testutil_die(ret, "pthread_rwlock_wrlock single");
-	(void)snprintf(new_uri, sizeof(new_uri), "%s.%d", uri, ++uid);
+	(void)snprintf(new_uri, sizeof(new_uri), "%s.%u", uri, ++uid);
 	if ((ret = pthread_rwlock_unlock(&single)) != 0)
 		testutil_die(ret, "pthread_rwlock_unlock single");
 
diff --git a/test/fops/fops.c b/test/fops/fops.c
index fbc9d9c6048..3333ff16858 100644
--- a/test/fops/fops.c
+++ b/test/fops/fops.c
@@ -109,7 +109,7 @@ fop(void *arg)
 	__wt_random_init(&rnd);
 
 	for (i = 0; i < nops; ++i, __wt_yield())
-		switch (__wt_random(&rnd) % 9) {
+		switch (__wt_random(&rnd) % 10) {
 		case 0:
 			++s->bulk;
 			obj_bulk();
diff --git a/test/format/backup.c b/test/format/backup.c
index 56657940514..2b1463bd0e3 100644
--- a/test/format/backup.c
+++ b/test/format/backup.c
@@ -67,6 +67,13 @@ copy_file(const char *name)
 	    "cp %s/%s %s/%s", g.home, name, g.home_backup, name);
 	testutil_checkfmt(system(cmd), "backup copy: %s", cmd);
 	free(cmd);
+
+	len = strlen(g.home) + strlen(g.home_backup2) + strlen(name) * 2 + 20;
+	cmd = dmalloc(len);
+	(void)snprintf(cmd, len,
+	    "cp %s/%s %s/%s", g.home, name, g.home_backup2, name);
+	testutil_checkfmt(system(cmd), "backup copy: %s", cmd);
+	free(cmd);
 }
 
 /*
diff --git a/test/format/format.h b/test/format/format.h
index c54fd061736..a129c5395fd 100644
--- a/test/format/format.h
+++ b/test/format/format.h
@@ -109,6 +109,7 @@ typedef struct {
 
 	char *home;				/* Home directory */
 	char *home_backup;			/* Hot-backup directory */
+	char *home_backup2;			/* Saved Hot-backup directory */
 	char *home_backup_init;			/* Initialize backup command */
 	char *home_bdb;				/* BDB directory */
 	char *home_config;			/* Run CONFIG file path */
diff --git a/test/format/util.c b/test/format/util.c
index 347b2ea1db3..2e4c869366c 100644
--- a/test/format/util.c
+++ b/test/format/util.c
@@ -310,6 +310,10 @@ path_setup(const char *home)
 	g.home_backup = dmalloc(len);
 	snprintf(g.home_backup, len, "%s/%s", g.home, "BACKUP");
 
+	len = strlen(g.home) + strlen("BACKUP2") + 2;
+	g.home_backup2 = dmalloc(len);
+	snprintf(g.home_backup2, len, "%s/%s", g.home, "BACKUP2");
+
 	/* BDB directory. */
 	len = strlen(g.home) + strlen("bdb") + 2;
 	g.home_bdb = dmalloc(len);
@@ -340,13 +344,15 @@ path_setup(const char *home)
 	/* Backup directory initialize command, remove and re-create it. */
 #undef	CMD
 #ifdef _WIN32
-#define	CMD	"del /s /q >:nul && mkdir %s"
+#define	CMD	"del /s /q >:nul && mkdir %s %s"
 #else
-#define	CMD	"rm -rf %s && mkdir %s"
+#define	CMD	"rm -rf %s %s && mkdir %s %s"
 #endif
-	len = strlen(g.home_backup) * 2 + strlen(CMD) + 1;
+	len = strlen(g.home_backup) * 2 +
+	    strlen(g.home_backup2) * 2 + strlen(CMD) + 1;
 	g.home_backup_init = dmalloc(len);
-	snprintf(g.home_backup_init, len, CMD, g.home_backup, g.home_backup);
+	snprintf(g.home_backup_init, len, CMD, g.home_backup, g.home_backup2,
+	    g.home_backup, g.home_backup2);
 
 	/*
 	 * Salvage command, save the interesting files so we can replay the
diff --git a/test/format/wts.c b/test/format/wts.c
index a0e57dc2bee..81e484296e2 100644
--- a/test/format/wts.c
+++ b/test/format/wts.c
@@ -53,7 +53,8 @@ compressor(uint32_t compress_flag)
 	default:
 		break;
 	}
-	testutil_die(EINVAL, "illegal compression flag: 0x%x", compress_flag);
+	testutil_die(EINVAL,
+	    "illegal compression flag: %#" PRIx32, compress_flag);
 }
 
 /*
@@ -71,7 +72,8 @@ encryptor(uint32_t encrypt_flag)
 	default:
 		break;
 	}
-	testutil_die(EINVAL, "illegal encryption flag: 0x%x", encrypt_flag);
+	testutil_die(EINVAL,
+	    "illegal encryption flag: %#" PRIx32, encrypt_flag);
 }
 
 static int
@@ -313,7 +315,7 @@ wts_create(void)
 	p += snprintf(p, REMAIN(p, end),
 	    "key_format=%s,"
 	    "allocation_size=512,%s"
-	    "internal_page_max=%d,leaf_page_max=%d",
+	    "internal_page_max=%" PRIu32 ",leaf_page_max=%" PRIu32,
 	    (g.type == ROW) ? "u" : "r",
 	    g.c_firstfit ? "block_allocation=first," : "",
 	    maxintlpage, maxleafpage);
@@ -325,15 +327,15 @@ wts_create(void)
 	maxintlkey = mmrand(NULL, maxintlpage / 50, maxintlpage / 40);
 	if (maxintlkey > 20)
 		p += snprintf(p, REMAIN(p, end),
-		    ",internal_key_max=%d", maxintlkey);
+		    ",internal_key_max=%" PRIu32, maxintlkey);
 	maxleafkey = mmrand(NULL, maxleafpage / 50, maxleafpage / 40);
 	if (maxleafkey > 20)
 		p += snprintf(p, REMAIN(p, end),
-		    ",leaf_key_max=%d", maxleafkey);
+		    ",leaf_key_max=%" PRIu32, maxleafkey);
 	maxleafvalue = mmrand(NULL, maxleafpage * 10, maxleafpage / 40);
 	if (maxleafvalue > 40 && maxleafvalue < 100 * 1024)
 		p += snprintf(p, REMAIN(p, end),
-		    ",leaf_value_max=%d", maxleafvalue);
+		    ",leaf_value_max=%" PRIu32, maxleafvalue);
 
 	switch (g.type) {
 	case FIX:
@@ -361,7 +363,7 @@ wts_create(void)
 			    ",huffman_value=english");
 		if (g.c_dictionary)
 			p += snprintf(p, REMAIN(p, end),
-			    ",dictionary=%d", mmrand(NULL, 123, 517));
+			    ",dictionary=%" PRIu32, mmrand(NULL, 123, 517));
 		break;
 	}
 
diff --git a/test/manydbs/Makefile.am b/test/manydbs/Makefile.am
new file mode 100644
index 00000000000..53559b25243
--- /dev/null
+++ b/test/manydbs/Makefile.am
@@ -0,0 +1,13 @@
+AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include \
+    -I$(top_srcdir)/test/utility
+
+noinst_PROGRAMS = t
+t_SOURCES = manydbs.c
+t_LDADD = $(top_builddir)/libwiredtiger.la
+t_LDFLAGS = -static
+
+# Run this during a "make check" smoke test.
+TESTS = smoke.sh
+
+clean-local:
+	rm -rf WiredTiger* *.core __*
diff --git a/test/manydbs/manydbs.c b/test/manydbs/manydbs.c
new file mode 100644
index 00000000000..1d3412a7b06
--- /dev/null
+++ b/test/manydbs/manydbs.c
@@ -0,0 +1,264 @@
+/*-
+ * Public Domain 2014-2016 MongoDB, Inc.
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <sys/wait.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <wiredtiger.h>
+
+#include "test_util.i"
+
+#define	HOME_SIZE	512
+#define	HOME_BASE	"WT_HOME"
+static char home[HOME_SIZE];		/* Base home directory */
+static char hometmp[HOME_SIZE];		/* Each conn home directory */
+static const char *progname;		/* Program name */
+static const char * const uri = "table:main";
+
+#define	WTOPEN_CFG_COMMON					\
+    "create,log=(file_max=10M,archive=false,enabled),"		\
+    "statistics=(fast),statistics_log=(wait=5),"
+#define	WT_CONFIG0						\
+    WTOPEN_CFG_COMMON						\
+    "transaction_sync=(enabled=false)"
+#define	WT_CONFIG1						\
+    WTOPEN_CFG_COMMON						\
+    "transaction_sync=(enabled,method=none)"
+#define	WT_CONFIG2						\
+    WTOPEN_CFG_COMMON						\
+    "transaction_sync=(enabled,method=fsync)"
+
+#define	MAX_DBS		10
+#define	MAX_IDLE_TIME	30
+#define	IDLE_INCR	5
+
+#define	MAX_KV		100
+#define	MAX_VAL		128
+
+static void
+usage(void)
+{
+	fprintf(stderr,
+	    "usage: %s [-I] [-D maxdbs] [-h dir]\n", progname);
+	exit(EXIT_FAILURE);
+}
+
+extern int __wt_optind;
+extern char *__wt_optarg;
+
+void (*custom_die)(void) = NULL;
+
+WT_CONNECTION **connections = NULL;
+WT_CURSOR **cursors = NULL;
+WT_RAND_STATE rnd;
+WT_SESSION **sessions = NULL;
+
+static int
+get_stat(WT_SESSION *stat_session, int stat_field, uint64_t *valuep)
+{
+	WT_CURSOR *statc;
+	const char *desc, *pvalue;
+	int ret;
+
+	testutil_check(stat_session->open_cursor(stat_session,
+	    "statistics:", NULL, NULL, &statc));
+	statc->set_key(statc, stat_field);
+	if ((ret = statc->search(statc)) != 0)
+		return (ret);
+
+	ret = statc->get_value(statc, &desc, &pvalue, valuep);
+	testutil_check(statc->close(statc));
+	return (ret);
+}
+
+static int
+run_ops(int dbs)
+{
+	WT_ITEM data;
+	int db_set, i, key;
+	uint32_t db;
+	uint8_t buf[MAX_VAL];
+
+	memset(buf, 0, sizeof(buf));
+	for (i = 0; i < MAX_VAL; ++i)
+		buf[i] = (uint8_t)__wt_random(&rnd);
+	data.data = buf;
+	/*
+	 * Write a small amount of data into a random subset of the databases.
+	 */
+	db_set = dbs / 4;
+	for (i = 0; i < db_set; ++i) {
+		db = __wt_random(&rnd) % (uint32_t)dbs;
+		printf("Write to database %" PRIu32 "\n", db);
+		for (key = 0; key < MAX_KV; ++key) {
+			data.size = __wt_random(&rnd) % MAX_VAL;
+			cursors[db]->set_key(cursors[db], key);
+			cursors[db]->set_value(cursors[db], &data);
+			testutil_check(cursors[db]->insert(cursors[db]));
+		}
+	}
+	return (0);
+}
+
+int
+main(int argc, char *argv[])
+{
+	uint64_t cond_reset, cond_wait;
+	uint64_t *cond_reset_orig;
+	int cfg, ch, dbs, i;
+	bool idle;
+	const char *working_dir, *wt_cfg;
+	char cmd[128];
+
+	if ((progname = strrchr(argv[0], DIR_DELIM)) == NULL)
+		progname = argv[0];
+	else
+		++progname;
+	dbs = MAX_DBS;
+	working_dir = HOME_BASE;
+	idle = false;
+	while ((ch = __wt_getopt(progname, argc, argv, "D:h:I")) != EOF)
+		switch (ch) {
+		case 'D':
+			dbs = atoi(__wt_optarg);
+			break;
+		case 'h':
+			working_dir = __wt_optarg;
+			break;
+		case 'I':
+			idle = true;
+			break;
+		default:
+			usage();
+		}
+	argc -= __wt_optind;
+	argv += __wt_optind;
+	if (argc != 0)
+		usage();
+
+	/*
+	 * Allocate arrays for connection handles, sessions, statistics
+	 * cursors and, if needed, data cursors.
+	 */
+	if ((connections = calloc(
+	    (size_t)dbs, sizeof(WT_CONNECTION *))) == NULL)
+		testutil_die(ENOMEM, "connection array malloc");
+	if ((sessions = calloc(
+	    (size_t)dbs, sizeof(WT_SESSION *))) == NULL)
+		testutil_die(ENOMEM, "session array malloc");
+	if ((cond_reset_orig = calloc((size_t)dbs, sizeof(uint64_t))) == NULL)
+		testutil_die(ENOMEM, "orig stat malloc");
+	if (!idle && ((cursors = calloc(
+	    (size_t)dbs, sizeof(WT_CURSOR *))) == NULL))
+		testutil_die(ENOMEM, "cursor array malloc");
+	memset(cmd, 0, sizeof(cmd));
+	/*
+	 * Set up all the directory names.
+	 */
+	testutil_work_dir_from_path(home, HOME_SIZE, working_dir);
+	testutil_make_work_dir(home);
+	__wt_random_init(&rnd);
+	for (i = 0; i < dbs; ++i) {
+		snprintf(hometmp, HOME_SIZE, "%s/%s.%d", home, HOME_BASE, i);
+		testutil_make_work_dir(hometmp);
+		/*
+		 * Open each database.  Rotate different configurations
+		 * among them.  Open a session and statistics cursor.
+		 * If writing data, create the table and open a data cursor.
+		 */
+		cfg = i % 3;
+		if (cfg == 0)
+			wt_cfg = WT_CONFIG0;
+		else if (cfg == 1)
+			wt_cfg = WT_CONFIG1;
+		else
+			wt_cfg = WT_CONFIG2;
+		testutil_check(wiredtiger_open(
+		    hometmp, NULL, wt_cfg, &connections[i]));
+		testutil_check(connections[i]->open_session(connections[i],
+		    NULL, NULL, &sessions[i]));
+		if (!idle) {
+			testutil_check(sessions[i]->create(sessions[i],
+			    uri, "key_format=Q,value_format=u"));
+			testutil_check(sessions[i]->open_cursor(sessions[i],
+			    uri, NULL, NULL, &cursors[i]));
+		}
+	}
+
+	sleep(10);
+
+	/*
+	 * Record original reset setting.  There could have been some
+	 * activity during the creation period.
+	 */
+	for (i = 0; i < dbs; ++i)
+		testutil_check(get_stat(sessions[i],
+		    WT_STAT_CONN_COND_AUTO_WAIT_RESET, &cond_reset_orig[i]));
+	for (i = 0; i < MAX_IDLE_TIME; i += IDLE_INCR) {
+		if (!idle)
+			testutil_check(run_ops(dbs));
+		printf("Sleep %d (%d of %d)\n", IDLE_INCR, i, MAX_IDLE_TIME);
+		sleep(IDLE_INCR);
+	}
+	for (i = 0; i < dbs; ++i) {
+		testutil_check(get_stat(sessions[i],
+		    WT_STAT_CONN_COND_AUTO_WAIT_RESET, &cond_reset));
+		testutil_check(get_stat(sessions[i],
+		    WT_STAT_CONN_COND_AUTO_WAIT, &cond_wait));
+		/*
+		 * On an idle workload there should be no resets of condition
+		 * variables during the idle period.  Even with a light
+		 * workload, resets should not be very common.  We look for 5%.
+		 */
+		if (idle && cond_reset != cond_reset_orig[i])
+			testutil_die(ERANGE,
+			    "condition reset on idle connection %d of %" PRIu64,
+			    i, cond_reset);
+		if (!idle && cond_reset > cond_wait / 20)
+			testutil_die(ERANGE, "connection %d condition reset %"
+			    PRIu64 " exceeds 5%% of %" PRIu64,
+			    i, cond_reset, cond_wait);
+		testutil_check(connections[i]->close(connections[i], NULL));
+	}
+
+	/* Cleanup allocated memory. */
+	free(connections);
+	free(sessions);
+	free(cond_reset_orig);
+	if (!idle)
+		free(cursors);
+
+	return (EXIT_SUCCESS);
+}
diff --git a/test/manydbs/smoke.sh b/test/manydbs/smoke.sh
new file mode 100755
index 00000000000..c0e2976f154
--- /dev/null
+++ b/test/manydbs/smoke.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+
+set -e
+
+# Smoke-test format as part of running "make check".
+# Run with:
+# 1.  The defaults
+# 2.  Set idle flag to turn off operations.
+# 3.  More dbs.
+# 
+echo "manydbs: default with operations turned on"
+$TEST_WRAPPER ./t
+echo "manydbs: totally idle databases"
+$TEST_WRAPPER ./t -I
+echo "manydbs: 40 databases with operations"
+$TEST_WRAPPER ./t -D 40
+echo "manydbs: 40 idle databases"
+$TEST_WRAPPER ./t -I -D 40
diff --git a/test/readonly/Makefile.am b/test/readonly/Makefile.am
index 384e197a1f8..3abcd2386a1 100644
--- a/test/readonly/Makefile.am
+++ b/test/readonly/Makefile.am
@@ -10,4 +10,4 @@ t_LDFLAGS = -static
 TESTS = smoke.sh
 
 clean-local:
-	rm -rf WiredTiger* *.core __*
+	rm -rf WT_RD* WiredTiger* *.core __*
diff --git a/test/readonly/readonly.c b/test/readonly/readonly.c
index 100ccbf81b7..41400da2605 100644
--- a/test/readonly/readonly.c
+++ b/test/readonly/readonly.c
@@ -42,9 +42,13 @@
 
 #define	HOME_SIZE	512
 static char home[HOME_SIZE];		/* Program working dir lock file */
-static char home_wr[HOME_SIZE];		/* Writable dir copy no lock file */
-static char home_rd[HOME_SIZE];		/* Read-only dir */
-static char home_rd2[HOME_SIZE];	/* Read-only dir no lock file */
+#define	HOME_WR_SUFFIX	".WRNOLOCK"	/* Writable dir copy no lock file */
+static char home_wr[HOME_SIZE + sizeof(HOME_WR_SUFFIX)];
+#define	HOME_RD_SUFFIX	".RD"		/* Read-only dir */
+static char home_rd[HOME_SIZE + sizeof(HOME_RD_SUFFIX)];
+#define	HOME_RD2_SUFFIX	".RDNOLOCK"	/* Read-only dir no lock file */
+static char home_rd2[HOME_SIZE + sizeof(HOME_RD2_SUFFIX)];
+
 static const char *progname;		/* Program name */
 static const char *saved_argv0;		/* Program command */
 static const char * const uri = "table:main";
@@ -87,13 +91,14 @@ run_child(const char *homedir, int op, int expect)
 		cfg = ENV_CONFIG_RD;
 	else
 		cfg = ENV_CONFIG_WR;
-	ret = wiredtiger_open(homedir, NULL, cfg, &conn);
-	if (expect == EXPECT_SUCCESS && ret != 0)
-		testutil_die(ret, "wiredtiger_open success err");
-	if (expect == EXPECT_ERR) {
-		if (ret == 0)
+	if ((ret = wiredtiger_open(homedir, NULL, cfg, &conn)) == 0) {
+		if (expect == EXPECT_ERR)
+			testutil_die(
+			    ret, "wiredtiger_open expected error, succeeded");
+	} else {
+		if (expect == EXPECT_SUCCESS)
 			testutil_die(
-			    ret, "wiredtiger_open expected err succeeded");
+			    ret, "wiredtiger_open expected success, error");
 		/*
 		 * If we expect an error and got one, we're done.
 		 */
@@ -207,17 +212,14 @@ main(int argc, char *argv[])
 	if (argc != 0)
 		usage();
 
-	memset(buf, 0, sizeof(buf));
 	/*
 	 * Set up all the directory names.
 	 */
-	testutil_work_dir_from_path(home, 512, working_dir);
-	strncpy(home_wr, home, HOME_SIZE);
-	strcat(home_wr, ".WRNOLOCK");
-	strncpy(home_rd, home, HOME_SIZE);
-	strcat(home_rd, ".RD");
-	strncpy(home_rd2, home, HOME_SIZE);
-	strcat(home_rd2, ".RDNOLOCK");
+	testutil_work_dir_from_path(home, sizeof(home), working_dir);
+	(void)snprintf(home_wr, sizeof(home_wr), "%s%s", home, HOME_WR_SUFFIX);
+	(void)snprintf(home_rd, sizeof(home_rd), "%s%s", home, HOME_RD_SUFFIX);
+	(void)snprintf(
+	    home_rd2, sizeof(home_rd2), "%s%s", home, HOME_RD2_SUFFIX);
 	if (!child) {
 		testutil_make_work_dir(home);
 		testutil_make_work_dir(home_wr);
@@ -260,6 +262,7 @@ main(int argc, char *argv[])
 	/*
 	 * Write data into the table and then cleanly shut down connection.
 	 */
+	memset(buf, 0, sizeof(buf));
 	data.data = buf;
 	data.size = MAX_VAL;
 	for (i = 0; i < MAX_KV; ++i) {
@@ -329,7 +332,8 @@ main(int argc, char *argv[])
 	 * the child even though it should not be.  So use 'system' to spawn
 	 * an entirely new process.
 	 */
-	(void)snprintf(cmd, sizeof(cmd), "%s -R", saved_argv0);
+	(void)snprintf(
+	    cmd, sizeof(cmd), "%s -h %s -R", saved_argv0, working_dir);
 	if ((status = system(cmd)) < 0)
 		testutil_die(status, "system");
 	/*
@@ -341,7 +345,8 @@ main(int argc, char *argv[])
 	/*
 	 * Scenario 2.  Run child with writable config.
 	 */
-	(void)snprintf(cmd, sizeof(cmd), "%s -W", saved_argv0);
+	(void)snprintf(
+	    cmd, sizeof(cmd), "%s -h %s -W", saved_argv0, working_dir);
 	if ((status = system(cmd)) < 0)
 		testutil_die(status, "system");
 
@@ -362,7 +367,8 @@ main(int argc, char *argv[])
 	/*
 	 * Scenario 3.  Child read-only.
 	 */
-	(void)snprintf(cmd, sizeof(cmd), "%s -R", saved_argv0);
+	(void)snprintf(
+	    cmd, sizeof(cmd), "%s -h %s -R", saved_argv0, working_dir);
 	if ((status = system(cmd)) < 0)
 		testutil_die(status, "system");
 	if (WEXITSTATUS(status) != 0)
@@ -371,7 +377,8 @@ main(int argc, char *argv[])
 	/*
 	 * Scenario 4.  Run child with writable config.
 	 */
-	(void)snprintf(cmd, sizeof(cmd), "%s -W", saved_argv0);
+	(void)snprintf(
+	    cmd, sizeof(cmd), "%s -h %s -W", saved_argv0, working_dir);
 	if ((status = system(cmd)) < 0)
 		testutil_die(status, "system");
 	if (WEXITSTATUS(status) != 0)
diff --git a/test/recovery/random-abort.c b/test/recovery/random-abort.c
index c9cc10d2db3..f9c3ed28814 100644
--- a/test/recovery/random-abort.c
+++ b/test/recovery/random-abort.c
@@ -249,9 +249,10 @@ main(int argc, char *argv[])
 	if ((ret = conn->close(conn, NULL)) != 0)
 		testutil_die(ret, "WT_CONNECTION:close");
 	if (absent) {
-		printf("%u record(s) absent from %u\n", absent, count);
+		printf("%" PRIu32 " record(s) absent from %" PRIu32 "\n",
+		    absent, count);
 		return (EXIT_FAILURE);
 	}
-	printf("%u records verified\n", count);
+	printf("%" PRIu32 " records verified\n", count);
 	return (EXIT_SUCCESS);
 }
diff --git a/test/recovery/truncated-log.c b/test/recovery/truncated-log.c
index 23269e99d35..67fdb932c27 100644
--- a/test/recovery/truncated-log.c
+++ b/test/recovery/truncated-log.c
@@ -156,14 +156,16 @@ fill_db(void)
 					    "%" PRIu32 " %" PRIu32 "\n",
 					    save_lsn.l.offset, i - 1) == -1)
 						testutil_die(errno, "fprintf");
-					if (fclose(fp) != 0)
-						testutil_die(errno, "fclose");
-					abort();
+					break;
 				}
 			}
 			first = false;
 		}
 	}
+	if (fclose(fp) != 0)
+		testutil_die(errno, "fclose");
+	abort();
+	/* NOTREACHED */
 }
 
 extern int __wt_optind;
@@ -243,8 +245,10 @@ main(int argc, char *argv[])
 	 * The offset is the beginning of the last record.  Truncate to
 	 * the middle of that last record (i.e. ahead of that offset).
 	 */
+	if (offset > UINT64_MAX - V_SIZE)
+		testutil_die(ERANGE, "offset");
 	new_offset = offset + V_SIZE;
-	printf("Parent: Truncate to %u\n", (uint32_t)new_offset);
+	printf("Parent: Truncate to %" PRIu64 "\n", new_offset);
 	if ((ret = truncate(LOG_FILE_1, (wt_off_t)new_offset)) != 0)
 		testutil_die(errno, "truncate");
 
@@ -267,9 +271,10 @@ main(int argc, char *argv[])
 	if ((ret = conn->close(conn, NULL)) != 0)
 		testutil_die(ret, "WT_CONNECTION:close");
 	if (count > max_key) {
-		printf("expected %u records found %u\n", max_key, count);
+		printf("expected %" PRIu32 " records found %" PRIu32 "\n",
+		    max_key, count);
 		return (EXIT_FAILURE);
 	}
-	printf("%u records verified\n", count);
+	printf("%" PRIu32 " records verified\n", count);
 	return (EXIT_SUCCESS);
 }
diff --git a/test/suite/helper.py b/test/suite/helper.py
index 3c460e23d08..f85d708880f 100644
--- a/test/suite/helper.py
+++ b/test/suite/helper.py
@@ -107,7 +107,10 @@ def copy_wiredtiger_home(olddir, newdir, aligned=True):
     for fname in os.listdir(olddir):
         fullname = os.path.join(olddir, fname)
         # Skip lock file, on Windows it is locked.
-        if os.path.isfile(fullname) and "WiredTiger.lock" not in fullname:
+        # Skip temporary log files.
+        if os.path.isfile(fullname) and "WiredTiger.lock" not in fullname and \
+            "WiredTigerTmplog" not in fullname and \
+            "WiredTigerPreplog" not in fullname:
             # Use a dd command that does not align on a block boundary.
             if aligned:
                 shutil.copy(fullname, newdir)
@@ -196,31 +199,36 @@ def complex_populate_index_count():
 #    config:    prefix of the session.create configuration string
 #    rows:      entries to insert
 def complex_populate(self, uri, config, rows):
-        complex_populate_type(self, uri, config, rows, '')
+        complex_populate_type(self, uri, config, '', rows, '')
+def complex_populate_cgconfig(self, uri, config, rows):
+        complex_populate_type(self, uri, config, config, rows, '')
 def complex_populate_lsm(self, uri, config, rows):
-        complex_populate_type(self, uri, config, rows, 'type=lsm')
-def complex_populate_type(self, uri, config, rows, type):
+        complex_populate_type(self, uri, config, '', rows, 'type=lsm')
+def complex_populate_cgconfig_lsm(self, uri, config, rows):
+        complex_populate_type(self, uri, config, config, rows, 'type=lsm')
+def complex_populate_type(self, uri, config, cgconfig, rows, type):
     self.session.create(uri,
         config + ',value_format=SiSS,' +
         'columns=(record,column2,column3,column4,column5),' +
         'colgroups=(cgroup1,cgroup2,cgroup3,cgroup4,cgroup5,cgroup6)')
 
     cgname = 'colgroup:' + uri.split(":")[1]
-    self.session.create(cgname + ':cgroup1', 'columns=(column2)' + ',' + type)
-    self.session.create(cgname + ':cgroup2', 'columns=(column3)' + ',' + type)
-    self.session.create(cgname + ':cgroup3', 'columns=(column4)' + ',' + type)
+    cgcfg = ',' + cgconfig + ',' + type
+    self.session.create(cgname + ':cgroup1', 'columns=(column2)' + ',' + cgcfg)
+    self.session.create(cgname + ':cgroup2', 'columns=(column3)' + ',' + cgcfg)
+    self.session.create(cgname + ':cgroup3', 'columns=(column4)' + ',' + cgcfg)
     self.session.create(
-        cgname + ':cgroup4', 'columns=(column2,column3)' + ',' + type)
+        cgname + ':cgroup4', 'columns=(column2,column3)' + ',' + cgcfg)
     self.session.create(
-        cgname + ':cgroup5', 'columns=(column3,column4)' + ',' + type)
+        cgname + ':cgroup5', 'columns=(column3,column4)' + ',' + cgcfg)
     self.session.create(
-        cgname + ':cgroup6', 'columns=(column2,column4,column5)' + ',' + type)
+        cgname + ':cgroup6', 'columns=(column2,column4,column5)' + ',' + cgcfg)
     indxname = 'index:' + uri.split(":")[1]
-    self.session.create(indxname + ':indx1', 'columns=(column2)' + ',' + type)
-    self.session.create(indxname + ':indx2', 'columns=(column3)' + ',' + type)
-    self.session.create(indxname + ':indx3', 'columns=(column4)' + ',' + type)
+    self.session.create(indxname + ':indx1', 'columns=(column2)' + ',' + cgcfg)
+    self.session.create(indxname + ':indx2', 'columns=(column3)' + ',' + cgcfg)
+    self.session.create(indxname + ':indx3', 'columns=(column4)' + ',' + cgcfg)
     self.session.create(
-        indxname + ':indx4', 'columns=(column2,column4)' + ',' + type)
+        indxname + ':indx4', 'columns=(column2,column4)' + ',' + cgcfg)
     cursor = self.session.open_cursor(uri, None)
     for i in range(1, rows + 1):
         cursor[key_populate(cursor, i)] = \
@@ -228,9 +236,9 @@ def complex_populate_type(self, uri, config, rows, type):
     cursor.close()
     # add some indices after populating
     self.session.create(
-        indxname + ':indx5', 'columns=(column3,column5)' + ',' + type)
+        indxname + ':indx5', 'columns=(column3,column5)' + ',' + cgcfg)
     self.session.create(
-        indxname + ':indx6', 'columns=(column3,column5,column4)' + ',' + type)
+        indxname + ':indx6', 'columns=(column3,column5,column4)' + ',' + cgcfg)
 
 def complex_populate_colgroup_name(self, uri, i):
     return 'colgroup:' + uri.split(":")[1] + ':cgroup' + str(i + 1)
diff --git a/test/suite/test_bug008.py b/test/suite/test_bug008.py
index 8f0526d9cef..0243887e258 100644
--- a/test/suite/test_bug008.py
+++ b/test/suite/test_bug008.py
@@ -33,65 +33,208 @@ import wiredtiger, wttest
 from helper import simple_populate, key_populate, value_populate
 from wtscenario import check_scenarios
 
-# Tests for invisible updates.
+# Test search/search-near operations, including invisible values and keys
+# past the end of the table.
 class test_bug008(wttest.WiredTigerTestCase):
+    uri = 'file:test_bug008'                # This is a btree layer test.
     scenarios = check_scenarios([
-        ('fix', dict(fmt='key_format=r,value_format=8t', empty=1)),
-        ('row', dict(fmt='key_format=S', empty=0)),
-        ('var', dict(fmt='key_format=r', empty=0))
+        ('fix', dict(fmt='key_format=r,value_format=8t', empty=1, colvar=0)),
+        ('row', dict(fmt='key_format=S', empty=0, colvar=0)),
+        ('var', dict(fmt='key_format=r', empty=0, colvar=1))
     ])
 
+    # Verify cursor search and search-near operations in an empty table.
+    def test_search_empty(self):
+        # Create the object and open a cursor.
+        self.session.create(self.uri, self.fmt)
+        cursor = self.session.open_cursor(self.uri, None)
+
+        # Search for a record past the end of the table, which should fail.
+        cursor.set_key(key_populate(cursor, 100))
+        self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND)
+
+        # Search-near for a record past the end of the table, which should fail.
+        cursor.set_key(key_populate(cursor, 100))
+        self.assertEqual(cursor.search_near(), wiredtiger.WT_NOTFOUND)
+
+    # Verify cursor search and search-near operations at and past the end of
+    # a file, with a set of on-page visible records.
+    def test_search_eot(self):
+        # Populate the tree and reopen the connection, forcing it to disk
+        # and moving the records to an on-page format.
+        simple_populate(self, self.uri, self.fmt, 100)
+        self.reopen_conn()
+
+        # Open a cursor.
+        cursor = self.session.open_cursor(self.uri, None)
+
+        # Search for a record at the end of the table, which should succeed.
+        cursor.set_key(key_populate(cursor, 100))
+        self.assertEqual(cursor.search(), 0)
+        self.assertEqual(cursor.get_key(), key_populate(cursor, 100))
+        self.assertEqual(cursor.get_value(), value_populate(cursor, 100))
+
+        # Search-near for a record at the end of the table, which should
+        # succeed, returning the last record.
+        cursor.set_key(key_populate(cursor, 100))
+        self.assertEqual(cursor.search_near(), 0)
+        self.assertEqual(cursor.get_key(), key_populate(cursor, 100))
+        self.assertEqual(cursor.get_value(), value_populate(cursor, 100))
+
+        # Search for a record past the end of the table, which should fail.
+        cursor.set_key(key_populate(cursor, 200))
+        self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND)
+
+        # Search-near for a record past the end of the table, which should
+        # succeed, returning the last record.
+        cursor.set_key(key_populate(cursor, 200))
+        self.assertEqual(cursor.search_near(), -1)
+        self.assertEqual(cursor.get_key(), key_populate(cursor, 100))
+        self.assertEqual(cursor.get_value(), value_populate(cursor, 100))
+
+    # Verify cursor search-near operations before and after a set of
+    # column-store duplicates.
+    def test_search_duplicate(self):
+        if self.colvar == 0:
+                return
+
+        # Populate the tree.
+        simple_populate(self, self.uri, self.fmt, 105)
+
+        # Set up deleted records before and after a set of duplicate records,
+        # and make sure search/search-near returns the correct record.
+        cursor = self.session.open_cursor(self.uri, None)
+        for i in range(20, 100):
+            cursor[key_populate(cursor, i)] = '=== IDENTICAL VALUE ==='
+        for i in range(15, 25):
+            cursor.set_key(key_populate(cursor, i))
+            self.assertEqual(cursor.remove(), 0)
+        for i in range(95, 106):
+            cursor.set_key(key_populate(cursor, i))
+            self.assertEqual(cursor.remove(), 0)
+        cursor.close()
+
+        # Reopen the connection, forcing it to disk and moving the records to
+        # an on-page format.
+        self.reopen_conn()
+
+        # Open a cursor.
+        cursor = self.session.open_cursor(self.uri, None)
+
+        # Search-near for a record in the deleted set before the duplicate set,
+        # which should succeed, returning the first record in the duplicate set.
+        cursor.set_key(key_populate(cursor, 18))
+        self.assertEqual(cursor.search_near(), 1)
+        self.assertEqual(cursor.get_key(), key_populate(cursor, 25))
+
+        # Search-near for a record in the deleted set after the duplicate set,
+        # which should succeed, returning the last record in the duplicate set.
+        cursor.set_key(key_populate(cursor, 98))
+        self.assertEqual(cursor.search_near(), -1)
+        self.assertEqual(cursor.get_key(), key_populate(cursor, 94))
+
     # Verify cursor search and search-near operations on a file with a set of
     # on-page visible records, and a set of insert-list invisible records.
     def test_search_invisible_one(self):
-        uri = 'file:test_bug008'                # This is a btree layer test.
+        # Populate the tree.
+        simple_populate(self, self.uri, self.fmt, 100)
 
-        # Populate the tree and reopen the connection, forcing it to disk
-        # and moving the records to an on-page format.
-        simple_populate(self, uri, self.fmt, 100)
+        # Delete a range of records.
+        for i in range(5, 10):
+            cursor = self.session.open_cursor(self.uri, None)
+            cursor.set_key(key_populate(cursor, i))
+            self.assertEqual(cursor.remove(), 0)
+
+        # Reopen the connection, forcing it to disk and moving the records to
+        # an on-page format.
         self.reopen_conn()
 
-        # Begin a transaction, and add some additional records.
+        # Add updates to the existing records (in both the deleted an undeleted
+        # range), as well as some new records after the end. Put the updates in
+        # a separate transaction so they're invisible to another cursor.
         self.session.begin_transaction()
-        cursor = self.session.open_cursor(uri, None)
+        cursor = self.session.open_cursor(self.uri, None)
+        for i in range(5, 10):
+            cursor[key_populate(cursor, i)] = value_populate(cursor, i + 1000)
+        for i in range(30, 40):
+            cursor[key_populate(cursor, i)] = value_populate(cursor, i + 1000)
         for i in range(100, 140):
-            cursor[key_populate(cursor, i)] = value_populate(cursor, i)
+            cursor[key_populate(cursor, i)] = value_populate(cursor, i + 1000)
 
         # Open a separate session and cursor.
         s = self.conn.open_session()
-        cursor = s.open_cursor(uri, None)
+        cursor = s.open_cursor(self.uri, None)
 
-        # Search for an invisible record.
-        cursor.set_key(key_populate(cursor, 130))
-        if self.empty:
-            # Invisible updates to fixed-length column-store objects are
-            # invisible to the reader, but the fact that they exist past
-            # the end of the initial records causes the instantiation of
-            # empty records: confirm successful return of an empty row.
-            cursor.search()
-            self.assertEqual(cursor.get_key(), 130)
-            self.assertEqual(cursor.get_value(), 0)
-        else:
-            # Otherwise, we should not find any matching records.
-            self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND)
+        # Search for an existing record in the deleted range, should not find
+        # it.
+        for i in range(5, 10):
+            cursor.set_key(key_populate(cursor, i))
+            if self.empty:
+                # Fixed-length column-store rows always exist.
+                self.assertEqual(cursor.search(), 0)
+                self.assertEqual(cursor.get_key(), i)
+                self.assertEqual(cursor.get_value(), 0)
+            else:
+                self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND)
 
-        # Search-near for an invisible record, which should succeed, returning
-        # the last visible record.
-        cursor.set_key(key_populate(cursor, 130))
-        cursor.search_near()
-        if self.empty:
-            # Invisible updates to fixed-length column-store objects are
-            # invisible to the reader, but the fact that they exist past
-            # the end of the initial records causes the instantiation of
-            # empty records: confirm successful return of an empty row.
-            cursor.search()
-            self.assertEqual(cursor.get_key(), 130)
-            self.assertEqual(cursor.get_value(), 0)
-        else:
-            # Otherwise, we should find the closest record for which we can see
-            # the value.
-            self.assertEqual(cursor.get_key(), key_populate(cursor, 100))
-            self.assertEqual(cursor.get_value(), value_populate(cursor, 100))
+        # Search for an existing record in the updated range, should see the
+        # original value.
+        for i in range(30, 40):
+            cursor.set_key(key_populate(cursor, i))
+            self.assertEqual(cursor.search(), 0)
+            self.assertEqual(cursor.get_key(), key_populate(cursor, i))
+
+        # Search for a added record, should not find it.
+        for i in range(120, 130):
+            cursor.set_key(key_populate(cursor, i))
+            if self.empty:
+                # Invisible updates to fixed-length column-store objects are
+                # invisible to the reader, but the fact that they exist past
+                # the end of the initial records causes the instantiation of
+                # empty records: confirm successful return of an empty row.
+                self.assertEqual(cursor.search(), 0)
+                self.assertEqual(cursor.get_key(), i)
+                self.assertEqual(cursor.get_value(), 0)
+            else:
+                # Otherwise, we should not find any matching records.
+                self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND)
+
+        # Search-near for an existing record in the deleted range, should find
+        # the next largest record. (This depends on the implementation behavior
+        # which currently includes a bias to prefix search.)
+        for i in range(5, 10):
+            cursor.set_key(key_populate(cursor, i))
+            if self.empty:
+                # Fixed-length column-store rows always exist.
+                self.assertEqual(cursor.search_near(), 0)
+                self.assertEqual(cursor.get_key(), i)
+                self.assertEqual(cursor.get_value(), 0)
+            else:
+                self.assertEqual(cursor.search_near(), 1)
+                self.assertEqual(cursor.get_key(), key_populate(cursor, 10))
+
+        # Search-near for an existing record in the updated range, should see
+        # the original value.
+        for i in range(30, 40):
+            cursor.set_key(key_populate(cursor, i))
+            self.assertEqual(cursor.search_near(), 0)
+            self.assertEqual(cursor.get_key(), key_populate(cursor, i))
+
+        # Search-near for an added record, should find the previous largest
+        # record.
+        for i in range(120, 130):
+            cursor.set_key(key_populate(cursor, i))
+            if self.empty:
+                # Invisible updates to fixed-length column-store objects are
+                # invisible to the reader, but the fact that they exist past
+                # the end of the initial records causes the instantiation of
+                # empty records: confirm successful return of an empty row.
+                self.assertEqual(cursor.search_near(), 0)
+                self.assertEqual(cursor.get_key(), i)
+                self.assertEqual(cursor.get_value(), 0)
+            else:
+                self.assertEqual(cursor.search_near(), -1)
+                self.assertEqual(cursor.get_key(), key_populate(cursor, 100))
 
     # Verify cursor search and search-near operations on a file with a set of
     # on-page visible records, a set of insert-list visible records, and a set
@@ -101,28 +244,26 @@ class test_bug008(wttest.WiredTigerTestCase):
     # fallback happens, whether the correct position is in the page slots or
     # the insert list.)
     def test_search_invisible_two(self):
-        uri = 'file:test_bug008'                # This is a btree layer test.
-
         # Populate the tree and reopen the connection, forcing it to disk
         # and moving the records to an on-page format.
-        simple_populate(self, uri, self.fmt, 100)
+        simple_populate(self, self.uri, self.fmt, 100)
         self.reopen_conn()
 
         # Add some additional visible records.
-        cursor = self.session.open_cursor(uri, None)
+        cursor = self.session.open_cursor(self.uri, None)
         for i in range(100, 120):
             cursor[key_populate(cursor, i)] = value_populate(cursor, i)
         cursor.close()
 
         # Begin a transaction, and add some additional records.
         self.session.begin_transaction()
-        cursor = self.session.open_cursor(uri, None)
+        cursor = self.session.open_cursor(self.uri, None)
         for i in range(120, 140):
             cursor[key_populate(cursor, i)] = value_populate(cursor, i)
 
         # Open a separate session and cursor.
         s = self.conn.open_session()
-        cursor = s.open_cursor(uri, None)
+        cursor = s.open_cursor(self.uri, None)
 
         # Search for an invisible record.
         cursor.set_key(key_populate(cursor, 130))
diff --git a/test/suite/test_checkpoint01.py b/test/suite/test_checkpoint01.py
index 9955944f73d..6e1ad7814ed 100644
--- a/test/suite/test_checkpoint01.py
+++ b/test/suite/test_checkpoint01.py
@@ -185,7 +185,7 @@ class test_checkpoint_cursor(wttest.WiredTigerTestCase):
         # Check dropping all checkpoints fails.
         msg = '/checkpoints cannot be dropped/'
         self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
-            lambda: self.session.checkpoint("name=checkpoint-2"), msg)
+            lambda: self.session.checkpoint("force,name=checkpoint-2"), msg)
         self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
             lambda: self.session.checkpoint("drop=(checkpoint-2)"), msg)
         self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
diff --git a/test/suite/test_collator.py b/test/suite/test_collator.py
new file mode 100644
index 00000000000..34b5c20247f
--- /dev/null
+++ b/test/suite/test_collator.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2016 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import os
+import wiredtiger, wttest, run
+from wtscenario import check_scenarios, number_scenarios
+
+# test_collator.py
+#    Test indices using a custom extractor and collator.
+class test_collator(wttest.WiredTigerTestCase):
+    """
+    Test indices with a custom extractor to create an index,
+    with our own collator.
+    Our set of rows looks like a multiplication table:
+      row '0':  '0,0,0,0'
+      row '1':  '0,1,2,3'
+      row '2':  '0,2,4,6'
+    with the twist that entries are mod 100.  So, looking further:
+      row '40':  '0,40,80,20'
+
+    Each column is placed into its own index.  Our collator reverses
+    the values.
+    """
+    nentries = 100
+    nindices = 4
+
+    # Return the wiredtiger_open extension argument for a shared library.
+    def extensionArg(self, exts):
+        extfiles = []
+        for ext in exts:
+            (dirname, name, libname) = ext
+            if name != None and name != 'none':
+                testdir = os.path.dirname(__file__)
+                extdir = os.path.join(run.wt_builddir, 'ext', dirname)
+                extfile = os.path.join(
+                    extdir, name, '.libs', 'libwiredtiger_' + libname + '.so')
+                if not os.path.exists(extfile):
+                    self.skipTest('extension "' + extfile + '" not built')
+                if not extfile in extfiles:
+                    extfiles.append(extfile)
+        if len(extfiles) == 0:
+            return ''
+        else:
+            return ',extensions=["' + '","'.join(extfiles) + '"]'
+
+    # Override WiredTigerTestCase, we have extensions.
+    def setUpConnectionOpen(self, dir):
+        extarg = self.extensionArg([('extractors', 'csv', 'csv_extractor'),
+                                ('collators', 'revint', 'revint_collator')])
+        connarg = 'create,error_prefix="{0}: ",{1}'.format(
+            self.shortid(), extarg)
+        conn = self.wiredtiger_open(dir, connarg)
+        self.pr(`conn`)
+        return conn
+
+    def create_indices(self):
+        # Create self.nindices index files, each with a column from the CSV
+        for i in range(0, self.nindices):
+            si = str(i)
+            self.session.create('index:collator:x' + si,
+                                'key_format=i,columns=(key),' +
+                                'collator=revint,' +
+                                'extractor=csv,app_metadata={"format" : "i",' +
+                                '"field" : "' + si + '"}')
+
+    def drop_indices(self):
+        for i in range(0, self.nindices):
+            self.session.drop("index:collator:x" + str(i))
+
+    def csv(self, s, i):
+        return s.split(',')[i]
+
+    def expected_main_value(self, i):
+        return ','.join([str((i*j)%100) for j in range(0, self.nindices)])
+
+    # We split the population into two phases
+    # (in anticipation of future tests that create
+    # indices between the two population steps).
+    def populate(self):
+        cursor = self.session.open_cursor('table:collator', None, None)
+        for i in range(0, self.nentries):
+            cursor[i] = self.expected_main_value(i)
+        cursor.close()
+
+    def check_entries(self):
+        cursor = self.session.open_cursor('table:collator', None, None)
+        icursor = []
+        for i in range(0, self.nindices):
+            icursor.append(self.session.open_cursor('index:collator:x' + str(i),
+                                                    None, None))
+        i = 0
+        for primkey, value in cursor:
+            # Check main table
+            expect = self.expected_main_value(i)
+            self.assertEqual(i, primkey)
+            self.assertEqual(value, expect)
+            for idx in range(0, self.nindices):
+                c = icursor[idx]
+                indexkey = (i*idx)%100
+                c.set_key(indexkey)
+                self.assertEqual(c.search(), 0)
+                value = c.get_value()
+                key = c.get_key()
+                while value != expect and key == indexkey and \
+                      self.csv(value, idx) == self.csv(expect, idx):
+                    self.assertEqual(0, c.next())
+                    value = c.get_value()
+                    key = c.get_key()
+                self.assertEqual(value, expect)
+            i += 1
+        self.assertEqual(self.nentries, i)
+        for i in range(0, self.nindices):
+            c = icursor[i]
+            c.reset()
+            expected = set(range(0, self.nentries))
+            for key, val in c:
+                primkey = int(val.split(',')[1])
+                expected.remove(primkey)
+            self.assertEquals(0, len(expected))
+            c.close()
+
+    def test_index(self):
+        self.session.create("table:collator", "key_format=i,value_format=S,"
+                            "columns=(primarykey,value)")
+        self.create_indices()
+        self.populate()
+        self.check_entries()
+
+        # Drop and recreate all indices, everything should be there.
+        self.drop_indices()
+        self.create_indices()
+        self.check_entries()
+
+
+if __name__ == '__main__':
+    wttest.run()
diff --git a/test/suite/test_drop.py b/test/suite/test_drop.py
index 5663b85d661..52ea7251ab5 100644
--- a/test/suite/test_drop.py
+++ b/test/suite/test_drop.py
@@ -41,12 +41,11 @@ class test_drop(wttest.WiredTigerTestCase):
     scenarios = check_scenarios([
         ('file', dict(uri='file:')),
         ('table', dict(uri='table:')),
-        #Not yet: drop failing with an open cursor needs handle locking
-        #('table-lsm', dict(uri='table:', extra_config=',type=lsm')),
+        ('table-lsm', dict(uri='table:', extra_config=',type=lsm')),
     ])
 
     # Populate an object, remove it and confirm it no longer exists.
-    def drop(self, populate, with_cursor, close_session, drop_index):
+    def drop(self, populate, with_cursor, reopen, drop_index):
         uri = self.uri + self.name
         populate(self, uri, 'key_format=S' + self.extra_config, 10)
 
@@ -57,7 +56,7 @@ class test_drop(wttest.WiredTigerTestCase):
                 lambda: self.session.drop(uri, None))
             cursor.close()
 
-        if close_session:
+        if reopen:
             self.reopen_conn()
 
         if drop_index:
@@ -73,17 +72,17 @@ class test_drop(wttest.WiredTigerTestCase):
         # Try all combinations except dropping the index, the simple
         # case has no indices.
         for with_cursor in [False, True]:
-            for close_session in [False, True]:
-                self.drop(simple_populate, with_cursor, close_session, False)
+            for reopen in [False, True]:
+                self.drop(simple_populate, with_cursor, reopen, False)
 
         # A complex, multi-file table object.
         # Try all test combinations.
         if self.uri == "table:":
             for with_cursor in [False, True]:
-                for close_session in [False, True]:
+                for reopen in [False, True]:
                     for drop_index in [False, True]:
                         self.drop(complex_populate, with_cursor,
-                                  close_session, drop_index)
+                                  reopen, drop_index)
 
     # Test drop of a non-existent object: force succeeds, without force fails.
     def test_drop_dne(self):
diff --git a/test/suite/test_drop02.py b/test/suite/test_drop02.py
new file mode 100644
index 00000000000..677ba3866b2
--- /dev/null
+++ b/test/suite/test_drop02.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2016 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wttest
+from helper import simple_populate
+
+# test_drop02.py
+#    Test dropping an LSM tree on first open. There was a bug where this
+#    would cause an assertion failure: WT-2501
+class test_drop02(wttest.WiredTigerTestCase):
+    name = 'test_drop02'
+
+    # Populate an object, remove it and confirm it no longer exists.
+    def test_drop(self):
+        uri = 'lsm:' + self.name
+        simple_populate(self, uri, 'key_format=S', 100000)
+        self.reopen_conn()
+
+        self.session.drop(uri, None)
+
+if __name__ == '__main__':
+    wttest.run()
diff --git a/test/suite/test_dump.py b/test/suite/test_dump.py
index c850d1b5d3f..fc1422155e2 100644
--- a/test/suite/test_dump.py
+++ b/test/suite/test_dump.py
@@ -29,8 +29,8 @@
 import os
 import wiredtiger, wttest
 from helper import \
-    complex_populate, complex_populate_check_cursor,\
-    simple_populate, simple_populate_check_cursor
+    complex_populate, complex_populate_check, \
+    simple_populate, simple_populate_check
 from suite_subprocess import suite_subprocess
 from wtscenario import multiply_scenarios, number_scenarios
 
@@ -54,15 +54,24 @@ class test_dump(wttest.WiredTigerTestCase, suite_subprocess):
         ('string', dict(keyfmt='S'))
     ]
     types = [
-        ('file', dict(type='file:',
+        ('file', dict(uri='file:', config='', lsm=False,
           populate=simple_populate,
-          populate_check=simple_populate_check_cursor)),
-        ('table-simple', dict(type='table:',
+          populate_check=simple_populate_check)),
+        ('lsm', dict(uri='lsm:', config='', lsm=True,
           populate=simple_populate,
-          populate_check=simple_populate_check_cursor)),
-        ('table-complex', dict(type='table:',
+          populate_check=simple_populate_check)),
+        ('table-simple', dict(uri='table:', config='', lsm=False,
+          populate=simple_populate,
+          populate_check=simple_populate_check)),
+        ('table-simple-lsm', dict(uri='table:', config='type=lsm', lsm=True,
+          populate=simple_populate,
+          populate_check=simple_populate_check)),
+        ('table-complex', dict(uri='table:', config='', lsm=False,
+          populate=complex_populate,
+          populate_check=complex_populate_check)),
+        ('table-complex-lsm', dict(uri='table:', config='type=lsm', lsm=True,
           populate=complex_populate,
-          populate_check=complex_populate_check_cursor))
+          populate_check=complex_populate_check))
     ]
     scenarios = number_scenarios(
         multiply_scenarios('.', types, keyfmt, dumpfmt))
@@ -94,9 +103,14 @@ class test_dump(wttest.WiredTigerTestCase, suite_subprocess):
 
     # Dump, re-load and do a content comparison.
     def test_dump(self):
+        # LSM and column-store isn't a valid combination.
+        if self.lsm and self.keyfmt == 'r':
+                return
+
         # Create the object.
-        uri = self.type + self.name
-        self.populate(self, uri, 'key_format=' + self.keyfmt, self.nentries)
+        uri = self.uri + self.name
+        self.populate(self, uri,
+            self.config + ',key_format=' + self.keyfmt, self.nentries)
 
         # Dump the object.
         os.mkdir(self.dir)
@@ -108,11 +122,17 @@ class test_dump(wttest.WiredTigerTestCase, suite_subprocess):
         # Re-load the object.
         self.runWt(['-h', self.dir, 'load', '-f', 'dump.out'])
 
-        # Check the contents
+        # Check the database contents
+        self.runWt(['list'], outfilename='list.out')
+        self.runWt(['-h', self.dir, 'list'], outfilename='list.out.new')
+        s1 = set(open('list.out').read().split())
+        s2 = set(open('list.out.new').read().split())
+        self.assertEqual(not s1.symmetric_difference(s2), True)
+
+        # Check the object's contents
         conn = self.wiredtiger_open(self.dir)
         session = conn.open_session()
-        cursor = session.open_cursor(uri, None, None)
-        self.populate_check(self, cursor, self.nentries)
+        self.populate_check(self, uri, self.nentries)
         conn.close()
 
         # Re-load the object again.
@@ -121,8 +141,7 @@ class test_dump(wttest.WiredTigerTestCase, suite_subprocess):
         # Check the contents, they shouldn't have changed.
         conn = self.wiredtiger_open(self.dir)
         session = conn.open_session()
-        cursor = session.open_cursor(uri, None, None)
-        self.populate_check(self, cursor, self.nentries)
+        self.populate_check(self, uri, self.nentries)
         conn.close()
 
         # Re-load the object again, but confirm -n (no overwrite) fails.
@@ -130,7 +149,7 @@ class test_dump(wttest.WiredTigerTestCase, suite_subprocess):
             'load', '-n', '-f', 'dump.out'], errfilename='errfile.out')
         self.check_non_empty_file('errfile.out')
 
-        # If there is are indices, dump one of them and check the output.
+        # If there are indices, dump one of them and check the output.
         if self.populate == complex_populate:
             indexuri = 'index:' + self.name + ':indx1'
             hexopt = ['-x'] if self.hex == 1 else []
diff --git a/test/suite/test_join01.py b/test/suite/test_join01.py
index 539a3a3ae57..4aa2bc6e269 100644
--- a/test/suite/test_join01.py
+++ b/test/suite/test_join01.py
@@ -74,8 +74,18 @@ class test_join01(wttest.WiredTigerTestCase):
     # the join cursor and iterating again.
     def stats(self, jc, which):
         statcur = self.session.open_cursor('statistics:join', jc, None)
-        self.check_stats(statcur, 0, 'join: index:join01:index1: ' +
-                         'bloom filter false positives')
+        # pick a stat we always expect to see
+        statdesc = 'bloom filter false positives'
+        expectstats = [
+            'join: index:join01:index1: ' + statdesc,
+            'join: index:join01:index2: ' + statdesc ]
+        if self.ref == 'index':
+            expectstats.append('join: index:join01:index0: ' + statdesc)
+        else:
+            expectstats.append('join: table:join01: ' + statdesc)
+        self.check_stats(statcur, expectstats)
+        statcur.reset()
+        self.check_stats(statcur, expectstats)
         statcur.close()
 
     def statstr_to_int(self, str):
@@ -86,16 +96,14 @@ class test_join01(wttest.WiredTigerTestCase):
         parts = str.rpartition('(')
         return int(parts[2].rstrip(')'))
 
-    # string should appear with a minimum value of least "min".
-    def check_stats(self, statcursor, min, lookfor):
+    # All of the expect strings should appear
+    def check_stats(self, statcursor, expectstats):
         stringclass = ''.__class__
         intclass = (0).__class__
 
         # Reset the cursor, we're called multiple times.
         statcursor.reset()
 
-        found = False
-        foundval = 0
         self.printVerbose(3, 'statistics:')
         for id, desc, valstr, val in statcursor:
             self.assertEqual(type(desc), stringclass)
@@ -104,12 +112,11 @@ class test_join01(wttest.WiredTigerTestCase):
             self.assertEqual(val, self.statstr_to_int(valstr))
             self.printVerbose(3, '  stat: \'' + desc + '\', \'' +
                               valstr + '\', ' + str(val))
-            if desc == lookfor:
-                found = True
-                foundval = val
+            if desc in expectstats:
+                expectstats.remove(desc)
 
-        self.assertTrue(found, 'in stats, did not see: ' + lookfor)
-        self.assertTrue(foundval >= min)
+        self.assertTrue(len(expectstats) == 0,
+                        'missing expected values in stats: ' + str(expectstats))
 
     # Common function for testing the most basic functionality
     # of joins
@@ -141,7 +148,8 @@ class test_join01(wttest.WiredTigerTestCase):
         # and examine primary keys 2,5,8,...,95,98,1,4,7,...,94,97.
         jc = self.session.open_cursor('join:table:join01' + proj_suffix,
                                       None, None)
-        c2 = self.session.open_cursor('index:join01:index2', None, None)
+        # Adding a projection to a reference cursor should be allowed.
+        c2 = self.session.open_cursor('index:join01:index2(v1)', None, None)
         c2.set_key(99)   # skips all entries w/ primary key divisible by three
         self.assertEquals(0, c2.search())
         self.session.join(jc, c2, 'compare=gt')
@@ -159,12 +167,12 @@ class test_join01(wttest.WiredTigerTestCase):
 
         # Then select all numbers whose reverse string representation
         # is in '20' < x < '40'.
-        c1a = self.session.open_cursor('index:join01:index1', None, None)
+        c1a = self.session.open_cursor('index:join01:index1(v1)', None, None)
         c1a.set_key('21')
         self.assertEquals(0, c1a.search())
         self.session.join(jc, c1a, 'compare=gt' + joincfg1)
 
-        c1b = self.session.open_cursor('index:join01:index1', None, None)
+        c1b = self.session.open_cursor('index:join01:index1(v1)', None, None)
         c1b.set_key('41')
         self.assertEquals(0, c1b.search())
         self.session.join(jc, c1b, 'compare=lt' + joincfg1)
diff --git a/test/suite/test_join02.py b/test/suite/test_join02.py
index d122de8a0eb..a691c499cf6 100644
--- a/test/suite/test_join02.py
+++ b/test/suite/test_join02.py
@@ -179,15 +179,16 @@ class test_join02(wttest.WiredTigerTestCase):
         c.close()
 
         # Use the primary table in one of the joins.
+        # Use various projections, which should not matter for ref cursors
         c0a = self.session.open_cursor('table:join02', None, None)
-        c0b = self.session.open_cursor('table:join02', None, None)
-        c1a = self.session.open_cursor('index:join02:index1', None, None)
+        c0b = self.session.open_cursor('table:join02(v4)', None, None)
+        c1a = self.session.open_cursor('index:join02:index1(v0)', None, None)
         c1b = self.session.open_cursor('index:join02:index1', None, None)
         c2a = self.session.open_cursor('index:join02:index2', None, None)
         c2b = self.session.open_cursor('index:join02:index2', None, None)
-        c3a = self.session.open_cursor('index:join02:index3', None, None)
-        c3b = self.session.open_cursor('index:join02:index3', None, None)
-        c4a = self.session.open_cursor('index:join02:index4', None, None)
+        c3a = self.session.open_cursor('index:join02:index3(v4)', None, None)
+        c3b = self.session.open_cursor('index:join02:index3(v0)', None, None)
+        c4a = self.session.open_cursor('index:join02:index4(v1)', None, None)
 
         # Attach extra properties to each cursor.  For cursors that
         # may appear on the 'left' side of a range CA < x < CB,
diff --git a/test/suite/test_join05.py b/test/suite/test_join05.py
new file mode 100644
index 00000000000..ef2be4c6460
--- /dev/null
+++ b/test/suite/test_join05.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2016 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wttest
+from wtscenario import check_scenarios, multiply_scenarios, number_scenarios
+
+# test_join05.py
+#    Tests based on JIRA reports
+class test_join05(wttest.WiredTigerTestCase):
+
+    # test join having the first index just be lt/le
+    def test_wt_2384(self):
+        self.session.create("table:test_2384",
+                       "key_format=i,value_format=i,columns=(k,v)")
+        self.session.create("index:test_2384:index", "columns=(v)")
+        cursor = self.session.open_cursor("table:test_2384", None, None)
+        cursor[1] = 11
+        cursor[2] = 12
+        cursor[3] = 13
+        cursor.close()
+
+        cursor = self.session.open_cursor("index:test_2384:index", None, None)
+        cursor.set_key(13)
+        self.assertEquals(cursor.search(), 0)
+
+        jcursor = self.session.open_cursor("join:table:test_2384", None, None)
+        self.session.join(jcursor, cursor, "compare=lt")
+
+        nr_found = 0
+        while jcursor.next() == 0:
+            [k] = jcursor.get_keys()
+            [v] = jcursor.get_values()
+            #self.tty("jcursor: k=" + str(k) + ", v=" + str(v))
+            nr_found += 1
+
+        self.assertEquals(nr_found, 2)
+        jcursor.close()
+        cursor.close()
+
+if __name__ == '__main__':
+    wttest.run()
diff --git a/test/suite/test_join06.py b/test/suite/test_join06.py
new file mode 100644
index 00000000000..9af6f93792f
--- /dev/null
+++ b/test/suite/test_join06.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2016 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import os
+import wiredtiger, wttest, run
+from wtscenario import check_scenarios, multiply_scenarios, number_scenarios
+
+# test_join06.py
+#    Join operations
+# Joins with a read-uncommitted
+class test_join06(wttest.WiredTigerTestCase):
+    nentries = 1000
+
+    isoscen = [
+        ('isolation_read_uncommitted', dict(uncommitted=True)),
+        ('isolation_default', dict(uncommitted=False))
+    ]
+
+    bloomscen = [
+        ('bloom', dict(bloom=True)),
+        ('nobloom', dict(bloom=False))
+    ]
+
+    scenarios = number_scenarios(multiply_scenarios('.', isoscen, bloomscen))
+
+    def gen_values(self, i):
+        s = str(i)                    # 345 => "345"
+        f = s[0:1] + s[0:1] + s[0:1]  # 345 => "333"
+        return [s, f]
+
+    def gen_values2(self, i):
+        s = str(i)                    # 345 => "345"
+        l = s[-1:] + s[-1:] + s[-1:]  # 345 => "555"
+        return [s, l]
+
+    def populate(self, s, gen_values):
+        c = s.open_cursor('table:join06', None, None)
+        for i in range(0, self.nentries):
+            c.set_key(i)
+            c.set_value(*gen_values(i))
+            c.insert()
+        c.close()
+
+    # Common function for testing the most basic functionality
+    # of joins
+    def test_join(self):
+        self.session.create('table:join06',
+                            'columns=(k,v0,v1),key_format=i,value_format=SS')
+        self.session.create('index:join06:index0','columns=(v0)')
+        self.session.create('index:join06:index1','columns=(v1)')
+
+        self.populate(self.session, self.gen_values)
+
+        # TODO: needed?
+        #self.reopen_conn()
+
+        if self.uncommitted:
+            self.session.begin_transaction('isolation=read-uncommitted')
+
+        jc = self.session.open_cursor('join:table:join06', None, None)
+        c0 = self.session.open_cursor('index:join06:index0', None, None)
+        c0.set_key('520')
+        self.assertEquals(0, c0.search())
+        self.session.join(jc, c0, 'compare=ge')
+
+        joinconfig = 'compare=eq'
+        if self.bloom:
+            joinconfig += ',strategy=bloom,count=1000'
+        c1 = self.session.open_cursor('index:join06:index1', None, None)
+        c1.set_key('555')
+        self.assertEquals(0, c1.search())
+        self.session.join(jc, c1, joinconfig)
+
+        if self.uncommitted and self.bloom:
+            # Make sure that read-uncommitted with Bloom is not allowed.
+            # This is detected on the first next() operation.
+            msg = '/cannot be used with read-uncommitted/'
+            self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+                lambda: jc.next(), msg)
+            return
+
+        # Changes made in another session may or may not be visible to us,
+        # depending on the isolation level.
+        if self.uncommitted:
+            # isolation level is read-uncommitted, so we will see
+            # additions deletions made in our other session.
+            mbr = set(range(525,1000,10)) | set(range(55,100,10)) | set([520])
+        else:
+            # default isolation level, so we should see a consistent
+            # set at the time we begin iteration.
+            mbr = set(range(520,600)) | set(range(53,60))
+
+        altered = False
+
+        while jc.next() == 0:
+            [k] = jc.get_keys()
+            [v0,v1] = jc.get_values()
+            #self.tty('GOT: ' + str(k) + ': ' + str(jc.get_values()))
+            if altered and self.uncommitted:
+                self.assertEquals(self.gen_values2(k), [v0, v1])
+            else:
+                self.assertEquals(self.gen_values(k), [v0, v1])
+            if not k in mbr:
+                self.tty('**** ERROR: result ' + str(k) + ' is not in: ' +
+                         str(mbr))
+            self.assertTrue(k in mbr)
+            mbr.remove(k)
+
+            # In another session, we remove entries for keys ending in 6,
+            # and add entries for keys ending in 5.  Depending on the
+            # isolation level for the transaction, these changes may or
+            # may not be visible for the original session.
+            if not altered:
+                s = self.conn.open_session(None)
+                s.begin_transaction(None)
+                self.populate(s, self.gen_values2)
+                s.commit_transaction()
+                s.close()
+                altered = True
+
+        if len(mbr) != 0:
+            self.tty('**** ERROR: did not see these: ' + str(mbr))
+        self.assertEquals(0, len(mbr))
+
+        jc.close()
+        c1.close()
+        c0.close()
+        if self.uncommitted:
+            self.session.commit_transaction()
+        self.session.drop('table:join06')
+
+if __name__ == '__main__':
+    wttest.run()
diff --git a/test/suite/test_lsm03.py b/test/suite/test_lsm03.py
new file mode 100644
index 00000000000..448d864c646
--- /dev/null
+++ b/test/suite/test_lsm03.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2016 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wtscenario, wttest
+from helper import simple_populate
+
+# test_lsm03.py
+#    Check to make sure that LSM schema operations don't get EBUSY when
+#    there are no user operations active.
+class test_lsm03(wttest.WiredTigerTestCase):
+    name = 'test_lsm03'
+
+    # Use small pages so we generate some internal layout
+    # Setup LSM so multiple chunks are present
+    config = 'key_format=S,allocation_size=512,internal_page_max=512' + \
+             ',leaf_page_max=1k,lsm=(chunk_size=512k,merge_min=10)'
+
+    # Populate an object then drop it.
+    def test_lsm_drop_active(self):
+        uri = 'lsm:' + self.name
+        simple_populate(self, uri, self.config, 10000)
+
+        # Force to disk
+        self.reopen_conn()
+
+        # An open cursors should cause failure.
+        cursor = self.session.open_cursor(uri, None, None)
+        self.assertRaises(wiredtiger.WiredTigerError,
+            lambda: self.session.drop(uri, None))
+        cursor.close()
+
+        # Add enough records that a merge should be running
+        simple_populate(self, uri, self.config, 50000)
+        # The drop should succeed even when LSM work units are active
+        self.session.drop(uri)
diff --git a/test/suite/test_rebalance.py b/test/suite/test_rebalance.py
index 80cce6ed514..f2167e864c9 100644
--- a/test/suite/test_rebalance.py
+++ b/test/suite/test_rebalance.py
@@ -59,7 +59,7 @@ class test_rebalance(wttest.WiredTigerTestCase):
         if with_cursor:
             cursor = self.session.open_cursor(uri, None, None)
             self.assertRaises(wiredtiger.WiredTigerError,
-                lambda: self.session.drop(uri, None))
+                lambda: self.session.rebalance(uri, None))
             cursor.close()
 
         self.session.rebalance(uri, None)
diff --git a/test/suite/test_schema07.py b/test/suite/test_schema07.py
new file mode 100644
index 00000000000..ac397c6e1a1
--- /dev/null
+++ b/test/suite/test_schema07.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2016 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wttest
+
+# test_schema07.py
+#    Test that long-running tests don't fill the cache with metadata
+class test_schema07(wttest.WiredTigerTestCase):
+    tablename = 'table:test_schema07'
+
+    def conn_config(self, dir):
+        return 'cache_size=10MB'
+
+    @wttest.longtest("Creating many tables shouldn't fill the cache")
+    def test_many_tables(self):
+        s = self.session
+        # We have a 10MB cache, metadata is (well) over 512B per table,
+        # if we can create 20K tables, something must be cleaning up.
+        for i in xrange(20000):
+            uri = '%s-%06d' % (self.tablename, i)
+            s.create(uri)
+            c = s.open_cursor(uri)
+            # This will block if the metadata fills the cache
+            c["key"] = "value"
+            c.close()
+            self.session.drop(uri)
+
+if __name__ == '__main__':
+    wttest.run()
diff --git a/test/suite/test_sweep01.py b/test/suite/test_sweep01.py
index f996dbfa06d..bccd2bce012 100644
--- a/test/suite/test_sweep01.py
+++ b/test/suite/test_sweep01.py
@@ -40,7 +40,7 @@ import wttest
 class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
     tablebase = 'test_sweep01'
     uri = 'table:' + tablebase
-    numfiles = 50
+    numfiles = 30
     numkv = 1000
     conn_config = 'file_manager=(close_handle_minimum=0,' + \
                   'close_idle_time=6,close_scan_interval=2),' + \
@@ -87,7 +87,7 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
         #
         # We've configured checkpoints to run every 5 seconds, sweep server to
         # run every 2 seconds and idle time to be 6 seconds. It should take
-        # about 8 seconds for a handle to be closed. Sleep for 12 seconds to be
+        # about 8 seconds for a handle to be closed. Sleep for double to be
         # safe.
         #
         uri = '%s.test' % self.uri
@@ -105,13 +105,24 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
         c = self.session.open_cursor(uri, None)
         k = 0
         sleep = 0
-        while sleep < 12:
+        max = 60
+        final_nfile = 4
+        while sleep < max:
             self.session.checkpoint()
             k = k+1
             c[k] = 1
             sleep += 2
             time.sleep(2)
+            # Give slow machines time to process files.
+            stat_cursor = self.session.open_cursor('statistics:', None, None)
+            this_nfile = stat_cursor[stat.conn.file_open][2]
+            stat_cursor.close()
+            self.pr("==== loop " + str(sleep))
+            self.pr("this_nfile " + str(this_nfile))
+            if this_nfile == final_nfile:
+                break
         c.close()
+        self.pr("Sweep loop took " + str(sleep))
 
         stat_cursor = self.session.open_cursor('statistics:', None, None)
         close2 = stat_cursor[stat.conn.dh_sweep_close][2]
@@ -177,7 +188,7 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
         self.assertEqual(nfile2 < nfile1, True)
         # The only files that should be left are the metadata, the lookaside
         # file, the lock file, and the active file.
-        if (nfile2 != 4):
+        if (nfile2 != final_nfile):
             print "close1: " + str(close1) + " close2: " + str(close2)
             print "remove1: " + str(remove1) + " remove2: " + str(remove2)
             print "sweep1: " + str(sweep1) + " sweep2: " + str(sweep2)
@@ -186,7 +197,7 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
             print "tod1: " + str(tod1) + " tod2: " + str(tod2)
             print "ref1: " + str(ref1) + " ref2: " + str(ref2)
             print "XX2: nfile1: " + str(nfile1) + " nfile2: " + str(nfile2)
-        self.assertEqual(nfile2 == 4, True)
+        self.assertEqual(nfile2 == final_nfile, True)
 
 if __name__ == '__main__':
     wttest.run()
diff --git a/test/suite/test_util13.py b/test/suite/test_util13.py
new file mode 100644
index 00000000000..222f42cd7f1
--- /dev/null
+++ b/test/suite/test_util13.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2016 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import os, re, string
+from suite_subprocess import suite_subprocess
+import itertools, wiredtiger, wttest
+
+from helper import complex_populate_cgconfig, complex_populate_cgconfig_lsm
+from helper import simple_populate
+from helper import complex_populate_check, simple_populate_check
+from wtscenario import multiply_scenarios, number_scenarios
+
+# test_util13.py
+#    Utilities: wt dump, as well as the dump cursor
+#    Test that dump and load retain table configuration information.
+#
+class test_util13(wttest.WiredTigerTestCase, suite_subprocess):
+    """
+    Test wt dump.  We check for specific output and preservation of
+    non-default table create parameters.
+    """
+
+    pfx = 'test_util13'
+    nentries = 100
+    dir = "dump_dir"
+    #
+    # Select table configuration settings that are not the default.
+    #
+    types = [
+        ('file-simple', dict(uri='file:' + pfx, pop=simple_populate,
+            populate_check=simple_populate_check,
+            table_config='prefix_compression_min=3', cfg='')),
+        ('lsm-simple', dict(uri='lsm:' + pfx, pop=simple_populate,
+            populate_check=simple_populate_check,
+            table_config='lsm=(bloom_bit_count=29)',
+            cfg='bloom_bit_count=29')),
+        ('table-simple', dict(uri='table:' + pfx, pop=simple_populate,
+            populate_check=simple_populate_check,
+            table_config='split_pct=50', cfg='')),
+        ('table-complex',
+            dict(uri='table:' + pfx, pop=complex_populate_cgconfig,
+            populate_check=complex_populate_check,
+            table_config='allocation_size=512B', cfg='')),
+        ('table-complex-lsm',
+            dict(uri='table:' + pfx, pop=complex_populate_cgconfig_lsm,
+            populate_check=complex_populate_check,
+            table_config='lsm=(merge_max=5)',
+            cfg='merge_max=5')),
+    ]
+
+    scenarios = number_scenarios(multiply_scenarios('.', types))
+
+    def compare_config(self, expected_cfg, actual_cfg):
+        # Replace '(' characters so configuration groups don't break parsing.
+        # If we ever want to look for config groups this will need to change.
+        #print "compare_config Actual config "
+        #print actual_cfg
+        #print "compare_config Expected config "
+        #print expected_cfg
+        cfg_orig = actual_cfg
+        if self.pop != simple_populate:
+            #
+            # If we have a complex config, strip out the colgroups and
+            # columns from the config.  Doing so allows us to keep the
+            # split commands below usable because those two items don't
+            # have assignments in them.
+            #
+            nocolgrp = re.sub("colgroups=\((.+?)\),", '', actual_cfg)
+            cfg_orig = re.sub("columns=\((.+?)\),", '', nocolgrp)
+
+        #print "Using original config "
+        #print cfg_orig
+        da = dict(kv.split('=') for kv in
+            cfg_orig.strip().replace('(',',').split(','))
+        dx = dict(kv.split('=') for kv in
+            expected_cfg.strip().replace('(',',').split(','))
+
+        # Check that all items in our expected config subset are in
+        # the actual configuration and they match.
+        match = all(item in da.items() for item in dx.items())
+        if match == False:
+            print "MISMATCH:"
+            print "Original dict: "
+            print da
+            print "Expected config: "
+            print dx
+        return match
+
+    def compare_files(self, expect_subset, dump_out):
+        inheader = isconfig = False
+        for l1, l2 in zip(open(expect_subset, "rb"), open(dump_out, "rb")):
+            if isconfig:
+                if not self.compare_config(l1, l2):
+                    return False
+            if inheader:
+                # This works because the expected subset has a format
+                # of URI and config lines alternating.
+                isconfig = not isconfig
+            if l1.strip() == 'Header':
+                inheader = True
+            if l1.strip() == 'Data':
+                break
+        return True
+
+    def load_recheck(self, expect_subset, dump_out):
+        newdump = "newdump.out"
+        os.mkdir(self.dir)
+        self.runWt(['-h', self.dir, 'load', '-f', dump_out])
+        # Check the contents
+        conn = self.wiredtiger_open(self.dir)
+        session = conn.open_session()
+        cursor = session.open_cursor(self.uri, None, None)
+        self.populate_check
+        conn.close()
+        dumpargs = ["-h"]
+        dumpargs.append(self.dir)
+        dumpargs.append("dump")
+        dumpargs.append(self.uri)
+        self.runWt(dumpargs, outfilename=newdump)
+
+        self.assertTrue(self.compare_files(expect_subset, newdump))
+        return True
+
+    def test_dump_config(self):
+        # The number of btree_entries reported is influenced by the
+        # number of column groups and indices.  Each insert will have
+        # a multiplied effect.
+        self.pop(self, self.uri,
+            'key_format=S,value_format=S,' + self.table_config, self.nentries)
+
+        ver = wiredtiger.wiredtiger_version()
+        verstring = str(ver[1]) + '.' + str(ver[2]) + '.' + str(ver[3])
+        expectfile="expect.out"
+        with open(expectfile, "w") as expectout:
+            # Note: this output is sensitive to the precise output format
+            # generated by wt dump.  If this is likely to change, we should
+            # make this test more accommodating.
+            expectout.write(
+                'WiredTiger Dump (WiredTiger Version ' + verstring + ')\n')
+            expectout.write('Format=print\n')
+            expectout.write('Header\n')
+            expectout.write(self.uri + '\n')
+            # Check the config on the colgroup itself for complex tables.
+            if self.pop != simple_populate:
+                expectout.write('key_format=S\n')
+                expectout.write('colgroup:' + self.pfx + ':cgroup1\n')
+            if self.cfg == '':
+                expectout.write(self.table_config + '\n')
+            else:
+                expectout.write(self.cfg + '\n')
+            expectout.write('Data\n')
+
+        self.pr('calling dump')
+        outfile="dump.out"
+        dumpargs = ["dump"]
+        dumpargs.append(self.uri)
+        self.runWt(dumpargs, outfilename=outfile)
+
+        self.assertTrue(self.compare_files(expectfile, outfile))
+        self.assertTrue(self.load_recheck(expectfile, outfile))
+
+if __name__ == '__main__':
+    wttest.run()
diff --git a/tools/wtstats/stat_data.py b/tools/wtstats/stat_data.py
index f181aeb09b4..c75e4f194dd 100644
--- a/tools/wtstats/stat_data.py
+++ b/tools/wtstats/stat_data.py
@@ -1,6 +1,7 @@
 # DO NOT EDIT: automatically built by dist/stat.py. */
 
 no_scale_per_second_list = [
+    'async: current work queue length',
     'async: maximum work queue length',
     'cache: bytes currently in the cache',
     'cache: eviction currently operating in aggressive mode',
@@ -36,6 +37,7 @@ no_scale_per_second_list = [
     'transaction: transaction range of IDs currently pinned by named snapshots',
     'block-manager: checkpoint size',
     'block-manager: file allocation unit size',
+    'block-manager: file bytes available for reuse',
     'block-manager: file magic number',
     'block-manager: file major version number',
     'block-manager: file size in bytes',
author	Alex Gorrod <alexg@wiredtiger.com>	2016-03-22 14:49:51 +1100
committer	Alex Gorrod <alexg@wiredtiger.com>	2016-03-22 14:49:51 +1100
commit	9cf8eb2f15c6df7da90c19c86ccf7516ed126183 (patch)
tree	dd8d22e7b881791e64cd8efaa9d0befb12b2ba84
parent	444981a456059f0652fd3bb1968d58d2c37b9089 (diff)
parent	18e6091d9c16bf46bc8d0750b2227ca71a559c33 (diff)
download	mongo-9cf8eb2f15c6df7da90c19c86ccf7516ed126183.tar.gz