Merge branch 'develop' into mongodb-3.0

Conflicts: NEWS.MONGODB
author: Michael Cahill <michael.cahill@wiredtiger.com> 2015-03-09 17:47:27 +1100
committer: Michael Cahill <michael.cahill@wiredtiger.com> 2015-03-09 17:47:27 +1100
commit: 3a3bda539cdd34428b7489fa0fa102ff0605e8d8 (patch)
tree: fc901ad7b45300181356b8305ecb02fff13f4bfc
parent: 89f45aafdff48bf7c8e191b788a144cab0b86122 (diff)
parent: 0afa07b0cd666adf7576901540a699b0bec396e3 (diff)
download: mongo-3a3bda539cdd34428b7489fa0fa102ff0605e8d8.tar.gz
72 files changed, 1612 insertions, 720 deletions
diff --git a/README b/README
index d9994f0526d..d89756c9f42 100644
--- a/README
+++ b/README
@@ -1,6 +1,6 @@
-WiredTiger 2.5.1: (March  9, 2015)
+WiredTiger 2.5.2: (March  9, 2015)
 
-This is version 2.5.1 of WiredTiger.
+This is version 2.5.2 of WiredTiger.
 
 WiredTiger release packages and documentation can be found at:
 
@@ -9,7 +9,7 @@ WiredTiger release packages and documentation can be found at:
 Information on configuring, building and installing WiredTiger can be
 found at:
 
-	http://source.wiredtiger.com/2.5.1/install.html
+	http://source.wiredtiger.com/2.5.2/install.html
 
 WiredTiger licensing information can be found at:
 
diff --git a/RELEASE_INFO b/RELEASE_INFO
index 6c7da8cb961..ac5ff8ac028 100644
--- a/RELEASE_INFO
+++ b/RELEASE_INFO
@@ -1,6 +1,6 @@
 WIREDTIGER_VERSION_MAJOR=2
 WIREDTIGER_VERSION_MINOR=5
-WIREDTIGER_VERSION_PATCH=1
+WIREDTIGER_VERSION_PATCH=2
 WIREDTIGER_VERSION="$WIREDTIGER_VERSION_MAJOR.$WIREDTIGER_VERSION_MINOR.$WIREDTIGER_VERSION_PATCH"
 
 WIREDTIGER_RELEASE_DATE=`date "+%B %e, %Y"`
diff --git a/build_posix/Make.subdirs b/build_posix/Make.subdirs
index d37acef50e1..82feee58aa1 100644
--- a/build_posix/Make.subdirs
+++ b/build_posix/Make.subdirs
@@ -24,5 +24,6 @@ test/checkpoint
 test/fops
 test/format HAVE_BERKELEY_DB
 test/huge
+test/packing
 test/salvage
 test/thread
diff --git a/build_posix/aclocal/version-set.m4 b/build_posix/aclocal/version-set.m4
index 7f4d68e8b39..cbd389ea40d 100644
--- a/build_posix/aclocal/version-set.m4
+++ b/build_posix/aclocal/version-set.m4
@@ -2,8 +2,8 @@ dnl build by dist/s_version
 
 VERSION_MAJOR=2
 VERSION_MINOR=5
-VERSION_PATCH=1
-VERSION_STRING='"WiredTiger 2.5.1: (March  9, 2015)"'
+VERSION_PATCH=2
+VERSION_STRING='"WiredTiger 2.5.2: (March  9, 2015)"'
 
 AC_SUBST(VERSION_MAJOR)
 AC_SUBST(VERSION_MINOR)
diff --git a/build_posix/aclocal/version.m4 b/build_posix/aclocal/version.m4
index 71598b276eb..340f77e5474 100644
--- a/build_posix/aclocal/version.m4
+++ b/build_posix/aclocal/version.m4
@@ -1,2 +1,2 @@
 dnl WiredTiger product version for AC_INIT.  Maintained by dist/s_version
-2.5.1
+2.5.2
diff --git a/dist/flags.py b/dist/flags.py
index a0e307debf6..f1eb6b24968 100644
--- a/dist/flags.py
+++ b/dist/flags.py
@@ -36,12 +36,11 @@ flags = {
     'page_read' : [
         'READ_CACHE',
         'READ_COMPACT',
-        'READ_NO_GEN',
         'READ_NO_EVICT',
+        'READ_NO_GEN',
         'READ_NO_WAIT',
         'READ_PREV',
         'READ_SKIP_INTL',
-        'READ_SKIP_LEAF',
         'READ_TRUNCATE',
         'READ_WONT_NEED',
     ],
@@ -88,15 +87,16 @@ flags = {
     'conn' : [
         'CONN_CACHE_POOL',
         'CONN_CKPT_SYNC',
+        'CONN_CLOSING',
         'CONN_EVICTION_RUN',
         'CONN_LEAK_MEMORY',
         'CONN_LOG_SERVER_RUN',
         'CONN_LSM_MERGE',
         'CONN_PANIC',
-        'CONN_SERVER_RUN',
         'CONN_SERVER_ASYNC',
         'CONN_SERVER_CHECKPOINT',
         'CONN_SERVER_LSM',
+        'CONN_SERVER_RUN',
         'CONN_SERVER_STATISTICS',
         'CONN_SERVER_SWEEP',
         'CONN_WAS_BACKUP',
diff --git a/dist/package/wiredtiger.spec b/dist/package/wiredtiger.spec
index ab762ef17fd..11eca316ffd 100644
--- a/dist/package/wiredtiger.spec
+++ b/dist/package/wiredtiger.spec
@@ -1,5 +1,5 @@
 Name:		wiredtiger
-Version:	2.5.1
+Version:	2.5.2
 Release:	1%{?dist}
 Summary:	WiredTiger data storage engine
 
diff --git a/dist/s_define.list b/dist/s_define.list
index 91fbc971afa..4924a1935ae 100644
--- a/dist/s_define.list
+++ b/dist/s_define.list
@@ -49,6 +49,7 @@ WT_STAT_ATOMIC_DECR
 WT_STAT_ATOMIC_DECRV
 WT_STAT_ATOMIC_INCR
 WT_STAT_ATOMIC_INCRV
+WT_STAT_DECR
 WT_STAT_DECRV
 WT_STAT_FAST_ATOMIC_DECR
 WT_STAT_FAST_ATOMIC_DECRV
diff --git a/dist/s_string.ok b/dist/s_string.ok
index 66439faf161..8b0335a6480 100644
--- a/dist/s_string.ok
+++ b/dist/s_string.ok
@@ -551,6 +551,7 @@ dest
 dev
 dhandle
 dhandles
+dir
 dirlist
 dl
 dlclose
@@ -1161,6 +1162,7 @@ wrapup
 writelock
 writeunlock
 wrlock
+wrlsn
 ws
 wti
 wtperf
diff --git a/dist/stat_data.py b/dist/stat_data.py
index 5a42f2ff318..dd4d292c8b6 100644
--- a/dist/stat_data.py
+++ b/dist/stat_data.py
@@ -221,11 +221,14 @@ connection_stats = [
     LogStat('log_prealloc_max', 'number of pre-allocated log files to create'),
     LogStat('log_prealloc_used', 'pre-allocated log files used'),
     LogStat('log_reads', 'log read operations'),
+    LogStat('log_release_write_lsn', 'log release advances write LSN'),
     LogStat('log_scan_records', 'records processed by log scan'),
     LogStat('log_scan_rereads', 'log scan records requiring two reads'),
     LogStat('log_scans', 'log scan operations'),
     LogStat('log_sync', 'log sync operations'),
+    LogStat('log_sync_dir', 'log sync_dir operations'),
     LogStat('log_writes', 'log write operations'),
+    LogStat('log_write_lsn', 'log server thread advances write LSN'),
 
     LogStat('log_slot_consolidated', 'logging bytes consolidated'),
     LogStat('log_slot_closes', 'consolidated slot closures'),
diff --git a/examples/c/ex_pack.c b/examples/c/ex_pack.c
index 19be35119af..c24805ade29 100644
--- a/examples/c/ex_pack.c
+++ b/examples/c/ex_pack.c
@@ -42,8 +42,6 @@ main(void)
 {
 	WT_CONNECTION *conn;
 	WT_SESSION *session;
-	char buf[50];
-	size_t size;
 	int i, j, k, ret;
 
 	/*
@@ -66,7 +64,11 @@ main(void)
 		fprintf(stderr, "Error opening a session on %s: %s\n",
 		    home, wiredtiger_strerror(ret));
 
+	{
 	/*! [packing] */
+	size_t size;
+	char buf[50];
+
 	ret = wiredtiger_struct_size(session, &size, "iii", 42, 1000, -9);
 	if (size > sizeof(buf)) {
 		/* Allocate a bigger buffer. */
@@ -76,6 +78,7 @@ main(void)
 
 	ret = wiredtiger_struct_unpack(session, buf, size, "iii", &i, &j, &k);
 	/*! [packing] */
+	}
 
 	/* Note: closing the connection implicitly closes open session(s). */
 	if ((ret = conn->close(conn, NULL)) != 0)
diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c
index 9c4ab05ce40..479f6547e42 100644
--- a/src/btree/bt_delete.c
+++ b/src/btree/bt_delete.c
@@ -221,9 +221,6 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
 {
 	int skip;
 
-	if (ref->state != WT_REF_DELETED)
-		return (0);
-
 	/*
 	 * Deleted pages come from two sources: either it's a fast-delete as
 	 * described above, or the page has been emptied by other operations
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index 299849ad365..5b3624a4a2d 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -453,8 +453,7 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, int creation)
 		ref->page = NULL;
 		ref->addr = NULL;
 		ref->state = WT_REF_DELETED;
-		WT_ERR(__wt_row_ikey_incr(
-		    session, root, 0, "", 1, &ref->key.ikey));
+		WT_ERR(__wt_row_ikey_incr(session, root, 0, "", 1, ref));
 		break;
 	WT_ILLEGAL_VALUE_ERR(session);
 	}
@@ -634,7 +633,7 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
 	    WT_MAX((uint64_t)cval.val, 50 * (uint64_t)btree->maxleafpage);
 	cache_size = S2C(session)->cache_size;
 	if (cache_size > 0)
-		btree->maxmempage = WT_MIN(btree->maxmempage, cache_size / 2);
+		btree->maxmempage = WT_MIN(btree->maxmempage, cache_size / 4);
 
 	/*
 	 * Get the split percentage (reconciliation splits pages into smaller
diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c
index b5140beb792..e177b05cd24 100644
--- a/src/btree/bt_page.c
+++ b/src/btree/bt_page.c
@@ -165,6 +165,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
 			if (oldgen && page->read_gen == WT_READGEN_NOTSET)
 				__wt_page_evict_soon(page);
 			else if (!LF_ISSET(WT_READ_NO_GEN) &&
+			    page->read_gen != WT_READGEN_OLDEST &&
 			    page->read_gen < __wt_cache_read_gen(session))
 				page->read_gen =
 				    __wt_cache_read_gen_set(session);
@@ -611,7 +612,7 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
 
 			WT_ERR(__wt_row_ikey_incr(session, page,
 			    WT_PAGE_DISK_OFFSET(page, cell),
-			    current->data, current->size, &ref->key.ikey));
+			    current->data, current->size, ref));
 
 			*sizep += sizeof(WT_IKEY) + current->size;
 			break;
diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c
index 1cf616a2f6b..d6c20556a9a 100644
--- a/src/btree/bt_slvg.c
+++ b/src/btree/bt_slvg.c
@@ -1858,8 +1858,7 @@ __slvg_row_build_internal(
 			WT_ERR(__slvg_row_build_leaf(session, trk, ref, ss));
 		} else {
 			WT_ERR(__wt_row_ikey_incr(session, page, 0,
-			    trk->row_start.data, trk->row_start.size,
-			    &ref->key.ikey));
+			    trk->row_start.data, trk->row_start.size, ref));
 
 			WT_ERR(__slvg_ovfl_ref_all(session, trk));
 		}
@@ -1981,8 +1980,8 @@ __slvg_row_build_leaf(
 	 */
 	rip = page->pg_row_d + skip_start;
 	WT_ERR(__wt_row_leaf_key(session, page, rip, key, 0));
-	WT_ERR(__wt_row_ikey_incr(session,
-	    ref->home, 0, key->data, key->size, &ref->key.ikey));
+	WT_ERR(__wt_row_ikey_incr(
+	    session, ref->home, 0, key->data, key->size, ref));
 
 	/* Set the referenced flag on overflow pages we're using. */
 	if (trk->trk_ovfl_cnt != 0)
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 6ebd4609efa..95fb9c68a86 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -281,8 +281,8 @@ __split_ref_deepen_move(WT_SESSION_IMPL *session,
 	if (parent->type == WT_PAGE_ROW_INT) {
 		if ((ikey = __wt_ref_key_instantiated(ref)) == NULL) {
 			__wt_ref_key(parent, ref, &key, &size);
-			WT_RET(__wt_row_ikey(session, 0, key, size, &ikey));
-			ref->key.ikey = ikey;
+			WT_RET(__wt_row_ikey(session, 0, key, size, ref));
+			ikey = ref->key.ikey;
 		} else {
 			WT_RET(__split_ovfl_key_cleanup(session, parent, ref));
 			*parent_decrp += sizeof(WT_IKEY) + ikey->size;
@@ -454,8 +454,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children)
 		ref->addr = NULL;
 		if (parent->type == WT_PAGE_ROW_INT) {
 			__wt_ref_key(parent, *parent_refp, &p, &size);
-			WT_ERR(
-			    __wt_row_ikey(session, 0, p, size, &ref->key.ikey));
+			WT_ERR(__wt_row_ikey(session, 0, p, size, ref));
 			parent_incr += sizeof(WT_IKEY) + size;
 		} else
 			ref->key.recno = (*parent_refp)->key.recno;
@@ -468,7 +467,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children)
 
 		/* Mark it dirty. */
 		WT_ERR(__wt_page_modify_init(session, child));
-		__wt_page_only_modify_set(session, child);
+		__wt_page_modify_set(session, child);
 
 		/*
 		 * Once the split goes live, the newly created internal pages
@@ -761,8 +760,8 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
 	case WT_PAGE_ROW_INT:
 	case WT_PAGE_ROW_LEAF:
 		ikey = multi->key.ikey;
-		WT_RET(__wt_row_ikey(session, 0,
-		    WT_IKEY_DATA(ikey), ikey->size, &ref->key.ikey));
+		WT_RET(__wt_row_ikey(
+		    session, 0, WT_IKEY_DATA(ikey), ikey->size, ref));
 		incr += sizeof(WT_IKEY) + ikey->size;
 		break;
 	default:
@@ -855,7 +854,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
 	for (i = 0, deleted_entries = 0; i < parent_entries; ++i) {
 		next_ref = pindex->index[i];
 		WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
-		if (__wt_delete_page_skip(session, next_ref) &&
+		if (next_ref->state == WT_REF_DELETED &&
+		    __wt_delete_page_skip(session, next_ref) &&
 		    WT_ATOMIC_CAS4(next_ref->state,
 		    WT_REF_DELETED, WT_REF_SPLIT))
 			deleted_entries++;
@@ -1139,15 +1139,23 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
 	F_SET_ATOMIC(page, WT_PAGE_SPLIT_INSERT);
 
 	/*
-	 * The first page in the split is the current page, but we still need to
-	 * create a replacement WT_REF and make a copy of the key (the original
-	 * WT_REF is set to split-status and eventually freed).
-	 *
-	 * The new reference is visible to readers once the split completes.
+	 * The first page in the split is the current page, but we still have
+	 * to create a replacement WT_REF, the original WT_REF will be set to
+	 * split status and eventually freed.
 	 */
 	WT_ERR(__wt_calloc_one(session, &split_ref[0]));
 	child = split_ref[0];
 	*child = *ref;
+
+	/*
+	 * The new WT_REF is not quite identical: we have to instantiate a key,
+	 * and the new reference is visible to readers once the split completes.
+	 *
+	 * The key-instantiation code checks for races, clear the key fields so
+	 * we don't trigger them.
+	 */
+	child->key.recno = 0;
+	child->key.ikey = NULL;
 	child->state = WT_REF_MEM;
 
 	/*
@@ -1167,8 +1175,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
 	} else
 		WT_ERR(__wt_row_leaf_key(
 		    session, page, &page->pg_row_d[0], key, 1));
-	WT_ERR(__wt_row_ikey(
-	    session, 0, key->data, key->size, &child->key.ikey));
+	WT_ERR(__wt_row_ikey(session, 0, key->data, key->size, child));
 	parent_incr += sizeof(WT_REF) + sizeof(WT_IKEY) + key->size;
 	__wt_scr_free(session, &key);
 
@@ -1187,7 +1194,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
 	child->state = WT_REF_MEM;
 	WT_ERR(__wt_row_ikey(session, 0,
 	    WT_INSERT_KEY(moved_ins), WT_INSERT_KEY_SIZE(moved_ins),
-	    &child->key.ikey));
+	    child));
 	parent_incr +=
 	    sizeof(WT_REF) + sizeof(WT_IKEY) + WT_INSERT_KEY_SIZE(moved_ins);
 
@@ -1203,7 +1210,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
 
 	/* The new page is dirty by definition. */
 	WT_ERR(__wt_page_modify_init(session, right));
-	__wt_page_only_modify_set(session, right);
+	__wt_page_modify_set(session, right);
 
 	/*
 	 * We modified the page above, which will have set the first dirty
diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c
index 2e34a925f84..b550158a5a9 100644
--- a/src/btree/bt_stat.c
+++ b/src/btree/bt_stat.c
@@ -9,8 +9,9 @@
 #include "wt_internal.h"
 
 static int  __stat_page(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS *);
-static int  __stat_page_col_var(WT_PAGE *, WT_DSRC_STATS *);
-static int  __stat_page_row_leaf(WT_PAGE *, WT_DSRC_STATS *);
+static void __stat_page_col_var(WT_PAGE *, WT_DSRC_STATS *);
+static void __stat_page_row_int(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS *);
+static void __stat_page_row_leaf(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS *);
 
 /*
  * __wt_btree_stat_init --
@@ -89,18 +90,13 @@ __stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats)
 		WT_STAT_INCRV(stats, btree_entries, pindex->entries);
 		break;
 	case WT_PAGE_COL_VAR:
-		WT_RET(__stat_page_col_var(page, stats));
-		break;
-	case WT_PAGE_OVFL:
-		WT_STAT_INCR(stats, btree_overflow);
+		__stat_page_col_var(page, stats);
 		break;
 	case WT_PAGE_ROW_INT:
-		WT_STAT_INCR(stats, btree_row_internal);
-		pindex = WT_INTL_INDEX_COPY(page);
-		WT_STAT_INCRV(stats, btree_entries, pindex->entries);
+		__stat_page_row_int(session, page, stats);
 		break;
 	case WT_PAGE_ROW_LEAF:
-		WT_RET(__stat_page_row_leaf(page, stats));
+		__stat_page_row_leaf(session, page, stats);
 		break;
 	WT_ILLEGAL_VALUE(session);
 	}
@@ -111,7 +107,7 @@ __stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats)
  * __stat_page_col_var --
  *	Stat a WT_PAGE_COL_VAR page.
  */
-static int
+static void
 __stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats)
 {
 	WT_CELL *cell;
@@ -119,29 +115,33 @@ __stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats)
 	WT_COL *cip;
 	WT_INSERT *ins;
 	WT_UPDATE *upd;
+	uint64_t deleted_cnt, entry_cnt, ovfl_cnt;
 	uint32_t i;
 	int orig_deleted;
 
 	unpack = &_unpack;
+	deleted_cnt = entry_cnt = ovfl_cnt = 0;
 
 	WT_STAT_INCR(stats, btree_column_variable);
 
 	/*
-	 * Walk the page, counting regular and overflow data items, and checking
-	 * to be sure any updates weren't deletions.  If the item was updated,
-	 * assume it was updated by an item of the same size (it's expensive to
-	 * figure out if it will require the same space or not, especially if
-	 * there's Huffman encoding).
+	 * Walk the page counting regular items, adjusting if the item has been
+	 * subsequently deleted or not. This is a mess because 10-item RLE might
+	 * have 3 of the items subsequently deleted. Overflow items are harder,
+	 * we can't know if an updated item will be an overflow item or not; do
+	 * our best, and simply count every overflow item (or RLE set of items)
+	 * we see.
 	 */
 	WT_COL_FOREACH(page, cip, i) {
 		if ((cell = WT_COL_PTR(page, cip)) == NULL) {
 			orig_deleted = 1;
-			WT_STAT_INCR(stats, btree_column_deleted);
+			++deleted_cnt;
 		} else {
 			orig_deleted = 0;
 			__wt_cell_unpack(cell, unpack);
-			WT_STAT_INCRV(
-			    stats, btree_entries, __wt_cell_rle(unpack));
+			entry_cnt += __wt_cell_rle(unpack);
+			if (unpack->ovfl)
+				++ovfl_cnt;
 		}
 
 		/*
@@ -151,57 +151,128 @@ __stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats)
 		WT_SKIP_FOREACH(ins, WT_COL_UPDATE(page, cip)) {
 			upd = ins->upd;
 			if (WT_UPDATE_DELETED_ISSET(upd)) {
-				if (orig_deleted)
-					continue;
-				WT_STAT_INCR(stats, btree_column_deleted);
-				WT_STAT_DECR(stats, btree_entries);
-			} else {
-				if (!orig_deleted)
-					continue;
-				WT_STAT_DECR(stats, btree_column_deleted);
-				WT_STAT_INCR(stats, btree_entries);
-			}
+				if (!orig_deleted) {
+					++deleted_cnt;
+					--entry_cnt;
+				}
+			} else
+				if (orig_deleted) {
+					--deleted_cnt;
+					++entry_cnt;
+				}
 		}
 	}
-	return (0);
+
+	/* Walk any append list. */
+	WT_SKIP_FOREACH(ins, WT_COL_APPEND(page))
+		++entry_cnt;
+
+	WT_STAT_INCRV(stats, btree_column_deleted, deleted_cnt);
+	WT_STAT_INCRV(stats, btree_entries, entry_cnt);
+	WT_STAT_INCRV(stats, btree_overflow, ovfl_cnt);
+}
+
+/*
+ * __stat_page_row_int --
+ *	Stat a WT_PAGE_ROW_INT page.
+ */
+static void
+__stat_page_row_int(
+    WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats)
+{
+	WT_BTREE *btree;
+	WT_CELL *cell;
+	WT_CELL_UNPACK unpack;
+	WT_PAGE_INDEX *pindex;
+	uint32_t i, ovfl_cnt;
+
+	btree = S2BT(session);
+	ovfl_cnt = 0;
+
+	WT_STAT_INCR(stats, btree_row_internal);
+
+	/*
+	 * The number of entries tells us the number of items on row-store
+	 * internal page.
+	 */
+	pindex = WT_INTL_INDEX_COPY(page);
+	WT_STAT_INCRV(stats, btree_entries, pindex->entries);
+
+	/*
+	 * Overflow keys are hard: we have to walk the disk image to count them,
+	 * the in-memory representation of the page doesn't necessarily contain
+	 * a reference to the original cell.
+	 */
+	if (page->dsk != NULL)
+		WT_CELL_FOREACH(btree, page->dsk, cell, &unpack, i) {
+			__wt_cell_unpack(cell, &unpack);
+			if (__wt_cell_type(cell) == WT_CELL_KEY_OVFL)
+				++ovfl_cnt;
+		}
+
+	WT_STAT_INCRV(stats, btree_overflow, ovfl_cnt);
 }
 
 /*
  * __stat_page_row_leaf --
  *	Stat a WT_PAGE_ROW_LEAF page.
  */
-static int
-__stat_page_row_leaf(WT_PAGE *page, WT_DSRC_STATS *stats)
+static void
+__stat_page_row_leaf(
+    WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats)
 {
+	WT_BTREE *btree;
+	WT_CELL *cell;
+	WT_CELL_UNPACK unpack;
 	WT_INSERT *ins;
 	WT_ROW *rip;
 	WT_UPDATE *upd;
-	uint32_t cnt, i;
+	uint32_t entry_cnt, i, ovfl_cnt;
+
+	btree = S2BT(session);
+	entry_cnt = ovfl_cnt = 0;
 
 	WT_STAT_INCR(stats, btree_row_leaf);
 
 	/*
-	 * Stat any K/V pairs inserted into the page before the first from-disk
+	 * Walk any K/V pairs inserted into the page before the first from-disk
 	 * key on the page.
 	 */
-	cnt = 0;
 	WT_SKIP_FOREACH(ins, WT_ROW_INSERT_SMALLEST(page))
 		if (!WT_UPDATE_DELETED_ISSET(ins->upd))
-			++cnt;
+			++entry_cnt;
 
-	/* Stat the page's K/V pairs. */
+	/*
+	 * Walk the page's K/V pairs. Count overflow values, where an overflow
+	 * item is any on-disk overflow item that hasn't been updated.
+	 */
 	WT_ROW_FOREACH(page, rip, i) {
 		upd = WT_ROW_UPDATE(page, rip);
 		if (upd == NULL || !WT_UPDATE_DELETED_ISSET(upd))
-			++cnt;
+			++entry_cnt;
+		if (upd == NULL && (cell =
+		    __wt_row_leaf_value_cell(page, rip, NULL)) != NULL &&
+		    __wt_cell_type(cell) == WT_CELL_VALUE_OVFL)
+				++ovfl_cnt;
 
-		/* Stat inserted K/V pairs. */
+		/* Walk K/V pairs inserted after the on-page K/V pair. */
 		WT_SKIP_FOREACH(ins, WT_ROW_INSERT(page, rip))
 			if (!WT_UPDATE_DELETED_ISSET(ins->upd))
-				++cnt;
+				++entry_cnt;
 	}
 
-	WT_STAT_INCRV(stats, btree_entries, cnt);
+	/*
+	 * Overflow keys are hard: we have to walk the disk image to count them,
+	 * the in-memory representation of the page doesn't necessarily contain
+	 * a reference to the original cell.
+	 */
+	if (page->dsk != NULL)
+		WT_CELL_FOREACH(btree, page->dsk, cell, &unpack, i) {
+			__wt_cell_unpack(cell, &unpack);
+			if (__wt_cell_type(cell) == WT_CELL_KEY_OVFL)
+				++ovfl_cnt;
+		}
 
-	return (0);
+	WT_STAT_INCRV(stats, btree_entries, entry_cnt);
+	WT_STAT_INCRV(stats, btree_overflow, ovfl_cnt);
 }
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index d925eefc2fe..bc5d1051b1e 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -113,6 +113,13 @@ __sync_file(WT_SESSION_IMPL *session, int syncop)
 			if (walk == NULL)
 				break;
 
+			page = walk->page;
+			mod = page->modify;
+
+			/* Skip clean pages. */
+			if (!__wt_page_is_modified(page))
+				continue;
+
 			/*
 			 * Write dirty pages, unless we can be sure they only
 			 * became dirty after the checkpoint started.
@@ -125,23 +132,27 @@ __sync_file(WT_SESSION_IMPL *session, int syncop)
 			 * (3) the first dirty update on the page is
 			 *     sufficiently recent that the checkpoint
 			 *     transaction would skip them.
+			 *
+			 * Mark the tree dirty: the checkpoint marked it clean
+			 * and we can't skip future checkpoints until this page
+			 * is written.
 			 */
-			page = walk->page;
-			mod = page->modify;
-			if (__wt_page_is_modified(page) &&
-			    (WT_PAGE_IS_INTERNAL(page) ||
-			    !F_ISSET(txn, TXN_HAS_SNAPSHOT) ||
-			    TXNID_LE(mod->first_dirty_txn, txn->snap_max))) {
-				if (WT_PAGE_IS_INTERNAL(page)) {
-					internal_bytes +=
-					    page->memory_footprint;
-					++internal_pages;
-				} else {
-					leaf_bytes += page->memory_footprint;
-					++leaf_pages;
-				}
-				WT_ERR(__wt_reconcile(session, walk, NULL, 0));
+			if (!WT_PAGE_IS_INTERNAL(page) &&
+			    F_ISSET(txn, TXN_HAS_SNAPSHOT) &&
+			    TXNID_LT(txn->snap_max, mod->first_dirty_txn)) {
+				__wt_page_modify_set(session, page);
+				continue;
+			}
+
+			if (WT_PAGE_IS_INTERNAL(page)) {
+				internal_bytes +=
+				    page->memory_footprint;
+				++internal_pages;
+			} else {
+				leaf_bytes += page->memory_footprint;
+				++leaf_pages;
 			}
+			WT_ERR(__wt_reconcile(session, walk, NULL, 0));
 		}
 		break;
 	}
diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c
index 10dd5b12936..917e0c54a30 100644
--- a/src/btree/bt_walk.c
+++ b/src/btree/bt_walk.c
@@ -20,12 +20,11 @@ __wt_tree_walk(WT_SESSION_IMPL *session,
 	WT_DECL_RET;
 	WT_PAGE *page;
 	WT_PAGE_INDEX *pindex;
-	WT_REF *couple, *ref;
-	int descending, prev, skip;
+	WT_REF *couple, *couple_orig, *ref;
+	int prev, skip;
 	uint32_t slot;
 
 	btree = S2BT(session);
-	descending = 0;
 
 	/*
 	 * Tree walks are special: they look inside page structures that splits
@@ -79,7 +78,7 @@ __wt_tree_walk(WT_SESSION_IMPL *session,
 	 * here.  We check when discarding pages that we're not discarding that
 	 * page, so this clear must be done before the page is released.
 	 */
-	couple = ref = *refp;
+	couple = couple_orig = ref = *refp;
 	*refp = NULL;
 
 	/* If no page is active, begin a walk from the start of the tree. */
@@ -102,29 +101,6 @@ ascend:	/*
 	/* Figure out the current slot in the WT_REF array. */
 	__wt_page_refp(session, ref, &pindex, &slot);
 
-	if (0) {
-restart:	/*
-		 * The page we're moving to might have split, in which case find
-		 * the last position we held.
-		 *
-		 * If we were starting a tree walk, begin again.
-		 *
-		 * If we were in the process of descending, repeat the descent.
-		 * If we were moving within a single level of the tree, repeat
-		 * the last move.
-		 */
-		ref = couple;
-		if (ref == &btree->root) {
-			ref = &btree->root;
-			if (ref->page == NULL)
-				goto done;
-			goto descend;
-		}
-		__wt_page_refp(session, ref, &pindex, &slot);
-		if (descending)
-			goto descend;
-	}
-
 	for (;;) {
 		/*
 		 * If we're at the last/first slot on the page, return this page
@@ -152,14 +128,11 @@ restart:	/*
 				/*
 				 * Locate the reference to our parent page then
 				 * swap our child hazard pointer for the parent.
-				 * We don't handle a restart return because it
-				 * would require additional complexity in the
-				 * restart code (ascent code somewhat like the
-				 * descent code already there), and it's not a
-				 * possible return: we're moving to the parent
-				 * of the current child, not another child of
-				 * the same parent, there's no way our parent
-				 * split.
+				 * We don't handle restart or not-found returns.
+				 * It would require additional complexity and is
+				 * not a possible return: we're moving to the
+				 * parent of the current child page, our parent
+				 * reference can't have split or been evicted.
 				 */
 				__wt_page_refp(session, ref, &pindex, &slot);
 				if ((ret = __wt_page_swap(
@@ -182,7 +155,7 @@ restart:	/*
 		if (walkcntp != NULL)
 			++*walkcntp;
 
-		for (descending = 0;;) {
+		for (;;) {
 			ref = pindex->index[slot];
 
 			if (LF_ISSET(WT_READ_CACHE)) {
@@ -198,7 +171,8 @@ restart:	/*
 				 * Avoid pulling a deleted page back in to try
 				 * to delete it again.
 				 */
-				if (__wt_delete_page_skip(session, ref))
+				if (ref->state == WT_REF_DELETED &&
+				    __wt_delete_page_skip(session, ref))
 					break;
 				/*
 				 * If deleting a range, try to delete the page
@@ -232,26 +206,67 @@ restart:	/*
 				}
 			} else {
 				/*
-				 * If iterating a cursor, try to skip deleted
-				 * pages that are visible to us.
+				 * Try to skip deleted pages visible to us.
 				 */
-				if (__wt_delete_page_skip(session, ref))
+				if (ref->state == WT_REF_DELETED &&
+				    __wt_delete_page_skip(session, ref))
 					break;
 			}
 
 			ret = __wt_page_swap(session, couple, ref, flags);
+
+			/*
+			 * Not-found is an expected return when only walking
+			 * in-cache pages.
+			 */
 			if (ret == WT_NOTFOUND) {
 				ret = 0;
 				break;
 			}
-			if (ret == WT_RESTART)
-				goto restart;
+
+			/*
+			 * The page we're moving to might have split, in which
+			 * case move to the last position we held.
+			 */
+			if (ret == WT_RESTART) {
+				ret = 0;
+
+				/*
+				 * If a new walk that never coupled from the
+				 * root to a new saved position in the tree,
+				 * restart the walk.
+				 */
+				if (couple == &btree->root) {
+					ref = &btree->root;
+					if (ref->page == NULL)
+						goto done;
+					goto descend;
+				}
+
+				/*
+				 * If restarting from some original position,
+				 * repeat the increment or decrement we made at
+				 * that time. Otherwise, couple is an internal
+				 * page we've acquired after moving from that
+				 * starting position and we can treat it as a
+				 * new page. This works because we never acquire
+				 * a hazard pointer on a leaf page we're not
+				 * going to return to our caller, this will quit
+				 * work if that ever changes.
+				 */
+				WT_ASSERT(session,
+				    couple == couple_orig ||
+				    WT_PAGE_IS_INTERNAL(couple->page));
+				ref = couple;
+				__wt_page_refp(session, ref, &pindex, &slot);
+				if (couple == couple_orig)
+					break;
+			}
 			WT_ERR(ret);
 
 			/*
-			 * Entering a new page: configure for traversal of any
-			 * internal page's children, else return (or optionally
-			 * skip), the leaf page.
+			 * A new page: configure for traversal of any internal
+			 * page's children, else return the leaf page.
 			 */
 descend:		couple = ref;
 			page = ref->page;
@@ -259,10 +274,7 @@ descend:		couple = ref;
 			    page->type == WT_PAGE_COL_INT) {
 				pindex = WT_INTL_INDEX_COPY(page);
 				slot = prev ? pindex->entries - 1 : 0;
-				descending = 1;
-			} else if (LF_ISSET(WT_READ_SKIP_LEAF))
-				goto ascend;
-			else {
+			} else {
 				*refp = ref;
 				goto done;
 			}
diff --git a/src/btree/row_key.c b/src/btree/row_key.c
index 92cfd1e4273..f2868afe13a 100644
--- a/src/btree/row_key.c
+++ b/src/btree/row_key.c
@@ -439,7 +439,7 @@ next:		switch (direction) {
 		(void)__wt_row_leaf_key_info(
 		    page, copy, &ikey, &cell, NULL, NULL);
 		if (ikey == NULL) {
-			WT_ERR(__wt_row_ikey(session,
+			WT_ERR(__wt_row_ikey_alloc(session,
 			    WT_PAGE_DISK_OFFSET(page, cell),
 			    keyb->data, keyb->size, &ikey));
 
@@ -462,15 +462,37 @@ err:	__wt_scr_free(session, &tmp);
 }
 
 /*
+ * __wt_row_ikey_alloc --
+ *	Instantiate a key in a WT_IKEY structure.
+ */
+int
+__wt_row_ikey_alloc(WT_SESSION_IMPL *session,
+    uint32_t cell_offset, const void *key, size_t size, WT_IKEY **ikeyp)
+{
+	WT_IKEY *ikey;
+
+	/*
+	 * Allocate memory for the WT_IKEY structure and the key, then copy
+	 * the key into place.
+	 */
+	WT_RET(__wt_calloc(session, 1, sizeof(WT_IKEY) + size, &ikey));
+	ikey->size = WT_STORE_SIZE(size);
+	ikey->cell_offset = cell_offset;
+	memcpy(WT_IKEY_DATA(ikey), key, size);
+	*ikeyp = ikey;
+	return (0);
+}
+
+/*
  * __wt_row_ikey_incr --
  *	Instantiate a key in a WT_IKEY structure and increment the page's
  * memory footprint.
  */
 int
 __wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page,
-    uint32_t cell_offset, const void *key, size_t size, void *ikeyp)
+    uint32_t cell_offset, const void *key, size_t size, WT_REF *ref)
 {
-	WT_RET(__wt_row_ikey(session, cell_offset, key, size, ikeyp));
+	WT_RET(__wt_row_ikey(session, cell_offset, key, size, ref));
 
 	__wt_cache_page_inmem_incr(session, page, sizeof(WT_IKEY) + size);
 
@@ -483,19 +505,30 @@ __wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page,
  */
 int
 __wt_row_ikey(WT_SESSION_IMPL *session,
-    uint32_t cell_offset, const void *key, size_t size, void *ikeyp)
+    uint32_t cell_offset, const void *key, size_t size, WT_REF *ref)
 {
 	WT_IKEY *ikey;
 
+	WT_RET(__wt_row_ikey_alloc(session, cell_offset, key, size, &ikey));
+
+#ifdef HAVE_DIAGNOSTIC
+	{
+	uintptr_t oldv;
+
+	oldv = (uintptr_t)ref->key.ikey;
+	WT_DIAGNOSTIC_YIELD;
+
 	/*
-	 * Allocate memory for the WT_IKEY structure and the key, then copy
-	 * the key into place.
+	 * We should never overwrite an instantiated key, and we should
+	 * never instantiate a key after a split.
 	 */
-	WT_RET(__wt_calloc(session, 1, sizeof(WT_IKEY) + size, &ikey));
-	ikey->size = WT_STORE_SIZE(size);
-	ikey->cell_offset = cell_offset;
-	memcpy(WT_IKEY_DATA(ikey), key, size);
-
-	*(WT_IKEY **)ikeyp = ikey;
+	WT_ASSERT(session, oldv == 0 || (oldv & WT_IK_FLAG) != 0);
+	WT_ASSERT(session, ref->state != WT_REF_SPLIT);
+	WT_ASSERT(session,
+	    WT_ATOMIC_CAS8(ref->key.ikey, (WT_IKEY *)oldv, ikey));
+	}
+#else
+	ref->key.ikey = ikey;
+#endif
 	return (0);
 }
diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c
index 0562f9cfc34..6b9824fc415 100644
--- a/src/conn/conn_api.c
+++ b/src/conn/conn_api.c
@@ -762,8 +762,7 @@ __conn_reconfigure(WT_CONNECTION *wt_conn, const char *config)
 
 	WT_ERR(__conn_statistics_config(session, config_cfg));
 	WT_ERR(__wt_async_reconfig(session, config_cfg));
-	WT_ERR(__wt_cache_config(session, config_cfg));
-	WT_ERR(__wt_cache_pool_config(session, config_cfg));
+	WT_ERR(__wt_cache_config(session, 1, config_cfg));
 	WT_ERR(__wt_checkpoint_server_create(session, config_cfg));
 	WT_ERR(__wt_lsm_manager_reconfig(session, config_cfg));
 	WT_ERR(__wt_statlog_create(session, config_cfg));
diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c
index c513d46137c..4a7e15044de 100644
--- a/src/conn/conn_cache.c
+++ b/src/conn/conn_cache.c
@@ -9,33 +9,28 @@
 #include "wt_internal.h"
 
 /*
- * __wt_cache_config --
+ * __cache_config_local --
  *	Configure the underlying cache.
  */
-int
-__wt_cache_config(WT_SESSION_IMPL *session, const char *cfg[])
+static int
+__cache_config_local(WT_SESSION_IMPL *session, int shared, const char *cfg[])
 {
 	WT_CACHE *cache;
 	WT_CONFIG_ITEM cval;
 	WT_CONNECTION_IMPL *conn;
+	uint32_t evict_workers_max, evict_workers_min;
 
 	conn = S2C(session);
 	cache = conn->cache;
 
 	/*
 	 * If not using a shared cache configure the cache size, otherwise
-	 * check for a reserved size.
+	 * check for a reserved size. All other settings are independent of
+	 * whether we are using a shared cache or not.
 	 */
-	if (!F_ISSET(conn, WT_CONN_CACHE_POOL)) {
+	if (!shared) {
 		WT_RET(__wt_config_gets(session, cfg, "cache_size", &cval));
 		conn->cache_size = (uint64_t)cval.val;
-	} else {
-		WT_RET(__wt_config_gets(
-		    session, cfg, "shared_cache.reserve", &cval));
-		if (cval.val == 0)
-			WT_RET(__wt_config_gets(
-			    session, cfg, "shared_cache.chunk", &cval));
-		cache->cp_reserved = (uint64_t)cval.val;
 	}
 
 	WT_RET(__wt_config_gets(session, cfg, "cache_overhead", &cval));
@@ -57,16 +52,64 @@ __wt_cache_config(WT_SESSION_IMPL *session, const char *cfg[])
 	 */
 	WT_RET(__wt_config_gets(session, cfg, "eviction.threads_max", &cval));
 	WT_ASSERT(session, cval.val > 0);
-	conn->evict_workers_max = (u_int)cval.val - 1;
+	evict_workers_max = (uint32_t)cval.val - 1;
 
 	WT_RET(__wt_config_gets(session, cfg, "eviction.threads_min", &cval));
 	WT_ASSERT(session, cval.val > 0);
-	conn->evict_workers_min = (u_int)cval.val - 1;
+	evict_workers_min = (uint32_t)cval.val - 1;
 
-	if (conn->evict_workers_min > conn->evict_workers_max)
+	if (evict_workers_min > evict_workers_max)
 		WT_RET_MSG(session, EINVAL,
 		    "eviction=(threads_min) cannot be greater than "
 		    "eviction=(threads_max)");
+	conn->evict_workers_max = evict_workers_max;
+	conn->evict_workers_min = evict_workers_min;
+
+	return (0);
+}
+
+/*
+ * __wt_cache_config --
+ *	Configure or reconfigure the current cache and shared cache.
+ */
+int
+__wt_cache_config(WT_SESSION_IMPL *session, int reconfigure, const char *cfg[])
+{
+	WT_CONFIG_ITEM cval;
+	WT_CONNECTION_IMPL *conn;
+	int now_shared, was_shared;
+
+	conn = S2C(session);
+
+	WT_ASSERT(session, conn->cache != NULL);
+
+	WT_RET(__wt_config_gets_none(session, cfg, "shared_cache.name", &cval));
+	now_shared = cval.len != 0;
+	was_shared = F_ISSET(conn, WT_CONN_CACHE_POOL);
+
+	/* Cleanup if reconfiguring */
+	if (reconfigure && was_shared && !now_shared)
+		/* Remove ourselves from the pool if necessary */
+		WT_RET(__wt_conn_cache_pool_destroy(session));
+	else if (reconfigure && !was_shared && now_shared)
+		/*
+		 * Cache size will now be managed by the cache pool - the
+		 * start size always needs to be zero to allow the pool to
+		 * manage how much memory is in-use.
+		 */
+		conn->cache_size = 0;
+
+	/*
+	 * Always setup the local cache - it's used even if we are
+	 * participating in a shared cache.
+	 */
+	WT_RET(__cache_config_local(session, now_shared, cfg));
+	if (now_shared) {
+		WT_RET(__wt_cache_pool_config(session, cfg));
+		WT_ASSERT(session, F_ISSET(conn, WT_CONN_CACHE_POOL));
+		if (!was_shared)
+			WT_RET(__wt_conn_cache_pool_open(session));
+	}
 
 	return (0);
 }
@@ -84,19 +127,14 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[])
 
 	conn = S2C(session);
 
-	WT_ASSERT(session, conn->cache == NULL ||
-	    (F_ISSET(conn, WT_CONN_CACHE_POOL) && conn->cache != NULL));
+	WT_ASSERT(session, conn->cache == NULL);
 
 	WT_RET(__wt_calloc_one(session, &conn->cache));
 
 	cache = conn->cache;
 
 	/* Use a common routine for run-time configuration options. */
-	WT_RET(__wt_cache_config(session, cfg));
-
-	/* Add the configured cache to the cache pool. */
-	if (F_ISSET(conn, WT_CONN_CACHE_POOL))
-		WT_RET(__wt_conn_cache_pool_open(session));
+	WT_RET(__wt_cache_config(session, 0, cfg));
 
 	/*
 	 * The target size must be lower than the trigger size or we will never
diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c
index f5b78e33b04..7bf090496a8 100644
--- a/src/conn/conn_cache_pool.c
+++ b/src/conn/conn_cache_pool.c
@@ -36,17 +36,17 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
 	WT_CONNECTION_IMPL *conn, *entry;
 	WT_DECL_RET;
 	char *pool_name;
-	int created, reconfiguring;
+	int created, updating;
 	uint64_t chunk, reserve, size, used_cache;
 
 	conn = S2C(session);
-	created = reconfiguring = 0;
+	created = updating = 0;
 	pool_name = NULL;
 	cp = NULL;
 	size = 0;
 
 	if (F_ISSET(conn, WT_CONN_CACHE_POOL))
-		reconfiguring = 1;
+		updating = 1;
 	else {
 		WT_RET(__wt_config_gets_none(
 		    session, cfg, "shared_cache.name", &cval));
@@ -81,7 +81,7 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
 
 	__wt_spin_lock(session, &__wt_process.spinlock);
 	if (__wt_process.cache_pool == NULL) {
-		WT_ASSERT(session, !reconfiguring);
+		WT_ASSERT(session, !updating);
 		/* Create a cache pool. */
 		WT_ERR(__wt_calloc_one(session, &cp));
 		created = 1;
@@ -96,7 +96,7 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
 		__wt_process.cache_pool = cp;
 		WT_ERR(__wt_verbose(session,
 		    WT_VERB_SHARED_CACHE, "Created cache pool %s", cp->name));
-	} else if (!reconfiguring && !WT_STRING_MATCH(
+	} else if (!updating && !WT_STRING_MATCH(
 	    __wt_process.cache_pool->name, pool_name, strlen(pool_name)))
 		/* Only a single cache pool is supported. */
 		WT_ERR_MSG(session, WT_ERROR,
@@ -109,7 +109,7 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
 	 * The cache pool requires a reference count to avoid a race between
 	 * configuration/open and destroy.
 	 */
-	if (!reconfiguring)
+	if (!updating)
 		++cp->refs;
 
 	/*
@@ -157,7 +157,7 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
 	if (__wt_config_gets(session, &cfg[1],
 	    "shared_cache.reserve", &cval) == 0 && cval.val != 0)
 		reserve = (uint64_t)cval.val;
-	else if (reconfiguring)
+	else if (updating)
 		reserve = conn->cache->cp_reserved;
 	else
 		reserve = chunk;
@@ -171,18 +171,23 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
 		TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq)
 			used_cache += entry->cache->cp_reserved;
 	}
+	/* Ignore our old allocation if reconfiguring */
+	if (updating)
+		used_cache -= conn->cache->cp_reserved;
 	if (used_cache + reserve > size)
 		WT_ERR_MSG(session, EINVAL,
 		    "Shared cache unable to accommodate this configuration. "
-		    "Shared cache size: %" PRIu64 ", reserved: %" PRIu64,
+		    "Shared cache size: %" PRIu64 ", requested min: %" PRIu64,
 		    size, used_cache + reserve);
 
 	/* The configuration is verified - it's safe to update the pool. */
 	cp->size = size;
 	cp->chunk = chunk;
 
+	conn->cache->cp_reserved = reserve;
+
 	/* Wake up the cache pool server so any changes are noticed. */
-	if (reconfiguring)
+	if (updating)
 		WT_ERR(__wt_cond_signal(
 		    session, __wt_process.cache_pool->cache_pool_cond));
 
@@ -192,7 +197,7 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
 
 	F_SET(conn, WT_CONN_CACHE_POOL);
 err:	__wt_spin_unlock(session, &__wt_process.spinlock);
-	if (!reconfiguring)
+	if (!updating)
 		__wt_free(session, pool_name);
 	if (ret != 0 && created) {
 		__wt_free(session, cp->name);
diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c
index a5512352f2c..7756158594c 100644
--- a/src/conn/conn_dhandle.c
+++ b/src/conn/conn_dhandle.c
@@ -375,6 +375,8 @@ __conn_btree_open(
 	    F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) &&
 	    !LF_ISSET(WT_DHANDLE_LOCK_ONLY));
 
+	WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_CLOSING));
+
 	/*
 	 * If the handle is already open, it has to be closed so it can be
 	 * reopened with a new configuration.  We don't need to check again:
@@ -539,6 +541,48 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session,
 }
 
 /*
+ * __wt_conn_btree_apply_single_ckpt --
+ *	Decode any checkpoint information from the configuration string then
+ *	call btree apply single.
+ */
+int
+__wt_conn_btree_apply_single_ckpt(WT_SESSION_IMPL *session,
+    const char *uri,
+    int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[])
+{
+	WT_CONFIG_ITEM cval;
+	WT_DECL_RET;
+	const char *checkpoint;
+
+	checkpoint = NULL;
+
+	/*
+	 * This function exists to handle checkpoint configuration.  Callers
+	 * that never open a checkpoint call the underlying function directly.
+	 */
+	WT_RET_NOTFOUND_OK(
+	    __wt_config_gets_def(session, cfg, "checkpoint", 0, &cval));
+	if (cval.len != 0) {
+		/*
+		 * The internal checkpoint name is special, find the last
+		 * unnamed checkpoint of the object.
+		 */
+		if (WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len)) {
+			WT_RET(__wt_meta_checkpoint_last_name(
+			    session, uri, &checkpoint));
+		} else
+			WT_RET(__wt_strndup(
+			    session, cval.str, cval.len, &checkpoint));
+	}
+
+	ret = __wt_conn_btree_apply_single(session, uri, checkpoint, func, cfg);
+
+	__wt_free(session, checkpoint);
+
+	return (ret);
+}
+
+/*
  * __wt_conn_btree_apply_single --
  *	Apply a function to a single btree handle that couldn't be locked
  * (attempting to get the handle returned EBUSY).
@@ -580,10 +624,10 @@ __wt_conn_btree_apply_single(WT_SESSION_IMPL *session,
 				    ret = func(session, cfg));
 			}
 			__wt_spin_unlock(session, &dhandle->close_lock);
-			WT_ERR(ret);
+			WT_RET(ret);
 		}
 
-err:	return (ret);
+	return (0);
 }
 
 /*
@@ -683,20 +727,25 @@ __wt_conn_dhandle_discard_single(WT_SESSION_IMPL *session, int final)
 {
 	WT_DATA_HANDLE *dhandle;
 	WT_DECL_RET;
+	int tret;
 
 	dhandle = session->dhandle;
 
 	if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) {
-		ret = __wt_conn_btree_sync_and_close(session, 0);
-		if (!final)
-			WT_RET(ret);
+		tret = __wt_conn_btree_sync_and_close(session, 0);
+		if (final && tret != 0) {
+			__wt_err(session, tret,
+			    "Final close of %s failed", dhandle->name);
+			WT_TRET(tret);
+		} else if (!final)
+			WT_RET(tret);
 	}
 
 	/*
 	 * Kludge: interrupt the eviction server in case it is holding the
 	 * handle list lock.
 	 */
-	F_SET(S2C(session)->cache, WT_EVICT_CLEAR_WALKS);
+	F_SET(S2C(session)->cache, WT_CACHE_CLEAR_WALKS);
 
 	/* Try to remove the handle, protected by the data handle lock. */
 	WT_WITH_DHANDLE_LOCK(session,
diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c
index 36d4d539d92..315e93c1875 100644
--- a/src/conn/conn_log.c
+++ b/src/conn/conn_log.c
@@ -347,6 +347,124 @@ err:		__wt_err(session, ret, "log close server error");
 }
 
 /*
+ * Simple structure for sorting written slots.
+ */
+typedef struct {
+	WT_LSN	lsn;
+	uint32_t slot_index;
+} WT_LOG_WRLSN_ENTRY;
+
+/*
+ * __log_wrlsn_cmp --
+ *	The log wrlsn comparison function for qsort.
+ */
+static int
+__log_wrlsn_cmp(const void *a, const void *b)
+{
+	WT_LOG_WRLSN_ENTRY *ae, *be;
+
+	ae = (WT_LOG_WRLSN_ENTRY *)a;
+	be = (WT_LOG_WRLSN_ENTRY *)b;
+	return (LOG_CMP(&ae->lsn, &be->lsn));
+}
+
+/*
+ * __log_wrlsn_server --
+ *	The log wrlsn server thread.
+ */
+static void *
+__log_wrlsn_server(void *arg)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_LOG *log;
+	WT_LOG_WRLSN_ENTRY written[SLOT_POOL];
+	WT_LOGSLOT *slot;
+	WT_SESSION_IMPL *session;
+	size_t written_i;
+	uint32_t i, save_i;
+	int yield;
+
+	session = arg;
+	conn = S2C(session);
+	log = conn->log;
+	yield = 0;
+	while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) {
+		/*
+		 * No need to use the log_slot_lock because the slot pool
+		 * is statically allocated and any slot in the
+		 * WT_LOG_SLOT_WRITTEN state is exclusively ours for now.
+		 */
+		i = 0;
+		written_i = 0;
+		/*
+		 * Walk the array once saving any slots that are in the
+		 * WT_LOG_SLOT_WRITTEN state.
+		 */
+		while (i < SLOT_POOL) {
+			save_i = i;
+			slot = &log->slot_pool[i++];
+			if (slot->slot_state != WT_LOG_SLOT_WRITTEN)
+				continue;
+			written[written_i].slot_index = save_i;
+			written[written_i++].lsn = slot->slot_release_lsn;
+		}
+		/*
+		 * If we found any written slots process them.  We sort them
+		 * based on the release LSN, and then look for them in order.
+		 */
+		if (written_i > 0) {
+			yield = 0;
+			qsort(written, written_i, sizeof(WT_LOG_WRLSN_ENTRY),
+			    __log_wrlsn_cmp);
+			/*
+			 * We know the written array is sorted by LSN.  Go
+			 * through them either advancing write_lsn or stop
+			 * as soon as one is not in order.
+			 */
+			for (i = 0; i < written_i; i++) {
+				if (LOG_CMP(&log->write_lsn,
+				    &written[i].lsn) != 0)
+					break;
+				/*
+				 * If we get here we have a slot to process.
+				 * Advance the LSN and process the slot.
+				 */
+				slot = &log->slot_pool[written[i].slot_index];
+				WT_ASSERT(session, LOG_CMP(&written[i].lsn,
+				    &slot->slot_release_lsn) == 0);
+				log->write_lsn = slot->slot_end_lsn;
+				WT_ERR(__wt_cond_signal(session,
+				    log->log_write_cond));
+				WT_STAT_FAST_CONN_INCR(session, log_write_lsn);
+
+				/*
+				 * Signal the close thread if needed.
+				 */
+				if (F_ISSET(slot, SLOT_CLOSEFH))
+					WT_ERR(__wt_cond_signal(session,
+					    conn->log_close_cond));
+				WT_ERR(__wt_log_slot_free(session, slot));
+			}
+		}
+		/*
+		 * If we saw a later write, we always want to yield because
+		 * we know something is in progress.
+		 */
+		if (yield++ < 1000)
+			__wt_yield();
+		else
+			/* Wait until the next event. */
+			WT_ERR(__wt_cond_wait(session,
+			    conn->log_wrlsn_cond, 100000));
+	}
+
+	if (0)
+err:		__wt_err(session, ret, "log wrlsn server error");
+	return (NULL);
+}
+
+/*
  * __log_server --
  *	The log server thread.
  */
@@ -479,12 +597,24 @@ __wt_logmgr_open(WT_SESSION_IMPL *session)
 	    "log close server", 0, &conn->log_close_cond));
 
 	/*
-	 * Start the thread.
+	 * Start the log file close thread.
 	 */
 	WT_RET(__wt_thread_create(conn->log_close_session,
 	    &conn->log_close_tid, __log_close_server, conn->log_close_session));
 	conn->log_close_tid_set = 1;
 
+	/*
+	 * Start the log write LSN thread.  It is not configurable.
+	 * If logging is enabled, this thread runs.
+	 */
+	WT_RET(__wt_open_internal_session(
+	    conn, "log-wrlsn-server", 0, 0, &conn->log_wrlsn_session));
+	WT_RET(__wt_cond_alloc(conn->log_wrlsn_session,
+	    "log write lsn server", 0, &conn->log_wrlsn_cond));
+	WT_RET(__wt_thread_create(conn->log_wrlsn_session,
+	    &conn->log_wrlsn_tid, __log_wrlsn_server, conn->log_wrlsn_session));
+	conn->log_wrlsn_tid_set = 1;
+
 	/* If no log thread services are configured, we're done. */ 
 	if (!FLD_ISSET(conn->log_flags,
 	    (WT_CONN_LOG_ARCHIVE | WT_CONN_LOG_PREALLOC)))
@@ -557,6 +687,17 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
 		WT_TRET(wt_session->close(wt_session, NULL));
 		conn->log_close_session = NULL;
 	}
+	if (conn->log_wrlsn_tid_set) {
+		WT_TRET(__wt_cond_signal(session, conn->log_wrlsn_cond));
+		WT_TRET(__wt_thread_join(session, conn->log_wrlsn_tid));
+		conn->log_wrlsn_tid_set = 0;
+	}
+	WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond));
+	if (conn->log_wrlsn_session != NULL) {
+		wt_session = &conn->log_wrlsn_session->iface;
+		WT_TRET(wt_session->close(wt_session, NULL));
+		conn->log_wrlsn_session = NULL;
+	}
 
 	WT_TRET(__wt_log_close(session));
 
diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c
index d4f6cf4869c..0a3d35ac0b1 100644
--- a/src/conn/conn_open.c
+++ b/src/conn/conn_open.c
@@ -55,9 +55,6 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[])
 	 */
 	WT_WRITE_BARRIER();
 
-	/* Connect to a cache pool. */
-	WT_RET(__wt_cache_pool_config(session, cfg));
-
 	/* Create the cache. */
 	WT_RET(__wt_cache_create(session, cfg));
 
@@ -113,6 +110,9 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
 	F_CLR(conn, WT_CONN_SERVER_RUN);
 	WT_TRET(__wt_async_destroy(session));
 	WT_TRET(__wt_lsm_manager_destroy(session));
+
+	F_SET(conn, WT_CONN_CLOSING);
+
 	WT_TRET(__wt_checkpoint_server_destroy(session));
 	WT_TRET(__wt_statlog_destroy(session, 1));
 	WT_TRET(__wt_sweep_destroy(session));
diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c
index 67814dc330b..c38e0ef125f 100644
--- a/src/conn/conn_stat.c
+++ b/src/conn/conn_stat.c
@@ -205,7 +205,7 @@ __statlog_apply(WT_SESSION_IMPL *session, const char *cfg[])
 		if (WT_PREFIX_MATCH(dhandle->name, *p)) {
 			WT_WITHOUT_DHANDLE(session,
 			    ret = __statlog_dump(session, dhandle->name, 0));
-			WT_RET(ret);
+			return (ret);
 		}
 	return (0);
 }
diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c
index abc6a106cc9..bf086bcc813 100644
--- a/src/cursor/cur_index.c
+++ b/src/cursor/cur_index.c
@@ -26,7 +26,7 @@ __curindex_get_value(WT_CURSOR *cursor, ...)
 	WT_CURSOR_NEEDVALUE(cursor);
 
 	va_start(ap, cursor);
-	if (F_ISSET(cursor, WT_CURSTD_RAW)) {
+	if (F_ISSET(cursor, WT_CURSOR_RAW_OK)) {
 		ret = __wt_schema_project_merge(session,
 		    cindex->cg_cursors, cindex->value_plan,
 		    cursor->value_format, &cursor->value);
diff --git a/src/cursor/cur_stat.c b/src/cursor/cur_stat.c
index bebce217a6a..74b998876c2 100644
--- a/src/cursor/cur_stat.c
+++ b/src/cursor/cur_stat.c
@@ -354,7 +354,6 @@ __curstat_file_init(WT_SESSION_IMPL *session,
 
 	/* Release the handle, we're done with it. */
 	WT_TRET(__wt_session_release_btree(session));
-	WT_RET(ret);
 
 	return (ret);
 }
diff --git a/src/docs/spell.ok b/src/docs/spell.ok
index f333a8fff58..df31a272361 100644
--- a/src/docs/spell.ok
+++ b/src/docs/spell.ok
@@ -87,6 +87,7 @@ ack'ed
 ajn
 alloc
 allocator
+allocators
 allocsize
 ao
 api
diff --git a/src/docs/tune-memory-allocator.dox b/src/docs/tune-memory-allocator.dox
index ad052bc4ec3..a619708f816 100644
--- a/src/docs/tune-memory-allocator.dox
+++ b/src/docs/tune-memory-allocator.dox
@@ -10,4 +10,9 @@ Google's tcmalloc</a>, or
 <a href="http://www.canonware.com/jemalloc">FreeBSD's jemalloc</a>),
 can dramatically improve throughput.
 
+As different memory allocators have different overhead and different
+workloads will have different heap allocation sizes and patterns,
+applications may need to set their allocator overhead using the
+\c cache_overhead configuration to the wiredtiger_open:: call.
+
  */
diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c
index 9e39fcc7a2c..1030c0aa818 100644
--- a/src/evict/evict_file.c
+++ b/src/evict/evict_file.c
@@ -72,6 +72,17 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop)
 		    WT_READ_CACHE | WT_READ_NO_EVICT));
 
 		switch (syncop) {
+		case WT_SYNC_DISCARD:
+			/*
+			 * Check that the page is clean: if we see a dirty page
+			 * (including a dirty parent page after evicting a
+			 * child), give up.  The higher level can try to
+			 * checkpoint, but during discard we aren't set up to
+			 * manage checkpoints.
+			 */
+			if (__wt_page_is_modified(page))
+				WT_ERR(EBUSY);
+			/* FALLTHROUGH */
 		case WT_SYNC_CLOSE:
 			/*
 			 * Evict the page.
@@ -84,29 +95,6 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop)
 			    !F_ISSET(page->modify, WT_PM_REC_EMPTY))
 				WT_ERR(__wt_evict(session, ref, 1));
 			break;
-		case WT_SYNC_DISCARD:
-			/*
-			 * Ordinary discard of the page, whether clean or dirty.
-			 * If we see a dirty page in an ordinary discard (e.g.,
-			 * from sweep), give up: an update must have happened
-			 * since the file was selected for sweeping.
-			 */
-			if (__wt_page_is_modified(page))
-				WT_ERR(EBUSY);
-
-			/*
-			 * If the page contains an update that is too recent to
-			 * evict, stop.  This should never happen during
-			 * connection close, but in other paths our caller
-			 * should be prepared to deal with this case.
-			 */
-			if (page->modify != NULL &&
-			    !__wt_txn_visible_all(session,
-			    page->modify->rec_max_txn))
-				WT_ERR(EBUSY);
-
-			__wt_evict_page_clean_update(session, ref);
-			break;
 		case WT_SYNC_DISCARD_FORCE:
 			/*
 			 * Forced discard of the page, whether clean or dirty.
diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c
index e3d8ea6a4e0..640c9b0541d 100644
--- a/src/evict/evict_lru.c
+++ b/src/evict/evict_lru.c
@@ -194,6 +194,17 @@ __evict_server(void *arg)
 				ret = 0;
 			}
 		}
+		/*
+		 * Clear the walks so we don't pin pages while asleep,
+		 * otherwise we can block applications evicting large pages.
+		 */
+		if (!F_ISSET(cache, WT_CACHE_STUCK)) {
+			WT_ERR(__evict_clear_walks(session));
+
+			/* Next time we wake up, reverse the sweep direction. */
+			cache->flags ^= WT_CACHE_WALK_REVERSE;
+		}
+
 		WT_ERR(__wt_verbose(session, WT_VERB_EVICTSERVER, "sleeping"));
 		/* Don't rely on signals: check periodically. */
 		WT_ERR(__wt_cond_wait(session, cache->evict_cond, 100000));
@@ -237,7 +248,7 @@ __evict_workers_resize(WT_SESSION_IMPL *session)
 	WT_DECL_RET;
 	WT_EVICT_WORKER *workers;
 	size_t alloc;
-	u_int i;
+	uint32_t i;
 
 	conn = S2C(session);
 
@@ -321,7 +332,7 @@ __wt_evict_destroy(WT_SESSION_IMPL *session)
 	WT_DECL_RET;
 	WT_EVICT_WORKER *workers;
 	WT_SESSION *wt_session;
-	u_int i;
+	uint32_t i;
 
 	conn = S2C(session);
 	cache = conn->cache;
@@ -432,17 +443,17 @@ __evict_has_work(WT_SESSION_IMPL *session, uint32_t *flagsp)
 	    (cache->eviction_dirty_target * bytes_max) / 100)
 		/* Ignore clean pages unless the cache is too large */
 		LF_SET(WT_EVICT_PASS_DIRTY);
-	else if (F_ISSET(cache, WT_EVICT_WOULD_BLOCK)) {
+	else if (F_ISSET(cache, WT_CACHE_WOULD_BLOCK)) {
 		/*
 		 * Evict pages with oldest generation (which would otherwise
 		 * block application threads) set regardless of whether we have
 		 * reached the eviction trigger.
 		 */
 		LF_SET(WT_EVICT_PASS_WOULD_BLOCK);
-		F_CLR(cache, WT_EVICT_WOULD_BLOCK);
+		F_CLR(cache, WT_CACHE_WOULD_BLOCK);
 	}
 
-	if (F_ISSET(cache, WT_EVICT_STUCK))
+	if (F_ISSET(cache, WT_CACHE_STUCK))
 		LF_SET(WT_EVICT_PASS_AGGRESSIVE);
 
 	*flagsp = flags;
@@ -475,8 +486,8 @@ __evict_pass(WT_SESSION_IMPL *session)
 		 * If there is a request to clear eviction walks, do that now,
 		 * before checking if the cache is full.
 		 */
-		if (F_ISSET(cache, WT_EVICT_CLEAR_WALKS)) {
-			F_CLR(cache, WT_EVICT_CLEAR_WALKS);
+		if (F_ISSET(cache, WT_CACHE_CLEAR_WALKS)) {
+			F_CLR(cache, WT_CACHE_CLEAR_WALKS);
 			WT_RET(__evict_clear_walks(session));
 			WT_RET(__wt_cond_signal(
 			    session, cache->evict_waiter_cond));
@@ -493,7 +504,8 @@ __evict_pass(WT_SESSION_IMPL *session)
 		 * Start a worker if we have capacity and we haven't reached
 		 * the eviction targets.
 		 */
-		if (LF_ISSET(WT_EVICT_PASS_ALL | WT_EVICT_PASS_DIRTY) &&
+		if (LF_ISSET(WT_EVICT_PASS_ALL |
+		    WT_EVICT_PASS_DIRTY | WT_EVICT_PASS_WOULD_BLOCK) &&
 		    conn->evict_workers < conn->evict_workers_max) {
 			WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER,
 			    "Starting evict worker: %"PRIu32"\n",
@@ -527,10 +539,8 @@ __evict_pass(WT_SESSION_IMPL *session)
 			 * handles.
 			 */
 			__wt_sleep(0, 1000 * (long)loop);
-			if (F_ISSET(cache, WT_EVICT_STUCK))
-				break;
 			if (loop == 100) {
-				F_SET(cache, WT_EVICT_STUCK);
+				F_SET(cache, WT_CACHE_STUCK);
 				WT_STAT_FAST_CONN_INCR(
 				    session, cache_eviction_slow);
 				WT_RET(__wt_verbose(
@@ -605,7 +615,7 @@ __evict_tree_walk_clear(WT_SESSION_IMPL *session)
 	F_SET(session, WT_SESSION_CLEAR_EVICT_WALK);
 
 	while (btree->evict_ref != NULL && ret == 0) {
-		F_SET(cache, WT_EVICT_CLEAR_WALKS);
+		F_SET(cache, WT_CACHE_CLEAR_WALKS);
 		ret = __wt_cond_wait(
 		    session, cache->evict_waiter_cond, 100000);
 	}
@@ -792,21 +802,29 @@ __evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags)
 
 	WT_ASSERT(session, cache->evict[0].ref != NULL);
 
-	/* Find the bottom 25% of read generations. */
-	cutoff = (3 * __evict_read_gen(&cache->evict[0]) +
-	    __evict_read_gen(&cache->evict[entries - 1])) / 4;
-
-	/*
-	 * Don't take less than 10% or more than 50% of entries, regardless.
-	 * That said, if there is only one entry, which is normal when
-	 * populating an empty file, don't exclude it.
-	 */
-	for (candidates = 1 + entries / 10;
-	    candidates < entries / 2;
-	    candidates++)
-		if (__evict_read_gen(&cache->evict[candidates]) > cutoff)
-			break;
-	cache->evict_candidates = candidates;
+	if (LF_ISSET(WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK))
+		/*
+		 * Take all candidates if we only gathered pages with an oldest
+		 * read generation set.
+		 */
+		cache->evict_candidates = entries;
+	else {
+		/* Find the bottom 25% of read generations. */
+		cutoff = (3 * __evict_read_gen(&cache->evict[0]) +
+		    __evict_read_gen(&cache->evict[entries - 1])) / 4;
+		/*
+		 * Don't take less than 10% or more than 50% of entries,
+		 * regardless.  That said, if there is only one entry, which is
+		 * normal when populating an empty file, don't exclude it.
+		 */
+		for (candidates = 1 + entries / 10;
+		    candidates < entries / 2;
+		    candidates++)
+			if (__evict_read_gen(
+			    &cache->evict[candidates]) > cutoff)
+				break;
+		cache->evict_candidates = candidates;
+	}
 
 	/* If we have more than the minimum number of entries, clear them. */
 	if (cache->evict_entries > WT_EVICT_WALK_BASE) {
@@ -907,7 +925,7 @@ retry:	while (slot < max_entries && ret == 0) {
 		 * If another thread is waiting on the eviction server to clear
 		 * the walk point in a tree, give up.
 		 */
-		if (F_ISSET(cache, WT_EVICT_CLEAR_WALKS))
+		if (F_ISSET(cache, WT_CACHE_CLEAR_WALKS))
 			break;
 
 		/*
@@ -917,7 +935,7 @@ retry:	while (slot < max_entries && ret == 0) {
 		if (!dhandle_locked) {
 			for (spins = 0; (ret = __wt_spin_trylock(
 			    session, &conn->dhandle_lock, &id)) == EBUSY &&
-			    !F_ISSET(cache, WT_EVICT_CLEAR_WALKS);
+			    !F_ISSET(cache, WT_CACHE_CLEAR_WALKS);
 			    spins++) {
 				if (spins < 1000)
 					__wt_yield();
@@ -1029,7 +1047,7 @@ retry:	while (slot < max_entries && ret == 0) {
 	 * candidates and we aren't finding more.  Take care not to skip files
 	 * on subsequent passes.
 	 */
-	if (!F_ISSET(cache, WT_EVICT_CLEAR_WALKS) && ret == 0 &&
+	if (!F_ISSET(cache, WT_CACHE_CLEAR_WALKS) && ret == 0 &&
 	    slot < max_entries && (retries < 2 ||
 	    (!LF_ISSET(WT_EVICT_PASS_WOULD_BLOCK) && retries < 10 &&
 	    (slot == cache->evict_entries || slot > start_slot)))) {
@@ -1096,8 +1114,11 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
 	    cache->evict + cache->evict_slots);
 	enough = internal_pages = restarts = 0;
 
-	walk_flags =
-	    WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT;
+	walk_flags = WT_READ_CACHE | WT_READ_NO_EVICT |
+	    WT_READ_NO_GEN | WT_READ_NO_WAIT;
+
+	if (F_ISSET(cache, WT_CACHE_WALK_REVERSE))
+		walk_flags |= WT_READ_PREV;
 
 	/*
 	 * Get some more eviction candidate pages.
@@ -1181,7 +1202,7 @@ fast:		/* If the page can't be evicted, give up. */
 		 */
 		mod = page->modify;
 		if (!modified && mod != NULL && !LF_ISSET(
-		    WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_WOULD_BLOCK) &&
+		    WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK) &&
 		    !__wt_txn_visible_all(session, mod->rec_max_txn))
 			continue;
 
@@ -1355,8 +1376,8 @@ __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_server)
 	WT_RET(ret);
 
 	cache = S2C(session)->cache;
-	if (F_ISSET(cache, WT_EVICT_STUCK))
-		F_CLR(cache, WT_EVICT_STUCK);
+	if (F_ISSET(cache, WT_CACHE_STUCK))
+		F_CLR(cache, WT_CACHE_STUCK);
 
 	return (ret);
 }
@@ -1400,9 +1421,9 @@ __wt_cache_wait(WT_SESSION_IMPL *session, int full)
 		 * abort the transaction to give up all hazard pointers before
 		 * trying again.
 		 */
-		if (F_ISSET(cache, WT_EVICT_STUCK) &&
+		if (F_ISSET(cache, WT_CACHE_STUCK) &&
 		    __wt_txn_am_oldest(session)) {
-			F_CLR(cache, WT_EVICT_STUCK);
+			F_CLR(cache, WT_CACHE_STUCK);
 			WT_STAT_FAST_CONN_INCR(session, txn_fail_cache);
 			return (WT_ROLLBACK);
 		}
diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c
index 9ba1af897a4..892d5b4ac60 100644
--- a/src/evict/evict_page.c
+++ b/src/evict/evict_page.c
@@ -130,8 +130,8 @@ done:	session->excl_next = 0;
 		txn_state->snap_min = WT_TXN_NONE;
 
 	if ((inmem_split || (forced_eviction && ret == EBUSY)) &&
-	    !F_ISSET(conn->cache, WT_EVICT_WOULD_BLOCK)) {
-		F_SET(conn->cache, WT_EVICT_WOULD_BLOCK);
+	    !F_ISSET(conn->cache, WT_CACHE_WOULD_BLOCK)) {
+		F_SET(conn->cache, WT_CACHE_WOULD_BLOCK);
 		WT_TRET(__wt_evict_server_wake(session));
 	}
 
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 91d0d1eb654..101fd450fc7 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -442,8 +442,6 @@ struct __wt_page {
 
 		/* Row-store leaf page. */
 		struct {
-			WT_ROW *d;		/* Key/value pairs */
-
 			/*
 			 * The column-store leaf page modification structures
 			 * live in the WT_PAGE_MODIFY structure to keep the
@@ -457,6 +455,7 @@ struct __wt_page {
 			WT_INSERT_HEAD	**ins;	/* Inserts */
 			WT_UPDATE	**upd;	/* Updates */
 
+			WT_ROW *d;		/* Key/value pairs */
 			uint32_t entries;	/* Entries */
 		} row;
 #undef	pg_row_d
@@ -510,11 +509,31 @@ struct __wt_page {
 #define	pg_var_entries	u.col_var.entries
 	} u;
 
-	/* Page's on-disk representation: NULL for pages created in memory. */
-	const WT_PAGE_HEADER *dsk;
+	/*
+	 * The page's type and flags are positioned at the end of the WT_PAGE
+	 * union, it reduces cache misses in the row-store search function.
+	 */
+#define	WT_PAGE_IS_INTERNAL(page)					\
+	((page)->type == WT_PAGE_COL_INT || (page)->type == WT_PAGE_ROW_INT)
+#define	WT_PAGE_INVALID		0	/* Invalid page */
+#define	WT_PAGE_BLOCK_MANAGER	1	/* Block-manager page */
+#define	WT_PAGE_COL_FIX		2	/* Col-store fixed-len leaf */
+#define	WT_PAGE_COL_INT		3	/* Col-store internal page */
+#define	WT_PAGE_COL_VAR		4	/* Col-store var-length leaf page */
+#define	WT_PAGE_OVFL		5	/* Overflow page */
+#define	WT_PAGE_ROW_INT		6	/* Row-store internal page */
+#define	WT_PAGE_ROW_LEAF	7	/* Row-store leaf page */
+	uint8_t type;			/* Page type */
 
-	/* If/when the page is modified, we need lots more information. */
-	WT_PAGE_MODIFY *modify;
+#define	WT_PAGE_BUILD_KEYS	0x01	/* Keys have been built in memory */
+#define	WT_PAGE_DISK_ALLOC	0x02	/* Disk image in allocated memory */
+#define	WT_PAGE_DISK_MAPPED	0x04	/* Disk image in mapped memory */
+#define	WT_PAGE_EVICT_LRU	0x08	/* Page is on the LRU queue */
+#define	WT_PAGE_REFUSE_DEEPEN	0x10	/* Don't deepen the tree at this page */
+#define	WT_PAGE_SCANNING	0x20	/* Obsolete updates are being scanned */
+#define	WT_PAGE_SPLIT_INSERT	0x40	/* A leaf page was split for append */
+#define	WT_PAGE_SPLITTING	0x80	/* An internal page is growing */
+	uint8_t flags_atomic;		/* Atomic flags, use F_*_ATOMIC */
 
 	/*
 	 * The page's read generation acts as an LRU value for each page in the
@@ -539,27 +558,11 @@ struct __wt_page {
 
 	size_t memory_footprint;	/* Memory attached to the page */
 
-#define	WT_PAGE_IS_INTERNAL(page)					\
-	((page)->type == WT_PAGE_COL_INT || (page)->type == WT_PAGE_ROW_INT)
-#define	WT_PAGE_INVALID		0	/* Invalid page */
-#define	WT_PAGE_BLOCK_MANAGER	1	/* Block-manager page */
-#define	WT_PAGE_COL_FIX		2	/* Col-store fixed-len leaf */
-#define	WT_PAGE_COL_INT		3	/* Col-store internal page */
-#define	WT_PAGE_COL_VAR		4	/* Col-store var-length leaf page */
-#define	WT_PAGE_OVFL		5	/* Overflow page */
-#define	WT_PAGE_ROW_INT		6	/* Row-store internal page */
-#define	WT_PAGE_ROW_LEAF	7	/* Row-store leaf page */
-	uint8_t type;			/* Page type */
+	/* Page's on-disk representation: NULL for pages created in memory. */
+	const WT_PAGE_HEADER *dsk;
 
-#define	WT_PAGE_BUILD_KEYS	0x01	/* Keys have been built in memory */
-#define	WT_PAGE_DISK_ALLOC	0x02	/* Disk image in allocated memory */
-#define	WT_PAGE_DISK_MAPPED	0x04	/* Disk image in mapped memory */
-#define	WT_PAGE_EVICT_LRU	0x08	/* Page is on the LRU queue */
-#define	WT_PAGE_REFUSE_DEEPEN	0x10	/* Don't deepen the tree at this page */
-#define	WT_PAGE_SCANNING	0x20	/* Obsolete updates are being scanned */
-#define	WT_PAGE_SPLIT_INSERT	0x40	/* A leaf page was split for append */
-#define	WT_PAGE_SPLITTING	0x80	/* An internal page is growing */
-	uint8_t flags_atomic;		/* Atomic flags, use F_*_ATOMIC */
+	/* If/when the page is modified, we need lots more information. */
+	WT_PAGE_MODIFY *modify;
 };
 
 /*
diff --git a/src/include/btree.i b/src/include/btree.i
index 032178b4755..56fb66abaef 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -404,7 +404,7 @@ __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
 
 /*
  * __wt_page_parent_modify_set --
- *	Mark the parent page and tree dirty.
+ *	Mark the parent page, and optionally the tree, dirty.
  */
 static inline int
 __wt_page_parent_modify_set(
@@ -957,6 +957,10 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int check_splits)
 	if (mod == NULL)
 		return (1);
 
+	/* Skip pages that are already being evicted. */
+	if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU))
+		return (0);
+
 	/*
 	 * If the tree was deepened, there's a requirement that newly created
 	 * internal pages not be evicted until all threads are known to have
diff --git a/src/include/cache.h b/src/include/cache.h
index 84b18082a25..8ed3176492f 100644
--- a/src/include/cache.h
+++ b/src/include/cache.h
@@ -118,9 +118,10 @@ struct __wt_cache {
 	 */
 #define	WT_CACHE_POOL_MANAGER	0x01	/* The active cache pool manager */
 #define	WT_CACHE_POOL_RUN	0x02	/* Cache pool thread running */
-#define	WT_EVICT_CLEAR_WALKS	0x04	/* Clear eviction walks */
-#define	WT_EVICT_STUCK		0x08	/* Eviction server is stuck */
-#define	WT_EVICT_WOULD_BLOCK	0x10	/* Pages that would block apps */
+#define	WT_CACHE_CLEAR_WALKS	0x04	/* Clear eviction walks */
+#define	WT_CACHE_STUCK		0x08	/* Eviction server is stuck */
+#define	WT_CACHE_WALK_REVERSE	0x10	/* Scan backwards for candidates */
+#define	WT_CACHE_WOULD_BLOCK	0x20	/* Pages that would block apps */
 	uint32_t flags;
 };
 
diff --git a/src/include/connection.h b/src/include/connection.h
index 9cb42ae7c80..78b2949ab98 100644
--- a/src/include/connection.h
+++ b/src/include/connection.h
@@ -320,6 +320,10 @@ struct __wt_connection_impl {
 	WT_SESSION_IMPL *log_close_session;/* Log close thread session */
 	wt_thread_t	 log_close_tid;	/* Log close thread thread */
 	int		 log_close_tid_set;/* Log close thread set */
+	WT_CONDVAR	*log_wrlsn_cond;/* Log write lsn thread wait mutex */
+	WT_SESSION_IMPL *log_wrlsn_session;/* Log write lsn thread session */
+	wt_thread_t	 log_wrlsn_tid;	/* Log write lsn thread thread */
+	int		 log_wrlsn_tid_set;/* Log write lsn thread set */
 	WT_LOG		*log;		/* Logging structure */
 	WT_COMPRESSOR	*log_compressor;/* Logging compressor */
 	wt_off_t	 log_file_max;	/* Log file max size */
diff --git a/src/include/error.h b/src/include/error.h
index b732776badf..efc1617fcd3 100644
--- a/src/include/error.h
+++ b/src/include/error.h
@@ -11,11 +11,11 @@
 
 /* In DIAGNOSTIC mode, yield in places where we want to encourage races. */
 #ifdef HAVE_DIAGNOSTIC
-#define	WT_HAVE_DIAGNOSTIC_YIELD do {					\
+#define	WT_DIAGNOSTIC_YIELD do {					\
 	__wt_yield();							\
 } while (0)
 #else
-#define	WT_HAVE_DIAGNOSTIC_YIELD
+#define	WT_DIAGNOSTIC_YIELD
 #endif
 
 /* Set "ret" and branch-to-err-label tests. */
diff --git a/src/include/extern.h b/src/include/extern.h
index 5d3ee5bc8f8..bddbb5e01eb 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -163,8 +163,9 @@ extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t recno, WT_REF *lea
 extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page);
 extern int __wt_row_leaf_key_copy( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key);
 extern int __wt_row_leaf_key_work(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip_arg, WT_ITEM *keyb, int instantiate);
-extern int __wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t cell_offset, const void *key, size_t size, void *ikeyp);
-extern int __wt_row_ikey(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key, size_t size, void *ikeyp);
+extern int __wt_row_ikey_alloc(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key, size_t size, WT_IKEY **ikeyp);
+extern int __wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t cell_offset, const void *key, size_t size, WT_REF *ref);
+extern int __wt_row_ikey(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key, size_t size, WT_REF *ref);
 extern int __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page);
 extern int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd, int is_remove);
 extern int __wt_row_insert_alloc(WT_SESSION_IMPL *session, WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep);
@@ -206,7 +207,7 @@ extern int __wt_conn_remove_data_source(WT_SESSION_IMPL *session);
 extern int __wt_extractor_config(WT_SESSION_IMPL *session, const char *config, WT_EXTRACTOR **extractorp, int *ownp);
 extern int __wt_conn_remove_extractor(WT_SESSION_IMPL *session);
 extern int __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]);
-extern int __wt_cache_config(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_cache_config(WT_SESSION_IMPL *session, int reconfigure, const char *cfg[]);
 extern int __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]);
 extern void __wt_cache_stats_update(WT_SESSION_IMPL *session);
 extern int __wt_cache_destroy(WT_SESSION_IMPL *session);
@@ -221,6 +222,7 @@ extern int __wt_conn_dhandle_find(WT_SESSION_IMPL *session, const char *name, co
 extern int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, int force);
 extern int __wt_conn_btree_get(WT_SESSION_IMPL *session, const char *name, const char *ckpt, const char *cfg[], uint32_t flags);
 extern int __wt_conn_btree_apply(WT_SESSION_IMPL *session, int apply_checkpoints, const char *uri, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]);
+extern int __wt_conn_btree_apply_single_ckpt(WT_SESSION_IMPL *session, const char *uri, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]);
 extern int __wt_conn_btree_apply_single(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]);
 extern int __wt_conn_dhandle_close_all( WT_SESSION_IMPL *session, const char *name, int force);
 extern int __wt_conn_dhandle_discard_single(WT_SESSION_IMPL *session, int final);
@@ -349,7 +351,7 @@ extern int __wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
 extern int __wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
 extern int __wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
 extern int64_t __wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size);
-extern int __wt_log_slot_free(WT_LOGSLOT *slot);
+extern int __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
 extern int __wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, size_t newsize);
 extern int __wt_clsm_init_merge( WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_int nchunks);
 extern int __wt_clsm_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
diff --git a/src/include/flags.h b/src/include/flags.h
index 9664fce3f9f..30b2ab1c0e3 100644
--- a/src/include/flags.h
+++ b/src/include/flags.h
@@ -4,18 +4,19 @@
  */
 #define	WT_CONN_CACHE_POOL				0x00000001
 #define	WT_CONN_CKPT_SYNC				0x00000002
-#define	WT_CONN_EVICTION_RUN				0x00000004
-#define	WT_CONN_LEAK_MEMORY				0x00000008
-#define	WT_CONN_LOG_SERVER_RUN				0x00000010
-#define	WT_CONN_LSM_MERGE				0x00000020
-#define	WT_CONN_PANIC					0x00000040
-#define	WT_CONN_SERVER_ASYNC				0x00000080
-#define	WT_CONN_SERVER_CHECKPOINT			0x00000100
-#define	WT_CONN_SERVER_LSM				0x00000200
-#define	WT_CONN_SERVER_RUN				0x00000400
-#define	WT_CONN_SERVER_STATISTICS			0x00000800
-#define	WT_CONN_SERVER_SWEEP				0x00001000
-#define	WT_CONN_WAS_BACKUP				0x00002000
+#define	WT_CONN_CLOSING					0x00000004
+#define	WT_CONN_EVICTION_RUN				0x00000008
+#define	WT_CONN_LEAK_MEMORY				0x00000010
+#define	WT_CONN_LOG_SERVER_RUN				0x00000020
+#define	WT_CONN_LSM_MERGE				0x00000040
+#define	WT_CONN_PANIC					0x00000080
+#define	WT_CONN_SERVER_ASYNC				0x00000100
+#define	WT_CONN_SERVER_CHECKPOINT			0x00000200
+#define	WT_CONN_SERVER_LSM				0x00000400
+#define	WT_CONN_SERVER_RUN				0x00000800
+#define	WT_CONN_SERVER_STATISTICS			0x00001000
+#define	WT_CONN_SERVER_SWEEP				0x00002000
+#define	WT_CONN_WAS_BACKUP				0x00004000
 #define	WT_EVICTING					0x00000001
 #define	WT_FILE_TYPE_CHECKPOINT				0x00000001
 #define	WT_FILE_TYPE_DATA				0x00000002
@@ -36,9 +37,8 @@
 #define	WT_READ_NO_WAIT					0x00000010
 #define	WT_READ_PREV					0x00000020
 #define	WT_READ_SKIP_INTL				0x00000040
-#define	WT_READ_SKIP_LEAF				0x00000080
-#define	WT_READ_TRUNCATE				0x00000100
-#define	WT_READ_WONT_NEED				0x00000200
+#define	WT_READ_TRUNCATE				0x00000080
+#define	WT_READ_WONT_NEED				0x00000100
 #define	WT_SESSION_CAN_WAIT				0x00000001
 #define	WT_SESSION_CLEAR_EVICT_WALK			0x00000002
 #define	WT_SESSION_DISCARD_FORCE			0x00000004
diff --git a/src/include/log.h b/src/include/log.h
index 82d90070609..760321d9abb 100644
--- a/src/include/log.h
+++ b/src/include/log.h
@@ -59,17 +59,21 @@
 
 /*
  * Possible values for the consolidation array slot states:
+ * (NOTE: Any new states must be > WT_LOG_SLOT_DONE and < WT_LOG_SLOT_READY.)
+ *
  * < WT_LOG_SLOT_DONE - threads are actively writing to the log.
  * WT_LOG_SLOT_DONE - all activity on this slot is complete.
  * WT_LOG_SLOT_FREE - slot is available for allocation.
  * WT_LOG_SLOT_PENDING - slot is transitioning from ready to active.
+ * WT_LOG_SLOT_WRITTEN - slot is written and should be processed by worker.
  * WT_LOG_SLOT_READY - slot is ready for threads to join.
  * > WT_LOG_SLOT_READY - threads are actively consolidating on this slot.
  */
 #define	WT_LOG_SLOT_DONE	0
 #define	WT_LOG_SLOT_FREE	1
 #define	WT_LOG_SLOT_PENDING	2
-#define	WT_LOG_SLOT_READY	3
+#define	WT_LOG_SLOT_WRITTEN	3
+#define	WT_LOG_SLOT_READY	4
 typedef WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) struct {
 	int64_t	 slot_state;		/* Slot state */
 	uint64_t slot_group_size;	/* Group size */
@@ -92,9 +96,11 @@ typedef WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) struct {
 	uint32_t flags;			/* Flags */
 } WT_LOGSLOT;
 
+#define	SLOT_INIT_FLAGS	(SLOT_BUFFERED)
+
 typedef struct {
 	WT_LOGSLOT	*slot;
-	wt_off_t		 offset;
+	wt_off_t	 offset;
 } WT_MYSLOT;
 
 					/* Offset of first record */
diff --git a/src/include/stat.h b/src/include/stat.h
index 3f684478358..21eaff0677f 100644
--- a/src/include/stat.h
+++ b/src/include/stat.h
@@ -215,6 +215,7 @@ struct __wt_connection_stats {
 	WT_STATS log_prealloc_max;
 	WT_STATS log_prealloc_used;
 	WT_STATS log_reads;
+	WT_STATS log_release_write_lsn;
 	WT_STATS log_scan_records;
 	WT_STATS log_scan_rereads;
 	WT_STATS log_scans;
@@ -227,6 +228,8 @@ struct __wt_connection_stats {
 	WT_STATS log_slot_toosmall;
 	WT_STATS log_slot_transitions;
 	WT_STATS log_sync;
+	WT_STATS log_sync_dir;
+	WT_STATS log_write_lsn;
 	WT_STATS log_writes;
 	WT_STATS lsm_checkpoint_throttle;
 	WT_STATS lsm_merge_throttle;
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index d0d0f9eec77..fed6042c67a 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -561,7 +561,6 @@ struct __wt_cursor {
 	 * user on open.
 	 */
 	const char *internal_uri;
-					/* Saved modification methods. */
 
 #define	WT_CURSTD_APPEND	0x0001
 #define	WT_CURSTD_BULK		0x0002
@@ -3336,110 +3335,116 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
 #define	WT_STAT_CONN_LOG_PREALLOC_USED			1081
 /*! log: log read operations */
 #define	WT_STAT_CONN_LOG_READS				1082
+/*! log: log release advances write LSN */
+#define	WT_STAT_CONN_LOG_RELEASE_WRITE_LSN		1083
 /*! log: records processed by log scan */
-#define	WT_STAT_CONN_LOG_SCAN_RECORDS			1083
+#define	WT_STAT_CONN_LOG_SCAN_RECORDS			1084
 /*! log: log scan records requiring two reads */
-#define	WT_STAT_CONN_LOG_SCAN_REREADS			1084
+#define	WT_STAT_CONN_LOG_SCAN_REREADS			1085
 /*! log: log scan operations */
-#define	WT_STAT_CONN_LOG_SCANS				1085
+#define	WT_STAT_CONN_LOG_SCANS				1086
 /*! log: consolidated slot closures */
-#define	WT_STAT_CONN_LOG_SLOT_CLOSES			1086
+#define	WT_STAT_CONN_LOG_SLOT_CLOSES			1087
 /*! log: logging bytes consolidated */
-#define	WT_STAT_CONN_LOG_SLOT_CONSOLIDATED		1087
+#define	WT_STAT_CONN_LOG_SLOT_CONSOLIDATED		1088
 /*! log: consolidated slot joins */
-#define	WT_STAT_CONN_LOG_SLOT_JOINS			1088
+#define	WT_STAT_CONN_LOG_SLOT_JOINS			1089
 /*! log: consolidated slot join races */
-#define	WT_STAT_CONN_LOG_SLOT_RACES			1089
+#define	WT_STAT_CONN_LOG_SLOT_RACES			1090
 /*! log: slots selected for switching that were unavailable */
-#define	WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS		1090
+#define	WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS		1091
 /*! log: record size exceeded maximum */
-#define	WT_STAT_CONN_LOG_SLOT_TOOBIG			1091
+#define	WT_STAT_CONN_LOG_SLOT_TOOBIG			1092
 /*! log: failed to find a slot large enough for record */
-#define	WT_STAT_CONN_LOG_SLOT_TOOSMALL			1092
+#define	WT_STAT_CONN_LOG_SLOT_TOOSMALL			1093
 /*! log: consolidated slot join transitions */
-#define	WT_STAT_CONN_LOG_SLOT_TRANSITIONS		1093
+#define	WT_STAT_CONN_LOG_SLOT_TRANSITIONS		1094
 /*! log: log sync operations */
-#define	WT_STAT_CONN_LOG_SYNC				1094
+#define	WT_STAT_CONN_LOG_SYNC				1095
+/*! log: log sync_dir operations */
+#define	WT_STAT_CONN_LOG_SYNC_DIR			1096
+/*! log: log server thread advances write LSN */
+#define	WT_STAT_CONN_LOG_WRITE_LSN			1097
 /*! log: log write operations */
-#define	WT_STAT_CONN_LOG_WRITES				1095
+#define	WT_STAT_CONN_LOG_WRITES				1098
 /*! LSM: sleep for LSM checkpoint throttle */
-#define	WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE		1096
+#define	WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE		1099
 /*! LSM: sleep for LSM merge throttle */
-#define	WT_STAT_CONN_LSM_MERGE_THROTTLE			1097
+#define	WT_STAT_CONN_LSM_MERGE_THROTTLE			1100
 /*! LSM: rows merged in an LSM tree */
-#define	WT_STAT_CONN_LSM_ROWS_MERGED			1098
+#define	WT_STAT_CONN_LSM_ROWS_MERGED			1101
 /*! LSM: application work units currently queued */
-#define	WT_STAT_CONN_LSM_WORK_QUEUE_APP			1099
+#define	WT_STAT_CONN_LSM_WORK_QUEUE_APP			1102
 /*! LSM: merge work units currently queued */
-#define	WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER		1100
+#define	WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER		1103
 /*! LSM: tree queue hit maximum */
-#define	WT_STAT_CONN_LSM_WORK_QUEUE_MAX			1101
+#define	WT_STAT_CONN_LSM_WORK_QUEUE_MAX			1104
 /*! LSM: switch work units currently queued */
-#define	WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH		1102
+#define	WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH		1105
 /*! LSM: tree maintenance operations scheduled */
-#define	WT_STAT_CONN_LSM_WORK_UNITS_CREATED		1103
+#define	WT_STAT_CONN_LSM_WORK_UNITS_CREATED		1106
 /*! LSM: tree maintenance operations discarded */
-#define	WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED		1104
+#define	WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED		1107
 /*! LSM: tree maintenance operations executed */
-#define	WT_STAT_CONN_LSM_WORK_UNITS_DONE		1105
+#define	WT_STAT_CONN_LSM_WORK_UNITS_DONE		1108
 /*! connection: memory allocations */
-#define	WT_STAT_CONN_MEMORY_ALLOCATION			1106
+#define	WT_STAT_CONN_MEMORY_ALLOCATION			1109
 /*! connection: memory frees */
-#define	WT_STAT_CONN_MEMORY_FREE			1107
+#define	WT_STAT_CONN_MEMORY_FREE			1110
 /*! connection: memory re-allocations */
-#define	WT_STAT_CONN_MEMORY_GROW			1108
+#define	WT_STAT_CONN_MEMORY_GROW			1111
 /*! thread-yield: page acquire busy blocked */
-#define	WT_STAT_CONN_PAGE_BUSY_BLOCKED			1109
+#define	WT_STAT_CONN_PAGE_BUSY_BLOCKED			1112
 /*! thread-yield: page acquire eviction blocked */
-#define	WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED	1110
+#define	WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED	1113
 /*! thread-yield: page acquire locked blocked */
-#define	WT_STAT_CONN_PAGE_LOCKED_BLOCKED		1111
+#define	WT_STAT_CONN_PAGE_LOCKED_BLOCKED		1114
 /*! thread-yield: page acquire read blocked */
-#define	WT_STAT_CONN_PAGE_READ_BLOCKED			1112
+#define	WT_STAT_CONN_PAGE_READ_BLOCKED			1115
 /*! thread-yield: page acquire time sleeping (usecs) */
-#define	WT_STAT_CONN_PAGE_SLEEP				1113
+#define	WT_STAT_CONN_PAGE_SLEEP				1116
 /*! connection: total read I/Os */
-#define	WT_STAT_CONN_READ_IO				1114
+#define	WT_STAT_CONN_READ_IO				1117
 /*! reconciliation: page reconciliation calls */
-#define	WT_STAT_CONN_REC_PAGES				1115
+#define	WT_STAT_CONN_REC_PAGES				1118
 /*! reconciliation: page reconciliation calls for eviction */
-#define	WT_STAT_CONN_REC_PAGES_EVICTION			1116
+#define	WT_STAT_CONN_REC_PAGES_EVICTION			1119
 /*! reconciliation: split bytes currently awaiting free */
-#define	WT_STAT_CONN_REC_SPLIT_STASHED_BYTES		1117
+#define	WT_STAT_CONN_REC_SPLIT_STASHED_BYTES		1120
 /*! reconciliation: split objects currently awaiting free */
-#define	WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS		1118
+#define	WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS		1121
 /*! connection: pthread mutex shared lock read-lock calls */
-#define	WT_STAT_CONN_RWLOCK_READ			1119
+#define	WT_STAT_CONN_RWLOCK_READ			1122
 /*! connection: pthread mutex shared lock write-lock calls */
-#define	WT_STAT_CONN_RWLOCK_WRITE			1120
+#define	WT_STAT_CONN_RWLOCK_WRITE			1123
 /*! session: open cursor count */
-#define	WT_STAT_CONN_SESSION_CURSOR_OPEN		1121
+#define	WT_STAT_CONN_SESSION_CURSOR_OPEN		1124
 /*! session: open session count */
-#define	WT_STAT_CONN_SESSION_OPEN			1122
+#define	WT_STAT_CONN_SESSION_OPEN			1125
 /*! transaction: transaction begins */
-#define	WT_STAT_CONN_TXN_BEGIN				1123
+#define	WT_STAT_CONN_TXN_BEGIN				1126
 /*! transaction: transaction checkpoints */
-#define	WT_STAT_CONN_TXN_CHECKPOINT			1124
+#define	WT_STAT_CONN_TXN_CHECKPOINT			1127
 /*! transaction: transaction checkpoint currently running */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_RUNNING		1125
+#define	WT_STAT_CONN_TXN_CHECKPOINT_RUNNING		1128
 /*! transaction: transaction checkpoint max time (msecs) */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX		1126
+#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX		1129
 /*! transaction: transaction checkpoint min time (msecs) */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN		1127
+#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN		1130
 /*! transaction: transaction checkpoint most recent time (msecs) */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT		1128
+#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT		1131
 /*! transaction: transaction checkpoint total time (msecs) */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL		1129
+#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL		1132
 /*! transaction: transactions committed */
-#define	WT_STAT_CONN_TXN_COMMIT				1130
+#define	WT_STAT_CONN_TXN_COMMIT				1133
 /*! transaction: transaction failures due to cache overflow */
-#define	WT_STAT_CONN_TXN_FAIL_CACHE			1131
+#define	WT_STAT_CONN_TXN_FAIL_CACHE			1134
 /*! transaction: transaction range of IDs currently pinned */
-#define	WT_STAT_CONN_TXN_PINNED_RANGE			1132
+#define	WT_STAT_CONN_TXN_PINNED_RANGE			1135
 /*! transaction: transactions rolled back */
-#define	WT_STAT_CONN_TXN_ROLLBACK			1133
+#define	WT_STAT_CONN_TXN_ROLLBACK			1136
 /*! connection: total write I/Os */
-#define	WT_STAT_CONN_WRITE_IO				1134
+#define	WT_STAT_CONN_WRITE_IO				1137
 
 /*!
  * @}
diff --git a/src/log/log.c b/src/log/log.c
index f76ec402b0d..f485f0a09e5 100644
--- a/src/log/log.c
+++ b/src/log/log.c
@@ -61,16 +61,23 @@ __wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, int *rec)
 
 	WT_RET(__wt_curlog_open(session, "log:", NULL, &c));
 	c->set_key(c, ckp_lsn->file, ckp_lsn->offset, 0);
-	WT_ERR(c->search(c));
-
-	/*
-	 * If the checkpoint LSN we're given is the last record, then recovery
-	 * is not needed.
-	 */
-	if ((ret = c->next(c)) == WT_NOTFOUND) {
-		*rec = 0;
+	if ((ret = c->search(c)) == 0) {
+		/*
+		 * If the checkpoint LSN we're given is the last record,
+		 * then recovery is not needed.
+		 */
+		if ((ret = c->next(c)) == WT_NOTFOUND) {
+			*rec = 0;
+			ret = 0;
+		}
+	} else if (ret == WT_NOTFOUND)
+		/*
+		 * If we didn't find that LSN, we need to run recovery,
+		 * but not return any error.
+		 */
 		ret = 0;
-	}
+	else
+		WT_ERR(ret);
 
 err:	WT_TRET(c->close(c));
 	return (ret);
@@ -455,6 +462,10 @@ __log_file_header(
 		WT_ERR(__log_acquire(session, logrec->len, &tmp));
 	}
 	WT_ERR(__log_fill(session, &myslot, 1, buf, NULL));
+	/*
+	 * Make sure the header gets to disk.
+	 */
+	WT_ERR(__wt_fsync(session, tmp.slot_fh));
 	if (end_lsn != NULL)
 		*end_lsn = tmp.slot_end_lsn;
 
@@ -573,6 +584,7 @@ __log_truncate(WT_SESSION_IMPL *session,
 	WT_ERR(__wt_ftruncate(session, log_fh, lsn->offset));
 	tmp_fh = log_fh;
 	log_fh = NULL;
+	WT_ERR(__wt_fsync(session, tmp_fh));
 	WT_ERR(__wt_close(session, tmp_fh));
 
 	/*
@@ -596,6 +608,7 @@ __log_truncate(WT_SESSION_IMPL *session,
 			    log_fh, LOG_FIRST_RECORD));
 			tmp_fh = log_fh;
 			log_fh = NULL;
+			WT_ERR(__wt_fsync(session, tmp_fh));
 			WT_ERR(__wt_close(session, tmp_fh));
 		}
 	}
@@ -646,6 +659,7 @@ __wt_log_allocfile(
 		WT_ERR(__log_prealloc(session, log_fh));
 	tmp_fh = log_fh;
 	log_fh = NULL;
+	WT_ERR(__wt_fsync(session, tmp_fh));
 	WT_ERR(__wt_close(session, tmp_fh));
 	WT_ERR(__wt_verbose(session, WT_VERB_LOG,
 	    "log_prealloc: rename %s to %s",
@@ -790,17 +804,20 @@ __wt_log_close(WT_SESSION_IMPL *session)
 	if (log->log_close_fh != NULL && log->log_close_fh != log->log_fh) {
 		WT_RET(__wt_verbose(session, WT_VERB_LOG,
 		    "closing old log %s", log->log_close_fh->name));
+		WT_RET(__wt_fsync(session, log->log_close_fh));
 		WT_RET(__wt_close(session, log->log_close_fh));
 	}
 	if (log->log_fh != NULL) {
 		WT_RET(__wt_verbose(session, WT_VERB_LOG,
 		    "closing log %s", log->log_fh->name));
+		WT_RET(__wt_fsync(session, log->log_fh));
 		WT_RET(__wt_close(session, log->log_fh));
 		log->log_fh = NULL;
 	}
 	if (log->log_dir_fh != NULL) {
 		WT_RET(__wt_verbose(session, WT_VERB_LOG,
 		    "closing log directory %s", log->log_dir_fh->name));
+		WT_RET(__wt_directory_sync_fh(session, log->log_dir_fh));
 		WT_RET(__wt_close(session, log->log_dir_fh));
 		log->log_dir_fh = NULL;
 	}
@@ -900,7 +917,7 @@ err:
  *	Release a log slot.
  */
 static int
-__log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+__log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep)
 {
 	WT_CONNECTION_IMPL *conn;
 	WT_DECL_RET;
@@ -913,6 +930,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
 	conn = S2C(session);
 	log = conn->log;
 	locked = yield_count = 0;
+	*freep = 1;
 
 	/* Write the buffered records */
 	if (F_ISSET(slot, SLOT_BUFFERED)) {
@@ -923,9 +941,29 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
 	}
 
 	/*
-	 * Wait for earlier groups to finish, otherwise there could be holes
-	 * in the log file.
+	 * If this is not a buffered write, meaning the slot we have is a
+	 * dummy constructed slot, not from the slot pool, or we have to wait
+	 * for a synchronous operation, we do not pass handling of this slot
+	 * off to the worker thread.  The caller is responsible for freeing
+	 * the slot in that case.  Otherwise the worker thread will free it.
+	 */
+	if (F_ISSET(slot, SLOT_BUFFERED) &&
+	    !F_ISSET(slot, SLOT_SYNC | SLOT_SYNC_DIR)) {
+		*freep = 0;
+		slot->slot_state = WT_LOG_SLOT_WRITTEN;
+		/*
+		 * After this point the worker thread owns the slot.  There
+		 * is nothing more to do but return.
+		 */
+		WT_ERR(__wt_cond_signal(session, conn->log_wrlsn_cond));
+		goto done;
+	}
+
+	/*
+	 * Wait for earlier groups to finish, otherwise there could
+	 * be holes in the log file.
 	 */
+	WT_STAT_FAST_CONN_INCR(session, log_release_write_lsn);
 	while (LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0) {
 		if (++yield_count < 1000)
 			__wt_yield();
@@ -936,6 +974,9 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
 	log->write_lsn = slot->slot_end_lsn;
 	WT_ERR(__wt_cond_signal(session, log->log_write_cond));
 
+	/*
+	 * Signal the close thread if needed.
+	 */
 	if (F_ISSET(slot, SLOT_CLOSEFH))
 		WT_ERR(__wt_cond_signal(session, conn->log_close_cond));
 
@@ -978,7 +1019,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
 			WT_ERR(__wt_directory_sync_fh(
 			    session, log->log_dir_fh));
 			log->sync_dir_lsn = sync_lsn;
-			F_CLR(slot, SLOT_SYNC_DIR);
+			WT_STAT_FAST_CONN_INCR(session, log_sync_dir);
 		}
 
 		/*
@@ -990,26 +1031,22 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
 			    "log_release: sync log %s", log->log_fh->name));
 			WT_STAT_FAST_CONN_INCR(session, log_sync);
 			WT_ERR(__wt_fsync(session, log->log_fh));
-			F_CLR(slot, SLOT_SYNC);
 			log->sync_lsn = sync_lsn;
 			WT_ERR(__wt_cond_signal(session, log->log_sync_cond));
 		}
+		/*
+		 * Clear the flags before leaving the loop.
+		 */
+		F_CLR(slot, SLOT_SYNC | SLOT_SYNC_DIR);
 		locked = 0;
 		__wt_spin_unlock(session, &log->log_sync_lock);
 		break;
 	}
-	if (F_ISSET(slot, SLOT_BUF_GROW)) {
-		WT_STAT_FAST_CONN_INCR(session, log_buffer_grow);
-		F_CLR(slot, SLOT_BUF_GROW);
-		WT_STAT_FAST_CONN_INCRV(session,
-		    log_buffer_size, slot->slot_buf.memsize);
-		WT_ERR(__wt_buf_grow(session,
-		    &slot->slot_buf, slot->slot_buf.memsize * 2));
-	}
 err:	if (locked)
 		__wt_spin_unlock(session, &log->log_sync_lock);
 	if (ret != 0 && slot->slot_error == 0)
 		slot->slot_error = ret;
+done:
 	return (ret);
 }
 
@@ -1460,12 +1497,13 @@ __log_direct_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
 	WT_LOG *log;
 	WT_LOGSLOT tmp;
 	WT_MYSLOT myslot;
-	int locked;
+	int dummy, locked;
 	WT_DECL_SPINLOCK_ID(id);			/* Must appear last */
 
 	log = S2C(session)->log;
 	myslot.slot = &tmp;
 	myslot.offset = 0;
+	dummy = 0;
 	WT_CLEAR(tmp);
 
 	/* Fast path the contended case. */
@@ -1481,7 +1519,7 @@ __log_direct_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
 	__wt_spin_unlock(session, &log->log_slot_lock);
 	locked = 0;
 	WT_ERR(__log_fill(session, &myslot, 1, record, lsnp));
-	WT_ERR(__log_release(session, &tmp));
+	WT_ERR(__log_release(session, &tmp, &dummy));
 
 err:	if (locked)
 		__wt_spin_unlock(session, &log->log_slot_lock);
@@ -1609,11 +1647,11 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
 	WT_LSN lsn;
 	WT_MYSLOT myslot;
 	uint32_t rdup_len;
-	int locked;
+	int free_slot, locked;
 
 	conn = S2C(session);
 	log = conn->log;
-	locked = 0;
+	free_slot = locked = 0;
 	WT_INIT_LSN(&lsn);
 	myslot.slot = NULL;
 	/*
@@ -1695,8 +1733,9 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
 		WT_ERR(__wt_log_slot_wait(session, myslot.slot));
 	WT_ERR(__log_fill(session, &myslot, 0, record, &lsn));
 	if (__wt_log_slot_release(myslot.slot, rdup_len) == WT_LOG_SLOT_DONE) {
-		WT_ERR(__log_release(session, myslot.slot));
-		WT_ERR(__wt_log_slot_free(myslot.slot));
+		WT_ERR(__log_release(session, myslot.slot, &free_slot));
+		if (free_slot)
+			WT_ERR(__wt_log_slot_free(session, myslot.slot));
 	} else if (LF_ISSET(WT_LOG_FSYNC)) {
 		/* Wait for our writes to reach disk */
 		while (LOG_CMP(&log->sync_lsn, &lsn) <= 0 &&
diff --git a/src/log/log_slot.c b/src/log/log_slot.c
index 8dcb2f9f165..02b3056be6f 100644
--- a/src/log/log_slot.c
+++ b/src/log/log_slot.c
@@ -57,7 +57,7 @@ __wt_log_slot_init(WT_SESSION_IMPL *session)
 	for (i = 0; i < SLOT_POOL; i++) {
 		WT_ERR(__wt_buf_init(session,
 		    &log->slot_pool[i].slot_buf, WT_LOG_SLOT_BUF_INIT_SIZE));
-		F_SET(&log->slot_pool[i], SLOT_BUFFERED);
+		F_SET(&log->slot_pool[i], SLOT_INIT_FLAGS);
 	}
 	WT_STAT_FAST_CONN_INCRV(session,
 	    log_buffer_size, WT_LOG_SLOT_BUF_INIT_SIZE * SLOT_POOL);
@@ -295,10 +295,34 @@ __wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size)
  *	Free a slot back into the pool.
  */
 int
-__wt_log_slot_free(WT_LOGSLOT *slot)
+__wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
 {
+	WT_DECL_RET;
+
+	ret = 0;
+	/*
+	 * Grow the buffer if needed before returning it to the pool.
+	 */
+	if (F_ISSET(slot, SLOT_BUF_GROW)) {
+		WT_STAT_FAST_CONN_INCR(session, log_buffer_grow);
+		WT_STAT_FAST_CONN_INCRV(session,
+		    log_buffer_size, slot->slot_buf.memsize);
+		WT_ERR(__wt_buf_grow(session,
+		    &slot->slot_buf, slot->slot_buf.memsize * 2));
+	}
+err:
+	/*
+	 * No matter if there is an error, we always want to free
+	 * the slot back to the pool.
+	 */
+	/*
+	 * Make sure flags don't get retained between uses.
+	 * We have to reset them them here because multiple threads may
+	 * change the flags when joining the slot.
+	 */
+	slot->flags = SLOT_INIT_FLAGS;
 	slot->slot_state = WT_LOG_SLOT_FREE;
-	return (0);
+	return (ret);
 }
 
 /*
diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c
index 3b4dc639945..8474b6e8b37 100644
--- a/src/lsm/lsm_cursor.c
+++ b/src/lsm/lsm_cursor.c
@@ -77,6 +77,7 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm)
 	} else {
 		primary = clsm->cursors[clsm->nchunks - 1];
 		primary_chunk = clsm->primary_chunk;
+		WT_ASSERT(session, F_ISSET(&session->txn, TXN_HAS_ID));
 		have_primary = (primary != NULL && primary_chunk != NULL &&
 		    (primary_chunk->switch_txn == WT_TXN_NONE ||
 		    TXNID_LT(session->txn.id, primary_chunk->switch_txn)));
@@ -177,14 +178,15 @@ __clsm_enter(WT_CURSOR_LSM *clsm, int reset, int update)
 
 		/* Update the maximum transaction ID in the primary chunk. */
 		if (update) {
-			WT_RET(__clsm_enter_update(clsm));
-			if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen)
-				goto open;
-
 			/*
 			 * Ensure that there is a transaction snapshot active.
 			 */
 			WT_RET(__wt_txn_autocommit_check(session));
+			WT_RET(__wt_txn_id_check(session));
+
+			WT_RET(__clsm_enter_update(clsm));
+			if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen)
+				goto open;
 
 			if (session->txn.isolation == TXN_ISO_SNAPSHOT)
 				__wt_txn_cursor_op(session);
@@ -1237,11 +1239,12 @@ __clsm_put(WT_SESSION_IMPL *session,
 {
 	WT_CURSOR *c, *primary;
 	WT_LSM_TREE *lsm_tree;
-	u_int i;
+	u_int i, slot;
 
 	lsm_tree = clsm->lsm_tree;
 
 	WT_ASSERT(session,
+	    F_ISSET(&session->txn, TXN_HAS_ID) &&
 	    clsm->primary_chunk != NULL &&
 	    (clsm->primary_chunk->switch_txn == WT_TXN_NONE ||
 	    TXNID_LE(session->txn.id, clsm->primary_chunk->switch_txn)));
@@ -1257,8 +1260,15 @@ __clsm_put(WT_SESSION_IMPL *session,
 	if (position)
 		clsm->current = primary;
 
-	for (i = 0; i < clsm->nupdates; i++) {
-		c = clsm->cursors[(clsm->nchunks - i) - 1];
+	for (i = 0, slot = clsm->nchunks - 1; i < clsm->nupdates; i++, slot--) {
+		/* Check if we need to keep updating old chunks. */
+		if (i > 0 &&
+		    __wt_txn_visible(session, clsm->switch_txn[slot])) {
+			clsm->nupdates = i;
+			break;
+		}
+
+		c = clsm->cursors[slot];
 		c->set_key(c, key);
 		c->set_value(c, value);
 		WT_RET((position && i == 0) ? c->update(c) : c->insert(c));
diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c
index f4ddd4f7e2f..dea012ccb9e 100644
--- a/src/lsm/lsm_work_unit.c
+++ b/src/lsm/lsm_work_unit.c
@@ -401,7 +401,13 @@ __lsm_bloom_create(WT_SESSION_IMPL *session,
 	F_SET(src, WT_CURSTD_RAW);
 	WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1));
 
-	F_SET(session, WT_SESSION_NO_CACHE);
+	/*
+	 * Setup so that we don't hold pages we read into cache, and so
+	 * that we don't get stuck if the cache is full. If we allow
+	 * ourselves to get stuck creating bloom filters, the entire tree
+	 * can stall since there may be no worker threads available to flush.
+	 */
+	F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK);
 	for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
 		WT_ERR(src->get_key(src, &key));
 		WT_ERR(__wt_bloom_insert(bloom, &key));
@@ -414,15 +420,7 @@ __lsm_bloom_create(WT_SESSION_IMPL *session,
 
 	F_CLR(session, WT_SESSION_NO_CACHE);
 
-	/*
-	 * Load the new Bloom filter into cache.
-	 *
-	 * We're doing advisory reads to fault the new trees into cache.
-	 * Don't block if the cache is full: our next unit of work may be to
-	 * discard some trees to free space.
-	 */
-	F_SET(session, WT_SESSION_NO_CACHE_CHECK);
-
+	/* Load the new Bloom filter into cache. */
 	WT_CLEAR(key);
 	WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key));
 
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index 4ca1a6bc623..33d79e6d4ce 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -522,6 +522,12 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
 	 */
 	mod->mod_root_split = next;
 
+	/*
+	 * Mark the page dirty.
+	 * Don't mark the tree dirty: if this reconciliation is in service of a
+	 * checkpoint, it's cleared the tree's dirty flag, and we don't want to
+	 * set it again as part of that walk.
+	 */
 	WT_ERR(__wt_page_modify_init(session, next));
 	__wt_page_only_modify_set(session, next);
 
@@ -1113,12 +1119,14 @@ __rec_child_modify(WT_SESSION_IMPL *session,
 			 * process will have completed before we walk any pages
 			 * for checkpoint.
 			 */
-			if ((ret = __wt_page_in(session, ref,
+			ret = __wt_page_in(session, ref,
 			    WT_READ_CACHE | WT_READ_NO_EVICT |
-			    WT_READ_NO_GEN | WT_READ_NO_WAIT)) == WT_NOTFOUND) {
+			    WT_READ_NO_GEN | WT_READ_NO_WAIT);
+			if (ret == WT_NOTFOUND) {
 				ret = 0;
 				break;
 			}
+			WT_RET(ret);
 			*hazardp = 1;
 			goto in_memory;
 
@@ -1173,7 +1181,7 @@ in_memory:
 		CHILD_RELEASE(session, *hazardp, ref);
 	}
 
-done:	WT_HAVE_DIAGNOSTIC_YIELD;
+done:	WT_DIAGNOSTIC_YIELD;
 	return (ret);
 }
 
@@ -1982,16 +1990,20 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
 		next->start = r->first_free;
 		next->entries = 0;
 
-		/*
-		 * Set the space available to another split-size chunk, if we
-		 * have one.  If we don't have room for another split chunk,
-		 * add whatever space remains in this page.
-		 */
+		/* Set the space available to another split-size chunk. */
 		r->space_avail =
 		    r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+
+		/*
+		 * Adjust the space available to handle two cases:
+		 *  - We don't have enough room for another full split-size
+		 *    chunk on the page.
+		 *  - We chose to fill past a page boundary because of a
+		 *    large item.
+		 */
 		if (inuse + r->space_avail > r->page_size) {
-			WT_ASSERT(session, r->page_size >= inuse);
-			r->space_avail = r->page_size - inuse;
+			r->space_avail =
+			    r->page_size > inuse ? (r->page_size - inuse) : 0;
 
 			/* There are no further boundary points. */
 			r->bnd_state = SPLIT_MAX;
@@ -2649,7 +2661,7 @@ __rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r)
 	 * WT_PAGE_HEADER header onto the scratch buffer, most of the header
 	 * information remains unchanged between the pages.
 	 */
-	WT_RET(__wt_scr_alloc(session, r->page_size, &tmp));
+	WT_RET(__wt_scr_alloc(session, r->dsk.memsize, &tmp));
 	dsk = tmp->mem;
 	memcpy(dsk, r->dsk.mem, WT_PAGE_HEADER_SIZE);
 
@@ -2977,7 +2989,7 @@ __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
 	WT_RET(__rec_split_finish(session, r));
 	WT_RET(__rec_write_wrapup(session, r, r->page));
 
-	/* Mark the page's parent dirty. */
+	/* Mark the page's parent and the tree dirty. */
 	parent = r->ref->home;
 	WT_RET(__wt_page_modify_init(session, parent));
 	__wt_page_modify_set(session, parent);
@@ -3017,8 +3029,6 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
 			WT_RET(
 			    __rec_split_raw(session, r, key->len + val->len));
 		else {
-			WT_RET(__rec_split(session, r, key->len + val->len));
-
 			/*
 			 * Turn off prefix compression until a full key written
 			 * to the new page, and (unless already working with an
@@ -3030,6 +3040,8 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
 					WT_RET(__rec_cell_build_leaf_key(
 					    session, r, NULL, 0, &ovfl_key));
 			}
+
+			WT_RET(__rec_split(session, r, key->len + val->len));
 		}
 	}
 
@@ -3225,15 +3237,18 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
 		WT_ERR(__rec_child_modify(session, r, ref, &hazard, &state));
 		addr = NULL;
 		child = ref->page;
-		if (state != 0) {
-			/*
-			 * Currently the only non-zero returned stated possible
-			 * for a column-store page is child-modified (all other
-			 * states are part of the fast-truncate support, which
-			 * is row-store only).
-			 */
-			WT_ASSERT(session, state == WT_CHILD_MODIFIED);
 
+		/* Deleted child we don't have to write. */
+		if (state == WT_CHILD_IGNORE) {
+			CHILD_RELEASE_ERR(session, hazard, ref);
+			continue;
+		}
+
+		/*
+		 * Modified child.  Empty pages are merged into the parent and
+		 * discarded.
+		 */
+		if (state == WT_CHILD_MODIFIED) {
 			switch (F_ISSET(child->modify, WT_PM_REC_MASK)) {
 			case WT_PM_REC_EMPTY:
 				/*
@@ -3253,7 +3268,9 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
 				break;
 			WT_ILLEGAL_VALUE_ERR(session);
 			}
-		}
+		} else
+			/* No other states are expected for column stores. */
+			WT_ASSERT(session, state == 0);
 
 		/*
 		 * Build the value cell.  The child page address is in one of 3
@@ -4550,8 +4567,6 @@ build:
 					    WT_PAGE_ROW_LEAF, kpack, r->cur));
 					key_onpage_ovfl = 0;
 				}
-				WT_ERR(__rec_split(
-				    session, r, key->len + val->len));
 
 				/*
 				 * Turn off prefix compression until a full key
@@ -4567,6 +4582,9 @@ build:
 						    session,
 						    r, NULL, 0, &ovfl_key));
 				}
+
+				WT_ERR(__rec_split(
+				    session, r, key->len + val->len));
 			}
 		}
 
@@ -4636,9 +4654,6 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
 				WT_RET(__rec_split_raw(
 				    session, r, key->len + val->len));
 			else {
-				WT_RET(__rec_split(
-				    session, r, key->len + val->len));
-
 				/*
 				 * Turn off prefix compression until a full key
 				 * written to the new page, and (unless already
@@ -4653,6 +4668,9 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
 						    session,
 						    r, NULL, 0, &ovfl_key));
 				}
+
+				WT_RET(__rec_split(
+				    session, r, key->len + val->len));
 			}
 		}
 
@@ -5085,7 +5103,7 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
 
 	for (multi = mod->mod_multi,
 	    bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) {
-		WT_RET(__wt_row_ikey(session, 0,
+		WT_RET(__wt_row_ikey_alloc(session, 0,
 		    bnd->key.data, bnd->key.size, &multi->key.ikey));
 
 		if (bnd->skip == NULL) {
diff --git a/src/schema/schema_worker.c b/src/schema/schema_worker.c
index 3dfd068cf9c..e913fcfe69d 100644
--- a/src/schema/schema_worker.c
+++ b/src/schema/schema_worker.c
@@ -55,11 +55,17 @@ __wt_schema_worker(WT_SESSION_IMPL *session,
 				WT_ERR(ret);
 			}
 
-			WT_ERR(__wt_session_get_btree_ckpt(
-			    session, uri, cfg, open_flags));
-			WT_SAVE_DHANDLE(session,
-			    ret = file_func(session, cfg));
-			WT_TRET(__wt_session_release_btree(session));
+			if ((ret = __wt_session_get_btree_ckpt(
+			    session, uri, cfg, open_flags)) == 0) {
+				WT_SAVE_DHANDLE(session,
+				    ret = file_func(session, cfg));
+				WT_TRET(__wt_session_release_btree(session));
+			} else if (ret == EBUSY)
+				/* TODO: Decode checkpoint from cfg. */
+				WT_WITH_DHANDLE_LOCK(session,
+				    ret = __wt_conn_btree_apply_single_ckpt(
+				    session, uri, file_func, cfg));
+			WT_ERR(ret);
 		}
 	} else if (WT_PREFIX_MATCH(uri, "colgroup:")) {
 		WT_ERR(__wt_schema_get_colgroup(
diff --git a/src/support/stat.c b/src/support/stat.c
index 0926636a532..9d10c4d5ca6 100644
--- a/src/support/stat.c
+++ b/src/support/stat.c
@@ -447,10 +447,15 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats)
 	    "log: log records not compressed";
 	stats->log_compress_small.desc =
 	    "log: log records too small to compress";
+	stats->log_release_write_lsn.desc =
+	    "log: log release advances write LSN";
 	stats->log_scans.desc = "log: log scan operations";
 	stats->log_scan_rereads.desc =
 	    "log: log scan records requiring two reads";
+	stats->log_write_lsn.desc =
+	    "log: log server thread advances write LSN";
 	stats->log_sync.desc = "log: log sync operations";
+	stats->log_sync_dir.desc = "log: log sync_dir operations";
 	stats->log_writes.desc = "log: log write operations";
 	stats->log_slot_consolidated.desc = "log: logging bytes consolidated";
 	stats->log_max_filesize.desc = "log: maximum log file size";
@@ -613,9 +618,12 @@ __wt_stat_refresh_connection_stats(void *stats_arg)
 	stats->log_compress_writes.v = 0;
 	stats->log_compress_write_fails.v = 0;
 	stats->log_compress_small.v = 0;
+	stats->log_release_write_lsn.v = 0;
 	stats->log_scans.v = 0;
 	stats->log_scan_rereads.v = 0;
+	stats->log_write_lsn.v = 0;
 	stats->log_sync.v = 0;
+	stats->log_sync_dir.v = 0;
 	stats->log_writes.v = 0;
 	stats->log_slot_consolidated.v = 0;
 	stats->log_prealloc_max.v = 0;
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index fb590e1a297..87b85eb2d8d 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -259,10 +259,10 @@ __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[])
 		session->ckpt_handle[session->ckpt_handle_next++].dhandle =
 		    session->dhandle;
 	else if (ret == EBUSY)
-		WT_ERR(__wt_strdup(session, name,
-		    &session->ckpt_handle[session->ckpt_handle_next++].name));
+		ret = __wt_strdup(session, name,
+		    &session->ckpt_handle[session->ckpt_handle_next++].name);
 
-err:	return (ret);
+	return (ret);
 }
 
 /*
@@ -988,14 +988,23 @@ __wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[])
 int
 __wt_checkpoint_close(WT_SESSION_IMPL *session, int force)
 {
-	/* If closing an unmodified file, simply discard its blocks. */
-	if (!S2BT(session)->modified || force)
-		return (__wt_cache_op(session, NULL,
-		    force ? WT_SYNC_DISCARD_FORCE : WT_SYNC_DISCARD));
+	WT_DECL_RET;
+
+	/* Handle forced discard (when dropping a file). */
+	if (force)
+		return (__wt_cache_op(session, NULL, WT_SYNC_DISCARD_FORCE));
+
+	/* If closing an unmodified file, try to evict its pages. */
+	if (!S2BT(session)->modified) {
+		ret = __wt_cache_op(session, NULL, WT_SYNC_DISCARD);
+		if (ret != EBUSY)
+			return (ret);
+	}
 
 	/*
-	 * Else, checkpoint the file and optionally flush the writes (the
-	 * checkpoint call will discard the blocks, there's no additional
+	 * If closing a modified file, or closing an unmodified file was blocked
+	 * for any reason, checkpoint the file and optionally flush the writes
+	 * (the checkpoint call will discard the blocks, there's no additional
 	 * step needed).
 	 */
 	WT_RET(__checkpoint_worker(session, NULL, 0));
diff --git a/test/bloom/test_bloom.c b/test/bloom/test_bloom.c
index 72f53fed9f8..086faef1a30 100644
--- a/test/bloom/test_bloom.c
+++ b/test/bloom/test_bloom.c
@@ -28,8 +28,6 @@
 
 #include "wt_internal.h"
 
-#include <signal.h>
-
 static struct {
 	char *progname;				/* Program name */
 
@@ -38,8 +36,7 @@ static struct {
 
 	char *config_open;			/* Command-line configuration */
 
-	uint32_t c_bitcnt;			/* Config values */
-	uint32_t c_cache;
+	uint32_t c_cache;			/* Config values */
 	uint32_t c_key_max;
 	uint32_t c_ops;
 	uint32_t c_k;				/* Number of hash iterations */
@@ -49,12 +46,12 @@ static struct {
 	uint8_t **entries;
 } g;
 
-static int cleanup(void);
+void cleanup(void);
 void die(int e, const char *fmt, ...);
-static int populate_entries(void);
-static int run(void);
-static int setup(void);
-static void usage(void);
+void populate_entries(void);
+void run(void);
+void setup(void);
+void usage(void);
 
 extern char *__wt_optarg;
 extern int __wt_optind;
@@ -109,7 +106,7 @@ main(int argc, char *argv[])
 	return (EXIT_SUCCESS);
 }
 
-int
+void
 setup(void)
 {
 	WT_CONNECTION *conn;
@@ -141,13 +138,10 @@ setup(void)
 
 	g.wt_conn = conn;
 	g.wt_session = session;
-	if ((ret = populate_entries()) != 0)
-		die(ret, "populate_entries");
-
-	return (0);
+	populate_entries();
 }
 
-int
+void
 run(void)
 {
 	WT_BLOOM *bloomp;
@@ -184,7 +178,8 @@ run(void)
 	if ((ret = __wt_bloom_close(bloomp)) != 0)
 		die(ret, "__wt_bloom_close");
 
-	g.wt_session->checkpoint(g.wt_session, NULL);
+	if ((ret = g.wt_session->checkpoint(g.wt_session, NULL)) != 0)
+		die(ret, "WT_SESSION.checkpoint");
 	if ((ret = __wt_bloom_open(
 	    sess, uri, g.c_factor, g.c_k, NULL, &bloomp)) != 0)
 		die(ret, "__wt_bloom_open");
@@ -212,28 +207,28 @@ run(void)
 	    g.c_ops, fp, 100.0 * fp/g.c_ops);
 	if ((ret = __wt_bloom_drop(bloomp, NULL)) != 0)
 		die(ret, "__wt_bloom_drop");
-
-	return (0);
 }
 
-int
+void
 cleanup(void)
 {
 	uint32_t i;
+	int ret;
 
 	for (i = 0; i < g.c_ops; i++)
 		free(g.entries[i]);
 	free(g.entries);
-	g.wt_session->close(g.wt_session, NULL);
-	g.wt_conn->close(g.wt_conn, NULL);
-	return (0);
+	if ((ret = g.wt_session->close(g.wt_session, NULL)) != 0)
+		die(ret, "WT_SESSION.close");
+	if ((g.wt_conn->close(g.wt_conn, NULL)) != 0)
+		die(ret, "WT_CONNECTION.close");
 }
 
 /*
  * Create and keep all the strings used to populate the bloom filter, so that
  * we can do validation with the same set of entries.
  */
-static int
+void
 populate_entries(void)
 {
 	uint32_t i, j;
@@ -254,7 +249,6 @@ populate_entries(void)
 	}
 
 	g.entries = entries;
-	return (0);
 }
 
 /*
@@ -283,7 +277,7 @@ die(int e, const char *fmt, ...)
  * usage --
  *	Display usage statement and exit failure.
  */
-static void
+void
 usage(void)
 {
 	fprintf(stderr, "usage: %s [-cfkos]\n", g.progname);
diff --git a/test/format/bdb.c b/test/format/bdb.c
index 563b69b9e27..254dd95e1d3 100644
--- a/test/format/bdb.c
+++ b/test/format/bdb.c
@@ -66,10 +66,7 @@ bdb_open(void)
 	assert(dbenv->mutex_set_max(dbenv, 10000) == 0);
 	assert(dbenv->set_cachesize(dbenv, 0, 50 * 1024 * 1024, 1) == 0);
 	assert(dbenv->open(dbenv, NULL,
-	    DB_CREATE |
-	    (g.c_delete_pct == 0 && g.c_insert_pct == 0 && g.c_write_pct == 0 ?
-	    0 : DB_INIT_LOCK) |
-	    DB_INIT_MPOOL | DB_PRIVATE, 0) == 0);
+	    DB_CREATE | DB_INIT_LOCK | DB_INIT_MPOOL | DB_PRIVATE, 0) == 0);
 	assert(db_create(&db, dbenv, 0) == 0);
 
 	if (g.type == ROW && g.c_reverse)
diff --git a/test/format/config.c b/test/format/config.c
index 1fbbe90a57e..e801827935c 100644
--- a/test/format/config.c
+++ b/test/format/config.c
@@ -71,14 +71,14 @@ config_setup(void)
 		}
 
 	if (!config_find_is_perm("file_type", strlen("file_type")))
-		switch (DATASOURCE("lsm") ? 3 : MMRAND(1, 3)) {
+		switch (DATASOURCE("lsm") ? 5 : MMRAND(1, 10)) {
 		case 1:
 			config_single("file_type=fix", 0);
 			break;
-		case 2:
+		case 2: case 3: case 4:
 			config_single("file_type=var", 0);
 			break;
-		case 3:
+		case 5: case 6: case 7: case 8: case 9: case 10:
 			config_single("file_type=row", 0);
 			break;
 		}
@@ -142,12 +142,6 @@ config_setup(void)
 	config_compression();
 	config_isolation();
 
-	/* Clear operations values if the whole run is read-only. */
-	if (g.c_ops == 0)
-		for (cp = c; cp->name != NULL; ++cp)
-			if (cp->flags & C_OPS)
-				*cp->v = 0;
-
 	/*
 	 * Periodically, set the delete percentage to 0 so salvage gets run,
 	 * as long as the delete percentage isn't nailed down.
@@ -174,6 +168,11 @@ config_setup(void)
 			g.c_insert_pct = MMRAND(50, 85);
 	}
 
+	/* Make the default maximum-run length 20 minutes. */
+	cp = config_find("timer", strlen("timer"));
+	if (!(cp->flags & C_PERM))
+		g.c_timer = 20;
+
 	/*
 	 * Key/value minimum/maximum are related, correct unless specified by
 	 * the configuration.
@@ -238,8 +237,9 @@ config_compression(void)
 	/*
 	 * Compression: choose something if compression wasn't specified,
 	 * otherwise confirm the appropriate shared library is available.
-	 * We don't include LZO in the test compression choices, we don't
-	 * yet have an LZO module of our own.
+	 * We used to verify that the libraries existed but that's no longer
+	 * robust, since it's possible to build compression libraries into
+	 * the WiredTiger library.
 	 */
 	cp = config_find("compression", strlen("compression"));
 	if (!(cp->flags & C_PERM)) {
@@ -249,50 +249,24 @@ config_compression(void)
 		case 4: case 5: case 6:
 			break;
 		case 7: case 8: case 9: case 10:	/* 20% bzip */
-			if (access(BZIP_PATH, R_OK) == 0)
-				cstr = "compression=bzip";
+			cstr = "compression=bzip";
 			break;
 		case 11:				/* 5% bzip-raw */
-			if (access(BZIP_PATH, R_OK) == 0)
-				cstr = "compression=bzip-raw";
+			cstr = "compression=bzip-raw";
 			break;
 		case 12: case 13: case 14: case 15:	/* 20% snappy */
-			if (access(SNAPPY_PATH, R_OK) == 0)
-				cstr = "compression=snappy";
+			cstr = "compression=snappy";
 			break;
 		case 16: case 17: case 18: case 19:	/* 20% zlib */
-			if (access(ZLIB_PATH, R_OK) == 0)
-				cstr = "compression=zlib";
+			cstr = "compression=zlib";
 			break;
 		case 20:				/* 5% zlib-no-raw */
-			if (access(ZLIB_PATH, R_OK) == 0)
-				cstr = "compression=zlib-noraw";
+			cstr = "compression=zlib-noraw";
 			break;
 		}
 
 		config_single(cstr, 0);
 	}
-
-	switch (g.c_compression_flag) {
-	case COMPRESS_BZIP:
-	case COMPRESS_BZIP_RAW:
-		if (access(BZIP_PATH, R_OK) != 0)
-			die(0, "bzip library not found or not readable");
-		break;
-	case COMPRESS_LZO:
-		if (access(LZO_PATH, R_OK) != 0)
-			die(0, "LZO library not found or not readable");
-		break;
-	case COMPRESS_SNAPPY:
-		if (access(SNAPPY_PATH, R_OK) != 0)
-			die(0, "snappy library not found or not readable");
-		break;
-	case COMPRESS_ZLIB:
-	case COMPRESS_ZLIB_NO_RAW:
-		if (access(ZLIB_PATH, R_OK) != 0)
-			die(0, "zlib library not found or not readable");
-		break;
-	}
 }
 
 /*
diff --git a/test/format/config.h b/test/format/config.h
index 7871127ff26..d5d797f4b50 100644
--- a/test/format/config.h
+++ b/test/format/config.h
@@ -40,14 +40,11 @@ typedef struct {
 	/* Not a simple randomization, handle outside the main loop. */ 
 #define	C_IGNORE	0x002
 
-	/* Operation, only set if doing operations. */
-#define	C_OPS		0x004
-
 	/* Value was set from command-line or file, ignore for all runs. */
-#define	C_PERM		0x008
+#define	C_PERM		0x004
 
 	/* Value isn't random for this run, ignore just for this run. */
-#define	C_TEMP		0x010
+#define	C_TEMP		0x008
 
 	/* Value is a string. */
 #define	C_STRING	0x020
@@ -134,7 +131,7 @@ static CONFIG c[] = {
 
 	{ "delete_pct",
 	  "percent operations that are deletes",
-	  C_OPS, 0, 45, 90, &g.c_delete_pct, NULL },
+	  0x0, 0, 45, 90, &g.c_delete_pct, NULL },
 
 	{ "dictionary",
 	  "if values are dictionary compressed",		/* 20% */
@@ -162,7 +159,7 @@ static CONFIG c[] = {
 
 	{ "insert_pct",
 	  "percent operations that are inserts",
-	  C_OPS, 0, 45, 90, &g.c_insert_pct, NULL },
+	  0x0, 0, 45, 90, &g.c_insert_pct, NULL },
 
 	{ "internal_key_truncation",
 	  "if internal keys are truncated",			/* 95% */
@@ -270,7 +267,7 @@ static CONFIG c[] = {
 	  C_IGNORE, 1, 32, 128, &g.c_threads, NULL },
 
 	{ "timer",
-	  "time to run in minutes",
+	  "maximum time to run in minutes (default 20 minutes)",
 	  C_IGNORE, 0, UINT_MAX, UINT_MAX, &g.c_timer, NULL },
 
 	{ "value_max",
@@ -287,7 +284,7 @@ static CONFIG c[] = {
 
 	{ "write_pct",
 	  "percent operations that are writes",
-	  C_OPS, 0, 90, 90, &g.c_write_pct, NULL },
+	  0x0, 0, 90, 90, &g.c_write_pct, NULL },
 
 	{ NULL, NULL, 0x0, 0, 0, 0, NULL, NULL }
 };
diff --git a/test/format/format.h b/test/format/format.h
index e2cd4f19c7e..58940f0c4b8 100644
--- a/test/format/format.h
+++ b/test/format/format.h
@@ -307,7 +307,7 @@ void	 wts_create(void);
 void	 wts_dump(const char *, int);
 void	 wts_load(void);
 void	 wts_open(const char *, int, WT_CONNECTION **);
-void	 wts_ops(void);
+void	 wts_ops(int);
 void	 wts_read_scan(void);
 void	 wts_salvage(void);
 void	 wts_stats(void);
diff --git a/test/format/ops.c b/test/format/ops.c
index 3a0a9110b9c..5fd992e9952 100644
--- a/test/format/ops.c
+++ b/test/format/ops.c
@@ -46,14 +46,14 @@ static void  table_append_init(void);
  *	Perform a number of operations in a set of threads.
  */
 void
-wts_ops(void)
+wts_ops(int lastrun)
 {
 	TINFO *tinfo, total;
 	WT_CONNECTION *conn;
 	WT_SESSION *session;
 	pthread_t backup_tid, compact_tid;
-	uint64_t thread_ops;
-	uint32_t i, fourths;
+	int64_t fourths, thread_ops;
+	uint32_t i;
 	int ret, running;
 
 	conn = g.wts_conn;
@@ -71,20 +71,23 @@ wts_ops(void)
 
 	/*
 	 * There are two mechanisms to specify the length of the run, a number
-	 * of operations or a timer.  If the former, each thread does an equal
-	 * share of the total operations (and make sure that it's not 0).  If
-	 * the latter, calculate how many fourth-of-a-second sleeps until this
-	 * part of the run finishes.
+	 * of operations and a timer, when either expire the run terminates.
+	 * Each thread does an equal share of the total operations (and make
+	 * sure that it's not 0).
+	 *
+	 * Calculate how many fourth-of-a-second sleeps until any timer expires.
 	 */
-	if (g.c_timer == 0) {
-		fourths = 0;
+	if (g.c_ops == 0)
+		thread_ops = -1;
+	else {
 		if (g.c_ops < g.c_threads)
 			g.c_ops = g.c_threads;
 		thread_ops = g.c_ops / g.c_threads;
-	} else {
-		fourths = (g.c_timer * 4 * 60) / FORMAT_OPERATION_REPS;
-		thread_ops = 0;
 	}
+	if (g.c_timer == 0)
+		fourths = -1;
+	else
+		fourths = (g.c_timer * 4 * 60) / FORMAT_OPERATION_REPS;
 
 	/* Initialize the table extension code. */
 	table_append_init();
@@ -117,8 +120,9 @@ wts_ops(void)
 		die(ret, "pthread_create: compaction");
 
 	/* Spin on the threads, calculating the totals. */
-	memset(&total, 0, sizeof(total));
 	for (;;) {
+		/* Clear out the totals each pass. */
+		memset(&total, 0, sizeof(total));
 		for (i = 0, running = 0; i < g.c_threads; ++i) {
 			total.commit += tinfo[i].commit;
 			total.deadlock += tinfo[i].deadlock;
@@ -140,27 +144,29 @@ wts_ops(void)
 				break;
 			}
 
-			if (thread_ops == 0) {
+			/*
+			 * If the timer has expired or this thread has completed
+			 * its operations, notify the thread it should quit.
+			 */
+			if (fourths == 0 ||
+			    (thread_ops != -1 &&
+			    tinfo[i].ops >= (uint64_t)thread_ops)) {
 				/*
-				 * Optionally drop core (for testing recovery),
-				 * otherwise tell the thread it's done.
+				 * On the last execution, optionally drop core
+				 * for recovery testing.
 				 */
-				if (fourths == 0) {
-					if (g.c_abort) {
-						static char *core = NULL;
-						*core = 0;
-					}
-					tinfo[i].quit = 1;
+				if (lastrun && g.c_abort) {
+					static char *core = NULL;
+					*core = 0;
 				}
-			} else
-				if (tinfo[i].ops >= thread_ops)
-					tinfo[i].quit = 1;
+				tinfo[i].quit = 1;
+			}
 		}
 		track("ops", 0ULL, &total);
 		if (!running)
 			break;
 		(void)usleep(250000);		/* 1/4th of a second */
-		if (fourths != 0)
+		if (fourths != -1)
 			--fourths;
 	}
 	free(tinfo);
diff --git a/test/format/recover.sh b/test/format/recover.sh
index de908c71e5d..4177e26a278 100644
--- a/test/format/recover.sh
+++ b/test/format/recover.sh
@@ -37,12 +37,16 @@ while true; do
 	# Save a copy of the database directory exactly as it was at the crash.
 	cp -rp RUNDIR $rundir2
 
-	# We aborted, so recovery is required
-	if `$wtcmd -R -h RUNDIR list | egrep table > /dev/null`; then
-		uri='table:wt'
+	#
+	# Everything is a table unless explicitly a file.
+	#
+	isfile=`grep data_source RUNDIR/CONFIG | grep -c file || exit 0`
+	if test "$isfile" -ne 0; then
+		uri="file:wt"
 	else
-		uri='file:wt'
+		uri="table:wt"
 	fi
-	# Force recovery to run.
+
+	# We know we aborted, so force recovery to run.
 	$wtcmd -R -h RUNDIR verify $uri || exit 1
 done
diff --git a/test/format/smoke.sh b/test/format/smoke.sh
index 62577692d0c..fe53f64229f 100755
--- a/test/format/smoke.sh
+++ b/test/format/smoke.sh
@@ -1,7 +1,7 @@
 #! /bin/sh
 
 # Smoke-test format as part of running "make check".
-args="-1 -c "." data_source=table ops=100000 rows=10000 threads=4"
+args="-1 -c "." data_source=table ops=100000 rows=10000 threads=4 compression=none"
 
 ./t $args file_type=fix || exit 1
 ./t $args file_type=row || exit 1
diff --git a/test/format/t.c b/test/format/t.c
index b53913b4623..03b3605a5e4 100644
--- a/test/format/t.c
+++ b/test/format/t.c
@@ -40,6 +40,7 @@ extern char *__wt_optarg;
 int
 main(int argc, char *argv[])
 {
+	time_t start;
 	int ch, reps, ret;
 	const char *config, *home;
 
@@ -174,7 +175,9 @@ main(int argc, char *argv[])
 		config_print(0);		/* Dump run configuration */
 		key_len_setup();		/* Setup keys */
 
+		start = time(NULL);
 		track("starting up", 0ULL, NULL);
+
 		if (SINGLETHREADED)
 			bdb_open();		/* Initial file config */
 		wts_open(g.home, 1, &g.wts_conn);
@@ -183,35 +186,35 @@ main(int argc, char *argv[])
 		wts_load();			/* Load initial records */
 		wts_verify("post-bulk verify");	/* Verify */
 
-						/* Loop reading & operations */
-		for (reps = 0; reps < FORMAT_OPERATION_REPS; ++reps) {
-			wts_read_scan();	/* Read scan */
-
-						/* Operations */
-			if (g.c_timer != 0 || g.c_ops != 0)
-				wts_ops();
-
-			/*
-			 * Statistics.
-			 *
-			 * XXX
-			 * Verify closes the underlying handle and discards the
-			 * statistics, read them first.
-			 */
-			if (g.c_ops == 0 || reps == 2)
-				wts_stats();
-
-						/* Verify */
-			wts_verify("post-ops verify");
-
-			/*
-			 * If no operation count, quit after a single read pass.
-			 * (A timer configuration ran out the timer on the first
-			 * set of operations.)
-			 */
-			if (g.c_ops == 0)
-				break;
-		}
+		/*
+		 * If we're not doing any operations, scan the bulk-load, copy
+		 * the statistics and we're done. Otherwise, loop reading and
+		 * operations, with a verify after each set.
+		 */
+		if (g.c_timer == 0 && g.c_ops == 0) {
+			wts_read_scan();		/* Read scan */
+			wts_stats();			/* Statistics */
+		} else
+			for (reps = 1; reps <= FORMAT_OPERATION_REPS; ++reps) {
+				wts_read_scan();	/* Read scan */
+
+							/* Operations */
+				wts_ops(reps == FORMAT_OPERATION_REPS);
+
+				/*
+				 * Copy out the run's statistics after the last
+				 * set of operations.
+				 *
+				 * XXX
+				 * Verify closes the underlying handle and
+				 * discards the statistics, read them first.
+				 */
+				if (reps == FORMAT_OPERATION_REPS)
+					wts_stats();
+
+							/* Verify */
+				wts_verify("post-ops verify");
+			}
 
 		track("shutting down", 0ULL, NULL);
 		if (SINGLETHREADED)
@@ -233,8 +236,9 @@ main(int argc, char *argv[])
 		/* Overwrite the progress line with a completion line. */
 		if (g.track)
 			printf("\r%78s\r", " ");
-		printf("%4d: %s, %s\n",
-		    g.run_cnt, g.c_data_source, g.c_file_type);
+		printf("%4d: %s, %s (%.0f seconds)\n",
+		    g.run_cnt, g.c_data_source,
+		    g.c_file_type, difftime(time(NULL), start));
 	}
 
 	/* Flush/close any logging information. */
diff --git a/test/packing/Makefile.am b/test/packing/Makefile.am
new file mode 100644
index 00000000000..a8c6c2dc69f
--- /dev/null
+++ b/test/packing/Makefile.am
@@ -0,0 +1,5 @@
+AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include
+
+noinst_PROGRAMS = intpack-test intpack-test2 packing-test
+LDADD = $(top_builddir)/libwiredtiger.la
+LDFLAGS = -static
diff --git a/test/packing/intpack-test.c b/test/packing/intpack-test.c
index 109f37e229a..51acea15506 100644
--- a/test/packing/intpack-test.c
+++ b/test/packing/intpack-test.c
@@ -27,29 +27,29 @@
  */
 
 #include <assert.h>
-#include <stdlib.h>
-#include <time.h>
 
-#include <wt_internal.h>
-#include "intpack.i"
+#include "wt_internal.h"
 
-int main() {
+int
+main()
+{
+	const uint8_t *cp;
 	uint8_t buf[10], *p;
-	uint64_t r, r2, ncalls;
-	int i, s;
+	uint64_t ncalls, r, r2, s;
+	int i;
 
 	ncalls = 0;
 
 	for (i = 0; i < 10000000; i++) {
 		for (s = 0; s < 50; s += 5) {
 			++ncalls;
-			r = 1 << s;
+			r = 1ULL << s;
 
 #if 1
 			p = buf;
-			__wt_vpack_uint(NULL, &p, sizeof buf, r);
-			p = buf;
-			__wt_vunpack_uint(NULL, &p, sizeof buf, &r2);
+			assert(__wt_vpack_uint(&p, sizeof(buf), r) == 0);
+			cp = buf;
+			assert(__wt_vunpack_uint(&cp, sizeof(buf), &r2) == 0);
 #else
 			/*
 			 * Note: use memmove for comparison because GCC does
@@ -57,9 +57,9 @@ int main() {
 			 * to measure anything.
 			 */
 			p = buf;
-			memmove(p, &r, sizeof r);
-			p = buf;
-			memmove(&r2, p, sizeof r2);
+			memmove(p, &r, sizeof(r));
+			cp = buf;
+			memmove(&r2, cp, sizeof(r2));
 #endif
 			if (r != r2) {
 				fprintf(stderr, "mismatch!\n");
diff --git a/test/packing/intpack-test2.c b/test/packing/intpack-test2.c
index 6b54504f367..d9ac9373cea 100644
--- a/test/packing/intpack-test2.c
+++ b/test/packing/intpack-test2.c
@@ -27,27 +27,26 @@
  */
 
 #include <assert.h>
-#include <stdlib.h>
-#include <time.h>
 
-#include <wt_internal.h>
-#include "intpack.i"
+#include "wt_internal.h"
 
-int main() {
+int
+main()
+{
 	uint8_t buf[10], *p, *end;
 	int64_t i;
 
 	for (i = 1; i < 1LL << 60; i <<= 1) {
 		end = buf;
-		__wt_vpack_uint(NULL, &end, sizeof buf, i);
-		printf("%lld ", i);
+		assert(__wt_vpack_uint(&end, sizeof(buf), (uint64_t)i) == 0);
+		printf("%" PRId64 " ", i);
 		for (p = buf; p < end; p++)
 			printf("%02x", *p);
 		printf("\n");
 
 		end = buf;
-		__wt_vpack_int(NULL, &end, sizeof buf, -i);
-		printf("%lld ", -i);
+		assert(__wt_vpack_int(&end, sizeof(buf), -i) == 0);
+		printf("%" PRId64 " ", -i);
 		for (p = buf; p < end; p++)
 			printf("%02x", *p);
 		printf("\n");
diff --git a/test/packing/packing-test.c b/test/packing/packing-test.c
index 2696e8a008d..32b7d3d17ec 100644
--- a/test/packing/packing-test.c
+++ b/test/packing/packing-test.c
@@ -27,26 +27,26 @@
  */
 
 #include <assert.h>
-#include <stdlib.h>
-#include <time.h>
 
-#include <wiredtiger.h>
-#include <stdarg.h>
+#include "wt_internal.h"
 
-void check(const char *fmt, ...)
+static void
+check(const char *fmt, ...)
 {
 	char buf[200], *end, *p;
 	va_list ap;
 	size_t len;
 
+	len = 0;			/* -Werror=maybe-uninitialized */
+
 	va_start(ap, fmt);
-	len = wiredtiger_struct_sizev(fmt, ap);
+	assert(__wt_struct_sizev(NULL, &len, fmt, ap) == 0);
 	va_end(ap);
 
-	assert(len < sizeof buf);
+	assert(len < sizeof(buf));
 
 	va_start(ap, fmt);
-	assert(wiredtiger_struct_packv(buf, sizeof buf, fmt, ap) == 0);
+	assert(__wt_struct_packv(NULL, buf, sizeof(buf), fmt, ap) == 0);
 	va_end(ap);
 
 	printf("%s ", fmt);
@@ -55,7 +55,9 @@ void check(const char *fmt, ...)
 	printf("\n");
 }
 
-int main() {
+int
+main()
+{
 	check("iii", 0, 101, -99);
 	check("3i", 0, 101, -99);
 	check("iS", 42, "forty two");
diff --git a/test/salvage/salvage.c b/test/salvage/salvage.c
index f1e4f26c255..1c4d54df9e9 100644
--- a/test/salvage/salvage.c
+++ b/test/salvage/salvage.c
@@ -447,6 +447,18 @@ run(int r)
 }
 
 /*
+ * file_exists --
+ *	Return if the file exists.
+ */
+static int
+file_exists(const char *path)
+{
+	struct stat sb;
+
+	return (stat(path, &sb) == 0);
+}
+
+/*
  * build --
  *	Build a row- or column-store page in a file.
  */
@@ -529,21 +541,16 @@ build(int ikey, int ivalue, int cnt)
 	}
 
 	/*
-	 * The first time through this routine we put a matching configuration
-	 * in for the salvage file.
+	 * The first time through this routine we create the salvage file and
+	 * then remove it (all we want is the appropriate schema entry, we're
+	 * creating the salvage file itself by hand).
 	 */
-	new_slvg = (access(SLVG, F_OK) != 0);
+	new_slvg = !file_exists(SLVG);
 	if (new_slvg) {
 		assert(session->drop(session, "file:" SLVG, "force") == 0);
 		assert(session->create(session, "file:" SLVG, config) == 0);
 	}
-
 	assert(conn->close(conn, 0) == 0);
-
-	/*
-	 * We created the salvage file above, but all we want is the schema,
-	 * we're creating the salvage file by hand.
-	 */
 	if (new_slvg)
 		(void)remove(SLVG);
 }
@@ -567,12 +574,13 @@ copy(u_int gen, u_int recno)
 	 * copy the first sector (the file description).
 	 * Otherwise, we are appending to an existing file.
 	 */
-	if (access(SLVG, F_OK)) {
+	if (file_exists(SLVG))
+		assert((ofp = fopen(SLVG, "a")) != NULL);
+	else {
 		assert((ofp = fopen(SLVG, "w")) != NULL);
 		assert(fread(buf, 1, PSIZE, ifp) == PSIZE);
 		assert(fwrite(buf, 1, PSIZE, ofp) == PSIZE);
-	} else
-		assert((ofp = fopen(SLVG, "a")) != NULL);
+	}
 
 	/*
 	 * If there's data, copy/update the first formatted page.
diff --git a/test/suite/test_bug009.py b/test/suite/test_bug009.py
new file mode 100644
index 00000000000..9074d45bafd
--- /dev/null
+++ b/test/suite/test_bug009.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2015 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_bug009.py
+#       check that reconciliation takes into account prefix compression
+#       when figuring out how to split pages
+#
+
+import wiredtiger, wttest
+from wiredtiger import stat
+from helper import confirm_empty,\
+    key_populate, value_populate, simple_populate,\
+    complex_populate, complex_value_populate
+from wtscenario import multiply_scenarios, number_scenarios
+
+class test_bug009(wttest.WiredTigerTestCase):
+    name = 'test_bug009'
+    uri = 'file:' + name
+
+    def test_reconciliation_prefix_compression(self):
+        # Configure 4KB pages with prefix compression enabled and support for
+        # large data items.
+        self.session.create(self.uri,
+                'prefix_compression=1,' +
+                'key_format=S,value_format=S,' +
+                'internal_page_max=4KB,leaf_page_max=4KB,' +
+                'leaf_value_max=3096')
+
+        cursor = self.session.open_cursor(self.uri, None)
+        # Insert two items with keys that will be prefix compressed and data
+        # items sized so that the compression size difference tips the
+        # size over a page boundary.
+        cursor.set_key('fill_2__b_27')
+        cursor.set_value(2294 * '0')
+        cursor.insert()
+
+        cursor.set_key('fill_2__b_28')
+        cursor.set_value(3022 * '0')
+        cursor.insert()
+
+if __name__ == '__main__':
+    wttest.run()
diff --git a/test/suite/test_bug010.py b/test/suite/test_bug010.py
new file mode 100644
index 00000000000..31e9777aa8e
--- /dev/null
+++ b/test/suite/test_bug010.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2015 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_bug010.py
+#       check that checkpoints don't leave files marked clean when they
+#       did not write all updates out.
+#
+
+import wiredtiger, wttest, wtthread
+import threading, time
+
+class test_bug010(wttest.WiredTigerTestCase):
+    name = 'test_bug010'
+    uri = 'table:' + name
+    num_tables = 1000
+
+    # Overrides WiredTigerTestCase
+    def setUpConnectionOpen(self, dir):
+        self.home = dir
+        # Disable checkpoint sync, to make checkpoints faster and
+        # increase the likelyhood of triggering the symptom
+        conn_params = ',create,checkpoint_sync=false'
+        conn = wiredtiger.wiredtiger_open(dir, conn_params)
+        return conn
+
+    def test_checkpoint_dirty(self):
+        # Create a lot of tables 
+        # insert the same item in each
+        # Start a checkpoint with some of the updates
+        # Create another checkpoint that should contain all data consistently
+        # Read from the checkpoint and make sure the data is consistent
+        for i in range(0, self.num_tables):
+            self.printVerbose(3, 'Creating table ' + str(i))
+            self.session.create(self.uri + str(i),
+            'key_format=S,value_format=i')
+            c = self.session.open_cursor(self.uri + str(i), None)
+            c.set_key('a')
+            c.set_value(0)
+            c.insert()
+            c.close()
+
+        self.session.checkpoint()
+
+        iterations = 1
+        expected_val = 0
+        for its in range(1, 10):
+            self.printVerbose(3, 'Doing iteration ' + str(its))
+
+            # Create a checkpoint thread
+            done = threading.Event()
+            ckpt = wtthread.checkpoint_thread(self.conn, done)
+            ckpt.start()
+            try:
+                expected_val += 1
+                for i in range(0, self.num_tables):
+                    c = self.session.open_cursor(self.uri + str(i), None)
+                    c.set_key('a')
+                    c.set_value(expected_val)
+                    c.insert()
+                    c.close()
+            finally:
+                done.set()
+                ckpt.join()
+
+            # Execute another checkpoint, to make sure we have a consistent
+            # view of the data.
+            self.session.checkpoint()
+            for i in range(0, self.num_tables):
+                c = self.session.open_cursor(
+                    self.uri + str(i), None, 'checkpoint=WiredTigerCheckpoint')
+                c.next()
+                self.assertEquals(c.get_value(), expected_val,
+                    msg='Mismatch on iteration ' + str(its) +\
+					' for table ' + str(i))
+                c.close()
+
+if __name__ == '__main__':
+    wttest.run()
diff --git a/test/suite/test_dump.py b/test/suite/test_dump.py
index 4c7e6f667e4..6d81c102028 100644
--- a/test/suite/test_dump.py
+++ b/test/suite/test_dump.py
@@ -67,6 +67,31 @@ class test_dump(wttest.WiredTigerTestCase, suite_subprocess):
     scenarios = number_scenarios(
         multiply_scenarios('.', types, keyfmt, dumpfmt))
 
+    # Extract the values lines from the dump output.
+    def value_lines(self, fname):
+        # mode:
+        #   0 == we are in the header
+        #   1 == next line is key
+        #   2 == next line is value
+        mode = 0
+        lines = []
+        for line in open(fname).readlines():
+            if mode == 0:
+                if line == 'Data\n':
+                    mode = 1
+            elif mode == 1:
+                mode = 2
+            else:
+                # This is a value line, keep it.
+                lines.append(line)
+                mode = 1
+        return sorted(lines)
+
+    def compare_dump_values(self, f1, f2):
+        l1 = self.value_lines(f1)
+        l2 = self.value_lines(f2)
+        self.assertEqual(l1, l2)
+
     # Dump, re-load and do a content comparison.
     def test_dump(self):
         # Create the object.
@@ -105,5 +130,14 @@ class test_dump(wttest.WiredTigerTestCase, suite_subprocess):
             'load', '-n', '-f', 'dump.out'], errfilename='errfile.out')
         self.check_non_empty_file('errfile.out')
 
+        # If there is are indices, dump one of them and check the output.
+        if self.populate == complex_populate:
+            indexuri = 'index:' + self.name + ':indx1'
+            hexopt = ['-x'] if self.hex == 1 else []
+            self.runWt(['-h', self.dir, 'dump'] + hexopt + [indexuri],
+                       outfilename='dumpidx.out')
+            self.check_non_empty_file('dumpidx.out')
+            self.compare_dump_values('dump.out', 'dumpidx.out')
+
 if __name__ == '__main__':
     wttest.run()
diff --git a/test/suite/test_shared_cache.py b/test/suite/test_shared_cache01.py
index ff40d31e6df..e6d712e61bc 100644
--- a/test/suite/test_shared_cache.py
+++ b/test/suite/test_shared_cache01.py
@@ -33,12 +33,12 @@ import wiredtiger, wttest
 from wttest import unittest
 from helper import key_populate, simple_populate
 
-# test_shared_cache.py
+# test_shared_cache01.py
 #    Checkpoint tests
 # Test shared cache shared amongst multiple connections.
-class test_shared_cache(wttest.WiredTigerTestCase):
+class test_shared_cache01(wttest.WiredTigerTestCase):
 
-    uri = 'table:test_shared_cache'
+    uri = 'table:test_shared_cache01'
     # Setup fairly large items to use up cache
     data_str = 'abcdefghijklmnopqrstuvwxyz' * 20
 
@@ -89,7 +89,7 @@ class test_shared_cache(wttest.WiredTigerTestCase):
         self.sessions = [] # Implicitly closed when closing sessions.
 
     # Basic test of shared cache
-    def test_shared_cache01(self):
+    def test_shared_cache_basic(self):
         nops = 1000
         self.openConnections(['WT_TEST1', 'WT_TEST2'])
 
@@ -99,7 +99,7 @@ class test_shared_cache(wttest.WiredTigerTestCase):
         self.closeConnections()
 
     # Test of shared cache with more connections
-    def test_shared_cache02(self):
+    def test_shared_cache_more_connections(self):
         nops = 1000
         self.openConnections(['WT_TEST1', 'WT_TEST2', 'WT_TEST3', 'WT_TEST4'])
 
@@ -109,7 +109,7 @@ class test_shared_cache(wttest.WiredTigerTestCase):
         self.closeConnections()
 
     # Do enough work for the shared cache to be fully allocated.
-    def test_shared_cache03(self):
+    def test_shared_cache_full(self):
         nops = 10000
         self.openConnections(['WT_TEST1', 'WT_TEST2'])
         for sess in self.sessions:
@@ -121,7 +121,7 @@ class test_shared_cache(wttest.WiredTigerTestCase):
         self.closeConnections()
 
     # Switch the work between connections, to test rebalancing.
-    def test_shared_cache04(self):
+    def test_shared_cache_rebalance(self):
         # About 100 MB of data with ~250 byte values.
         nops = 200000
         self.openConnections(['WT_TEST1', 'WT_TEST2'])
@@ -132,7 +132,7 @@ class test_shared_cache(wttest.WiredTigerTestCase):
         self.closeConnections()
 
     # Add a new connection once the shared cache is already established.
-    def test_shared_cache05(self):
+    def test_shared_cache_late_join(self):
         nops = 1000
         self.openConnections(['WT_TEST1', 'WT_TEST2'])
 
@@ -147,7 +147,7 @@ class test_shared_cache(wttest.WiredTigerTestCase):
         self.closeConnections()
 
     # Close a connection and keep using other connections.
-    def test_shared_cache06(self):
+    def test_shared_cache_leaving(self):
         nops = 10000
         self.openConnections(['WT_TEST1', 'WT_TEST2', 'WT_TEST3'])
 
@@ -163,7 +163,7 @@ class test_shared_cache(wttest.WiredTigerTestCase):
 
     # Test verbose output
     @unittest.skip("Verbose output handling")
-    def test_shared_cache07(self):
+    def test_shared_cache_verbose(self):
         nops = 1000
         self.openConnections(
                 ['WT_TEST1', 'WT_TEST2'], extra_opts="verbose=[shared_cache]")
@@ -174,7 +174,7 @@ class test_shared_cache(wttest.WiredTigerTestCase):
         self.closeConnections()
 
     # Test opening a connection outside of the shared cache
-    def test_shared_cache08(self):
+    def test_shared_cache_mixed(self):
         nops = 1000
         self.openConnections(['WT_TEST1', 'WT_TEST2'])
 
@@ -185,7 +185,7 @@ class test_shared_cache(wttest.WiredTigerTestCase):
         self.closeConnections()
 
     # Test default config values
-    def test_shared_cache09(self):
+    def test_shared_cache_defaults(self):
         nops = 1000
         self.openConnections(['WT_TEST1', 'WT_TEST2'], pool_opts=',shared_cache=(name=pool,size=200M)')
 
@@ -194,21 +194,8 @@ class test_shared_cache(wttest.WiredTigerTestCase):
             self.add_records(sess, 0, nops)
         self.closeConnections()
 
-    # Test reconfigure API
-    def test_shared_cache10(self):
-        nops = 1000
-        self.openConnections(['WT_TEST1', 'WT_TEST2'])
-
-        for sess in self.sessions:
-            sess.create(self.uri, "key_format=S,value_format=S")
-            self.add_records(sess, 0, nops)
-
-        connection = self.conns[0]
-        connection.reconfigure("shared_cache=(name=pool,size=300M)")
-        self.closeConnections()
-
     # Test default config values
-    def test_shared_cache11(self):
+    def test_shared_cache_defaults2(self):
         nops = 1000
         self.openConnections(['WT_TEST1', 'WT_TEST2'], pool_opts=',shared_cache=(name=pool)')
 
diff --git a/test/suite/test_shared_cache02.py b/test/suite/test_shared_cache02.py
new file mode 100644
index 00000000000..3806e9d0cda
--- /dev/null
+++ b/test/suite/test_shared_cache02.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2015 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+# If unittest2 is available, use it in preference to (the old) unittest
+
+import os
+import shutil
+import wiredtiger, wttest
+from wttest import unittest
+from helper import key_populate, simple_populate
+
+# test_shared_cache02.py
+#    Shared cache tests
+# Test shared cache shared amongst multiple connections.
+class test_shared_cache02(wttest.WiredTigerTestCase):
+
+    uri = 'table:test_shared_cache02'
+    # Setup fairly large items to use up cache
+    data_str = 'abcdefghijklmnopqrstuvwxyz' * 20
+
+    # Add a set of records
+    def add_records(self, session, start, stop):
+        cursor = session.open_cursor(self.uri, None, "overwrite")
+        for i in range(start, stop+1):
+            cursor.set_key("%010d KEY------" % i)
+            cursor.set_value("%010d VALUE "% i + self.data_str)
+            self.assertEqual(cursor.insert(), 0)
+        cursor.close()
+
+    # Disable default setup/shutdown steps - connections are managed manually.
+    def setUpSessionOpen(self, conn):
+        return None
+
+    def close_conn(self):
+        return None
+
+    def setUpConnectionOpen(self, dir):
+        return None
+
+    def openConnections(
+            self,
+            connections,
+            pool_opts = ',shared_cache=(name=pool,size=200M,chunk=10M,reserve=30M),',
+            extra_opts = '',
+            add=0):
+        if add == 0:
+            self.conns = []
+            self.sessions = []
+        # Open the set of connections.
+        for name in connections:
+            shutil.rmtree(name, True)
+            os.mkdir(name)
+            next_conn =  wiredtiger.wiredtiger_open(
+                name,
+                'create,error_prefix="' + self.shortid() + ': "' +
+                pool_opts + extra_opts)
+            self.conns.append(next_conn)
+            self.sessions.append(next_conn.open_session(None))
+        return None
+
+    def closeConnections(self):
+        for tmp_conn in self.conns:
+            tmp_conn.close()
+        self.conns = []
+        self.sessions = [] # Implicitly closed when closing sessions.
+
+    # Test reconfigure API
+    def test_shared_cache_reconfig01(self):
+        nops = 1000
+        self.openConnections(['WT_TEST1', 'WT_TEST2'])
+
+        for sess in self.sessions:
+            sess.create(self.uri, "key_format=S,value_format=S")
+            self.add_records(sess, 0, nops)
+
+        connection = self.conns[0]
+        connection.reconfigure("shared_cache=(name=pool,size=300M)")
+        self.closeConnections()
+
+    # Test reconfigure that grows the usage over quota fails
+    def test_shared_cache_reconfig02(self):
+        nops = 1000
+        self.openConnections(['WT_TEST1', 'WT_TEST2'],
+            pool_opts = ',shared_cache=(name=pool,size=50M,reserve=20M),')
+
+        for sess in self.sessions:
+            sess.create(self.uri, "key_format=S,value_format=S")
+            self.add_records(sess, 0, nops)
+
+        connection = self.conns[0]
+        # Reconfigure to over-subscribe, call should fail with an error
+        self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+            lambda: connection.reconfigure("shared_cache=(name=pool,reserve=40M)"),
+            '/Shared cache unable to accommodate this configuration/')
+        # TODO: Ensure that the reserve size wasn't updated.
+        # cursor = self.sessions[0].open_cursor('config:', None, None)
+        # value = cursor['connection']
+        # self.assertTrue(value.find('reserve') != -1)
+
+        self.closeConnections()
+
+    # Test reconfigure that would grow the usage over quota if the
+    # previous reserve size isn't taken into account
+    def test_shared_cache_reconfig03(self):
+        nops = 1000
+        self.openConnections(['WT_TEST1', 'WT_TEST2'],
+            pool_opts = ',shared_cache=(name=pool,size=50M,reserve=20M),')
+
+        for sess in self.sessions:
+            sess.create(self.uri, "key_format=S,value_format=S")
+            self.add_records(sess, 0, nops)
+
+        connection = self.conns[0]
+
+        connection.reconfigure("shared_cache=(name=pool,reserve=30M)"),
+
+        # TODO: Ensure that the reserve size was updated.
+        # cursor = self.sessions[0].open_cursor('config:', None, None)
+        # value = cursor['connection']
+        # self.assertTrue(value.find('reserve') != -1)
+
+        self.closeConnections()
+
+    # Test reconfigure that switches to using a shared cache
+    # previous reserve size isn't taken into account
+    def test_shared_cache_reconfig03(self):
+        nops = 1000
+        self.openConnections(['WT_TEST1', 'WT_TEST2'], pool_opts = ',')
+
+        for sess in self.sessions:
+            sess.create(self.uri, "key_format=S,value_format=S")
+            self.add_records(sess, 0, nops)
+
+        self.conns[0].reconfigure("shared_cache=(name=pool,reserve=20M)"),
+        self.conns[1].reconfigure("shared_cache=(name=pool,reserve=20M)"),
+
+        # TODO: Ensure that the reserve size was updated.
+        # cursor = self.sessions[0].open_cursor('config:', None, None)
+        # value = cursor['connection']
+        # self.assertTrue(value.find('reserve') != -1)
+
+        self.closeConnections()
+
+if __name__ == '__main__':
+    wttest.run()
diff --git a/test/suite/test_stat01.py b/test/suite/test_stat01.py
index 0f072a7c473..0b778d63b9d 100644
--- a/test/suite/test_stat01.py
+++ b/test/suite/test_stat01.py
@@ -28,6 +28,8 @@
 
 import helper, wiredtiger, wttest
 from wiredtiger import stat
+from helper import key_populate, simple_populate
+from wtscenario import multiply_scenarios, number_scenarios
 
 # test_stat01.py
 #    Statistics operations
@@ -36,17 +38,23 @@ class test_stat01(wttest.WiredTigerTestCase):
     Test statistics
     """
 
-    tablename = 'test_stat01.wt'
-    uri = 'file:' + tablename
-    config = 'key_format=S,' +\
-        'allocation_size=512,internal_page_max=16K,leaf_page_max=128K'
+    config = 'internal_page_max=4K,leaf_page_max=8K'
     nentries = 25
 
+    types = [
+        ('file', dict(uri='file:test_stat01.wt')),
+        ('table', dict(uri='table:test_stat01.wt'))
+    ]
+    keyfmt = [
+        ('recno', dict(keyfmt='r')),
+        ('string', dict(keyfmt='S')),
+    ]
+    scenarios = number_scenarios(multiply_scenarios('.', types, keyfmt))
+
     # Override WiredTigerTestCase, we have extensions.
     def setUpConnectionOpen(self, dir):
         conn = wiredtiger.wiredtiger_open(dir,
-            'create,statistics=(fast),' +
-            'error_prefix="%s: "' % self.shortid())
+            'create,statistics=(all),' + 'error_prefix="%s: "' % self.shortid())
         return conn
 
     def statstr_to_int(self, str):
@@ -57,17 +65,17 @@ class test_stat01(wttest.WiredTigerTestCase):
         parts = str.rpartition('(')
         return int(parts[2].rstrip(')'))
 
-    def check_stats(self, statcursor, mincount, lookfor):
-        """
-        Do a quick check of the entries in the the stats cursor,
-        There should be at least 'mincount' entries,
-        and the 'lookfor' string should appear
-        """
+    # Do a quick check of the entries in the the stats cursor, the "lookfor"
+    # string should appear with a minimum value of least "min".
+    def check_stats(self, statcursor, min, lookfor):
         stringclass = ''.__class__
         intclass = (0).__class__
-        # make sure statistics basically look right
-        count = 0
+
+        # Reset the cursor, we're called multiple times.
+        statcursor.reset()
+
         found = False
+        foundval = 0
         for id, desc, valstr, val in statcursor:
             self.assertEqual(type(desc), stringclass)
             self.assertEqual(type(valstr), stringclass)
@@ -75,68 +83,76 @@ class test_stat01(wttest.WiredTigerTestCase):
             self.assertEqual(val, self.statstr_to_int(valstr))
             self.printVerbose(2, '  stat: \'' + desc + '\', \'' +
                               valstr + '\', ' + str(val))
-            count += 1
             if desc == lookfor:
                 found = True
-        self.assertTrue(count > mincount)
+                foundval = val
+
         self.assertTrue(found, 'in stats, did not see: ' + lookfor)
+        self.assertTrue(foundval >= min)
 
+    # Test simple connection statistics.
     def test_basic_conn_stats(self):
-        self.printVerbose(2, 'overall database stats:')
+        # Build an object and force some writes.
+        config = self.config + ',key_format=' + self.keyfmt
+        simple_populate(self, self.uri, config, 1000)
+        self.session.checkpoint(None)
+
+        # See that we can get a specific stat value by its key and verify its
+        # entry is self-consistent.
         allstat_cursor = self.session.open_cursor('statistics:', None, None)
         self.check_stats(allstat_cursor, 10, 'block-manager: blocks written')
 
-        # See that we can get a specific stat value by its key,
-        # and verify that its entry is self-consistent
         values = allstat_cursor[stat.conn.block_write]
         self.assertEqual(values[0], 'block-manager: blocks written')
         val = self.statstr_to_int(values[1])
         self.assertEqual(val, values[2])
         allstat_cursor.close()
 
+    # Test simple object statistics.
     def test_basic_data_source_stats(self):
-        self.session.create(self.uri, self.config)
+        # Build an object.
+        config = self.config + ',key_format=' + self.keyfmt
+        self.session.create(self.uri, config)
         cursor = self.session.open_cursor(self.uri, None, None)
         value = ""
-        for i in range(0, self.nentries):
-            key = str(i)
-            value = value + key + value # size grows exponentially
-            cursor.set_key(key)
+        for i in range(1, self.nentries):
+            value = value + 1000 * "a"
+            cursor.set_key(key_populate(cursor, i))
             cursor.set_value(value)
             cursor.insert()
         cursor.close()
 
-        self.printVerbose(2, 'data source specific stats:')
-        cursor = self.session.open_cursor(
-            'statistics:' + self.uri, None, None)
+        # Force the object to disk, otherwise we can't check the overflow count.
+        self.reopen_conn()
+
+        # See that we can get a specific stat value by its key and verify its
+        # entry is self-consistent.
+        cursor = self.session.open_cursor('statistics:' + self.uri, None, None)
+        self.check_stats(cursor, 8192, 'btree: maximum leaf page size')
+        self.check_stats(cursor, 4096, 'btree: maximum internal page size')
         self.check_stats(cursor, 10, 'btree: overflow pages')
 
-        # See that we can get a specific stat value by its key,
-        # and verify that its entry is self-consistent
         values = cursor[stat.dsrc.btree_overflow]
         self.assertEqual(values[0], 'btree: overflow pages')
         val = self.statstr_to_int(values[1])
         self.assertEqual(val, values[2])
         cursor.close()
 
-    def test_missing_file_stats(self):
-        self.assertRaises(wiredtiger.WiredTigerError, lambda:
-            self.session.open_cursor('statistics:file:DoesNotExist'))
-
+    # Test simple per-checkpoint statistics.
     def test_checkpoint_stats(self):
-        nentries = 0
-        last_size = 0
         for name in ('first', 'second', 'third'):
-            helper.simple_populate(self, self.uri, self.config, nentries)
-            nentries += self.nentries
+            config = self.config + ',key_format=' + self.keyfmt
+            helper.simple_populate(self, self.uri, config, self.nentries)
             self.session.checkpoint('name=' + name)
             cursor = self.session.open_cursor(
                 'statistics:' + self.uri, None, 'checkpoint=' + name)
-            size = cursor[stat.dsrc.btree_overflow][1]
-            self.assertTrue(size >= last_size)
-            last_size = size
+            self.assertEqual(
+                cursor[stat.dsrc.btree_entries][2], self.nentries + 1)
             cursor.close()
-            self.session.truncate(self.uri, None, None)
+
+    def test_missing_file_stats(self):
+        self.assertRaises(wiredtiger.WiredTigerError, lambda:
+            self.session.open_cursor('statistics:file:DoesNotExist'))
 
 if __name__ == '__main__':
     wttest.run()
author	Michael Cahill <michael.cahill@wiredtiger.com>	2015-03-09 17:47:27 +1100
committer	Michael Cahill <michael.cahill@wiredtiger.com>	2015-03-09 17:47:27 +1100
commit	3a3bda539cdd34428b7489fa0fa102ff0605e8d8 (patch)
tree	fc901ad7b45300181356b8305ecb02fff13f4bfc
parent	89f45aafdff48bf7c8e191b788a144cab0b86122 (diff)
parent	0afa07b0cd666adf7576901540a699b0bec396e3 (diff)
download	mongo-3a3bda539cdd34428b7489fa0fa102ff0605e8d8.tar.gz