Merge branch 'develop' into cursor-reconfigure

author: Keith Bostic <keith@wiredtiger.com> 2015-01-16 15:23:00 -0500
committer: Keith Bostic <keith@wiredtiger.com> 2015-01-16 15:23:00 -0500
commit: 51a92facb691706bee4b6c573e8bda070a62351d (patch)
tree: 42bb9c6f5e16d4661cdf1c143e88339ecd94dff4 /src
parent: 24ca383872e0512a3ae54efd9f4f2de29eac0d23 (diff)
parent: 38b6b25fb7e825b234a17ad1fb9269c5f48cb129 (diff)
download: mongo-51a92facb691706bee4b6c573e8bda070a62351d.tar.gz
29 files changed, 673 insertions, 355 deletions
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index 4de94277364..af9f6a669f2 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -408,11 +408,13 @@ __debug_tree_shape_info(WT_PAGE *page)
 
 	v = page->memory_footprint;
 	if (v >= WT_GIGABYTE)
-		snprintf(buf, sizeof(buf), "(%" PRIu64 "G)", v / WT_GIGABYTE);
+		snprintf(buf, sizeof(buf),
+		    "(%p %" PRIu64 "G)", page, v / WT_GIGABYTE);
 	else if (v >= WT_MEGABYTE)
-		snprintf(buf, sizeof(buf), "(%" PRIu64 "M)", v / WT_MEGABYTE);
+		snprintf(buf, sizeof(buf),
+		    "(%p %" PRIu64 "M)", page, v / WT_MEGABYTE);
 	else
-		snprintf(buf, sizeof(buf), "(%" PRIu64 ")", v);
+		snprintf(buf, sizeof(buf), "(%p %" PRIu64 ")", page, v);
 	return (buf);
 }
 
@@ -429,16 +431,16 @@ __debug_tree_shape_worker(WT_DBG *ds, WT_PAGE *page, int level)
 	session = ds->session;
 
 	if (page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_COL_INT) {
-		__dmsg(ds, "%*s" "I" "%s\n",
-		    level, " ", __debug_tree_shape_info(page));
+		__dmsg(ds, "%*s" "I" "%d %s\n",
+		    level * 3, " ", level, __debug_tree_shape_info(page));
 		WT_INTL_FOREACH_BEGIN(session, page, ref) {
 			if (ref->state == WT_REF_MEM)
 				__debug_tree_shape_worker(
-				    ds, ref->page, level + 3);
+				    ds, ref->page, level + 1);
 		} WT_INTL_FOREACH_END;
 	} else
-		__dmsg(ds, "%*s" "L" "%s\n",
-		    level, " ", __debug_tree_shape_info(page));
+		__dmsg(ds, "%*s" "L" " %s\n",
+		    level * 3, " ", __debug_tree_shape_info(page));
 }
 
 /*
@@ -458,8 +460,7 @@ __wt_debug_tree_shape(
 	if (page == NULL)
 		page = S2BT(session)->root.page;
 
-	WT_WITH_PAGE_INDEX(session,
-	    __debug_tree_shape_worker(ds, page, 0));
+	WT_WITH_PAGE_INDEX(session, __debug_tree_shape_worker(ds, page, 1));
 
 	__dmsg_wrapup(ds);
 	return (0);
diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c
index c97ea176c97..622dfb1b294 100644
--- a/src/btree/bt_delete.c
+++ b/src/btree/bt_delete.c
@@ -207,6 +207,9 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
 {
 	int skip;
 
+	if (ref->state != WT_REF_DELETED)
+		return (0);
+
 	/*
 	 * Deleted pages come from two sources: either it's a fast-delete as
 	 * described above, or the page has been emptied by other operations
@@ -225,11 +228,14 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
 	 * the page could switch to an in-memory state at any time.  Lock down
 	 * the structure, just to be safe.
 	 */
+	if (ref->page_del == NULL)
+		return (1);
+
 	if (!WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED))
 		return (0);
 
-	skip = ref->page_del == NULL ||
-	    __wt_txn_visible(session, ref->page_del->txnid) ? 1 : 0;
+	skip = (ref->page_del == NULL ||
+	    __wt_txn_visible(session, ref->page_del->txnid));
 
 	WT_PUBLISH(ref->state, WT_REF_DELETED);
 	return (skip);
diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c
index 181ffdb3736..561e1c19218 100644
--- a/src/btree/bt_page.c
+++ b/src/btree/bt_page.c
@@ -37,8 +37,11 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page)
 	    page->type != WT_PAGE_ROW_LEAF)
 		return (0);
 
-	/* Eviction may be turned off, although that's rare. */
-	if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
+	/*
+	 * Eviction may be turned off (although that's rare), or we may be in
+	 * the middle of a checkpoint.
+	 */
+	if (F_ISSET(btree, WT_BTREE_NO_EVICTION) || btree->checkpointing)
 		return (0);
 
 	/*
@@ -128,7 +131,13 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
 			    force_attempts < 10 &&
 			    __evict_force_check(session, page)) {
 				++force_attempts;
-				WT_RET(__wt_page_release(session, ref, flags));
+				if ((ret = __wt_page_release_busy(
+				    session, ref, flags)) == EBUSY) {
+					/* If forced eviction fails, stall. */
+					ret = 0;
+					wait_cnt += 1000;
+				} else
+					WT_RET(ret);
 				WT_STAT_FAST_CONN_INCR(
 				    session, page_forcible_evict_blocked);
 				break;
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 911a38e4be6..69dbfb42354 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -9,15 +9,6 @@
 #include "wt_internal.h"
 
 /*
- * Tuning; global variables to allow the binary to be patched, we don't yet have
- * any real understanding of what might be useful to surface to applications.
- */
-static u_int __split_deepen_max_internal_image = 100;
-static u_int __split_deepen_min_child = 10;
-static u_int __split_deepen_per_child = 100;
-static u_int __split_deepen_split_child = 100;
-
-/*
  * Track allocation increments, matching the cache calculations, which add an
  * estimate of allocation overhead to every object.
  */
@@ -177,45 +168,57 @@ __split_safe_free(WT_SESSION_IMPL *session, int exclusive, void *p, size_t s)
 }
 
 /*
+ * Tuning; global variables to allow the binary to be patched, we don't yet have
+ * any real understanding of what might be useful to surface to applications.
+ */
+static u_int __split_deepen_min_child = 10000;
+static u_int __split_deepen_per_child = 100;
+
+/*
  * __split_should_deepen --
  *	Return if we should deepen the tree.
  */
 static int
-__split_should_deepen(WT_SESSION_IMPL *session, WT_PAGE *page)
+__split_should_deepen(
+    WT_SESSION_IMPL *session, WT_REF *ref, uint32_t *childrenp)
 {
 	WT_PAGE_INDEX *pindex;
+	WT_PAGE *page;
 
-	/*
-	 * Splits are based on either the number of child pages that will be
-	 * created by the split (splitting an internal page that will be slow
-	 * to search), or by the memory footprint of the parent page (avoiding
-	 * an internal page that will eat up all of the cache and put eviction
-	 * pressure on the system).
-	 */
+	*childrenp = 0;
+
+	page = ref->page;
 	pindex = WT_INTL_INDEX_COPY(page);
 
 	/*
 	 * Deepen the tree if the page's memory footprint is larger than the
-	 * maximum size for a page in memory.  We need an absolute minimum
-	 * number of entries in order to split the page: if there is a single
-	 * huge key, splitting won't help.
+	 * maximum size for a page in memory (presumably putting eviction
+	 * pressure on the cache).
 	 */
-	if (page->memory_footprint > S2BT(session)->maxmempage &&
-	    pindex->entries >= __split_deepen_min_child)
-		return (1);
+	if (page->memory_footprint < S2BT(session)->maxmempage)
+		return (0);
 
 	/*
-	 * Deepen the tree if the page's memory footprint is at least N
-	 * times the maximum internal page size chunk in the backing file and
-	 * the split will result in at least N children in the newly created
-	 * intermediate layer.
+	 * Ensure the page has enough entries to make it worth splitting and
+	 * we get a significant payback (in the case of a set of large keys,
+	 * splitting won't help).
 	 */
-	if (page->memory_footprint >
-	    __split_deepen_max_internal_image * S2BT(session)->maxintlpage &&
-	    pindex->entries >=
-	    (__split_deepen_per_child * __split_deepen_split_child))
+	if (pindex->entries > __split_deepen_min_child) {
+		*childrenp = pindex->entries / __split_deepen_per_child;
 		return (1);
+	}
 
+	/*
+	 * The root is a special-case: if it's putting cache pressure on the
+	 * system, split it even if there are only a few entries, we can't
+	 * push it out of memory.  Sanity check: if the root page is too big
+	 * with less than 100 keys, there are huge keys and/or a too-small
+	 * cache, there's not much to do.
+	 */
+	if (__wt_ref_is_root(ref) && pindex->entries > 100) {
+		*childrenp = pindex->entries / 10;
+		return (1);
+	}
 	return (0);
 }
 
@@ -254,12 +257,13 @@ __split_ovfl_key_cleanup(WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref)
 }
 
 /*
- * __split_ref_instantiate --
- *	Instantiate key/address pairs in memory in service of a split.
+ * __split_ref_deepen_move --
+ *	Move a WT_REF from a parent to a child in service of a split to deepen
+ * the tree, including updating the accounting information.
  */
 static int
-__split_ref_instantiate(WT_SESSION_IMPL *session,
-    WT_PAGE *page, WT_REF *ref, size_t *parent_decrp, size_t *child_incrp)
+__split_ref_deepen_move(WT_SESSION_IMPL *session,
+    WT_PAGE *parent, WT_REF *ref, size_t *parent_decrp, size_t *child_incrp)
 {
 	WT_ADDR *addr;
 	WT_CELL_UNPACK unpack;
@@ -276,8 +280,6 @@ __split_ref_instantiate(WT_SESSION_IMPL *session,
 	 * of child pages, and so we can no longer reference the block image
 	 * that remains with the page being split.
 	 *
-	 * Track how much memory the parent is losing and the child gaining.
-	 *
 	 * No locking is required to update the WT_REF structure because we're
 	 * the only thread splitting the parent page, and there's no way for
 	 * readers to race with our updates of single pointers.  The changes
@@ -286,13 +288,13 @@ __split_ref_instantiate(WT_SESSION_IMPL *session,
 	 *
 	 * Row-store keys, first.
 	 */
-	if (page->type == WT_PAGE_ROW_INT) {
+	if (parent->type == WT_PAGE_ROW_INT) {
 		if ((ikey = __wt_ref_key_instantiated(ref)) == NULL) {
-			__wt_ref_key(page, ref, &key, &size);
+			__wt_ref_key(parent, ref, &key, &size);
 			WT_RET(__wt_row_ikey(session, 0, key, size, &ikey));
 			ref->key.ikey = ikey;
 		} else {
-			WT_RET(__split_ovfl_key_cleanup(session, page, ref));
+			WT_RET(__split_ovfl_key_cleanup(session, parent, ref));
 			WT_MEMSIZE_ADD(*parent_decrp,
 			    sizeof(WT_IKEY) + ikey->size);
 		}
@@ -304,12 +306,8 @@ __split_ref_instantiate(WT_SESSION_IMPL *session,
 	 * address has been instantiated, there's no work to do.  Otherwise,
 	 * get the address from the on-page cell.
 	 */
-	if ((addr = ref->addr) == NULL)
-		return (0);
-	if (__wt_off_page(page, addr))
-		WT_MEMSIZE_TRANSFER(*parent_decrp, *child_incrp,
-		    sizeof(WT_ADDR) + addr->size);
-	else {
+	addr = ref->addr;
+	if (addr != NULL && !__wt_off_page(parent, addr)) {
 		__wt_cell_unpack((WT_CELL *)ref->addr, &unpack);
 		WT_RET(__wt_calloc_one(session, &addr));
 		if ((ret = __wt_strndup(
@@ -321,8 +319,11 @@ __split_ref_instantiate(WT_SESSION_IMPL *session,
 		addr->type =
 		    unpack.raw == WT_CELL_ADDR_INT ? WT_ADDR_INT : WT_ADDR_LEAF;
 		ref->addr = addr;
-		WT_MEMSIZE_ADD(*child_incrp, sizeof(WT_ADDR) + addr->size);
 	}
+
+	/* And finally, the WT_REF itself. */
+	WT_MEMSIZE_TRANSFER(*parent_decrp, *child_incrp, sizeof(WT_REF));
+
 	return (0);
 }
 
@@ -383,7 +384,7 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
  *	Split an internal page in-memory, deepening the tree.
  */
 static int
-__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
+__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children)
 {
 	WT_DECL_RET;
 	WT_PAGE *child;
@@ -391,7 +392,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
 	WT_REF **alloc_refp;
 	WT_REF *child_ref, **child_refp, *parent_ref, **parent_refp, *ref;
 	size_t child_incr, parent_decr, parent_incr, size;
-	uint32_t children, chunk, i, j, remain, slots;
+	uint32_t chunk, i, j, remain, slots;
 	int panic;
 	void *p;
 
@@ -401,13 +402,6 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
 
 	pindex = WT_INTL_INDEX_COPY(parent);
 
-	/*
-	 * Create N children, unless we are dealing with a large page without
-	 * many entries, in which case split into the minimum number of pages.
-	 */
-	children = WT_MAX(pindex->entries / __split_deepen_per_child,
-	    __split_deepen_min_child);
-
 	WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen);
 	WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
 	    "%p: %" PRIu32 " elements, splitting into %" PRIu32 " children",
@@ -506,12 +500,9 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
 		child_incr = 0;
 		child_pindex = WT_INTL_INDEX_COPY(child);
 		for (child_refp = child_pindex->index, j = 0; j < slots; ++j) {
-			WT_ERR(__split_ref_instantiate(session,
+			WT_ERR(__split_ref_deepen_move(session,
 			    parent, *parent_refp, &parent_decr, &child_incr));
 			*child_refp++ = *parent_refp++;
-
-			WT_MEMSIZE_TRANSFER(
-			    parent_decr, child_incr, sizeof(WT_REF));
 		}
 		__wt_cache_page_inmem_incr(session, child, child_incr);
 	}
@@ -604,9 +595,10 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
 	 * be using the new index.
 	 */
 	size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
-	WT_MEMSIZE_ADD(parent_decr, size);
 	WT_ERR(__split_safe_free(session, 0, pindex, size));
+	WT_MEMSIZE_ADD(parent_decr, size);
 
+#if 0
 	/*
 	 * Adjust the parent's memory footprint.  This may look odd, but we
 	 * have already taken the allocation overhead into account, and an
@@ -615,6 +607,19 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
 	 */
 	__wt_cache_page_inmem_incr(session, parent, parent_incr);
 	__wt_cache_page_inmem_decr(session, parent, parent_decr);
+#else
+	/*
+	 * XXX
+	 * The code to track page sizes is fundamentally flawed in the face of
+	 * splits: for example, we don't add in an overhead allocation constant
+	 * when allocating WT_REF structures as pages are created, but the
+	 * calculations during split assume that correction. For now, ignore
+	 * our carefully calculated values and force the internal page size to
+	 * 5% of its current value.
+	 */
+	size = parent->memory_footprint - (parent->memory_footprint / 20);
+	__wt_cache_page_inmem_decr(session, parent, size);
+#endif
 
 	if (0) {
 err:		__wt_free_ref_index(session, parent, alloc_index, 1);
@@ -770,13 +775,11 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
 		 * the confusion.
 		 */
 		WT_RET(__wt_calloc_one(session, &addr));
-		WT_MEMSIZE_ADD(incr, sizeof(WT_ADDR));
 		ref->addr = addr;
 		addr->size = multi->addr.size;
 		addr->type = multi->addr.type;
 		WT_RET(__wt_strndup(session,
 		    multi->addr.addr, addr->size, &addr->addr));
-		WT_MEMSIZE_ADD(incr, addr->size);
 	} else
 		WT_RET(__split_multi_inmem(session, page, ref, multi));
 
@@ -814,17 +817,20 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
     int exclusive, int ref_discard)
 {
 	WT_DECL_RET;
+	WT_IKEY *ikey;
 	WT_PAGE *parent;
 	WT_PAGE_INDEX *alloc_index, *pindex;
-	WT_REF **alloc_refp, *parent_ref;
+	WT_REF **alloc_refp, *next_ref, *parent_ref;
 	size_t size;
-	uint32_t i, j, parent_entries, result_entries;
+	uint32_t children, i, j;
+	uint32_t deleted_entries, parent_entries, result_entries;
 	int complete, hazard, locked;
 
 	parent = NULL;			/* -Wconditional-uninitialized */
-	alloc_index = NULL;
+	alloc_index = pindex = NULL;
 	parent_ref = NULL;
 	complete = hazard = locked = 0;
+	parent_entries = 0;
 
 	/*
 	 * Get a page-level lock on the parent to single-thread splits into the
@@ -865,7 +871,29 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
 
 	pindex = WT_INTL_INDEX_COPY(parent);
 	parent_entries = pindex->entries;
-	result_entries = (parent_entries - 1) + new_entries;
+
+	/*
+	 * Remove any refs to deleted pages while we are splitting, we have
+	 * the internal page locked down, and are copying the refs into a new
+	 * array anyway.  Switch them to the special split state, so that any
+	 * reading thread will restart.
+	 */
+	for (i = 0, deleted_entries = 0; i < parent_entries; ++i) {
+		next_ref = pindex->index[i];
+		WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
+		if (next_ref->state == WT_REF_DELETED &&
+		    next_ref->page_del == NULL &&
+		    WT_ATOMIC_CAS4(next_ref->state,
+		    WT_REF_DELETED, WT_REF_SPLIT))
+			deleted_entries++;
+	}
+
+	/*
+	 * The final entry count consists of: The original count, plus any
+	 * new pages, less any refs we are removing because they only
+	 * contained deleted items, less 1 for the page being replaced.
+	 */
+	result_entries = (parent_entries + new_entries) - (deleted_entries + 1);
 
 	/*
 	 * Allocate and initialize a new page index array for the parent, then
@@ -877,8 +905,9 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
 	WT_MEMSIZE_ADD(parent_incr, size);
 	alloc_index->index = (WT_REF **)(alloc_index + 1);
 	alloc_index->entries = result_entries;
-	for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i)
-		if (pindex->index[i] == ref)
+	for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) {
+		next_ref = pindex->index[i];
+		if (next_ref == ref)
 			for (j = 0; j < new_entries; ++j) {
 				ref_new[j]->home = parent;
 				*alloc_refp++ = ref_new[j];
@@ -890,8 +919,10 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
 				 */
 				ref_new[j] = NULL;
 			}
-		else
-			*alloc_refp++ = pindex->index[i];
+		else if (next_ref->state != WT_REF_SPLIT)
+			/* Skip refs we have marked for deletion. */
+			*alloc_refp++ = next_ref;
+	}
 
 	/*
 	 * Update the parent page's index: this update makes the split visible
@@ -926,6 +957,36 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
 	complete = 1;
 
 	/*
+	 * Now that the new page is in place it's OK to free any deleted
+	 * refs we encountered modulo the regular safe free semantics.
+	 */
+	for (i = 0; i < parent_entries; ++i) {
+		next_ref = pindex->index[i];
+		/* If we set the ref to split to mark it for delete */
+		if (next_ref != ref && next_ref->state == WT_REF_SPLIT) {
+			/*
+			 * We're discarding a deleted reference.
+			 * Free any resources it holds.
+			 */
+			if (parent->type == WT_PAGE_ROW_INT) {
+				WT_TRET(__split_ovfl_key_cleanup(
+				    session, parent, next_ref));
+				ikey = __wt_ref_key_instantiated(next_ref);
+				if (ikey != NULL) {
+					size = sizeof(WT_IKEY) + ikey->size;
+					WT_TRET(__split_safe_free(
+					    session, 0, ikey, size));
+					WT_MEMSIZE_ADD(parent_decr, size);
+				}
+			}
+
+			WT_TRET(__split_safe_free(
+			    session, 0, next_ref, sizeof(WT_REF)));
+			WT_MEMSIZE_ADD(parent_decr, sizeof(WT_REF));
+		}
+	}
+
+	/*
 	 * We can't free the previous page index, there may be threads using it.
 	 * Add it to the session discard list, to be freed when it's safe.
 	 */
@@ -978,11 +1039,30 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
 	 *	Do the check here because we've just grown the parent page and
 	 * are holding it locked.
 	 */
-	if (ret == 0 && !exclusive && __split_should_deepen(session, parent))
+	if (ret == 0 && !exclusive &&
+	    !F_ISSET_ATOMIC(parent, WT_PAGE_REFUSE_DEEPEN) &&
+	    __split_should_deepen(session, parent_ref, &children)) {
+		/*
+		 * XXX
+		 * Temporary hack to avoid a bug where the root page is split
+		 * even when it's no longer doing any good.
+		 */
+		uint64_t __a, __b;
+		__a = parent->memory_footprint;
 		WT_WITH_PAGE_INDEX(session,
-		    ret = __split_deepen(session, parent));
+		    ret = __split_deepen(session, parent, children));
+		__b = parent->memory_footprint;
+		if (__b * 2 >= __a)
+			F_SET_ATOMIC(parent, WT_PAGE_REFUSE_DEEPEN);
+	}
 
-err:	if (locked)
+err:	if (!complete)
+		for (i = 0; i < parent_entries; ++i) {
+			next_ref = pindex->index[i];
+			if (next_ref->state == WT_REF_SPLIT)
+				next_ref->state = WT_REF_DELETED;
+		}
+	if (locked)
 		F_CLR_ATOMIC(parent, WT_PAGE_SPLITTING);
 
 	if (hazard)
@@ -1018,15 +1098,16 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
 	WT_PAGE *page, *right;
 	WT_REF *child, *split_ref[2] = { NULL, NULL };
 	WT_UPDATE *upd;
-	size_t page_decr, parent_incr, right_incr, size;
+	size_t page_decr, parent_decr, parent_incr, right_incr;
 	int i;
 
 	*splitp = 0;
 
 	btree = S2BT(session);
 	page = ref->page;
+	ikey = NULL;
 	right = NULL;
-	page_decr = parent_incr = right_incr = 0;
+	page_decr = parent_decr = parent_incr = right_incr = 0;
 
 	/*
 	 * Check for pages with append-only workloads. A common application
@@ -1127,9 +1208,19 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
 	WT_ERR(__wt_row_ikey(session, 0,
 	    WT_INSERT_KEY(moved_ins), WT_INSERT_KEY_SIZE(moved_ins),
 	    &child->key.ikey));
+
+	/*
+	 * We're swapping WT_REFs in the parent, adjust the accounting, and
+	 * row store pages may have instantiated keys.
+	 */
 	WT_MEMSIZE_ADD(parent_incr, sizeof(WT_REF));
-	WT_MEMSIZE_ADD(parent_incr, sizeof(WT_IKEY));
-	WT_MEMSIZE_ADD(parent_incr, WT_INSERT_KEY_SIZE(moved_ins));
+	WT_MEMSIZE_ADD(
+	    parent_incr, sizeof(WT_IKEY) + WT_INSERT_KEY_SIZE(moved_ins));
+	WT_MEMSIZE_ADD(parent_decr, sizeof(WT_REF));
+	if (page->type == WT_PAGE_ROW_LEAF || page->type == WT_PAGE_ROW_INT)
+		if ((ikey = __wt_ref_key_instantiated(ref)) != NULL)
+			WT_MEMSIZE_ADD(
+			    parent_decr, sizeof(WT_IKEY) + ikey->size);
 
 	/* The new page is dirty by definition. */
 	WT_ERR(__wt_page_modify_init(session, right));
@@ -1151,14 +1242,11 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
 	 */
 	for (i = 0; i < WT_SKIP_MAXDEPTH && ins_head->tail[i] == moved_ins; ++i)
 		;
-	size = ((size_t)i - 1) * sizeof(WT_INSERT *);
-	size += sizeof(WT_INSERT) + WT_INSERT_KEY_SIZE(moved_ins);
+	WT_MEMSIZE_TRANSFER(page_decr, right_incr, sizeof(WT_INSERT) +
+	    (size_t)i * sizeof(WT_INSERT *) + WT_INSERT_KEY_SIZE(moved_ins));
 	for (upd = moved_ins->upd; upd != NULL; upd = upd->next)
-		size += sizeof(WT_UPDATE) + upd->size;
-	WT_MEMSIZE_ADD(right_incr, size);
-	WT_MEMSIZE_ADD(page_decr, size);
-	__wt_cache_page_inmem_decr(session, page, page_decr);
-	__wt_cache_page_inmem_incr(session, right, right_incr);
+		WT_MEMSIZE_TRANSFER(
+		    page_decr, right_incr, sizeof(WT_UPDATE) + upd->size);
 
 	/*
 	 * Allocation operations completed, move the last insert list item from
@@ -1245,10 +1333,23 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
 #endif
 
 	/*
-	 * Split into the parent.
+	 * Save the transaction ID when the split happened.  Application
+	 * threads will not try to forcibly evict the page again until
+	 * all concurrent transactions commit.
+	 */
+	page->modify->inmem_split_txn = __wt_txn_new_id(session);
+
+	/* Update the page accounting. */
+	__wt_cache_page_inmem_decr(session, page, page_decr);
+	__wt_cache_page_inmem_incr(session, right, right_incr);
+
+	/*
+	 * Split into the parent.  After this, the original page is no
+	 * longer locked, so we cannot safely look at it.
 	 */
+	page = NULL;
 	if ((ret = __split_parent(
-	    session, ref, split_ref, 2, 0, parent_incr, 0, 0)) != 0) {
+	    session, ref, split_ref, 2, parent_decr, parent_incr, 0, 0)) != 0) {
 		/*
 		 * Move the insert list element back to the original page list.
 		 * For simplicity, the previous skip list pointers originally
@@ -1271,13 +1372,6 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
 		WT_ERR(ret);
 	}
 
-	/*
-	 * Save the transaction ID when the split happened.  Application
-	 * threads will not try to forcibly evict the page again until
-	 * all concurrent transactions commit.
-	 */
-	page->modify->inmem_split_txn = __wt_txn_new_id(session);
-
 	/* Let our caller know that we split. */
 	*splitp = 1;
 
@@ -1289,13 +1383,8 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
 	 * structure and instantiated key, there may be threads using them.
 	 * Add them to the session discard list, to be freed once we know it's
 	 * safe.
-	 *
-	 * After the split, we're going to discard the WT_REF, account for the
-	 * change in memory footprint.  Row store pages have keys that may be
-	 * instantiated, check for that.
 	 */
-	if ((page->type == WT_PAGE_ROW_LEAF || page->type == WT_PAGE_ROW_INT) &&
-	    (ikey = __wt_ref_key_instantiated(ref)) != NULL)
+	 if (ikey != NULL)
 		WT_TRET(__split_safe_free(
 		    session, 0, ikey, sizeof(WT_IKEY) + ikey->size));
 	WT_TRET(__split_safe_free(session, 0, ref, sizeof(WT_REF)));
@@ -1380,7 +1469,7 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
 	WT_PAGE *page;
 	WT_PAGE_MODIFY *mod;
 	WT_REF **ref_new;
-	size_t ikey_size, parent_decr, parent_incr;
+	size_t parent_decr, parent_incr;
 	uint32_t i, new_entries;
 
 	page = ref->page;
@@ -1388,7 +1477,7 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
 	new_entries = mod->mod_multi_entries;
 
 	ikey = NULL;
-	ikey_size = parent_decr = parent_incr = 0;
+	parent_decr = parent_incr = 0;
 
 	/*
 	 * Convert the split page's multiblock reconciliation information into
@@ -1404,12 +1493,11 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
 	 * change in memory footprint.  Row store pages have keys that may be
 	 * instantiated, check for that.
 	 */
-	if ((page->type == WT_PAGE_ROW_LEAF || page->type == WT_PAGE_ROW_INT) &&
-	    (ikey = __wt_ref_key_instantiated(ref)) != NULL) {
-		ikey_size = sizeof(WT_IKEY) + ikey->size;
-		WT_MEMSIZE_ADD(parent_decr, ikey_size);
-	}
 	WT_MEMSIZE_ADD(parent_decr, sizeof(WT_REF));
+	if (page->type == WT_PAGE_ROW_LEAF || page->type == WT_PAGE_ROW_INT)
+		if ((ikey = __wt_ref_key_instantiated(ref)) != NULL)
+			WT_MEMSIZE_ADD(
+			    parent_decr, sizeof(WT_IKEY) + ikey->size);
 
 	/* Split into the parent. */
 	WT_ERR(__split_parent(session,
@@ -1436,7 +1524,8 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
 	 * safe.
 	 */
 	if (ikey != NULL)
-		WT_TRET(__split_safe_free(session, exclusive, ikey, ikey_size));
+		WT_TRET(__split_safe_free(
+		    session, exclusive, ikey, sizeof(WT_IKEY) + ikey->size));
 	WT_TRET(__split_safe_free(session, exclusive, ref, sizeof(WT_REF)));
 
 	/*
diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c
index c74a7177401..a2b2a6bb7c8 100644
--- a/src/btree/bt_walk.c
+++ b/src/btree/bt_walk.c
@@ -208,6 +208,12 @@ restart:	/*
 					break;
 			} else if (LF_ISSET(WT_READ_TRUNCATE)) {
 				/*
+				 * Avoid pulling a deleted page back in to try
+				 * to delete it again.
+				 */
+				if (__wt_delete_page_skip(session, ref))
+					break;
+				/*
 				 * If deleting a range, try to delete the page
 				 * without instantiating it.
 				 */
@@ -242,8 +248,7 @@ restart:	/*
 				 * If iterating a cursor, try to skip deleted
 				 * pages that are visible to us.
 				 */
-				if (ref->state == WT_REF_DELETED &&
-				    __wt_delete_page_skip(session, ref))
+				if (__wt_delete_page_skip(session, ref))
 					break;
 			}
 
diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c
index 2799a58f327..796b7d5147b 100644
--- a/src/conn/conn_log.c
+++ b/src/conn/conn_log.c
@@ -126,11 +126,13 @@ __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file)
 	/*
 	 * If we're coming from a backup cursor we want the smaller of
 	 * the last full log file copied in backup or the checkpoint LSN.
+	 * Otherwise we want the minimum of the last log file written to
+	 * disk and the checkpoint LSN.
 	 */
 	if (backup_file != 0)
 		min_lognum = WT_MIN(log->ckpt_lsn.file, backup_file);
 	else
-		min_lognum = log->ckpt_lsn.file;
+		min_lognum = WT_MIN(log->ckpt_lsn.file, log->sync_lsn.file);
 	WT_RET(__wt_verbose(session, WT_VERB_LOG,
 	    "log_archive: archive to log number %" PRIu32, min_lognum));
 
@@ -276,6 +278,70 @@ err:
 }
 
 /*
+ * __log_close_server --
+ *	The log close server thread.
+ */
+static void *
+__log_close_server(void *arg)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_FH *close_fh;
+	WT_LOG *log;
+	WT_LSN close_end_lsn, close_lsn;
+	WT_SESSION_IMPL *session;
+	int locked;
+
+	session = arg;
+	conn = S2C(session);
+	log = conn->log;
+	locked = 0;
+	while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) {
+		/*
+		 * If there is a log file to close, fsync and close it.
+		 */
+		if ((close_fh = log->log_close_fh) != NULL) {
+			/*
+			 * We've copied the file handle, clear out the one in
+			 * log structure to allow it to be set again.
+			 */
+			log->log_close_fh = NULL;
+			/*
+			 * Set the close_end_lsn to the LSN immediately after
+			 * ours.  That is, the beginning of the next log file.
+			 * We need to know the LSN file number of our own close
+			 * in case earlier calls are still in progress and the
+			 * next one to move the sync_lsn into the next file for
+			 * later syncs.
+			 */
+			WT_ERR(__wt_log_extract_lognum(session, close_fh->name,
+			    &close_lsn.file));
+			close_lsn.offset = 0;
+			close_end_lsn = close_lsn;
+			close_end_lsn.file++;
+			WT_ERR(__wt_fsync(session, close_fh));
+			__wt_spin_lock(session, &log->log_sync_lock);
+			locked = 1;
+			WT_ERR(__wt_close(session, close_fh));
+			log->sync_lsn = close_end_lsn;
+			WT_ERR(__wt_cond_signal(session, log->log_sync_cond));
+			locked = 0;
+			__wt_spin_unlock(session, &log->log_sync_lock);
+		} else
+			/* Wait until the next event. */
+			WT_ERR(__wt_cond_wait(session,
+			    conn->log_close_cond, 10000));
+	}
+
+	if (0) {
+err:		__wt_err(session, ret, "log close server error");
+	}
+	if (locked)
+		__wt_spin_unlock(session, &log->log_sync_lock);
+	return (NULL);
+}
+
+/*
  * __log_server --
  *	The log server thread.
  */
@@ -292,7 +358,7 @@ __log_server(void *arg)
 	conn = S2C(session);
 	log = conn->log;
 	locked = 0;
-	while (F_ISSET(conn, WT_CONN_SERVER_RUN)) {
+	while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) {
 		/*
 		 * Perform log pre-allocation.
 		 */
@@ -320,7 +386,7 @@ __log_server(void *arg)
 	}
 
 	if (0) {
-err:		__wt_err(session, ret, "log archive server error");
+err:		__wt_err(session, ret, "log server error");
 	}
 	if (locked)
 		(void)__wt_writeunlock(session, log->log_archive_lock);
@@ -384,7 +450,7 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[])
 
 /*
  * __wt_logmgr_open --
- *	Start the log subsystem and archive server thread.
+ *	Start the log service threads.
  */
 int
 __wt_logmgr_open(WT_SESSION_IMPL *session)
@@ -394,14 +460,33 @@ __wt_logmgr_open(WT_SESSION_IMPL *session)
 	conn = S2C(session);
 
 	/* If no log thread services are configured, we're done. */ 
-	if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) ||
-	    !FLD_ISSET(conn->log_flags,
+	if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
+		return (0);
+
+	/*
+	 * Start the log close thread.  It is not configurable.
+	 * If logging is enabled, this thread runs.
+	 */
+	WT_RET(__wt_open_internal_session(
+	    conn, "log-close-server", 0, 0, &conn->log_close_session));
+	WT_RET(__wt_cond_alloc(conn->log_close_session,
+	    "log close server", 0, &conn->log_close_cond));
+
+	/*
+	 * Start the thread.
+	 */
+	WT_RET(__wt_thread_create(conn->log_close_session,
+	    &conn->log_close_tid, __log_close_server, conn->log_close_session));
+	conn->log_close_tid_set = 1;
+
+	/* If no log thread services are configured, we're done. */ 
+	if (!FLD_ISSET(conn->log_flags,
 	    (WT_CONN_LOG_ARCHIVE | WT_CONN_LOG_PREALLOC)))
 		return (0);
 
 	/*
 	 * If a log server thread exists, the user may have reconfigured
-	 * archiving ore pre-allocation.  Signal the thread.  Otherwise the
+	 * archiving or pre-allocation.  Signal the thread.  Otherwise the
 	 * user wants archiving and/or allocation and we need to start up
 	 * the thread.
 	 */
@@ -455,6 +540,12 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
 		conn->log_tid_set = 0;
 	}
 	WT_TRET(__wt_cond_destroy(session, &conn->log_cond));
+	if (conn->log_close_tid_set) {
+		WT_TRET(__wt_cond_signal(session, conn->log_close_cond));
+		WT_TRET(__wt_thread_join(session, conn->log_close_tid));
+		conn->log_close_tid_set = 0;
+	}
+	WT_TRET(__wt_cond_destroy(session, &conn->log_close_cond));
 
 	WT_TRET(__wt_log_close(session));
 
diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c
index b425376d6ae..ab873cc36a9 100644
--- a/src/conn/conn_open.c
+++ b/src/conn/conn_open.c
@@ -25,7 +25,7 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[])
 	 * Tell internal server threads to run: this must be set before opening
 	 * any sessions.
 	 */
-	F_SET(conn, WT_CONN_SERVER_RUN);
+	F_SET(conn, WT_CONN_SERVER_RUN | WT_CONN_LOG_SERVER_RUN);
 
 	/* WT_SESSION_IMPL array. */
 	WT_RET(__wt_calloc(session,
@@ -130,6 +130,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
 	if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
 		WT_TRET(__wt_txn_checkpoint_log(
 		    session, 1, WT_TXN_LOG_CKPT_STOP, NULL));
+	F_CLR(conn, WT_CONN_LOG_SERVER_RUN);
 	WT_TRET(__wt_logmgr_destroy(session));
 
 	/* Free memory for collators, compressors, data sources. */
diff --git a/src/docs/images/wtstats.png b/src/docs/images/wtstats.png
new file mode 100644
index 00000000000..f65a2871b6f
--- /dev/null
+++ b/src/docs/images/wtstats.png
diff --git a/src/docs/performance.dox b/src/docs/performance.dox
index 5b9d6c40e7b..2284e1e1d4f 100644
--- a/src/docs/performance.dox
+++ b/src/docs/performance.dox
@@ -24,4 +24,7 @@ investigate performance and tune their WiredTiger applications.
 <h2>Simulating workloads</h2>
 - @subpage wtperf
 
+<h2>Visualizing performance</h2>
+- @subpage wtstats
+
  */
diff --git a/src/docs/spell.ok b/src/docs/spell.ok
index 2fd7e5f0ad2..56d1aa1170f 100644
--- a/src/docs/spell.ok
+++ b/src/docs/spell.ok
@@ -76,6 +76,7 @@ WiredTigerException
 WiredTigerLog
 WiredTigerPanicException
 WiredTigerRollbackException
+WiredTigerStat
 WiredTigerTestCase
 Za
 aR
@@ -442,6 +443,7 @@ writelock
 writelocks
 wrlock
 wtperf
+wtstats
 xa
 yieldcpu
 zlib
diff --git a/src/docs/statistics.dox b/src/docs/statistics.dox
index 067cf342111..7fdc4125254 100644
--- a/src/docs/statistics.dox
+++ b/src/docs/statistics.dox
@@ -153,4 +153,8 @@ A Python script that parses the default logging output and uses the
 Portable Network Graphics (PNG) format graphs is included in the
 WiredTiger distribution in the file \c tools/statlog.py.
 
+@m_if{c}
+To interactively examine statistics results, see @ref wtstats.
+@m_endif
+
 */
diff --git a/src/docs/wtstats.dox b/src/docs/wtstats.dox
new file mode 100644
index 00000000000..1a792849124
--- /dev/null
+++ b/src/docs/wtstats.dox
@@ -0,0 +1,47 @@
+/*! @page wtstats Visualizing performance with wtstats
+
+The WiredTiger distribution includes the \b wtstats tool that can be used to
+examine information generated using statistics logging (see @ref
+statistics_log).
+
+After running an application with statistics logging configured, the
+statistics output files will be in the database home directory.  By default,
+these are named \c WiredTigerStat.* .  In the database home directory, run
+this command, replacing \c \<wiredtiger\> with the path to the
+WiredTiger installation directory:
+\code{.sh}
+python <wiredtiger>/tools/wtstats.py WiredTigerStat.*
+\endcode
+
+Another way to process all the stats files in a directory is:
+
+\code{.sh}
+python <wiredtiger>/tools/wtstats.py <directory>
+\endcode
+In either case, a \c wtstats.html file will be generated in the \e current
+directory that you can open in your browser to examine statistics.
+
+Additional options are available, use <tt>wtstats.py --help</tt>
+to display them.
+
+Here is a sample of what is displayed using \c wtstats.html:
+
+\image html wtstats.png "wtstats.html"
+
+Some things to note about the interface:
+
+- The left sidebar has statistics groups that can each be expanded
+to show individual statistics.  Clicking on a circle toggles whether an
+individual statistic or statistics group is displayed or not.
+
+- The search box at the upper left can be used to search for statistics
+matching a string.
+
+- Hovering over values in the graph will show what the value is, and what
+statistic is being shown.
+
+- The graph can be panned using two fingered scroll or mouse wheel.
+
+- Scaling of the entire graph can be changed using the buttons at the right top.
+
+*/
diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c
index 60a5f82f233..a4ae0aaf55b 100644
--- a/src/evict/evict_lru.c
+++ b/src/evict/evict_lru.c
@@ -437,7 +437,7 @@ __evict_pass(WT_SESSION_IMPL *session)
 	WT_EVICT_WORKER *worker;
 	int loop;
 	uint32_t flags;
-	uint64_t bytes_inuse, pages_evicted;
+	uint64_t bytes_inuse, dirty_target_size, pages_evicted, target_size;
 
 	conn = S2C(session);
 	cache = conn->cache;
@@ -465,9 +465,16 @@ __evict_pass(WT_SESSION_IMPL *session)
 		if (loop > 10)
 			LF_SET(WT_EVICT_PASS_AGGRESSIVE);
 
-		/* Start a worker if we have capacity and the cache is full. */
+		/*
+		 * Start a worker if we have capacity and we haven't reached
+		 * the eviction targets.
+		 */
 		bytes_inuse = __wt_cache_bytes_inuse(cache);
-		if (bytes_inuse > conn->cache_size &&
+		target_size = (conn->cache_size * cache->eviction_target) / 100;
+		dirty_target_size =
+		    (conn->cache_size * cache->eviction_dirty_target) / 100;
+		if ((bytes_inuse > target_size ||
+		    cache->bytes_dirty > dirty_target_size) &&
 		    conn->evict_workers < conn->evict_workers_max) {
 			WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER,
 			    "Starting evict worker: %"PRIu32"\n",
diff --git a/src/include/btmem.h b/src/include/btmem.h
index e1fc72677c5..dd10e522412 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -550,9 +550,10 @@ struct __wt_page {
 #define	WT_PAGE_DISK_ALLOC	0x02	/* Disk image in allocated memory */
 #define	WT_PAGE_DISK_MAPPED	0x04	/* Disk image in mapped memory */
 #define	WT_PAGE_EVICT_LRU	0x08	/* Page is on the LRU queue */
-#define	WT_PAGE_SCANNING	0x10	/* Obsolete updates are being scanned */
-#define	WT_PAGE_SPLITTING	0x20	/* An internal page is growing */
+#define	WT_PAGE_REFUSE_DEEPEN	0x10	/* Don't deepen the tree at this page */
+#define	WT_PAGE_SCANNING	0x20	/* Obsolete updates are being scanned */
 #define	WT_PAGE_SPLIT_INSERT	0x40	/* A leaf page was split for append */
+#define	WT_PAGE_SPLITTING	0x80	/* An internal page is growing */
 	uint8_t flags_atomic;		/* Atomic flags, use F_*_ATOMIC */
 };
 
diff --git a/src/include/btree.i b/src/include/btree.i
index a333e4af565..d30ee46486a 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -165,65 +165,6 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page)
 }
 
 /*
- * __wt_cache_read_gen --
- *      Get the current read generation number.
- */
-static inline uint64_t
-__wt_cache_read_gen(WT_SESSION_IMPL *session)
-{
-	return (S2C(session)->cache->read_gen);
-}
-
-/*
- * __wt_cache_read_gen_incr --
- *      Increment the current read generation number.
- */
-static inline void
-__wt_cache_read_gen_incr(WT_SESSION_IMPL *session)
-{
-	++S2C(session)->cache->read_gen;
-}
-
-/*
- * __wt_cache_read_gen_set --
- *      Get the read generation to store in a page.
- */
-static inline uint64_t
-__wt_cache_read_gen_set(WT_SESSION_IMPL *session)
-{
-	/*
-	 * We return read-generations from the future (where "the future" is
-	 * measured by increments of the global read generation).  The reason
-	 * is because when acquiring a new hazard pointer for a page, we can
-	 * check its read generation, and if the read generation isn't less
-	 * than the current global generation, we don't bother updating the
-	 * page.  In other words, the goal is to avoid some number of updates
-	 * immediately after each update we have to make.
-	 */
-	return (__wt_cache_read_gen(session) + WT_READGEN_STEP);
-}
-
-/*
- * __wt_cache_pages_inuse --
- *	Return the number of pages in use.
- */
-static inline uint64_t
-__wt_cache_pages_inuse(WT_CACHE *cache)
-{
-	return (cache->pages_inmem - cache->pages_evict);
-}
-
-/*
- * __wt_cache_bytes_inuse --
- *	Return the number of bytes in use.
- */
-static inline uint64_t
-__wt_cache_bytes_inuse(WT_CACHE *cache)
-{
-	return (cache->bytes_inmem - cache->bytes_evict);
-}
-
-/*
  * __wt_page_evict_soon --
  *      Set a page to be evicted as soon as possible.
  */
@@ -917,16 +858,16 @@ __wt_ref_info(WT_SESSION_IMPL *session,
 }
 
 /*
- * __wt_page_release --
- *	Release a reference to a page.
+ * __wt_page_release_busy --
+ *	Release a reference to a page, fail if busy during forced eviction.
  */
 static inline int
-__wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
+__wt_page_release_busy(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
 {
 	WT_BTREE *btree;
 	WT_DECL_RET;
 	WT_PAGE *page;
-	int locked;
+	int locked, too_big;
 
 	btree = S2BT(session);
 
@@ -938,6 +879,8 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
 		return (0);
 	page = ref->page;
 
+	too_big = (page->memory_footprint < btree->maxmempage) ? 0 : 1;
+
 	/*
 	 * Attempt to evict pages with the special "oldest" read generation.
 	 *
@@ -970,12 +913,19 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
 		return (ret);
 
 	(void)WT_ATOMIC_ADD4(btree->evict_busy, 1);
-	if ((ret = __wt_evict_page(session, ref)) == 0)
-		WT_STAT_FAST_CONN_INCR(session, cache_eviction_force);
-	else {
+	if ((ret = __wt_evict_page(session, ref)) == 0) {
+		if (too_big)
+			WT_STAT_FAST_CONN_INCR(session, cache_eviction_force);
+		else
+			/*
+			 * If the page isn't too big, we are evicting it because
+			 * it had a chain of deleted entries that make traversal
+			 * expensive.
+			 */
+			WT_STAT_FAST_CONN_INCR(
+			    session, cache_eviction_force_delete);
+	} else {
 		WT_STAT_FAST_CONN_INCR(session, cache_eviction_force_fail);
-		if (ret == EBUSY)
-			ret = 0;
 	}
 	(void)WT_ATOMIC_SUB4(btree->evict_busy, 1);
 
@@ -983,6 +933,17 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
 }
 
 /*
+ * __wt_page_release --
+ *	Release a reference to a page.
+ */
+static inline int
+__wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
+{
+	WT_RET_BUSY_OK(__wt_page_release_busy(session, ref, flags));
+	return (0);
+}
+
+/*
  * __wt_page_swap_func --
  *	Swap one page's hazard pointer for another one when hazard pointer
  * coupling up/down the tree.
diff --git a/src/include/cache.i b/src/include/cache.i
index b997781272a..ee969255241 100644
--- a/src/include/cache.i
+++ b/src/include/cache.i
@@ -7,6 +7,65 @@
  */
 
 /*
+ * __wt_cache_read_gen --
+ *      Get the current read generation number.
+ */
+static inline uint64_t
+__wt_cache_read_gen(WT_SESSION_IMPL *session)
+{
+	return (S2C(session)->cache->read_gen);
+}
+
+/*
+ * __wt_cache_read_gen_incr --
+ *      Increment the current read generation number.
+ */
+static inline void
+__wt_cache_read_gen_incr(WT_SESSION_IMPL *session)
+{
+	++S2C(session)->cache->read_gen;
+}
+
+/*
+ * __wt_cache_read_gen_set --
+ *      Get the read generation to store in a page.
+ */
+static inline uint64_t
+__wt_cache_read_gen_set(WT_SESSION_IMPL *session)
+{
+	/*
+	 * We return read-generations from the future (where "the future" is
+	 * measured by increments of the global read generation).  The reason
+	 * is because when acquiring a new hazard pointer for a page, we can
+	 * check its read generation, and if the read generation isn't less
+	 * than the current global generation, we don't bother updating the
+	 * page.  In other words, the goal is to avoid some number of updates
+	 * immediately after each update we have to make.
+	 */
+	return (__wt_cache_read_gen(session) + WT_READGEN_STEP);
+}
+
+/*
+ * __wt_cache_pages_inuse --
+ *	Return the number of pages in use.
+ */
+static inline uint64_t
+__wt_cache_pages_inuse(WT_CACHE *cache)
+{
+	return (cache->pages_inmem - cache->pages_evict);
+}
+
+/*
+ * __wt_cache_bytes_inuse --
+ *	Return the number of bytes in use.
+ */
+static inline uint64_t
+__wt_cache_bytes_inuse(WT_CACHE *cache)
+{
+	return (cache->bytes_inmem - cache->bytes_evict);
+}
+
+/*
  * __wt_eviction_check --
  *	Wake the eviction server if necessary.
  */
diff --git a/src/include/connection.h b/src/include/connection.h
index c8a3ae6e291..c5723882489 100644
--- a/src/include/connection.h
+++ b/src/include/connection.h
@@ -279,10 +279,14 @@ struct __wt_connection_impl {
 #define	WT_CONN_LOG_EXISTED	0x04	/* Log files found */
 #define	WT_CONN_LOG_PREALLOC	0x08	/* Pre-allocation is enabled */
 	uint32_t	 log_flags;	/* Global logging configuration */
-	WT_CONDVAR	*log_cond;	/* Log archive wait mutex */
-	WT_SESSION_IMPL *log_session;	/* Log archive session */
-	wt_thread_t	 log_tid;	/* Log archive thread */
-	int		 log_tid_set;	/* Log archive thread set */
+	WT_CONDVAR	*log_cond;	/* Log server wait mutex */
+	WT_SESSION_IMPL *log_session;	/* Log server session */
+	wt_thread_t	 log_tid;	/* Log server thread */
+	int		 log_tid_set;	/* Log server thread set */
+	WT_CONDVAR	*log_close_cond;/* Log close thread wait mutex */
+	WT_SESSION_IMPL *log_close_session;/* Log close thread session */
+	wt_thread_t	 log_close_tid;	/* Log close thread thread */
+	int		 log_close_tid_set;/* Log close thread set */
 	WT_LOG		*log;		/* Logging structure */
 	WT_COMPRESSOR	*log_compressor;/* Logging compressor */
 	wt_off_t	 log_file_max;	/* Log file max size */
diff --git a/src/include/cursor.i b/src/include/cursor.i
index ae6aafdd638..8fa9790e096 100644
--- a/src/include/cursor.i
+++ b/src/include/cursor.i
@@ -176,11 +176,23 @@ static inline int
 __cursor_func_init(WT_CURSOR_BTREE *cbt, int reenter)
 {
 	WT_SESSION_IMPL *session;
+	WT_TXN *txn;
 
 	session = (WT_SESSION_IMPL *)cbt->iface.session;
+	txn = &session->txn;
 
 	if (reenter)
 		WT_RET(__curfile_leave(cbt));
+
+	/*
+	 * If there is no transaction active in this thread and we haven't
+	 * checked if the cache is full, do it now.  If we have to block for
+	 * eviction, this is the best time to do it.
+	 */
+	if (F_ISSET(txn, TXN_RUNNING) &&
+	    !F_ISSET(txn, TXN_HAS_ID) && !F_ISSET(txn, TXN_HAS_SNAPSHOT))
+		WT_RET(__wt_cache_full_check(session));
+
 	if (!F_ISSET(cbt, WT_CBT_ACTIVE))
 		WT_RET(__curfile_enter(cbt));
 	__wt_txn_cursor_op(session);
diff --git a/src/include/flags.h b/src/include/flags.h
index c7e74885a35..9664fce3f9f 100644
--- a/src/include/flags.h
+++ b/src/include/flags.h
@@ -6,15 +6,16 @@
 #define	WT_CONN_CKPT_SYNC				0x00000002
 #define	WT_CONN_EVICTION_RUN				0x00000004
 #define	WT_CONN_LEAK_MEMORY				0x00000008
-#define	WT_CONN_LSM_MERGE				0x00000010
-#define	WT_CONN_PANIC					0x00000020
-#define	WT_CONN_SERVER_ASYNC				0x00000040
-#define	WT_CONN_SERVER_CHECKPOINT			0x00000080
-#define	WT_CONN_SERVER_LSM				0x00000100
-#define	WT_CONN_SERVER_RUN				0x00000200
-#define	WT_CONN_SERVER_STATISTICS			0x00000400
-#define	WT_CONN_SERVER_SWEEP				0x00000800
-#define	WT_CONN_WAS_BACKUP				0x00001000
+#define	WT_CONN_LOG_SERVER_RUN				0x00000010
+#define	WT_CONN_LSM_MERGE				0x00000020
+#define	WT_CONN_PANIC					0x00000040
+#define	WT_CONN_SERVER_ASYNC				0x00000080
+#define	WT_CONN_SERVER_CHECKPOINT			0x00000100
+#define	WT_CONN_SERVER_LSM				0x00000200
+#define	WT_CONN_SERVER_RUN				0x00000400
+#define	WT_CONN_SERVER_STATISTICS			0x00000800
+#define	WT_CONN_SERVER_SWEEP				0x00001000
+#define	WT_CONN_WAS_BACKUP				0x00002000
 #define	WT_EVICTING					0x00000001
 #define	WT_FILE_TYPE_CHECKPOINT				0x00000001
 #define	WT_FILE_TYPE_DATA				0x00000002
diff --git a/src/include/stat.h b/src/include/stat.h
index cbd22c7b9d0..6efb9970065 100644
--- a/src/include/stat.h
+++ b/src/include/stat.h
@@ -164,6 +164,7 @@ struct __wt_connection_stats {
 	WT_STATS cache_eviction_dirty;
 	WT_STATS cache_eviction_fail;
 	WT_STATS cache_eviction_force;
+	WT_STATS cache_eviction_force_delete;
 	WT_STATS cache_eviction_force_fail;
 	WT_STATS cache_eviction_hazard;
 	WT_STATS cache_eviction_internal;
diff --git a/src/include/txn.i b/src/include/txn.i
index 745a8f75a99..656181790ed 100644
--- a/src/include/txn.i
+++ b/src/include/txn.i
@@ -227,6 +227,16 @@ __wt_txn_id_check(WT_SESSION_IMPL *session)
 	txn = &session->txn;
 
 	WT_ASSERT(session, F_ISSET(txn, TXN_RUNNING));
+
+	/*
+	 * If there is no transaction active in this thread and we haven't
+	 * checked if the cache is full, do it now.  If we have to block for
+	 * eviction, this is the best time to do it.
+	 */
+	if (F_ISSET(txn, TXN_RUNNING) &&
+	    !F_ISSET(txn, TXN_HAS_ID) && !F_ISSET(txn, TXN_HAS_SNAPSHOT))
+		WT_RET(__wt_cache_full_check(session));
+
 	if (!F_ISSET(txn, TXN_HAS_ID)) {
 		conn = S2C(session);
 		txn_global = &conn->txn_global;
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index 80b917e37cb..5f6818ebba5 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -3185,206 +3185,208 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
 #define	WT_STAT_CONN_CACHE_EVICTION_FAIL		1030
 /*! cache: pages evicted because they exceeded the in-memory maximum */
 #define	WT_STAT_CONN_CACHE_EVICTION_FORCE		1031
+/*! cache: pages evicted because they had chains of deleted items */
+#define	WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE	1032
 /*! cache: failed eviction of pages that exceeded the in-memory maximum */
-#define	WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL		1032
+#define	WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL		1033
 /*! cache: hazard pointer blocked page eviction */
-#define	WT_STAT_CONN_CACHE_EVICTION_HAZARD		1033
+#define	WT_STAT_CONN_CACHE_EVICTION_HAZARD		1034
 /*! cache: internal pages evicted */
-#define	WT_STAT_CONN_CACHE_EVICTION_INTERNAL		1034
+#define	WT_STAT_CONN_CACHE_EVICTION_INTERNAL		1035
 /*! cache: maximum page size at eviction */
-#define	WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE	1035
+#define	WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE	1036
 /*! cache: eviction server candidate queue empty when topping up */
-#define	WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY		1036
+#define	WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY		1037
 /*! cache: eviction server candidate queue not empty when topping up */
-#define	WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY	1037
+#define	WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY	1038
 /*! cache: eviction server evicting pages */
-#define	WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING	1038
+#define	WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING	1039
 /*! cache: eviction server populating queue, but not evicting pages */
-#define	WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING	1039
+#define	WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING	1040
 /*! cache: eviction server unable to reach eviction goal */
-#define	WT_STAT_CONN_CACHE_EVICTION_SLOW		1040
+#define	WT_STAT_CONN_CACHE_EVICTION_SLOW		1041
 /*! cache: pages split during eviction */
-#define	WT_STAT_CONN_CACHE_EVICTION_SPLIT		1041
+#define	WT_STAT_CONN_CACHE_EVICTION_SPLIT		1042
 /*! cache: pages walked for eviction */
-#define	WT_STAT_CONN_CACHE_EVICTION_WALK		1042
+#define	WT_STAT_CONN_CACHE_EVICTION_WALK		1043
 /*! cache: in-memory page splits */
-#define	WT_STAT_CONN_CACHE_INMEM_SPLIT			1043
+#define	WT_STAT_CONN_CACHE_INMEM_SPLIT			1044
 /*! cache: tracked dirty pages in the cache */
-#define	WT_STAT_CONN_CACHE_PAGES_DIRTY			1044
+#define	WT_STAT_CONN_CACHE_PAGES_DIRTY			1045
 /*! cache: pages currently held in the cache */
-#define	WT_STAT_CONN_CACHE_PAGES_INUSE			1045
+#define	WT_STAT_CONN_CACHE_PAGES_INUSE			1046
 /*! cache: pages read into cache */
-#define	WT_STAT_CONN_CACHE_READ				1046
+#define	WT_STAT_CONN_CACHE_READ				1047
 /*! cache: pages written from cache */
-#define	WT_STAT_CONN_CACHE_WRITE			1047
+#define	WT_STAT_CONN_CACHE_WRITE			1048
 /*! connection: pthread mutex condition wait calls */
-#define	WT_STAT_CONN_COND_WAIT				1048
+#define	WT_STAT_CONN_COND_WAIT				1049
 /*! cursor: cursor create calls */
-#define	WT_STAT_CONN_CURSOR_CREATE			1049
+#define	WT_STAT_CONN_CURSOR_CREATE			1050
 /*! cursor: cursor insert calls */
-#define	WT_STAT_CONN_CURSOR_INSERT			1050
+#define	WT_STAT_CONN_CURSOR_INSERT			1051
 /*! cursor: cursor next calls */
-#define	WT_STAT_CONN_CURSOR_NEXT			1051
+#define	WT_STAT_CONN_CURSOR_NEXT			1052
 /*! cursor: cursor prev calls */
-#define	WT_STAT_CONN_CURSOR_PREV			1052
+#define	WT_STAT_CONN_CURSOR_PREV			1053
 /*! cursor: cursor remove calls */
-#define	WT_STAT_CONN_CURSOR_REMOVE			1053
+#define	WT_STAT_CONN_CURSOR_REMOVE			1054
 /*! cursor: cursor reset calls */
-#define	WT_STAT_CONN_CURSOR_RESET			1054
+#define	WT_STAT_CONN_CURSOR_RESET			1055
 /*! cursor: cursor search calls */
-#define	WT_STAT_CONN_CURSOR_SEARCH			1055
+#define	WT_STAT_CONN_CURSOR_SEARCH			1056
 /*! cursor: cursor search near calls */
-#define	WT_STAT_CONN_CURSOR_SEARCH_NEAR			1056
+#define	WT_STAT_CONN_CURSOR_SEARCH_NEAR			1057
 /*! cursor: cursor update calls */
-#define	WT_STAT_CONN_CURSOR_UPDATE			1057
+#define	WT_STAT_CONN_CURSOR_UPDATE			1058
 /*! data-handle: connection dhandles swept */
-#define	WT_STAT_CONN_DH_CONN_HANDLES			1058
+#define	WT_STAT_CONN_DH_CONN_HANDLES			1059
 /*! data-handle: connection candidate referenced */
-#define	WT_STAT_CONN_DH_CONN_REF			1059
+#define	WT_STAT_CONN_DH_CONN_REF			1060
 /*! data-handle: connection sweeps */
-#define	WT_STAT_CONN_DH_CONN_SWEEPS			1060
+#define	WT_STAT_CONN_DH_CONN_SWEEPS			1061
 /*! data-handle: connection time-of-death sets */
-#define	WT_STAT_CONN_DH_CONN_TOD			1061
+#define	WT_STAT_CONN_DH_CONN_TOD			1062
 /*! data-handle: session dhandles swept */
-#define	WT_STAT_CONN_DH_SESSION_HANDLES			1062
+#define	WT_STAT_CONN_DH_SESSION_HANDLES			1063
 /*! data-handle: session sweep attempts */
-#define	WT_STAT_CONN_DH_SESSION_SWEEPS			1063
+#define	WT_STAT_CONN_DH_SESSION_SWEEPS			1064
 /*! connection: files currently open */
-#define	WT_STAT_CONN_FILE_OPEN				1064
+#define	WT_STAT_CONN_FILE_OPEN				1065
 /*! log: log buffer size increases */
-#define	WT_STAT_CONN_LOG_BUFFER_GROW			1065
+#define	WT_STAT_CONN_LOG_BUFFER_GROW			1066
 /*! log: total log buffer size */
-#define	WT_STAT_CONN_LOG_BUFFER_SIZE			1066
+#define	WT_STAT_CONN_LOG_BUFFER_SIZE			1067
 /*! log: log bytes of payload data */
-#define	WT_STAT_CONN_LOG_BYTES_PAYLOAD			1067
+#define	WT_STAT_CONN_LOG_BYTES_PAYLOAD			1068
 /*! log: log bytes written */
-#define	WT_STAT_CONN_LOG_BYTES_WRITTEN			1068
+#define	WT_STAT_CONN_LOG_BYTES_WRITTEN			1069
 /*! log: yields waiting for previous log file close */
-#define	WT_STAT_CONN_LOG_CLOSE_YIELDS			1069
+#define	WT_STAT_CONN_LOG_CLOSE_YIELDS			1070
 /*! log: total size of compressed records */
-#define	WT_STAT_CONN_LOG_COMPRESS_LEN			1070
+#define	WT_STAT_CONN_LOG_COMPRESS_LEN			1071
 /*! log: total in-memory size of compressed records */
-#define	WT_STAT_CONN_LOG_COMPRESS_MEM			1071
+#define	WT_STAT_CONN_LOG_COMPRESS_MEM			1072
 /*! log: log records too small to compress */
-#define	WT_STAT_CONN_LOG_COMPRESS_SMALL			1072
+#define	WT_STAT_CONN_LOG_COMPRESS_SMALL			1073
 /*! log: log records not compressed */
-#define	WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS		1073
+#define	WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS		1074
 /*! log: log records compressed */
-#define	WT_STAT_CONN_LOG_COMPRESS_WRITES		1074
+#define	WT_STAT_CONN_LOG_COMPRESS_WRITES		1075
 /*! log: maximum log file size */
-#define	WT_STAT_CONN_LOG_MAX_FILESIZE			1075
+#define	WT_STAT_CONN_LOG_MAX_FILESIZE			1076
 /*! log: pre-allocated log files prepared */
-#define	WT_STAT_CONN_LOG_PREALLOC_FILES			1076
+#define	WT_STAT_CONN_LOG_PREALLOC_FILES			1077
 /*! log: number of pre-allocated log files to create */
-#define	WT_STAT_CONN_LOG_PREALLOC_MAX			1077
+#define	WT_STAT_CONN_LOG_PREALLOC_MAX			1078
 /*! log: pre-allocated log files used */
-#define	WT_STAT_CONN_LOG_PREALLOC_USED			1078
+#define	WT_STAT_CONN_LOG_PREALLOC_USED			1079
 /*! log: log read operations */
-#define	WT_STAT_CONN_LOG_READS				1079
+#define	WT_STAT_CONN_LOG_READS				1080
 /*! log: records processed by log scan */
-#define	WT_STAT_CONN_LOG_SCAN_RECORDS			1080
+#define	WT_STAT_CONN_LOG_SCAN_RECORDS			1081
 /*! log: log scan records requiring two reads */
-#define	WT_STAT_CONN_LOG_SCAN_REREADS			1081
+#define	WT_STAT_CONN_LOG_SCAN_REREADS			1082
 /*! log: log scan operations */
-#define	WT_STAT_CONN_LOG_SCANS				1082
+#define	WT_STAT_CONN_LOG_SCANS				1083
 /*! log: consolidated slot closures */
-#define	WT_STAT_CONN_LOG_SLOT_CLOSES			1083
+#define	WT_STAT_CONN_LOG_SLOT_CLOSES			1084
 /*! log: logging bytes consolidated */
-#define	WT_STAT_CONN_LOG_SLOT_CONSOLIDATED		1084
+#define	WT_STAT_CONN_LOG_SLOT_CONSOLIDATED		1085
 /*! log: consolidated slot joins */
-#define	WT_STAT_CONN_LOG_SLOT_JOINS			1085
+#define	WT_STAT_CONN_LOG_SLOT_JOINS			1086
 /*! log: consolidated slot join races */
-#define	WT_STAT_CONN_LOG_SLOT_RACES			1086
+#define	WT_STAT_CONN_LOG_SLOT_RACES			1087
 /*! log: slots selected for switching that were unavailable */
-#define	WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS		1087
+#define	WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS		1088
 /*! log: record size exceeded maximum */
-#define	WT_STAT_CONN_LOG_SLOT_TOOBIG			1088
+#define	WT_STAT_CONN_LOG_SLOT_TOOBIG			1089
 /*! log: failed to find a slot large enough for record */
-#define	WT_STAT_CONN_LOG_SLOT_TOOSMALL			1089
+#define	WT_STAT_CONN_LOG_SLOT_TOOSMALL			1090
 /*! log: consolidated slot join transitions */
-#define	WT_STAT_CONN_LOG_SLOT_TRANSITIONS		1090
+#define	WT_STAT_CONN_LOG_SLOT_TRANSITIONS		1091
 /*! log: log sync operations */
-#define	WT_STAT_CONN_LOG_SYNC				1091
+#define	WT_STAT_CONN_LOG_SYNC				1092
 /*! log: log write operations */
-#define	WT_STAT_CONN_LOG_WRITES				1092
+#define	WT_STAT_CONN_LOG_WRITES				1093
 /*! LSM: sleep for LSM checkpoint throttle */
-#define	WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE		1093
+#define	WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE		1094
 /*! LSM: sleep for LSM merge throttle */
-#define	WT_STAT_CONN_LSM_MERGE_THROTTLE			1094
+#define	WT_STAT_CONN_LSM_MERGE_THROTTLE			1095
 /*! LSM: rows merged in an LSM tree */
-#define	WT_STAT_CONN_LSM_ROWS_MERGED			1095
+#define	WT_STAT_CONN_LSM_ROWS_MERGED			1096
 /*! LSM: application work units currently queued */
-#define	WT_STAT_CONN_LSM_WORK_QUEUE_APP			1096
+#define	WT_STAT_CONN_LSM_WORK_QUEUE_APP			1097
 /*! LSM: merge work units currently queued */
-#define	WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER		1097
+#define	WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER		1098
 /*! LSM: tree queue hit maximum */
-#define	WT_STAT_CONN_LSM_WORK_QUEUE_MAX			1098
+#define	WT_STAT_CONN_LSM_WORK_QUEUE_MAX			1099
 /*! LSM: switch work units currently queued */
-#define	WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH		1099
+#define	WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH		1100
 /*! LSM: tree maintenance operations scheduled */
-#define	WT_STAT_CONN_LSM_WORK_UNITS_CREATED		1100
+#define	WT_STAT_CONN_LSM_WORK_UNITS_CREATED		1101
 /*! LSM: tree maintenance operations discarded */
-#define	WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED		1101
+#define	WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED		1102
 /*! LSM: tree maintenance operations executed */
-#define	WT_STAT_CONN_LSM_WORK_UNITS_DONE		1102
+#define	WT_STAT_CONN_LSM_WORK_UNITS_DONE		1103
 /*! connection: memory allocations */
-#define	WT_STAT_CONN_MEMORY_ALLOCATION			1103
+#define	WT_STAT_CONN_MEMORY_ALLOCATION			1104
 /*! connection: memory frees */
-#define	WT_STAT_CONN_MEMORY_FREE			1104
+#define	WT_STAT_CONN_MEMORY_FREE			1105
 /*! connection: memory re-allocations */
-#define	WT_STAT_CONN_MEMORY_GROW			1105
+#define	WT_STAT_CONN_MEMORY_GROW			1106
 /*! thread-yield: page acquire busy blocked */
-#define	WT_STAT_CONN_PAGE_BUSY_BLOCKED			1106
+#define	WT_STAT_CONN_PAGE_BUSY_BLOCKED			1107
 /*! thread-yield: page acquire eviction blocked */
-#define	WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED	1107
+#define	WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED	1108
 /*! thread-yield: page acquire locked blocked */
-#define	WT_STAT_CONN_PAGE_LOCKED_BLOCKED		1108
+#define	WT_STAT_CONN_PAGE_LOCKED_BLOCKED		1109
 /*! thread-yield: page acquire read blocked */
-#define	WT_STAT_CONN_PAGE_READ_BLOCKED			1109
+#define	WT_STAT_CONN_PAGE_READ_BLOCKED			1110
 /*! thread-yield: page acquire time sleeping (usecs) */
-#define	WT_STAT_CONN_PAGE_SLEEP				1110
+#define	WT_STAT_CONN_PAGE_SLEEP				1111
 /*! connection: total read I/Os */
-#define	WT_STAT_CONN_READ_IO				1111
+#define	WT_STAT_CONN_READ_IO				1112
 /*! reconciliation: page reconciliation calls */
-#define	WT_STAT_CONN_REC_PAGES				1112
+#define	WT_STAT_CONN_REC_PAGES				1113
 /*! reconciliation: page reconciliation calls for eviction */
-#define	WT_STAT_CONN_REC_PAGES_EVICTION			1113
+#define	WT_STAT_CONN_REC_PAGES_EVICTION			1114
 /*! reconciliation: split bytes currently awaiting free */
-#define	WT_STAT_CONN_REC_SPLIT_STASHED_BYTES		1114
+#define	WT_STAT_CONN_REC_SPLIT_STASHED_BYTES		1115
 /*! reconciliation: split objects currently awaiting free */
-#define	WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS		1115
+#define	WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS		1116
 /*! connection: pthread mutex shared lock read-lock calls */
-#define	WT_STAT_CONN_RWLOCK_READ			1116
+#define	WT_STAT_CONN_RWLOCK_READ			1117
 /*! connection: pthread mutex shared lock write-lock calls */
-#define	WT_STAT_CONN_RWLOCK_WRITE			1117
+#define	WT_STAT_CONN_RWLOCK_WRITE			1118
 /*! session: open cursor count */
-#define	WT_STAT_CONN_SESSION_CURSOR_OPEN		1118
+#define	WT_STAT_CONN_SESSION_CURSOR_OPEN		1119
 /*! session: open session count */
-#define	WT_STAT_CONN_SESSION_OPEN			1119
+#define	WT_STAT_CONN_SESSION_OPEN			1120
 /*! transaction: transaction begins */
-#define	WT_STAT_CONN_TXN_BEGIN				1120
+#define	WT_STAT_CONN_TXN_BEGIN				1121
 /*! transaction: transaction checkpoints */
-#define	WT_STAT_CONN_TXN_CHECKPOINT			1121
+#define	WT_STAT_CONN_TXN_CHECKPOINT			1122
 /*! transaction: transaction checkpoint currently running */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_RUNNING		1122
+#define	WT_STAT_CONN_TXN_CHECKPOINT_RUNNING		1123
 /*! transaction: transaction checkpoint max time (msecs) */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX		1123
+#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX		1124
 /*! transaction: transaction checkpoint min time (msecs) */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN		1124
+#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN		1125
 /*! transaction: transaction checkpoint most recent time (msecs) */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT		1125
+#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT		1126
 /*! transaction: transaction checkpoint total time (msecs) */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL		1126
+#define	WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL		1127
 /*! transaction: transactions committed */
-#define	WT_STAT_CONN_TXN_COMMIT				1127
+#define	WT_STAT_CONN_TXN_COMMIT				1128
 /*! transaction: transaction failures due to cache overflow */
-#define	WT_STAT_CONN_TXN_FAIL_CACHE			1128
+#define	WT_STAT_CONN_TXN_FAIL_CACHE			1129
 /*! transaction: transaction range of IDs currently pinned */
-#define	WT_STAT_CONN_TXN_PINNED_RANGE			1129
+#define	WT_STAT_CONN_TXN_PINNED_RANGE			1130
 /*! transaction: transactions rolled back */
-#define	WT_STAT_CONN_TXN_ROLLBACK			1130
+#define	WT_STAT_CONN_TXN_ROLLBACK			1131
 /*! connection: total write I/Os */
-#define	WT_STAT_CONN_WRITE_IO				1131
+#define	WT_STAT_CONN_WRITE_IO				1132
 
 /*!
  * @}
diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h
index 138b64a6e27..1b3a9b62626 100644
--- a/src/include/wt_internal.h
+++ b/src/include/wt_internal.h
@@ -322,13 +322,13 @@ struct __wt_update;
 #include "misc.i"
 #include "intpack.i"			/* required by cell.i, packing.i */
 #include "packing.i"
+#include "cache.i"			/* required by txn.i */
 #include "cell.i"			/* required by btree.i */
 
 #include "mutex.i"			/* required by btree.i */
 #include "txn.i"			/* required by btree.i */
 
 #include "btree.i"			/* required by cursor.i */
-#include "cache.i"			/* required by cursor.i */
 #include "cursor.i"
 
 #include "bitstring.i"
diff --git a/src/log/log.c b/src/log/log.c
index 944e748a6a8..c48fc7536b2 100644
--- a/src/log/log.c
+++ b/src/log/log.c
@@ -240,6 +240,7 @@ __log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot)
 		if (log->log_close_fh != NULL)
 			F_SET(slot, SLOT_CLOSEFH);
 	}
+
 	/*
 	 * Checkpoints can be configured based on amount of log written.
 	 * Add in this log record to the sum and if needed, signal the
@@ -857,9 +858,8 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
 {
 	WT_CONNECTION_IMPL *conn;
 	WT_DECL_RET;
-	WT_FH *close_fh;
 	WT_LOG *log;
-	WT_LSN sync_lsn;
+	WT_LSN close_end_lsn, close_lsn, sync_lsn;
 	size_t write_size;
 	int locked;
 	WT_DECL_SPINLOCK_ID(id);			/* Must appear last */
@@ -872,12 +872,8 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
 	 * If we're going to have to close our log file, make a local copy
 	 * of the file handle structure.
 	 */
-	close_fh = NULL;
-	if (F_ISSET(slot, SLOT_CLOSEFH)) {
-		close_fh = log->log_close_fh;
-		log->log_close_fh = NULL;
-		F_CLR(slot, SLOT_CLOSEFH);
-	}
+	WT_INIT_LSN(&close_lsn);
+	WT_INIT_LSN(&close_end_lsn);
 
 	/* Write the buffered records */
 	if (F_ISSET(slot, SLOT_BUFFERED)) {
@@ -895,13 +891,22 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
 		__wt_yield();
 	log->write_lsn = slot->slot_end_lsn;
 
+	if (F_ISSET(slot, SLOT_CLOSEFH))
+		WT_ERR(__wt_cond_signal(session, conn->log_close_cond));
+
 	/*
 	 * Try to consolidate calls to fsync to wait less.  Acquire a spin lock
 	 * so that threads finishing writing to the log will wait while the
 	 * current fsync completes and advance log->sync_lsn.
 	 */
 	while (F_ISSET(slot, SLOT_SYNC | SLOT_SYNC_DIR)) {
-		if (__wt_spin_trylock(session, &log->log_sync_lock, &id) != 0) {
+		/*
+		 * We have to wait until earlier log files have finished their
+		 * sync operations.  The most recent one will set the LSN to the
+		 * beginning of our file.
+		 */
+		if (log->sync_lsn.file < slot->slot_end_lsn.file ||
+		    __wt_spin_trylock(session, &log->log_sync_lock, &id) != 0) {
 			WT_ERR(__wt_cond_wait(
 			    session, log->log_sync_cond, 10000));
 			continue;
@@ -909,10 +914,10 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
 		locked = 1;
 
 		/*
-		 * Record the current end of log after we grabbed the lock.
+		 * Record the current end of our update after the lock.
 		 * That is how far our calls can guarantee.
 		 */
-		sync_lsn = log->write_lsn;
+		sync_lsn = slot->slot_end_lsn;
 		/*
 		 * Check if we have to sync the parent directory.  Some
 		 * combinations of sync flags may result in the log file
@@ -956,16 +961,6 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
 		WT_ERR(__wt_buf_grow(session,
 		    &slot->slot_buf, slot->slot_buf.memsize * 2));
 	}
-	/*
-	 * If we have a file to close, close it now.  First fsync so
-	 * that a later sync will be assured all earlier transactions
-	 * in earlier log files are also on disk.
-	 */
-	if (close_fh) {
-		WT_ERR(__wt_fsync(session, close_fh));
-		WT_ERR(__wt_close(session, close_fh));
-	}
-
 err:	if (locked)
 		__wt_spin_unlock(session, &log->log_sync_lock);
 	if (ret != 0 && slot->slot_error == 0)
diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c
index 031a4e88467..3f14e035a9b 100644
--- a/src/lsm/lsm_cursor.c
+++ b/src/lsm/lsm_cursor.c
@@ -171,8 +171,6 @@ __clsm_enter(WT_CURSOR_LSM *clsm, int reset, int update)
 		    lsm_tree->nchunks != 0)
 			goto open;
 
-		WT_RET(__wt_cache_full_check(session));
-
 		if (clsm->dsk_gen != lsm_tree->dsk_gen &&
 		    lsm_tree->nchunks != 0)
 			goto open;
diff --git a/src/session/session_api.c b/src/session/session_api.c
index 3ab5e0acab1..8ee143133ae 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -736,13 +736,6 @@ __session_begin_transaction(WT_SESSION *wt_session, const char *config)
 	if (F_ISSET(&session->txn, TXN_RUNNING))
 		WT_ERR_MSG(session, EINVAL, "Transaction already running");
 
-	/*
-	 * There is no transaction active in this thread; check if the cache is
-	 * full, if we have to block for eviction, this is the best time to do
-	 * it.
-	 */
-	WT_ERR(__wt_cache_full_check(session));
-
 	ret = __wt_txn_begin(session, cfg);
 
 err:	API_END_RET(session, ret);
diff --git a/src/support/stat.c b/src/support/stat.c
index f4ae082add3..223d62d0559 100644
--- a/src/support/stat.c
+++ b/src/support/stat.c
@@ -376,6 +376,8 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats)
 	    "cache: pages currently held in the cache";
 	stats->cache_eviction_force.desc =
 	    "cache: pages evicted because they exceeded the in-memory maximum";
+	stats->cache_eviction_force_delete.desc =
+	    "cache: pages evicted because they had chains of deleted items";
 	stats->cache_eviction_app.desc =
 	    "cache: pages evicted by application threads";
 	stats->cache_read.desc = "cache: pages read into cache";
@@ -554,6 +556,7 @@ __wt_stat_refresh_connection_stats(void *stats_arg)
 	stats->cache_eviction_dirty.v = 0;
 	stats->cache_eviction_deepen.v = 0;
 	stats->cache_eviction_force.v = 0;
+	stats->cache_eviction_force_delete.v = 0;
 	stats->cache_eviction_app.v = 0;
 	stats->cache_read.v = 0;
 	stats->cache_eviction_fail.v = 0;
diff --git a/src/txn/txn.c b/src/txn/txn.c
index fd80efd5ebd..5b8f11a88a5 100644
--- a/src/txn/txn.c
+++ b/src/txn/txn.c
@@ -361,8 +361,15 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
 	/* If we are logging, write a commit log record. */
 	if (ret == 0 && txn->mod_count > 0 &&
 	    FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED) &&
-	    !F_ISSET(session, WT_SESSION_NO_LOGGING))
+	    !F_ISSET(session, WT_SESSION_NO_LOGGING)) {
+		/*
+		 * We are about to block on I/O writing the log.
+		 * Release our snapshot in case it is keeping data pinned.
+		 * This is particularly important for checkpoints.
+		 */
+		__wt_txn_release_snapshot(session);
 		ret = __wt_txn_log_commit(session, cfg);
+	}
 
 	/*
 	 * If anything went wrong, roll back.
diff --git a/src/txn/txn_log.c b/src/txn/txn_log.c
index f66bd7e09c8..789be2ceef4 100644
--- a/src/txn/txn_log.c
+++ b/src/txn/txn_log.c
@@ -270,6 +270,7 @@ __wt_txn_checkpoint_log(
 {
 	WT_DECL_ITEM(logrec);
 	WT_DECL_RET;
+	WT_ITEM *ckpt_snapshot, empty;
 	WT_LSN *ckpt_lsn;
 	WT_TXN *txn;
 	uint8_t *end, *p;
@@ -319,21 +320,26 @@ __wt_txn_checkpoint_log(
 		 */
 		if (!txn->full_ckpt) {
 			txn->ckpt_nsnapshot = 0;
+			WT_CLEAR(empty);
+			ckpt_snapshot = &empty;
 			*ckpt_lsn = S2C(session)->log->alloc_lsn;
-		}
+		} else
+			ckpt_snapshot = txn->ckpt_snapshot;
 
 		/* Write the checkpoint log record. */
 		WT_ERR(__wt_struct_size(session, &recsize, fmt,
 		    rectype, ckpt_lsn->file, ckpt_lsn->offset,
-		    txn->ckpt_nsnapshot, &txn->ckpt_snapshot));
+		    txn->ckpt_nsnapshot, ckpt_snapshot));
 		WT_ERR(__wt_logrec_alloc(session, recsize, &logrec));
 
 		WT_ERR(__wt_struct_pack(session,
 		    (uint8_t *)logrec->data + logrec->size, recsize, fmt,
 		    rectype, ckpt_lsn->file, ckpt_lsn->offset,
-		    txn->ckpt_nsnapshot, &txn->ckpt_snapshot));
+		    txn->ckpt_nsnapshot, ckpt_snapshot));
 		logrec->size += (uint32_t)recsize;
-		WT_ERR(__wt_log_write(session, logrec, lsnp, 0));
+		WT_ERR(__wt_log_write(session, logrec, lsnp,
+		    F_ISSET(S2C(session), WT_CONN_CKPT_SYNC) ?
+		    WT_LOG_FSYNC : 0));
 
 		/*
 		 * If this full checkpoint completed successfully and there is
author	Keith Bostic <keith@wiredtiger.com>	2015-01-16 15:23:00 -0500
committer	Keith Bostic <keith@wiredtiger.com>	2015-01-16 15:23:00 -0500
commit	51a92facb691706bee4b6c573e8bda070a62351d (patch)
tree	42bb9c6f5e16d4661cdf1c143e88339ecd94dff4 /src
parent	24ca383872e0512a3ae54efd9f4f2de29eac0d23 (diff)
parent	38b6b25fb7e825b234a17ad1fb9269c5f48cb129 (diff)
download	mongo-51a92facb691706bee4b6c573e8bda070a62351d.tar.gz