Merge branch 'develop' into checkpoint-directio

author: Michael Cahill <michael.cahill@wiredtiger.com> 2014-02-07 18:16:22 +1100
committer: Michael Cahill <michael.cahill@wiredtiger.com> 2014-02-07 18:16:22 +1100
commit: 3bcd2a96e6546419a871dba4a35a2e2a3453adb9 (patch)
tree: d93f25e4d576e47adbf78b352c910e7354d68639 /src
parent: 3b6d36874f716625c3f8c867f9185c829931472e (diff)
parent: 0f319b1107960bdeb7d617d1797dd992029bb1df (diff)
download: mongo-3bcd2a96e6546419a871dba4a35a2e2a3453adb9.tar.gz
24 files changed, 534 insertions, 424 deletions
diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c
index f5b0180b2a6..d57162c06a9 100644
--- a/src/btree/bt_evict.c
+++ b/src/btree/bt_evict.c
@@ -246,7 +246,7 @@ __evict_worker(WT_SESSION_IMPL *session)
 		    "Eviction pass with: Max: %" PRIu64
 		    " In use: %" PRIu64 " Dirty: %" PRIu64 " Internal: %s",
 		    bytes_max, bytes_inuse, dirty_inuse,
-		    F_ISSET(cache, WT_EVICT_INTERNAL) ? "yes" : "no");
+		    LF_ISSET(WT_EVICT_PASS_INTERNAL) ? "yes" : "no");
 
 		/*
 		 * When the cache is full, track whether pages are being
@@ -858,7 +858,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
 	    btree->evict_page->ref->state == WT_REF_EVICT_WALK);
 
 	walk_flags = WT_TREE_EVICT;
-	if (F_ISSET(cache, WT_EVICT_INTERNAL))
+	if (LF_ISSET(WT_EVICT_PASS_INTERNAL))
 		walk_flags |= WT_TREE_SKIP_LEAF;
 	/*
 	 * Get some more eviction candidate pages.
@@ -887,7 +887,13 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
 		if (WT_PAGE_IS_ROOT(page))
 			continue;
 
-		/* Look for a split-merge (grand)parent page to merge. */
+		/*
+		 * Look for a split-merge (grand)parent page to merge.
+		 *
+		 * Only look for a parent at exactly the right height above: if
+		 * the stack is deep enough, we'll find it eventually, and we
+		 * don't want to do too much work on every level.
+		 */
 		levels = 0;
 		if (__wt_btree_mergeable(page))
 			for (levels = 1;
@@ -900,85 +906,96 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
 			continue;
 
 		/*
-		 * Only look for a parent at exactly the right height above: if
-		 * the stack is deep enough, we'll find it eventually, and we
-		 * don't want to do too much work on every level.
-		 *
+		 * Use the EVICT_LRU flag to avoid putting pages onto the list
+		 * multiple times.
+		 */
+		if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU))
+			continue;
+
+		/*
 		 * !!!
-		 * Don't restrict ourselves to only the top-most page (that is,
-		 * don't require that page->parent is not mergeable).  If there
-		 * is a big, busy enough split-merge tree, the top-level merge
-		 * will only happen if we can lock the whole subtree
-		 * exclusively.  Consider smaller merges in case locking the
-		 * whole tree fails.
+		 * In normal operation, don't restrict ourselves to only the
+		 * top-most page (that is, don't require that page->parent is
+		 * not mergeable).  If there is a big, busy enough split-merge
+		 * tree, the top-level merge will only happen if we can lock
+		 * the whole subtree exclusively.  Consider smaller merges in
+		 * case locking the whole tree fails.
 		 */
-		if (levels != 0 && levels != WT_MERGE_STACK_MIN)
+		if (levels != 0) {
+			if (levels < WT_MERGE_STACK_MIN)
+				continue;
+
+			/*
+			 * Concentrate near the top of a stack -- with forced
+			 * eviction, stacks of split-merge pages can get very
+			 * deep, and merging near the bottom isn't helpful.
+			 */
+			if (LF_ISSET(WT_EVICT_PASS_INTERNAL) &&
+			    __wt_btree_mergeable(page->parent) &&
+			    __wt_btree_mergeable(page->parent->parent))
+				continue;
+
+			/* The remaining checks don't apply to merges. */
+			goto add;
+		} else if (LF_ISSET(WT_EVICT_PASS_INTERNAL))
 			continue;
 
 		/*
-		 * If this page has never been considered for eviction, set its
-		 * read generation to a little bit in the future and move on,
-		 * give readers a chance to start updating the read generation.
+		 * If this page has never been considered for eviction,
+		 * set its read generation to a little bit in the
+		 * future and move on, give readers a chance to start
+		 * updating the read generation.
 		 */
 		if (page->read_gen == WT_READ_GEN_NOTSET) {
-			page->read_gen = __wt_cache_read_gen_set(session);
+			page->read_gen =
+			    __wt_cache_read_gen_set(session);
 			continue;
 		}
 
 		/*
-		 * Use the EVICT_LRU flag to avoid putting pages onto the list
-		 * multiple times.
+		 * If the file is being checkpointed, there's a period
+		 * of time where we can't discard any page with a
+		 * modification structure because it might race with
+		 * the checkpointing thread.
+		 *
+		 * During this phase, there is little point trying to
+		 * evict dirty pages: we might be lucky and find an
+		 * internal page that has not yet been checkpointed,
+		 * but much more likely is that we will waste effort
+		 * considering dirty leaf pages that cannot be evicted
+		 * because they have modifications more recent than the
+		 * checkpoint.
 		 */
-		if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU))
+		modified = __wt_page_is_modified(page);
+		if (modified && btree->checkpointing)
 			continue;
 
-		/* The following checks apply to eviction but not merges. */
-		if (levels == 0) {
-			/*
-			 * If the file is being checkpointed, there's a period
-			 * of time where we can't discard any page with a
-			 * modification structure because it might race with
-			 * the checkpointing thread.
-			 *
-			 * During this phase, there is little point trying to
-			 * evict dirty pages: we might be lucky and find an
-			 * internal page that has not yet been checkpointed,
-			 * but much more likely is that we will waste effort
-			 * considering dirty leaf pages that cannot be evicted
-			 * because they have modifications more recent than the
-			 * checkpoint.
-			 */
-			modified = __wt_page_is_modified(page);
-			if (modified && btree->checkpointing)
-				continue;
-
-			/* Optionally ignore clean pages. */
-			if (!modified && LF_ISSET(WT_EVICT_PASS_DIRTY))
-				continue;
+		/* Optionally ignore clean pages. */
+		if (!modified && LF_ISSET(WT_EVICT_PASS_DIRTY))
+			continue;
 
-			/*
-			 * If the oldest transaction hasn't changed since the
-			 * last time this page was written, it's unlikely that
-			 * we can make progress.  Similarly, if the most recent
-			 * update on the page is not yet globally visible,
-			 * eviction will fail.  These heuristics attempt to
-			 * avoid repeated attempts to evict the same page.
-			 *
-			 * That said, if eviction is stuck, or the file is
-			 * being checkpointed, try anyway: maybe a transaction
-			 * that were running last time we wrote the page has
-			 * since rolled back, or we can help get the checkpoint
-			 * completed sooner.
-			 */
-			if (modified && !F_ISSET(cache, WT_EVICT_STUCK) &&
-			    (page->modify->disk_snap_min ==
-			    S2C(session)->txn_global.oldest_id ||
-			    !__wt_txn_visible_all(session,
-			    page->modify->update_txn)))
-				continue;
-		}
+		/*
+		 * If the oldest transaction hasn't changed since the
+		 * last time this page was written, it's unlikely that
+		 * we can make progress.  Similarly, if the most recent
+		 * update on the page is not yet globally visible,
+		 * eviction will fail.  These heuristics attempt to
+		 * avoid repeated attempts to evict the same page.
+		 *
+		 * That said, if eviction is stuck, or the file is
+		 * being checkpointed, try anyway: maybe a transaction
+		 * that were running last time we wrote the page has
+		 * since rolled back, or we can help get the checkpoint
+		 * completed sooner.
+		 */
+		if (modified && !F_ISSET(cache, WT_EVICT_STUCK) &&
+		    (page->modify->disk_snap_min ==
+		    S2C(session)->txn_global.oldest_id ||
+		    !__wt_txn_visible_all(session,
+		    page->modify->update_txn)))
+			continue;
 
-		WT_ASSERT(session, evict->page == NULL);
+add:		WT_ASSERT(session, evict->page == NULL);
 		__evict_init_candidate(session, evict, page);
 		++evict;
 
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index 5c1f45a8030..f6cc4cc6fb3 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -11,7 +11,7 @@ static int __btree_conf(WT_SESSION_IMPL *, WT_CKPT *ckpt);
 static int __btree_get_last_recno(WT_SESSION_IMPL *);
 static int __btree_page_sizes(WT_SESSION_IMPL *);
 static int __btree_preload(WT_SESSION_IMPL *);
-static int __btree_tree_open_empty(WT_SESSION_IMPL *, int);
+static int __btree_tree_open_empty(WT_SESSION_IMPL *, int, int);
 
 static int pse1(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t);
 static int pse2(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t, int);
@@ -102,7 +102,8 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[])
 		    ckpt.raw.data, ckpt.raw.size,
 		    root_addr, &root_addr_size, readonly));
 		if (creation || root_addr_size == 0)
-			WT_ERR(__btree_tree_open_empty(session, creation));
+			WT_ERR(__btree_tree_open_empty(
+			    session, creation, readonly));
 		else {
 			WT_ERR(__wt_btree_tree_open(
 			    session, root_addr, root_addr_size));
@@ -355,7 +356,7 @@ err:		__wt_buf_free(session, &dsk);
  *	Create an empty in-memory tree.
  */
 static int
-__btree_tree_open_empty(WT_SESSION_IMPL *session, int creation)
+__btree_tree_open_empty(WT_SESSION_IMPL *session, int creation, int readonly)
 {
 	WT_BTREE *btree;
 	WT_DECL_RET;
@@ -423,23 +424,31 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, int creation)
 	 * the root page dirty to force a write, and without reconciling the
 	 * leaf page we won't realize there's no records to write, we'll write
 	 * a root page, which isn't correct for an empty tree.
-	 *    Earlier versions of this code kept the leaf page clean, but with
-	 * the "empty" flag set in the leaf page's modification structure; in
-	 * that case, checkpoints works (forced reconciliation of a root with
-	 * a single "empty" page wouldn't write any blocks). That version had
+	 *
+	 * Earlier versions of this code kept the leaf page clean, but with the
+	 * "empty" flag set in the leaf page's modification structure; in that
+	 * case, checkpoints works (forced reconciliation of a root with a
+	 * single "empty" page wouldn't write any blocks). That version had
 	 * memory leaks because the eviction code didn't correctly handle pages
 	 * that were "clean" (and so never reconciled), yet "modified" with an
 	 * "empty" flag.  The goal of this code is to mimic a real tree that
 	 * simply has no records, for whatever reason, and trust reconciliation
 	 * to figure out it's empty and not write any blocks.
-	 *    We do not set the tree's modified flag because the checkpoint code
-	 * skips unmodified files in closing checkpoints (checkpoints that don't
-	 * require a write unless the file is actually dirty).  There's no need
-	 * to reconcile this file unless the application does a real checkpoint
-	 * or it's actually modified.
+	 *
+	 * We do not set the tree's modified flag because the checkpoint code
+	 * skips unmodified files in closing checkpoints (checkpoints that
+	 * don't require a write unless the file is actually dirty).  There's
+	 * no need to reconcile this file unless the application does a real
+	 * checkpoint or it's actually modified.
+	 *
+	 * Only do this for a live tree, not for checkpoints.  If we open an
+	 * empty checkpoint, the leaf page cannot be dirty or eviction may try
+	 * to write it, which will fail because checkpoints are read-only.
 	 */
-	WT_ERR(__wt_page_modify_init(session, leaf));
-	__wt_page_only_modify_set(session, leaf);
+	if (!readonly) {
+		WT_ERR(__wt_page_modify_init(session, leaf));
+		__wt_page_only_modify_set(session, leaf);
+	}
 
 	btree->root_page = root;
 
diff --git a/src/btree/rec_evict.c b/src/btree/rec_evict.c
index f7d146c42e2..0713989af58 100644
--- a/src/btree/rec_evict.c
+++ b/src/btree/rec_evict.c
@@ -441,32 +441,32 @@ ckpt:		WT_STAT_FAST_CONN_INCR(session, cache_eviction_checkpoint);
 	 */
 	if (__wt_page_is_modified(page) &&
 	    !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)) {
-		ret = __wt_rec_write(session, page,
-		    NULL, WT_EVICTION_SERVER_LOCKED | WT_SKIP_UPDATE_QUIT);
-
-		/*
-		 * Update the page's modification reference, reconciliation
-		 * might have changed it.
-		 */
-		mod = page->modify;
-
 		/*
-		 * If reconciliation failed due to active modifications and
-		 * the page is a lot larger than the maximum allowed, it is
-		 * likely that we are having trouble reconciling it due to
-		 * contention, attempt to split the page in memory.
+		 * If the page is larger than the maximum allowed, attempt to
+		 * split the page in memory before evicting it.  The in-memory
+		 * split checks for left and right splits, and prevents the
+		 * tree deepening unnecessarily.
 		 *
 		 * Note, we won't be here if recursively descending a tree of
 		 * pages: dirty row-store leaf pages can't be merged into their
 		 * parents, which means if top wasn't true in this test, we'd
 		 * have returned busy before attempting reconciliation.
 		 */
-		if (ret == EBUSY &&
-		    page->type == WT_PAGE_ROW_LEAF &&
+		if (page->type == WT_PAGE_ROW_LEAF &&
+		    !F_ISSET_ATOMIC(page, WT_PAGE_WAS_SPLIT) &&
 		    __wt_eviction_force_check(session, page)) {
 			*inmem_split = 1;
 			return (0);
 		}
+
+		ret = __wt_rec_write(session, page,
+		    NULL, WT_EVICTION_SERVER_LOCKED | WT_SKIP_UPDATE_QUIT);
+
+		/*
+		 * Update the page's modification reference, reconciliation
+		 * might have changed it.
+		 */
+		mod = page->modify;
 		if (ret == EBUSY) {
 			/* Give up if there are unwritten changes */
 			WT_VERBOSE_RET(session, evict,
diff --git a/src/btree/rec_merge.c b/src/btree/rec_merge.c
index 7599fa8cb84..cf8ef88c5ac 100644
--- a/src/btree/rec_merge.c
+++ b/src/btree/rec_merge.c
@@ -307,14 +307,9 @@ __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top)
 	if (visit_state.maxdepth < WT_MERGE_STACK_MIN)
 		return (EBUSY);
 
-	/*
-	 * Don't allow split merges to generate arbitrarily large pages.
-	 * Ideally we would choose a size based on the internal_page_max
-	 * setting for the btree, but we don't have the correct btree handle
-	 * available.
-	 */
-	if (visit_state.refcnt > WT_MERGE_MAX_REFS)
-		return (EBUSY);
+	/* Pages cannot grow larger than 2**32, but that should never happen. */
+	if (visit_state.refcnt > UINT32_MAX)
+		return (ENOMEM);
 
 	/*
 	 * Now we either collapse the internal pages into one split-merge page,
@@ -332,17 +327,19 @@ __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top)
 		 * In the normal case where there are live children spread
 		 * through the subtree, create two child pages.
 		 *
-		 * Handle the case where the only live child is first / last
-		 * specially: put the live child into the top-level page.
+		 * Handle the case where the live children are all near the
+		 * beginning / end specially: put the last live child into the
+		 * top-level page, to avoid getting much deeper during
+		 * append-only workloads.
 		 *
 		 * Set SPLIT_MERGE on the internal pages if there are any live
 		 * children: they can't be evicted, so there is no point
 		 * permanently deepening the tree.
 		 */
-		if (visit_state.first_live == visit_state.last_live &&
-		    (visit_state.first_live == 0 ||
-		    visit_state.first_live == refcnt - 1))
-			split = (visit_state.first_live == 0) ? 1 : refcnt - 1;
+		if (visit_state.last_live <= refcnt / 10)
+			split = 1;
+		else if (visit_state.first_live >= (9 * refcnt) / 10)
+			split = refcnt - 1;
 		else
 			split = (refcnt + 1) / 2;
 
@@ -370,7 +367,7 @@ __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top)
 		else {
 			WT_ERR(__wt_btree_new_modified_page(
 			    session, page_type, split,
-			    visit_state.first_live < split, &lchild));
+			    split < WT_MERGE_FULL_PAGE, &lchild));
 			visit_state.first = lchild;
 		}
 
@@ -380,8 +377,8 @@ __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top)
 			visit_state.second_ref = &newtop->u.intl.t[1];
 		} else {
 			WT_ERR(__wt_btree_new_modified_page(
-			    session, page_type,
-			    refcnt - split, visit_state.last_live >= split,
+			    session, page_type, refcnt - split,
+			    refcnt - split < WT_MERGE_FULL_PAGE,
 			    &rchild));
 			visit_state.second = rchild;
 			visit_state.second_ref =
@@ -389,17 +386,15 @@ __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top)
 		}
 	} else {
 		/*
-		 * Create a new split-merge page for small merges, or if the
-		 * page above is a split merge page.  When we do a big enough
-		 * merge, we create a real page at the top and don't consider
-		 * it as a merge candidate again.  Over time with an insert
-		 * workload the tree will grow deeper, but that's inevitable,
-		 * and this keeps individual merges small.
+		 * Create a new split-merge page for small merges.  When we do
+		 * a big enough merge, we create a real page at the top and
+		 * don't consider it as a merge candidate again.  Over time
+		 * with an insert workload the tree will grow deeper, but
+		 * that's inevitable, and this keeps individual merges small.
 		 */
 		WT_ERR(__wt_btree_new_modified_page(
 		    session, page_type, refcnt,
-		    refcnt < WT_MERGE_FULL_PAGE ||
-		    __wt_btree_mergeable(top->parent),
+		    refcnt < WT_MERGE_FULL_PAGE,
 		    &newtop));
 
 		visit_state.first = newtop;
diff --git a/src/btree/rec_track.c b/src/btree/rec_track.c
index 1ea5c1093d5..99e9aebc14f 100644
--- a/src/btree/rec_track.c
+++ b/src/btree/rec_track.c
@@ -382,13 +382,13 @@ __ovfl_reuse_dump(WT_SESSION_IMPL *session, WT_PAGE *page)
 
 /*
  * __ovfl_reuse_skip_search --
- *	Return the first matching value in the overflow reuse list.
+ *	Return the first, not in-use, matching value in the overflow reuse list.
  */
 static WT_OVFL_REUSE *
 __ovfl_reuse_skip_search(
     WT_OVFL_REUSE **head, const void *value, size_t value_size)
 {
-	WT_OVFL_REUSE **e;
+	WT_OVFL_REUSE **e, *next;
 	size_t len;
 	int cmp, i;
 
@@ -404,13 +404,29 @@ __ovfl_reuse_skip_search(
 		}
 
 		/*
-		 * Return any exact matches: we don't care in what search level
-		 * we found a match.
+		 * Values are not unique, and it's possible to have long lists
+		 * of identical overflow items.  (We've seen it in benchmarks.)
+		 * Move through a list of identical items at the current level
+		 * as long as the next one is in-use, otherwise, drop down a
+		 * level.   When at the bottom level, return items if reusable,
+		 * else NULL.
 		 */
 		len = WT_MIN((*e)->value_size, value_size);
 		cmp = memcmp(WT_OVFL_REUSE_VALUE(*e), value, len);
-		if (cmp == 0 && (*e)->value_size == value_size)
-			return (*e);
+		if (cmp == 0 && (*e)->value_size == value_size) {
+			if (i == 0)
+				return (F_ISSET(*e,
+				    WT_OVFL_REUSE_INUSE) ? NULL : *e);
+			if ((next = (*e)->next[i]) == NULL ||
+			    !F_ISSET(next, WT_OVFL_REUSE_INUSE) ||
+			    next->value_size != len || memcmp(
+			    WT_OVFL_REUSE_VALUE(next), value, len) != 0) {
+				--i;		/* Drop down a level */
+				--e;
+			} else			/* Keep going at this level */
+				e = &(*e)->next[i];
+			continue;
+		}
 
 		/*
 		 * If the skiplist value is larger than the search value, or
@@ -612,28 +628,19 @@ __wt_ovfl_reuse_search(WT_SESSION_IMPL *session, WT_PAGE *page,
 	head = page->modify->ovfl_track->ovfl_reuse;
 
 	/*
-	 * The search function returns the first matching record in the list,
-	 * which may be the first of many, overflow records may be identical.
-	 * Find one without the in-use flag set and put it back into service.
+	 * The search function returns the first matching record in the list
+	 * which does not have the in-use flag set, or NULL.
 	 */
 	if ((reuse = __ovfl_reuse_skip_search(head, value, value_size)) == NULL)
 		return (0);
-	do {
-		if (!F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) {
-			*addrp = WT_OVFL_REUSE_ADDR(reuse);
-			*addr_sizep = reuse->addr_size;
-			F_SET(reuse, WT_OVFL_REUSE_INUSE);
 
-			if (WT_VERBOSE_ISSET(session, overflow))
-				WT_RET(__ovfl_reuse_verbose(
-				    session, page, reuse, "reclaim"));
-			return (1);
-		}
-	} while ((reuse = reuse->next[0]) != NULL &&
-	    reuse->value_size == value_size &&
-	    memcmp(WT_OVFL_REUSE_VALUE(reuse), value, value_size) == 0);
+	*addrp = WT_OVFL_REUSE_ADDR(reuse);
+	*addr_sizep = reuse->addr_size;
+	F_SET(reuse, WT_OVFL_REUSE_INUSE);
 
-	return (0);
+	if (WT_VERBOSE_ISSET(session, overflow))
+		WT_RET(__ovfl_reuse_verbose(session, page, reuse, "reclaim"));
+	return (1);
 }
 
 /*
diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c
index 66ce4c089e8..81a4ec7a025 100644
--- a/src/btree/rec_write.c
+++ b/src/btree/rec_write.c
@@ -1616,10 +1616,10 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, WT_RECONCILE *r, int final)
 		 * We can't compress the first 64B of the block (it must be
 		 * written without compression), and a possible split point
 		 * may appear in that 64B; keep it simple, ignore the first
-		 * 1KB of data, anybody splitting a smaller than 1KB piece
-		 * (as calculated before compression), is doing us wrong.
+		 * allocation size of data, anybody splitting smaller than
+		 * that (as calculated before compression), is doing it wrong.
 		 */
-		if ((len = WT_PTRDIFF(cell, dsk)) > 1024)
+		if ((len = WT_PTRDIFF(cell, dsk)) > btree->allocsize)
 			r->raw_offsets[++slots] =
 			    WT_STORE_SIZE(len - WT_BLOCK_COMPRESS_SKIP);
 
@@ -1677,12 +1677,19 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, WT_RECONCILE *r, int final)
 	 * compression function.
 	 */
 	memcpy(dst->mem, dsk, WT_BLOCK_COMPRESS_SKIP);
-	WT_ERR(compressor->compress_raw(compressor, wt_session,
+	ret = compressor->compress_raw(compressor, wt_session,
 	    r->page_size_max, btree->split_pct,
 	    WT_BLOCK_COMPRESS_SKIP, (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP,
 	    r->raw_offsets, slots,
 	    (uint8_t *)dst->mem + WT_BLOCK_COMPRESS_SKIP,
-	    result_len, final, &result_len, &result_slots));
+	    result_len, final, &result_len, &result_slots);
+	if (ret == EAGAIN) {
+		ret = 0;
+		if (!final)
+			goto more_rows;
+		result_slots = 0;
+	}
+	WT_ERR(ret);
 	dst->size = (uint32_t)result_len + WT_BLOCK_COMPRESS_SKIP;
 
 	if (result_slots != 0) {
@@ -1701,11 +1708,14 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, WT_RECONCILE *r, int final)
 		 * There may be a remnant in the working buffer that didn't get
 		 * compressed; copy it down to the start of the working buffer
 		 * and update the starting record number, free space and so on.
+		 * !!!
+		 * Note use of memmove, the source and destination buffers can
+		 * overlap.
 		 */
 		len = WT_PTRDIFF(r->first_free, (uint8_t *)dsk +
 		    r->raw_offsets[result_slots] + WT_BLOCK_COMPRESS_SKIP);
 		dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
-		(void)memcpy(dsk_start, (uint8_t *)r->first_free - len, len);
+		(void)memmove(dsk_start, (uint8_t *)r->first_free - len, len);
 
 		r->entries -= r->raw_entries[result_slots - 1];
 		r->first_free = dsk_start + len;
diff --git a/src/docs/compression.dox b/src/docs/compression.dox
index 59f03f4e8ef..92f5c27f25e 100644
--- a/src/docs/compression.dox
+++ b/src/docs/compression.dox
@@ -1,48 +1,36 @@
 /*! @page compression Compressors
 
 This section explains how to configure WiredTiger's builtin support for
-the bzip2 and snappy compression engines.
+the snappy and bzip2 compression engines.
 
-@section compression_bzip2 Using bzip2 compression
+@section compression_zlib Using zlib compression
 
-To use the builtin support for
-<a href="http://www.bzip.org/">Julian Seward's bzip2</a>
-compression, first check that bzip2 is installed in include and library
-directories searched by the compiler.  Once bzip2 is installed, you can
-enable bzip2 using the \c --enable-bzip2 option to configure.
+To use the builtin support for Greg Roelofs' and Mark Adler's
+<a href="http://www.zlib.net/">zlib</a>
+compression, first check that zlib is installed in include and library
+directories searched by the compiler.  Once zlib is installed, you can
+enable zlib using the \c --enable-zlib option to configure.
 
-If bzip2 is installed in a location not normally searched by the
-compiler toolchain, you'll need to modify the \c CPPFLAGS and \c LDFLAGS
-to indicate these locations.  For example, with the bzip2 includes and
+If zlib is installed in a location not normally searched by the compiler
+toolchain, you'll need to modify the \c CPPFLAGS and \c LDFLAGS to
+indicate these locations.  For example, with the zlib includes and
 libraries installed in \c /usr/local/include and \c /usr/local/lib, you
-should run configure as follows:
+would run configure with the following additional arguments:
 
 @code
-cd build_posix
-../configure --enable-bzip2 CPPFLAGS="-I/usr/local/include" LDFLAGS="-L/usr/local/include"
+--enable-zlib CPPFLAGS="-I/usr/local/include" LDFLAGS="-L/usr/local/include"
 @endcode
 
-When opening the WiredTiger database, load the bzip2 shared library as
-an extension.  For example, with the bzip2 library installed in
+When opening the WiredTiger database, load the zlib shared library as
+an extension.  For example, with the WiredTiger library installed in
 \c /usr/local/lib, you would use the following extension:
 
-@snippet ex_all.c Configure bzip2 extension
+@snippet ex_all.c Configure zlib extension
 
 Finally, when creating the WiredTiger object, set \c block_compressor
-to \c bzip2:
-
-@snippet ex_all.c Create a bzip2 compressed table
-
-If necessary, you can confirm the compressor is working by running the
-compression part of the test suite:
+to \c zlib:
 
-@code
-cd build_posix
-python ../test/suite/run.py compress
-@endcode
-
-Review the test output to verify the bzip2 part of the test passes and
-was not skipped.
+@snippet ex_all.c Create a zlib compressed table
 
 @section compression_snappy Using snappy compression
 
@@ -56,15 +44,14 @@ If snappy is installed in a location not normally searched by the
 compiler toolchain, you'll need to modify the \c CPPFLAGS and \c LDFLAGS
 to indicate these locations.  For example, with the snappy includes and
 libraries installed in \c /usr/local/include and \c /usr/local/lib, you
-should run configure as follows:
+would run configure with the following additional arguments:
 
 @code
-cd build_posix
-../configure --enable-snappy CPPFLAGS="-I/usr/local/include" LDFLAGS="-L/usr/local/include"
+--enable-snappy CPPFLAGS="-I/usr/local/include" LDFLAGS="-L/usr/local/include"
 @endcode
 
 When opening the WiredTiger database, load the snappy shared library as
-an extension.  For example, with the snappy library installed in
+an extension.  For example, with the WiredTiger library installed in
 \c /usr/local/lib, you would use the following extension:
 
 @snippet ex_all.c Configure snappy extension
@@ -74,16 +61,34 @@ to \c snappy:
 
 @snippet ex_all.c Create a snappy compressed table
 
-If necessary, you can confirm the compressor is working by running the
-compression part of the test suite:
+@section compression_bzip2 Using bzip2 compression
+
+To use the builtin support for
+<a href="http://www.bzip.org/">Julian Seward's bzip2</a>
+compression, first check that bzip2 is installed in include and library
+directories searched by the compiler.  Once bzip2 is installed, you can
+enable bzip2 using the \c --enable-bzip2 option to configure.
+
+If bzip2 is installed in a location not normally searched by the
+compiler toolchain, you'll need to modify the \c CPPFLAGS and \c LDFLAGS
+to indicate these locations.  For example, with the bzip2 includes and
+libraries installed in \c /usr/local/include and \c /usr/local/lib, you
+would run configure with the following additional arguments:
 
 @code
-cd build_posix
-python ../test/suite/run.py compress
+--enable-bzip2 CPPFLAGS="-I/usr/local/include" LDFLAGS="-L/usr/local/include"
 @endcode
 
-Review the test output to verify the snappy part of the test passes and
-was not skipped.
+When opening the WiredTiger database, load the bzip2 shared library as
+an extension.  For example, with the WiredTiger library installed in
+\c /usr/local/lib, you would use the following extension:
+
+@snippet ex_all.c Configure bzip2 extension
+
+Finally, when creating the WiredTiger object, set \c block_compressor
+to \c bzip2:
+
+@snippet ex_all.c Create a bzip2 compressed table
 
 @section compression_upgrading Upgrading compression engines
 
diff --git a/src/docs/helium.dox b/src/docs/helium.dox
new file mode 100644
index 00000000000..cd6b47fb968
--- /dev/null
+++ b/src/docs/helium.dox
@@ -0,0 +1,125 @@
+/*! @page helium WiredTiger Helium support
+
+WiredTiger supports Levyx Inc., Helium Data Store volumes as a data-source.
+
+To configure one or more Helium volumes as WiredTiger data sources, take
+the following steps.
+
+@section helium_build Building the WiredTiger Helium Support
+
+To build the Helium support, use the configuration option \c --with-helium=DIR.
+For example:
+
+@code
+% cd wiredtiger
+% ls /usr/local/lib/Helium
+Helium Programmer's Reference.pdf	libhe.a
+README.TXT				libhe.so
+he.h
+% ./configure --with-helium=/usr/local/lib/Helium && make
+@endcode
+
+@section helium_load Loading the WiredTiger Helium Support
+
+Next, add code to your application to load the Helium shared library.
+
+The following example loads the Helium shared library, configuring and
+naming two separate Helium volumes.  The first volume is named \c dev1,
+the second volume is named \c dev2.  Volume \c dev1 has two underlying
+physical Helium devices, \c /dev/disk3s1 and \c /dev/disk4s1.  Volume
+\c dev2 has a single underlying physical Helium device, \c /dev/disk5s1.
+
+@code
+#define	HELIUM_LIBRARY_PATH	"test/helium/.libs/libwiredtiger_helium.so""
+ret = connection->load_extension(connection, HELIUM_LIBRARY_PATH,
+    "config=["
+    "dev1=[helium_devices=[\"he://.//dev/disk3s1,/dev/disk4s1\"],"
+    "helium_o_volume_truncate=1],"
+    "dev2=[helium_devices=[\"he://.//dev/disk5s1\"],"
+    "helium_o_volume_truncate=1]]");
+@endcode
+
+The \c helium_devices configuration string takes a WiredTiger string
+which is a comma-separated list of Helium devices.  (Note the quoting
+required for that to be possible.)
+
+In this example, both Helium volumes are configured to be truncated when
+first opened, and all previously existing contents discarded.
+
+When configuring a Helium volume, the following non-standard configuration
+strings are supported:
+
+<table>
+@hrow{String, Type, Meaning}
+@row{helium_devices, list, WiredTiger URI to Helium volume mapping}
+@row{helium_env_read_cache_size, int, struct he_env read_cache_size value}
+@row{helium_env_write_cache_size, int, struct he_env write_cache_size value}
+@row{helium_o_volume_truncate, boolean, HE_O_VOLUME_TRUNCATE flag}
+</table>
+
+With the exception of the configuration string \c helium_devices (which
+is WiredTiger specific), see the Helium documentation for details on
+their use.
+
+@section helium_objects Creating WiredTiger objects on Helium volumes
+
+When creating WiredTiger objects on Helium volumes, the volume names are
+used as part of the URI specified to WiredTiger methods such as
+WT_SESSION::create or WT_SESSION::rename, separated from the object name
+by a single slash character.
+
+Additionally, the \c helium \c type configuration string must be included.
+
+The following example creates a table named \c access on the Helium
+volume \c dev1, and then opens a cursor on the table:
+
+@code
+WT_CURSOR *cursor;
+WT_SESSION *session;
+
+/* Create the access table. */
+ret = session->create(
+    session, "table:dev1/access", "key_format=S,value_format=S,type=helium");
+
+/* Open a cursor on the access table. */
+ret = session->open_cursor(session, "table:dev1/access", NULL, NULL, &cursor);
+@endcode
+
+When calling WT_SESSION::create to create an object on a Helium volume,
+the following additional configuration strings are supported:
+
+<table>
+@hrow{String, Type, Meaning}
+@row{helium_o_compress, boolean, HE_I_COMPRESS flag}
+@row{helium_o_truncate, boolean, HE_O_TRUNCATE flag}
+</table>
+
+See the Helium device documentation for details on their use.
+
+For example, creating and truncating a table could be done as follows:
+
+@code
+WT_SESSION *session;
+
+/* Create and truncate the access table. */
+ret = session->create(session, "table:dev1/access",
+    "key_format=S,value_format=S,type=helium,helium_open_o_truncate=1");
+@endcode
+
+@section helium_notes Helium notes
+
+- Helium volumes do not support hot backup.
+- Helium volumes do not support named checkpoints.
+- Helium volumes do not support compression of any kind.
+- Helium volumes do not support bulk load as a special case, and configuring
+cursors for bulk load has no effect.
+- Inserting a new record after the current maximum record in a fixed-length
+bit field column-store (that is, a store with an 'r' type key and 't' type
+value) does not implicitly create the missing records.
+
+@section helium_limitations Helium limitations
+
+- WiredTiger transactions cannot include operations on both Helium volumes
+and other stores; this will be corrected in a future release.
+
+*/
diff --git a/src/docs/hot_backup.dox b/src/docs/hot_backup.dox
index 0971eca948a..9c0326bcb17 100644
--- a/src/docs/hot_backup.dox
+++ b/src/docs/hot_backup.dox
@@ -10,15 +10,15 @@ To perform a hot backup:
 1. Open a cursor on the backup data source, which begins the process of
 a hot backup.
 
-2. Copy each file returned by the WT_CURSOR::next method into a
-different directory.
+2. Copy each file returned by the WT_CURSOR::next method to the hot
+backup location, for example, a different directory.
 
 3. Close the cursor; the cursor must not be closed until all of the
 files have been copied.
 
-The directory to which the files are copied may subsequently be
-specified as an directory to the ::wiredtiger_open function and accessed
-as a WiredTiger database home.
+A directory to which the files are copied may subsequently be specified
+as an directory to the ::wiredtiger_open function and accessed as a
+WiredTiger database home.
 
 Notes:
 
diff --git a/src/docs/memrata.dox b/src/docs/memrata.dox
deleted file mode 100644
index c915f0c59ea..00000000000
--- a/src/docs/memrata.dox
+++ /dev/null
@@ -1,129 +0,0 @@
-/*! @page memrata WiredTiger Memrata support
-
-WiredTiger supports Memrata KVS devices as a data-source.
-
-To configure one or more Memrata KVS devices as WiredTiger data sources,
-take the following steps.
-
-@section memrata_build Building the WiredTiger Memrata Support
-
-To build the Memrata support, add a link in the WiredTiger build
-directory to the installed location of the Memrata software.  For
-example:
-
-@code
-% cd wiredtiger
-% ls /usr/local/memrata
-kvs.h		libkvs.a	libkvs.so
-kvs.h.4.2	libkvs.a.4.2	libkvs.so.4.2
-% ln -s /usr/local/memrata memrata
-% ./configure && make
-@endcode
-
-@section memrata_load Loading the WiredTiger Memrata Support
-
-Second, change your application to load the Memrata shared library.  The
-following example loads the Memrata shared library, configuring and
-naming two separate Memrata device pools.  The first device pool is
-named \c dev1, the second device pool is named \c dev2.  Device pool \c
-dev1 has two underlying Memrata devices, \c /dev/ssd0 and \c /dev/ssd1.
-Device pool \c dev2 has a single underlying Memrata device, \c
-/dev/ssd2.
-
-@code
-#define	MEMRATA_LIBRARY_PATH	"test/memrata/.libs/libwiredtiger_memrata.so""
-ret = connection->load_extension(connection, MEMRATA_LIBRARY_PATH,
-    "config=["
-    "dev1=[kvs_devices=[/dev/ssd0,/dev/ssd1],kvs_open_o_truncate=1],"
-    "dev2=[kvs_devices=[/dev/ssd2],kvs_open_o_truncate=1]]");
-@endcode
-
-The \c kvs_devices configuration string takes a WiredTiger configuration
-list, that is, a comma-separated list of Memrata devices.
-
-In this example, both device pools are configured to be truncated (that
-is, all previously existing contents discarded), when they are configured.
-
-When loading a Memrata device, the following additional configuration strings
-are supported:
-
-<table>
-@hrow{String, Type}
-@row{kvs_devices, list of lists}
-@row{kvs_parallelism, int}
-@row{kvs_granularity, int}
-@row{kvs_avg_key_len, int}
-@row{kvs_avg_val_len, int}
-@row{kvs_write_bufs, int}
-@row{kvs_read_bufs, int}
-@row{kvs_commit_timeout, int}
-@row{kvs_reclaim_threshold, int}
-@row{kvs_reclaim_period, int}
-@row{kvs_open_o_debug, boolean}
-@row{kvs_open_o_truncate, boolean}
-</table>
-
-With the exception of the configuration string \c kvs_devices (which is
-WiredTiger specific), see the Memrata device documentation for details
-on their use.
-
-@section memrata_objects Creating Memrata-backed objects
-
-The device pool names are used as part of the URI specified to WiredTiger
-methods such as WT_SESSION::create or WT_SESSION::rename, separated from
-the object name by a single slash character.
-
-Additionally, the \c memrata \c type configuration string must be included.
-
-The following example creates a Memrata table named \c access in the
-device pool \c dev1, and then opens a cursor on the table:
-
-@code
-WT_CURSOR *cursor;
-WT_SESSION *session;
-
-/* Create the access table. */
-ret = session->create(
-    session, "table:dev1/access", "key_format=S,value_format=S,type=memrata");
-
-/* Open a cursor on the access table. */
-ret = session->open_cursor(session, "table:dev1/access", NULL, NULL, &cursor);
-@endcode
-
-When creating a Memrata-backed object with the WT_SESSION::create method,
-the following additional configuration strings are supported:
-
-<table>
-@hrow{String, Type}
-@row{kvs_open_o_debug, boolean}
-@row{kvs_open_o_truncate, boolean}
-</table>
-
-See the Memrata device documentation for details on their use.
-
-For example, creating and truncating a table could be done as follows:
-
-@code
-WT_SESSION *session;
-
-/* Create and truncate the access table. */
-ret = session->create(session, "table:dev1/access",
-    "key_format=S,value_format=S,type=memrata,kvs_open_o_truncate=1");
-@endcode
-
-@section memrata_notes Memrata notes
-
-- Memrata devices do not support named checkpoints.
-- Inserting a new record after the current maximum record in a fixed-length
-bit field column-store (that is, a store with an 'r' type key and 't' type
-value) does not implicitly create the missing records.
-- Memrata devices do not support bulk load as a special case, and configuring
-cursors for bulk load has no effect.
-- Memrata devices do not support compression of any kind.
-
-@section memrata_limitations Memrata limitations
-
-- WiredTiger transactions cannot include operations on both Memrata devices
-and other stores.
-
-*/
diff --git a/src/docs/programming.dox b/src/docs/programming.dox
index 5bf5d965afc..54e641fa3a4 100644
--- a/src/docs/programming.dox
+++ b/src/docs/programming.dox
@@ -33,7 +33,7 @@ WiredTiger applications:
 @section programming_extending Extending WiredTiger
 
 - @subpage custom_data_sources
-- @subpage memrata
+- @subpage helium
 
 @section programming_admin Administering a WiredTiger database
 
diff --git a/src/docs/spell.ok b/src/docs/spell.ok
index 1012eef1f93..6d24c474e19 100644
--- a/src/docs/spell.ok
+++ b/src/docs/spell.ok
@@ -1,5 +1,6 @@
 personal_ws-1.1 en 200
 APIs
+Adler's
 Atomicity
 BLOBs
 CFLAGS
@@ -12,6 +13,7 @@ DbCursor
 DbEnv
 DbMultiple
 EB
+EAGAIN
 EBUSY
 EINVAL
 EmpId
@@ -28,18 +30,21 @@ LIBS
 LSB
 LSM
 Lameter
+Levyx
 MERCHANTABILITY
 MVCC's
 Makefiles
-Memrata
 Mewhort
 NOTFOUND
 NUMA
 NoSQL
+README
 RepMgr
+Roelofs
 Rrx
 Seward's
 SiS
+TXT
 URIs
 Vv
 WiredTiger
@@ -87,6 +92,7 @@ command's
 comparator
 cond
 config
+configurign
 conn
 const
 control's
@@ -146,6 +152,7 @@ firstname
 fnv
 fput
 freelist
+fsync
 gcc
 gdbm
 getopt
@@ -181,6 +188,7 @@ lastname
 len
 li
 libdir
+libhe
 libkvs
 libtool
 libwiredtiger
@@ -207,7 +215,6 @@ maxleafpage
 memalloc
 memfree
 memp
-memrata
 metadata
 minkey
 mkdir
@@ -356,3 +363,4 @@ writelocks
 wrlock
 xa
 yieldcpu
+zlib
diff --git a/src/docs/top/Doxyfile b/src/docs/top/Doxyfile
index 59a3667b169..ed4f2eb8c3b 100644
--- a/src/docs/top/Doxyfile
+++ b/src/docs/top/Doxyfile
@@ -2,7 +2,7 @@
 
 PROJECT_NUMBER		= "Developer Site"
 OUTPUT_DIRECTORY	= ../../docs/top
-INPUT			= top license.dox
+INPUT			= top community.dox license.dox
 EXCLUDE			=
 
 GENERATE_TREEVIEW	= NO
diff --git a/src/docs/top/main.dox b/src/docs/top/main.dox
index 821f22102d3..5481d2deae5 100644
--- a/src/docs/top/main.dox
+++ b/src/docs/top/main.dox
@@ -6,9 +6,9 @@ WiredTiger is an high performance, scalable, production quality, NoSQL,
 @section releases Releases
 
 <table>
-@row{<b>WiredTiger 2.0.1</b> (current),
-	<a href="releases/wiredtiger-2.0.1.tar.bz2"><b>[Release package]</b></a>,
-	<a href="2.0.1/index.html"><b>[Documentation]</b></a>}
+@row{<b>WiredTiger 2.1.0</b> (current),
+	<a href="releases/wiredtiger-2.1.0.tar.bz2"><b>[Release package]</b></a>,
+	<a href="2.1.0/index.html"><b>[Documentation]</b></a>}
 @row{<b>WiredTiger 1.6.6</b> (previous),
 	<a href="releases/wiredtiger-1.6.6.tar.bz2"><b>[Release package]</b></a>,
 	<a href="1.6.6/index.html"><b>[Documentation]</b></a>}
diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox
index 9c250824fee..e59b031a1ff 100644
--- a/src/docs/upgrading.dox
+++ b/src/docs/upgrading.dox
@@ -10,6 +10,26 @@ In the 2.1 release of WiredTiger WT_ITEM::size type has changed from
 resolve compile-time errors.
 </dd>
 
+<dt>WT_COMPRESSOR::compress_raw signature</dt>
+<dd>
+In the 2.1 release of WiredTiger, the behavior of the compress_raw
+callback has changed so that it will only be retried if it returns
+\c EAGAIN.  If it returns zero and sets \c result_slots to zero,
+WiredTiger will assume that raw compression has failed and will fall
+back to calling WT_COMPRESSOR::compress.
+</dd>
+
+<dt>Transaction sync default setting</dt>
+<dd>
+In the 2.1 release of WiredTiger the ::wiredtiger_open \c transaction_sync
+configuration setting default value has changed from "dsync" to "fsync".
+This is due to enhancements to the group commit implementation in
+WiredTiger - which mean that greater throughput can be achieved with
+explicit "fsync" calls than by enabling "dsync" on a file handle.
+Applications that don't execute concurrent transactions may see better
+throughput with transaction_sync set to "dsync".
+</dd>
+
 @section version_20 Upgrading to Version 2.0
 <dl>
 
diff --git a/src/include/btmem.h b/src/include/btmem.h
index e4b30f03ab9..7f0bf280d5c 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -521,14 +521,9 @@ struct __wt_ref {
  * WT_MERGE_FULL_PAGE --
  * When the result of a merge contains more than this number of keys, it is
  * considered "done" and will not be merged again.
- *
- * WT_MERGE_MAX_REFS --
- * Don't complete merges that contain more than this number of keys, they tend
- * to generate pathological trees.
  */
 #define	WT_MERGE_STACK_MIN	3
 #define	WT_MERGE_FULL_PAGE	100
-#define	WT_MERGE_MAX_REFS	1000
 
 /*
  * WT_ROW --
diff --git a/src/include/btree.i b/src/include/btree.i
index fc9a73f4d9d..f09d05178ab 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -528,6 +528,41 @@ __wt_ref_info(WT_SESSION_IMPL *session, WT_PAGE *page,
 }
 
 /*
+ * __wt_eviction_force_check --
+ *	Check if a page matches the criteria for forced eviction.
+ */
+static inline int
+__wt_eviction_force_check(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_BTREE *btree;
+
+	btree = S2BT(session);
+
+	/* Pages are usually small enough, check that first. */
+	if (page->memory_footprint < btree->maxmempage)
+		return (0);
+
+	/* Leaf pages only. */
+	if (page->type != WT_PAGE_COL_FIX &&
+	    page->type != WT_PAGE_COL_VAR &&
+	    page->type != WT_PAGE_ROW_LEAF)
+		return (0);
+
+	/* Eviction may be turned off, although that's rare. */
+	if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
+		return (0);
+
+	/*
+	 * It's hard to imagine a page with a huge memory footprint that has
+	 * never been modified, but check to be sure.
+	 */
+	if (page->modify == NULL)
+		return (0);
+
+	return (1);
+}
+
+/*
  * __wt_page_release --
  *	Release a reference to a page.
  */
@@ -557,7 +592,7 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_PAGE *page)
 			return (ret);
 		}
 
-		ret = __wt_evict_page(session, page);
+		WT_TRET(__wt_evict_page(session, page));
 		if (ret == 0)
 			WT_STAT_FAST_CONN_INCR(session, cache_eviction_force);
 		else
@@ -642,43 +677,8 @@ __wt_page_hazard_check(WT_SESSION_IMPL *session, WT_PAGE *page)
 }
 
 /*
- * __wt_eviction_force_check --
- *	Check if a page matches the criteria for forced eviction.
- */
-static inline int
-__wt_eviction_force_check(WT_SESSION_IMPL *session, WT_PAGE *page)
-{
-	WT_BTREE *btree;
-
-	btree = S2BT(session);
-
-	/* Pages are usually small enough, check that first. */
-	if (page->memory_footprint < btree->maxmempage)
-		return (0);
-
-	/* Leaf pages only. */
-	if (page->type != WT_PAGE_COL_FIX &&
-	    page->type != WT_PAGE_COL_VAR &&
-	    page->type != WT_PAGE_ROW_LEAF)
-		return (0);
-
-	/* Eviction may be turned off, although that's rare. */
-	if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
-		return (0);
-
-	/*
-	 * It's hard to imagine a page with a huge memory footprint that has
-	 * never been modified, but check to be sure.
-	 */
-	if (page->modify == NULL)
-		return (0);
-
-	return (1);
-}
-
-/*
  * __wt_eviction_force --
- *      Check if the current transaction permits forced eviction of a page.
+ *	Check if the current transaction permits forced eviction of a page.
  */
 static inline int
 __wt_eviction_force_txn_check(WT_SESSION_IMPL *session, WT_PAGE *page)
@@ -702,7 +702,7 @@ __wt_eviction_force_txn_check(WT_SESSION_IMPL *session, WT_PAGE *page)
 
 /*
  * __wt_eviction_force --
- *      Forcefully evict a page, if possible.
+ *	Forcefully evict a page, if possible.
  */
 static inline int
 __wt_eviction_force(WT_SESSION_IMPL *session, WT_PAGE *page)
@@ -852,7 +852,7 @@ __wt_lex_compare_skip(
 
 /*
  * __wt_btree_mergeable --
- *      Determines whether the given page is a candidate for merging.
+ *	Determines whether the given page is a candidate for merging.
  */
 static inline int
 __wt_btree_mergeable(WT_PAGE *page)
diff --git a/src/include/stat.h b/src/include/stat.h
index 6717b4d081f..ea2a4068f96 100644
--- a/src/include/stat.h
+++ b/src/include/stat.h
@@ -182,6 +182,8 @@ struct __wt_connection_stats {
 	WT_STATS log_slot_transitions;
 	WT_STATS log_sync;
 	WT_STATS log_writes;
+	WT_STATS lsm_checkpoint_throttle;
+	WT_STATS lsm_merge_throttle;
 	WT_STATS lsm_rows_merged;
 	WT_STATS memory_allocation;
 	WT_STATS memory_free;
@@ -275,9 +277,11 @@ struct __wt_dsrc_stats {
 	WT_STATS cursor_search_near;
 	WT_STATS cursor_update;
 	WT_STATS cursor_update_bytes;
+	WT_STATS lsm_checkpoint_throttle;
 	WT_STATS lsm_chunk_count;
 	WT_STATS lsm_generation_max;
 	WT_STATS lsm_lookup_no_bloom;
+	WT_STATS lsm_merge_throttle;
 	WT_STATS rec_dictionary;
 	WT_STATS rec_overflow_key_internal;
 	WT_STATS rec_overflow_key_leaf;
diff --git a/src/include/txn.i b/src/include/txn.i
index fc0a4d2317f..cdfe697ee51 100644
--- a/src/include/txn.i
+++ b/src/include/txn.i
@@ -18,6 +18,8 @@ __txn_next_op(WT_SESSION_IMPL *session, WT_TXN_OP **opp)
 	WT_TXN *txn;
 
 	txn = &session->txn;
+	*opp = NULL;
+
 	WT_ASSERT(session, F_ISSET(txn, TXN_RUNNING));
 	WT_RET(__wt_realloc_def(session, &txn->mod_alloc,
 	    txn->mod_count + 1, &txn->mod));
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index be4474ed14f..b5634c9d205 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -855,7 +855,9 @@ struct __wt_session {
 	 * value can be created.  Must be larger than chunk_size., an integer
 	 * between 100MB and 10TB; default \c 5GB.}
 	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;chunk_size, the maximum size of the
-	 * in-memory chunk of an LSM tree., an integer between 512K and 500MB;
+	 * in-memory chunk of an LSM tree.  This limit is soft - it is possible
+	 * for chunks to be temporarily larger than this value.  This overrides
+	 * the \c memory_page_max setting., an integer between 512K and 500MB;
 	 * default \c 10MB.}
 	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;merge_max, the
 	 * maximum number of chunks to include in a merge operation., an integer
@@ -872,7 +874,8 @@ struct __wt_session {
 	 * memory before being reconciled to disk.  The specified size will be
 	 * adjusted to a lower bound of <code>50 * leaf_page_max</code>. This
 	 * limit is soft - it is possible for pages to be temporarily larger
-	 * than this value., an integer between 512B and 10TB; default \c 5MB.}
+	 * than this value.  This setting is ignored for LSM trees\, see \c
+	 * chunk_size., an integer between 512B and 10TB; default \c 5MB.}
 	 * @config{os_cache_dirty_max, maximum dirty system buffer cache usage\,
 	 * in bytes.  If non-zero\, schedule writes for dirty blocks belonging
 	 * to this object in the system buffer cache after that many bytes from
@@ -2133,14 +2136,19 @@ struct __wt_compressor {
 	 * set \c result_slotsp to the number of byte strings encoded and
 	 * \c result_lenp to the bytes needed for the encoded representation.
 	 *
-	 * WiredTiger repeatedly calls the callback function until all rows on
-	 * the page have been encoded.  There is no requirement the callback
-	 * encode any or all of the byte strings passed by WiredTiger.  If the
-	 * callback does not encode any of the byte strings, the callback must
-	 * set \c result_slotsp to 0.  In this case, WiredTiger will accumulate
-	 * more rows and repeat the call; if there are no more rows to
-	 * accumulate, WiredTiger writes the remaining rows without further
-	 * calls to the callback.
+	 * There is no requirement the callback encode any or all of the byte
+	 * strings passed by WiredTiger.  If the callback does not encode any
+	 * of the byte strings and compression should not be retried, the
+	 * callback should set \c result_slotsp to 0.
+	 *
+	 * If the callback does not encode any of the byte strings and
+	 * compression should be retried with additional byte strings, the
+	 * callback must return \c EAGAIN.  In that case, WiredTiger will
+	 * accumulate more rows and repeat the call.
+	 *
+	 * If there are no more rows to accumulate or the callback indicates
+	 * that it cannot be retried, WiredTiger writes the remaining rows
+	 * using \c WT_COMPRESSOR::compress.
 	 *
 	 * On entry, \c final is zero if there are more rows to be written as
 	 * part of this page (if there will be additional data provided to the
@@ -2580,42 +2588,46 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
 #define	WT_STAT_CONN_LOG_SYNC				1063
 /*! log: log write operations */
 #define	WT_STAT_CONN_LOG_WRITES				1064
+/*! sleep for LSM checkpoint throttle */
+#define	WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE		1065
+/*! sleep for LSM merge throttle */
+#define	WT_STAT_CONN_LSM_MERGE_THROTTLE			1066
 /*! rows merged in an LSM tree */
-#define	WT_STAT_CONN_LSM_ROWS_MERGED			1065
+#define	WT_STAT_CONN_LSM_ROWS_MERGED			1067
 /*! memory allocations */
-#define	WT_STAT_CONN_MEMORY_ALLOCATION			1066
+#define	WT_STAT_CONN_MEMORY_ALLOCATION			1068
 /*! memory frees */
-#define	WT_STAT_CONN_MEMORY_FREE			1067
+#define	WT_STAT_CONN_MEMORY_FREE			1069
 /*! memory re-allocations */
-#define	WT_STAT_CONN_MEMORY_GROW			1068
+#define	WT_STAT_CONN_MEMORY_GROW			1070
 /*! total read I/Os */
-#define	WT_STAT_CONN_READ_IO				1069
+#define	WT_STAT_CONN_READ_IO				1071
 /*! page reconciliation calls */
-#define	WT_STAT_CONN_REC_PAGES				1070
+#define	WT_STAT_CONN_REC_PAGES				1072
 /*! page reconciliation calls for eviction */
-#define	WT_STAT_CONN_REC_PAGES_EVICTION			1071
+#define	WT_STAT_CONN_REC_PAGES_EVICTION			1073
 /*! reconciliation failed because an update could not be included */
-#define	WT_STAT_CONN_REC_SKIPPED_UPDATE			1072
+#define	WT_STAT_CONN_REC_SKIPPED_UPDATE			1074
 /*! pthread mutex shared lock read-lock calls */
-#define	WT_STAT_CONN_RWLOCK_READ			1073
+#define	WT_STAT_CONN_RWLOCK_READ			1075
 /*! pthread mutex shared lock write-lock calls */
-#define	WT_STAT_CONN_RWLOCK_WRITE			1074
+#define	WT_STAT_CONN_RWLOCK_WRITE			1076
 /*! open cursor count */
-#define	WT_STAT_CONN_SESSION_CURSOR_OPEN		1075
+#define	WT_STAT_CONN_SESSION_CURSOR_OPEN		1077
 /*! transactions */
-#define	WT_STAT_CONN_TXN_BEGIN				1076
+#define	WT_STAT_CONN_TXN_BEGIN				1078
 /*! transaction checkpoints */
-#define	WT_STAT_CONN_TXN_CHECKPOINT			1077
+#define	WT_STAT_CONN_TXN_CHECKPOINT			1079
 /*! transaction checkpoint currently running */
-#define	WT_STAT_CONN_TXN_CHECKPOINT_RUNNING		1078
+#define	WT_STAT_CONN_TXN_CHECKPOINT_RUNNING		1080
 /*! transactions committed */
-#define	WT_STAT_CONN_TXN_COMMIT				1079
+#define	WT_STAT_CONN_TXN_COMMIT				1081
 /*! transaction failures due to cache overflow */
-#define	WT_STAT_CONN_TXN_FAIL_CACHE			1080
+#define	WT_STAT_CONN_TXN_FAIL_CACHE			1082
 /*! transactions rolled-back */
-#define	WT_STAT_CONN_TXN_ROLLBACK			1081
+#define	WT_STAT_CONN_TXN_ROLLBACK			1083
 /*! total write I/Os */
-#define	WT_STAT_CONN_WRITE_IO				1082
+#define	WT_STAT_CONN_WRITE_IO				1084
 
 /*!
  * @}
@@ -2759,43 +2771,47 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
 #define	WT_STAT_DSRC_CURSOR_UPDATE			2066
 /*! cursor-update value bytes updated */
 #define	WT_STAT_DSRC_CURSOR_UPDATE_BYTES		2067
+/*! sleep for LSM checkpoint throttle */
+#define	WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE		2068
 /*! chunks in the LSM tree */
-#define	WT_STAT_DSRC_LSM_CHUNK_COUNT			2068
+#define	WT_STAT_DSRC_LSM_CHUNK_COUNT			2069
 /*! highest merge generation in the LSM tree */
-#define	WT_STAT_DSRC_LSM_GENERATION_MAX			2069
+#define	WT_STAT_DSRC_LSM_GENERATION_MAX			2070
 /*! queries that could have benefited from a Bloom filter that did not
  * exist */
-#define	WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM		2070
+#define	WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM		2071
+/*! sleep for LSM merge throttle */
+#define	WT_STAT_DSRC_LSM_MERGE_THROTTLE			2072
 /*! reconciliation dictionary matches */
-#define	WT_STAT_DSRC_REC_DICTIONARY			2071
+#define	WT_STAT_DSRC_REC_DICTIONARY			2073
 /*! reconciliation internal-page overflow keys */
-#define	WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL		2072
+#define	WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL		2074
 /*! reconciliation leaf-page overflow keys */
-#define	WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF		2073
+#define	WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF		2075
 /*! reconciliation overflow values written */
-#define	WT_STAT_DSRC_REC_OVERFLOW_VALUE			2074
+#define	WT_STAT_DSRC_REC_OVERFLOW_VALUE			2076
 /*! reconciliation pages deleted */
-#define	WT_STAT_DSRC_REC_PAGE_DELETE			2075
+#define	WT_STAT_DSRC_REC_PAGE_DELETE			2077
 /*! reconciliation pages merged */
-#define	WT_STAT_DSRC_REC_PAGE_MERGE			2076
+#define	WT_STAT_DSRC_REC_PAGE_MERGE			2078
 /*! page reconciliation calls */
-#define	WT_STAT_DSRC_REC_PAGES				2077
+#define	WT_STAT_DSRC_REC_PAGES				2079
 /*! page reconciliation calls for eviction */
-#define	WT_STAT_DSRC_REC_PAGES_EVICTION			2078
+#define	WT_STAT_DSRC_REC_PAGES_EVICTION			2080
 /*! reconciliation failed because an update could not be included */
-#define	WT_STAT_DSRC_REC_SKIPPED_UPDATE			2079
+#define	WT_STAT_DSRC_REC_SKIPPED_UPDATE			2081
 /*! reconciliation internal pages split */
-#define	WT_STAT_DSRC_REC_SPLIT_INTERNAL			2080
+#define	WT_STAT_DSRC_REC_SPLIT_INTERNAL			2082
 /*! reconciliation leaf pages split */
-#define	WT_STAT_DSRC_REC_SPLIT_LEAF			2081
+#define	WT_STAT_DSRC_REC_SPLIT_LEAF			2083
 /*! reconciliation maximum splits for a page */
-#define	WT_STAT_DSRC_REC_SPLIT_MAX			2082
+#define	WT_STAT_DSRC_REC_SPLIT_MAX			2084
 /*! object compaction */
-#define	WT_STAT_DSRC_SESSION_COMPACT			2083
+#define	WT_STAT_DSRC_SESSION_COMPACT			2085
 /*! open cursor count */
-#define	WT_STAT_DSRC_SESSION_CURSOR_OPEN		2084
+#define	WT_STAT_DSRC_SESSION_CURSOR_OPEN		2086
 /*! update conflicts */
-#define	WT_STAT_DSRC_TXN_UPDATE_CONFLICT		2085
+#define	WT_STAT_DSRC_TXN_UPDATE_CONFLICT		2087
 /*! @} */
 /*
  * Statistics section: END
diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c
index 618257469ee..c50380b91b9 100644
--- a/src/lsm/lsm_cursor.c
+++ b/src/lsm/lsm_cursor.c
@@ -1115,9 +1115,18 @@ __clsm_put(WT_SESSION_IMPL *session,
 	 * don't worry about protecting access.
 	 */
 	if (++clsm->primary_chunk->count % 100 == 0 &&
-	    lsm_tree->merge_throttle + lsm_tree->ckpt_throttle > 0)
+	    lsm_tree->merge_throttle + lsm_tree->ckpt_throttle > 0) {
+		WT_STAT_FAST_INCRV(session, &clsm->lsm_tree->stats,
+		    lsm_checkpoint_throttle, (uint64_t)lsm_tree->ckpt_throttle);
+		WT_STAT_FAST_CONN_INCRV(session,
+		    lsm_checkpoint_throttle, (uint64_t)lsm_tree->ckpt_throttle);
+		WT_STAT_FAST_INCRV(session, &clsm->lsm_tree->stats,
+		    lsm_merge_throttle, (uint64_t)lsm_tree->merge_throttle);
+		WT_STAT_FAST_CONN_INCRV(session,
+		    lsm_merge_throttle, (uint64_t)lsm_tree->merge_throttle);
 		__wt_sleep(0,
 		    lsm_tree->ckpt_throttle + lsm_tree->merge_throttle);
+	}
 
 	/*
 	 * In LSM there are multiple btrees active at one time. The tree
diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c
index 3aec49da252..a830295908f 100644
--- a/src/lsm/lsm_tree.c
+++ b/src/lsm/lsm_tree.c
@@ -407,6 +407,10 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session,
 	 * Set up the config for each chunk.  If possible, avoid high latencies
 	 * from fsync by flushing the cache every 8MB (will be overridden by
 	 * any application setting).
+	 *
+	 * Also make the memory_page_max double the chunk size, so application
+	 * threads don't immediately try to force evict the chunk when the
+	 * worker thread clears the NO_EVICTION flag.
 	 */
 	tmpconfig = "";
 #ifdef HAVE_SYNC_FILE_RANGE
@@ -415,7 +419,8 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session,
 #endif
 	WT_ERR(__wt_scr_alloc(session, 0, &buf));
 	WT_ERR(__wt_buf_fmt(session, buf,
-	    "%s%s,key_format=u,value_format=u", tmpconfig, config));
+	    "%s%s,key_format=u,value_format=u,memory_page_max=%" PRIu64,
+	    tmpconfig, config, 2 * lsm_tree->chunk_max));
 	lsm_tree->file_config = __wt_buf_steal(session, buf);
 
 	/* Create the first chunk and flush the metadata. */
diff --git a/src/support/stat.c b/src/support/stat.c
index 621c79220a4..c0caecbe606 100644
--- a/src/support/stat.c
+++ b/src/support/stat.c
@@ -93,11 +93,14 @@ __wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats)
 	stats->cursor_search_near.desc = "cursor search near calls";
 	stats->cursor_update.desc = "cursor update calls";
 	stats->cursor_update_bytes.desc = "cursor-update value bytes updated";
+	stats->lsm_checkpoint_throttle.desc =
+	    "sleep for LSM checkpoint throttle";
 	stats->lsm_chunk_count.desc = "chunks in the LSM tree";
 	stats->lsm_generation_max.desc =
 	    "highest merge generation in the LSM tree";
 	stats->lsm_lookup_no_bloom.desc =
 	    "queries that could have benefited from a Bloom filter that did not exist";
+	stats->lsm_merge_throttle.desc = "sleep for LSM merge throttle";
 	stats->rec_dictionary.desc = "reconciliation dictionary matches";
 	stats->rec_overflow_key_internal.desc =
 	    "reconciliation internal-page overflow keys";
@@ -194,9 +197,11 @@ __wt_stat_refresh_dsrc_stats(void *stats_arg)
 	stats->cursor_search_near.v = 0;
 	stats->cursor_update.v = 0;
 	stats->cursor_update_bytes.v = 0;
+	stats->lsm_checkpoint_throttle.v = 0;
 	stats->lsm_chunk_count.v = 0;
 	stats->lsm_generation_max.v = 0;
 	stats->lsm_lookup_no_bloom.v = 0;
+	stats->lsm_merge_throttle.v = 0;
 	stats->rec_dictionary.v = 0;
 	stats->rec_overflow_key_internal.v = 0;
 	stats->rec_overflow_key_leaf.v = 0;
@@ -280,9 +285,11 @@ __wt_stat_aggregate_dsrc_stats(const void *child, const void *parent)
 	p->cursor_search_near.v += c->cursor_search_near.v;
 	p->cursor_update.v += c->cursor_update.v;
 	p->cursor_update_bytes.v += c->cursor_update_bytes.v;
+	p->lsm_checkpoint_throttle.v += c->lsm_checkpoint_throttle.v;
 	if (c->lsm_generation_max.v > p->lsm_generation_max.v)
 	    p->lsm_generation_max.v = c->lsm_generation_max.v;
 	p->lsm_lookup_no_bloom.v += c->lsm_lookup_no_bloom.v;
+	p->lsm_merge_throttle.v += c->lsm_merge_throttle.v;
 	p->rec_dictionary.v += c->rec_dictionary.v;
 	p->rec_overflow_key_internal.v += c->rec_overflow_key_internal.v;
 	p->rec_overflow_key_leaf.v += c->rec_overflow_key_leaf.v;
@@ -389,6 +396,9 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats)
 	    "log: consolidated slot join transitions";
 	stats->log_sync.desc = "log: log sync operations";
 	stats->log_writes.desc = "log: log write operations";
+	stats->lsm_checkpoint_throttle.desc =
+	    "sleep for LSM checkpoint throttle";
+	stats->lsm_merge_throttle.desc = "sleep for LSM merge throttle";
 	stats->lsm_rows_merged.desc = "rows merged in an LSM tree";
 	stats->memory_allocation.desc = "memory allocations";
 	stats->memory_free.desc = "memory frees";
@@ -479,6 +489,8 @@ __wt_stat_refresh_connection_stats(void *stats_arg)
 	stats->log_slot_transitions.v = 0;
 	stats->log_sync.v = 0;
 	stats->log_writes.v = 0;
+	stats->lsm_checkpoint_throttle.v = 0;
+	stats->lsm_merge_throttle.v = 0;
 	stats->lsm_rows_merged.v = 0;
 	stats->memory_allocation.v = 0;
 	stats->memory_free.v = 0;
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index 8cafc78c11f..f4cd3a94a15 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -24,7 +24,7 @@ __checkpoint_name_check(WT_SESSION_IMPL *session, const char *uri)
 
 	/*
 	 * This function exists as a place for this comment: named checkpoints
-	 * are only supported on file objects, and not on LSM trees or Memrata
+	 * are only supported on file objects, and not on LSM trees or Helium
 	 * devices.  If a target list is configured for the checkpoint, this
 	 * function is called with each target list entry; check the entry to
 	 * make sure it's backed by a file.  If no target list is configured,
@@ -148,11 +148,11 @@ __checkpoint_data_source(WT_SESSION_IMPL *session, const char *cfg[])
 	WT_DATA_SOURCE *dsrc;
 
 	/*
-	 * A place-holder, to support Memrata devices: we assume calling the
+	 * A place-holder, to support Helium devices: we assume calling the
 	 * underlying data-source session checkpoint function is sufficient to
 	 * checkpoint all objects in the data source, open or closed, and we
 	 * don't attempt to optimize the checkpoint of individual targets.
-	 * Those assumptions is correct for the Memrata device, but it's not
+	 * Those assumptions is correct for the Helium device, but it's not
 	 * necessarily going to be true for other data sources.
 	 *
 	 * It's not difficult to support data-source checkpoints of individual
author	Michael Cahill <michael.cahill@wiredtiger.com>	2014-02-07 18:16:22 +1100
committer	Michael Cahill <michael.cahill@wiredtiger.com>	2014-02-07 18:16:22 +1100
commit	3bcd2a96e6546419a871dba4a35a2e2a3453adb9 (patch)
tree	d93f25e4d576e47adbf78b352c910e7354d68639 /src
parent	3b6d36874f716625c3f8c867f9185c829931472e (diff)
parent	0f319b1107960bdeb7d617d1797dd992029bb1df (diff)
download	mongo-3bcd2a96e6546419a871dba4a35a2e2a3453adb9.tar.gz