Move eviction and reconciliation files into their own subdirs.

author: Alex Gorrod <alexg@wiredtiger.com> 2014-12-03 13:36:04 +1100
committer: Alex Gorrod <alexg@wiredtiger.com> 2014-12-03 13:36:04 +1100
commit: 0512f27b669b8b1a1f4d9f78d38708efd8cdcd87 (patch)
tree: eaf8cfd6effc5823cd6dd275a22394fb01a4f5f9 /src/reconcile
parent: 2bd317c731c00ca7b82663014e5662517e344ba4 (diff)
download: mongo-0512f27b669b8b1a1f4d9f78d38708efd8cdcd87.tar.gz
2 files changed, 6432 insertions, 0 deletions
diff --git a/src/reconcile/rec_track.c b/src/reconcile/rec_track.c
new file mode 100644
index 00000000000..92282393a23
--- /dev/null
+++ b/src/reconcile/rec_track.c
@@ -0,0 +1,904 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Estimated memory cost for a structure on the overflow lists, the size of
+ * the structure plus two pointers (assume the average skip list depth is 2).
+ */
+#define	WT_OVFL_SIZE(s)							\
+	(sizeof(s) + 2 * sizeof(void *))
+
+/*
+ * __ovfl_track_init --
+ *	Initialize the overflow tracking structure.
+ */
+static int
+__ovfl_track_init(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	return (__wt_calloc_def(session, 1, &page->modify->ovfl_track));
+}
+
+/*
+ * __ovfl_discard_verbose --
+ *	Dump information about a discard overflow record.
+ */
+static int
+__ovfl_discard_verbose(
+    WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell, const char *tag)
+{
+	WT_CELL_UNPACK *unpack, _unpack;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+
+	WT_RET(__wt_scr_alloc(session, 512, &tmp));
+
+	unpack = &_unpack;
+	__wt_cell_unpack(cell, unpack);
+
+	WT_ERR(__wt_verbose(session, WT_VERB_OVERFLOW,
+	    "discard: %s%s%p %s",
+	    tag == NULL ? "" : tag,
+	    tag == NULL ? "" : ": ",
+	    page,
+	    __wt_addr_string(session, unpack->data, unpack->size, tmp)));
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+#if 0
+/*
+ * __ovfl_discard_dump --
+ *	Debugging information.
+ */
+static void
+__ovfl_discard_dump(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_CELL **cellp;
+	WT_OVFL_TRACK *track;
+	size_t i;
+
+	if (page->modify == NULL || page->modify->ovfl_track == NULL)
+		return;
+
+	track = page->modify->ovfl_track;
+	for (i = 0, cellp = track->discard;
+	    i < track->discard_entries; ++i, ++cellp)
+		(void)__ovfl_discard_verbose(session, page, *cellp, "dump");
+}
+#endif
+
+/*
+ * __ovfl_discard_wrapup --
+ *	Resolve the page's overflow discard list after a page is written.
+ */
+static int
+__ovfl_discard_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_CELL **cellp;
+	WT_DECL_RET;
+	WT_OVFL_TRACK *track;
+	uint32_t i;
+
+	track = page->modify->ovfl_track;
+	for (i = 0, cellp = track->discard;
+	    i < track->discard_entries; ++i, ++cellp) {
+		if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+			WT_RET(__ovfl_discard_verbose(
+			    session, page, *cellp, "free"));
+
+		/* Discard each cell's overflow item. */
+		WT_RET(__wt_ovfl_discard(session, *cellp));
+	}
+
+	__wt_free(session, track->discard);
+	track->discard_entries = track->discard_allocated = 0;
+
+	return (ret);
+}
+
+/*
+ * __ovfl_discard_wrapup_err --
+ *	Resolve the page's overflow discard list after an error occurs.
+ */
+static int
+__ovfl_discard_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_OVFL_TRACK *track;
+
+	track = page->modify->ovfl_track;
+
+	__wt_free(session, track->discard);
+	track->discard_entries = track->discard_allocated = 0;
+
+	return (0);
+}
+
+/*
+ * __wt_ovfl_discard_add --
+ *	Add a new entry to the page's list of overflow records that have been
+ * discarded.
+ */
+int
+__wt_ovfl_discard_add(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell)
+{
+	WT_OVFL_TRACK *track;
+
+	if (page->modify->ovfl_track == NULL)
+		WT_RET(__ovfl_track_init(session, page));
+
+	track = page->modify->ovfl_track;
+	WT_RET(__wt_realloc_def(session, &track->discard_allocated,
+	    track->discard_entries + 1, &track->discard));
+	track->discard[track->discard_entries++] = cell;
+
+	if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+		WT_RET(__ovfl_discard_verbose(session, page, cell, "add"));
+
+	return (0);
+}
+
+/*
+ * __wt_ovfl_discard_free --
+ *	Free the page's list of discarded overflow record addresses.
+ */
+void
+__wt_ovfl_discard_free(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_OVFL_TRACK *track;
+
+	if (page->modify == NULL || page->modify->ovfl_track == NULL)
+		return;
+
+	track = page->modify->ovfl_track;
+
+	__wt_free(session, track->discard);
+	track->discard_entries = track->discard_allocated = 0;
+}
+
+/*
+ * __ovfl_reuse_verbose --
+ *	Dump information about a reuse overflow record.
+ */
+static int
+__ovfl_reuse_verbose(WT_SESSION_IMPL *session,
+    WT_PAGE *page, WT_OVFL_REUSE *reuse, const char *tag)
+{
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+
+	WT_RET(__wt_scr_alloc(session, 64, &tmp));
+
+	WT_ERR(__wt_verbose(session, WT_VERB_OVERFLOW,
+	    "reuse: %s%s%p %s (%s%s%s) {%.*s}",
+	    tag == NULL ? "" : tag,
+	    tag == NULL ? "" : ": ",
+	    page,
+	    __wt_addr_string(
+		session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size, tmp),
+	    F_ISSET(reuse, WT_OVFL_REUSE_INUSE) ? "inuse" : "",
+	    F_ISSET(reuse, WT_OVFL_REUSE_INUSE) &&
+	    F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED) ? ", " : "",
+	    F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED) ? "just-added" : "",
+	    WT_MIN(reuse->value_size, 40), (char *)WT_OVFL_REUSE_VALUE(reuse)));
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+#if 0
+/*
+ * __ovfl_reuse_dump --
+ *	Debugging information.
+ */
+static void
+__ovfl_reuse_dump(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_OVFL_REUSE **head, *reuse;
+
+	if (page->modify == NULL || page->modify->ovfl_track == NULL)
+		return;
+	head = page->modify->ovfl_track->ovfl_reuse;
+
+	for (reuse = head[0]; reuse != NULL; reuse = reuse->next[0])
+		(void)__ovfl_reuse_verbose(session, page, reuse, "dump");
+}
+#endif
+
+/*
+ * __ovfl_reuse_skip_search --
+ *	Return the first, not in-use, matching value in the overflow reuse list.
+ */
+static WT_OVFL_REUSE *
+__ovfl_reuse_skip_search(
+    WT_OVFL_REUSE **head, const void *value, size_t value_size)
+{
+	WT_OVFL_REUSE **e, *next;
+	size_t len;
+	int cmp, i;
+
+	/*
+	 * Start at the highest skip level, then go as far as possible at each
+	 * level before stepping down to the next.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
+		if (*e == NULL) {		/* Empty levels */
+			--i;
+			--e;
+			continue;
+		}
+
+		/*
+		 * Values are not unique, and it's possible to have long lists
+		 * of identical overflow items.  (We've seen it in benchmarks.)
+		 * Move through a list of identical items at the current level
+		 * as long as the next one is in-use, otherwise, drop down a
+		 * level.   When at the bottom level, return items if reusable,
+		 * else NULL.
+		 */
+		len = WT_MIN((*e)->value_size, value_size);
+		cmp = memcmp(WT_OVFL_REUSE_VALUE(*e), value, len);
+		if (cmp == 0 && (*e)->value_size == value_size) {
+			if (i == 0)
+				return (F_ISSET(*e,
+				    WT_OVFL_REUSE_INUSE) ? NULL : *e);
+			if ((next = (*e)->next[i]) == NULL ||
+			    !F_ISSET(next, WT_OVFL_REUSE_INUSE) ||
+			    next->value_size != len || memcmp(
+			    WT_OVFL_REUSE_VALUE(next), value, len) != 0) {
+				--i;		/* Drop down a level */
+				--e;
+			} else			/* Keep going at this level */
+				e = &(*e)->next[i];
+			continue;
+		}
+
+		/*
+		 * If the skiplist value is larger than the search value, or
+		 * they compare equally and the skiplist value is longer than
+		 * the search value, drop down a level, otherwise continue on
+		 * this level.
+		 */
+		if (cmp > 0 || (cmp == 0 && (*e)->value_size > value_size)) {
+			--i;			/* Drop down a level */
+			--e;
+		} else				/* Keep going at this level */
+			e = &(*e)->next[i];
+	}
+	return (NULL);
+}
+
+/*
+ * __ovfl_reuse_skip_search_stack --
+ *	 Search an overflow reuse skiplist, returning an insert/remove stack.
+ */
+static void
+__ovfl_reuse_skip_search_stack(WT_OVFL_REUSE **head,
+    WT_OVFL_REUSE ***stack, const void *value, size_t value_size)
+{
+	WT_OVFL_REUSE **e;
+	size_t len;
+	int cmp, i;
+
+	/*
+	 * Start at the highest skip level, then go as far as possible at each
+	 * level before stepping down to the next.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
+		if (*e == NULL) {		/* Empty levels */
+			stack[i--] = e--;
+			continue;
+		}
+
+		/*
+		 * If the skiplist value is larger than the search value, or
+		 * they compare equally and the skiplist value is longer than
+		 * the search value, drop down a level, otherwise continue on
+		 * this level.
+		 */
+		len = WT_MIN((*e)->value_size, value_size);
+		cmp = memcmp(WT_OVFL_REUSE_VALUE(*e), value, len);
+		if (cmp > 0 || (cmp == 0 && (*e)->value_size > value_size))
+			stack[i--] = e--;	/* Drop down a level */
+		else
+			e = &(*e)->next[i];	/* Keep going at this level */
+	}
+}
+
+/*
+ * __ovfl_reuse_wrapup --
+ *	Resolve the page's overflow reuse list after a page is written.
+ */
+static int
+__ovfl_reuse_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_BM *bm;
+	WT_OVFL_REUSE **e, **head, *reuse;
+	size_t incr, decr;
+	int i;
+
+	bm = S2BT(session)->bm;
+	head = page->modify->ovfl_track->ovfl_reuse;
+
+	/*
+	 * Discard any overflow records that aren't in-use, freeing underlying
+	 * blocks.
+	 *
+	 * First, walk the overflow reuse lists (except for the lowest one),
+	 * fixing up skiplist links.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i)
+		for (e = &head[i]; *e != NULL;) {
+			if (F_ISSET(*e, WT_OVFL_REUSE_INUSE)) {
+				e = &(*e)->next[i];
+				continue;
+			}
+			*e = (*e)->next[i];
+		}
+
+	/*
+	 * Second, discard any overflow record without an in-use flag, clear
+	 * the flags for the next run.
+	 *
+	 * As part of the pass through the lowest level, figure out how much
+	 * space we added/subtracted from the page, and update its footprint.
+	 * We don't get it exactly correct because we don't know the depth of
+	 * the skiplist here, but it's close enough, and figuring out the
+	 * memory footprint change in the reconciliation wrapup code means
+	 * fewer atomic updates and less code overall.
+	 */
+	incr = decr = 0;
+	for (e = &head[0]; (reuse = *e) != NULL;) {
+		if (F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) {
+			if (F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED))
+				incr += WT_OVFL_SIZE(WT_OVFL_REUSE) +
+				    reuse->addr_size + reuse->value_size;
+
+			F_CLR(reuse,
+			    WT_OVFL_REUSE_INUSE | WT_OVFL_REUSE_JUST_ADDED);
+			e = &(*e)->next[0];
+			continue;
+		}
+		*e = (*e)->next[0];
+
+		WT_ASSERT(session, !F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED));
+		decr += WT_OVFL_SIZE(WT_OVFL_REUSE) +
+		    reuse->addr_size + reuse->value_size;
+
+		if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+			WT_RET(
+			    __ovfl_reuse_verbose(session, page, reuse, "free"));
+		WT_RET(bm->free(
+		    bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size));
+		__wt_free(session, reuse);
+	}
+
+	if (incr > decr)
+		__wt_cache_page_inmem_incr(session, page, incr - decr);
+	if (decr > incr)
+		__wt_cache_page_inmem_decr(session, page, decr - incr);
+	return (0);
+}
+
+/*
+ * __ovfl_reuse_wrapup_err --
+ *	Resolve the page's overflow reuse list after an error occurs.
+ */
+static int
+__ovfl_reuse_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_BM *bm;
+	WT_DECL_RET;
+	WT_OVFL_REUSE **e, **head, *reuse;
+	int i;
+
+	bm = S2BT(session)->bm;
+	head = page->modify->ovfl_track->ovfl_reuse;
+
+	/*
+	 * Discard any overflow records that were just added, freeing underlying
+	 * blocks.
+	 *
+	 * First, walk the overflow reuse lists (except for the lowest one),
+	 * fixing up skiplist links.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i)
+		for (e = &head[i]; *e != NULL;) {
+			if (!F_ISSET(*e, WT_OVFL_REUSE_JUST_ADDED)) {
+				e = &(*e)->next[i];
+				continue;
+			}
+			*e = (*e)->next[i];
+		}
+
+	/*
+	 * Second, discard any overflow record with a just-added flag, clear the
+	 * flags for the next run.
+	 */
+	for (e = &head[0]; (reuse = *e) != NULL;) {
+		if (!F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)) {
+			F_CLR(reuse, WT_OVFL_REUSE_INUSE);
+			e = &(*e)->next[0];
+			continue;
+		}
+		*e = (*e)->next[0];
+
+		if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+			WT_RET(
+			    __ovfl_reuse_verbose(session, page, reuse, "free"));
+		WT_TRET(bm->free(
+		    bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size));
+		__wt_free(session, reuse);
+	}
+	return (0);
+}
+
+/*
+ * __wt_ovfl_reuse_search --
+ *	Search the page's list of overflow records for a match.
+ */
+int
+__wt_ovfl_reuse_search(WT_SESSION_IMPL *session, WT_PAGE *page,
+    uint8_t **addrp, size_t *addr_sizep,
+    const void *value, size_t value_size)
+{
+	WT_OVFL_REUSE **head, *reuse;
+
+	*addrp = NULL;
+	*addr_sizep = 0;
+
+	if (page->modify->ovfl_track == NULL)
+		return (0);
+
+	head = page->modify->ovfl_track->ovfl_reuse;
+
+	/*
+	 * The search function returns the first matching record in the list
+	 * which does not have the in-use flag set, or NULL.
+	 */
+	if ((reuse = __ovfl_reuse_skip_search(head, value, value_size)) == NULL)
+		return (0);
+
+	*addrp = WT_OVFL_REUSE_ADDR(reuse);
+	*addr_sizep = reuse->addr_size;
+	F_SET(reuse, WT_OVFL_REUSE_INUSE);
+
+	if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+		WT_RET(__ovfl_reuse_verbose(session, page, reuse, "reclaim"));
+	return (1);
+}
+
+/*
+ * __wt_ovfl_reuse_add --
+ *	Add a new entry to the page's list of overflow records tracked for
+ * reuse.
+ */
+int
+__wt_ovfl_reuse_add(WT_SESSION_IMPL *session, WT_PAGE *page,
+    const uint8_t *addr, size_t addr_size,
+    const void *value, size_t value_size)
+{
+	WT_OVFL_REUSE **head, *reuse, **stack[WT_SKIP_MAXDEPTH];
+	size_t size;
+	u_int i, skipdepth;
+	uint8_t *p;
+
+	if (page->modify->ovfl_track == NULL)
+		WT_RET(__ovfl_track_init(session, page));
+
+	head = page->modify->ovfl_track->ovfl_reuse;
+
+	/* Choose a skiplist depth for this insert. */
+	skipdepth = __wt_skip_choose_depth(session);
+
+	/*
+	 * Allocate the WT_OVFL_REUSE structure, next pointers for the skip
+	 * list, room for the address and value, then copy everything into
+	 * place.
+	 *
+	 * To minimize the WT_OVFL_REUSE structure size, the address offset
+	 * and size are single bytes: that's safe because the address follows
+	 * the structure (which can't be more than about 100B), and address
+	 * cookies are limited to 255B.
+	 */
+	size = sizeof(WT_OVFL_REUSE) +
+	    skipdepth * sizeof(WT_OVFL_REUSE *) + addr_size + value_size;
+	WT_RET(__wt_calloc(session, 1, size, &reuse));
+	p = (uint8_t *)reuse +
+	    sizeof(WT_OVFL_REUSE) + skipdepth * sizeof(WT_OVFL_REUSE *);
+	reuse->addr_offset = (uint8_t)WT_PTRDIFF(p, reuse);
+	reuse->addr_size = (uint8_t)addr_size;
+	memcpy(p, addr, addr_size);
+	p += addr_size;
+	reuse->value_offset = WT_PTRDIFF32(p, reuse);
+	reuse->value_size = WT_STORE_SIZE(value_size);
+	memcpy(p, value, value_size);
+	F_SET(reuse, WT_OVFL_REUSE_INUSE | WT_OVFL_REUSE_JUST_ADDED);
+
+	/* Insert the new entry into the skiplist. */
+	__ovfl_reuse_skip_search_stack(head, stack, value, value_size);
+	for (i = 0; i < skipdepth; ++i) {
+		reuse->next[i] = *stack[i];
+		*stack[i] = reuse;
+	}
+
+	if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+		WT_RET(__ovfl_reuse_verbose(session, page, reuse, "add"));
+
+	return (0);
+}
+
+/*
+ * __wt_ovfl_reuse_free --
+ *	Free the page's list of overflow records tracked for reuse.
+ */
+void
+__wt_ovfl_reuse_free(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_OVFL_REUSE *reuse;
+	WT_PAGE_MODIFY *mod;
+	void *next;
+
+	mod = page->modify;
+	if (mod == NULL || mod->ovfl_track == NULL)
+		return;
+
+	for (reuse = mod->ovfl_track->ovfl_reuse[0];
+	    reuse != NULL; reuse = next) {
+		next = reuse->next[0];
+		__wt_free(session, reuse);
+	}
+}
+
+/*
+ * __ovfl_txnc_verbose --
+ *	Dump information about a transaction-cached overflow record.
+ */
+static int
+__ovfl_txnc_verbose(WT_SESSION_IMPL *session,
+    WT_PAGE *page, WT_OVFL_TXNC *txnc, const char *tag)
+{
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+
+	WT_RET(__wt_scr_alloc(session, 64, &tmp));
+
+	WT_ERR(__wt_verbose(session, WT_VERB_OVERFLOW,
+	    "txn-cache: %s%s%p %s %" PRIu64 " {%.*s}",
+	    tag == NULL ? "" : tag,
+	    tag == NULL ? "" : ": ",
+	    page,
+	    __wt_addr_string(
+		session, WT_OVFL_TXNC_ADDR(txnc), txnc->addr_size, tmp),
+	    txnc->current,
+	    WT_MIN(txnc->value_size, 40), (char *)WT_OVFL_TXNC_VALUE(txnc)));
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+#if 0
+/*
+ * __ovfl_txnc_dump --
+ *	Debugging information.
+ */
+static void
+__ovfl_txnc_dump(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_OVFL_TXNC **head, *txnc;
+
+	if (page->modify == NULL || page->modify->ovfl_track == NULL)
+		return;
+	head = page->modify->ovfl_track->ovfl_txnc;
+
+	for (txnc = head[0]; txnc != NULL; txnc = txnc->next[0])
+		(void)__ovfl_txnc_verbose(session, page, txnc, "dump");
+}
+#endif
+
+/*
+ * __ovfl_txnc_skip_search --
+ *	Return the first matching addr in the overflow transaction-cache list.
+ */
+static WT_OVFL_TXNC *
+__ovfl_txnc_skip_search(WT_OVFL_TXNC **head, const void *addr, size_t addr_size)
+{
+	WT_OVFL_TXNC **e;
+	size_t len;
+	int cmp, i;
+
+	/*
+	 * Start at the highest skip level, then go as far as possible at each
+	 * level before stepping down to the next.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
+		if (*e == NULL) {		/* Empty levels */
+			--i;
+			--e;
+			continue;
+		}
+
+		/*
+		 * Return any exact matches: we don't care in what search level
+		 * we found a match.
+		 */
+		len = WT_MIN((*e)->addr_size, addr_size);
+		cmp = memcmp(WT_OVFL_TXNC_ADDR(*e), addr, len);
+		if (cmp == 0 && (*e)->addr_size == addr_size)
+			return (*e);
+
+		/*
+		 * If the skiplist address is larger than the search address, or
+		 * they compare equally and the skiplist address is longer than
+		 * the search address, drop down a level, otherwise continue on
+		 * this level.
+		 */
+		if (cmp > 0 || (cmp == 0 && (*e)->addr_size > addr_size)) {
+			--i;			/* Drop down a level */
+			--e;
+		} else				/* Keep going at this level */
+			e = &(*e)->next[i];
+	}
+	return (NULL);
+}
+
+/*
+ * __ovfl_txnc_skip_search_stack --
+ *	 Search an overflow transaction-cache skiplist, returning an
+ * insert/remove stack.
+ */
+static void
+__ovfl_txnc_skip_search_stack(WT_OVFL_TXNC **head,
+    WT_OVFL_TXNC ***stack, const void *addr, size_t addr_size)
+{
+	WT_OVFL_TXNC **e;
+	size_t len;
+	int cmp, i;
+
+	/*
+	 * Start at the highest skip level, then go as far as possible at each
+	 * level before stepping down to the next.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
+		if (*e == NULL) {		/* Empty levels */
+			stack[i--] = e--;
+			continue;
+		}
+
+		/*
+		 * If the skiplist addr is larger than the search addr, or
+		 * they compare equally and the skiplist addr is longer than
+		 * the search addr, drop down a level, otherwise continue on
+		 * this level.
+		 */
+		len = WT_MIN((*e)->addr_size, addr_size);
+		cmp = memcmp(WT_OVFL_TXNC_ADDR(*e), addr, len);
+		if (cmp > 0 || (cmp == 0 && (*e)->addr_size > addr_size))
+			stack[i--] = e--;	/* Drop down a level */
+		else
+			e = &(*e)->next[i];	/* Keep going at this level */
+	}
+}
+
+/*
+ * __ovfl_txnc_wrapup --
+ *	Resolve the page's transaction-cache list.
+ */
+static int
+__ovfl_txnc_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_OVFL_TXNC **e, **head, *txnc;
+	size_t decr;
+	int i;
+
+	head = page->modify->ovfl_track->ovfl_txnc;
+
+	/*
+	 * Discard any transaction-cache records with transaction IDs earlier
+	 * than any in the system.
+	 *
+	 * First, walk the overflow transaction-cache skip lists (except for
+	 * the lowest level), fixing up links.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i)
+		for (e = &head[i]; *e != NULL;) {
+			if (!__wt_txn_visible_all(session, (*e)->current)) {
+				e = &(*e)->next[i];
+				continue;
+			}
+			*e = (*e)->next[i];
+		}
+
+	/* Second, discard any no longer needed transaction-cache records. */
+	decr = 0;
+	for (e = &head[0]; (txnc = *e) != NULL;) {
+		if (!__wt_txn_visible_all(session, txnc->current)) {
+			e = &(*e)->next[0];
+			continue;
+		}
+		*e = (*e)->next[0];
+
+		decr += WT_OVFL_SIZE(WT_OVFL_TXNC) +
+		    txnc->addr_size + txnc->value_size;
+
+		if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+			WT_RET(
+			    __ovfl_txnc_verbose(session, page, txnc, "free"));
+		__wt_free(session, txnc);
+	}
+
+	if (decr != 0)
+		__wt_cache_page_inmem_decr(session, page, decr);
+	return (0);
+}
+
+/*
+ * __wt_ovfl_txnc_search --
+ *	Search the page's list of transaction-cache overflow records for a
+ * match.
+ */
+int
+__wt_ovfl_txnc_search(
+    WT_PAGE *page, const uint8_t *addr, size_t addr_size, WT_ITEM *store)
+{
+	WT_OVFL_TXNC **head, *txnc;
+
+	if (page->modify->ovfl_track == NULL)
+		return (WT_NOTFOUND);
+
+	head = page->modify->ovfl_track->ovfl_txnc;
+
+	if ((txnc = __ovfl_txnc_skip_search(head, addr, addr_size)) == NULL)
+		return (WT_NOTFOUND);
+
+	store->data = WT_OVFL_TXNC_VALUE(txnc);
+	store->size = txnc->value_size;
+	return (0);
+}
+
+/*
+ * __wt_ovfl_txnc_add --
+ *	Add a new entry to the page's list of transaction-cached overflow
+ * records.
+ */
+int
+__wt_ovfl_txnc_add(WT_SESSION_IMPL *session, WT_PAGE *page,
+    const uint8_t *addr, size_t addr_size,
+    const void *value, size_t value_size)
+{
+	WT_OVFL_TXNC **head, **stack[WT_SKIP_MAXDEPTH], *txnc;
+	size_t size;
+	u_int i, skipdepth;
+	uint8_t *p;
+
+	if (page->modify->ovfl_track == NULL)
+		WT_RET(__ovfl_track_init(session, page));
+
+	head = page->modify->ovfl_track->ovfl_txnc;
+
+	/* Choose a skiplist depth for this insert. */
+	skipdepth = __wt_skip_choose_depth(session);
+
+	/*
+	 * Allocate the WT_OVFL_TXNC structure, next pointers for the skip
+	 * list, room for the address and value, then copy everything into
+	 * place.
+	 *
+	 * To minimize the WT_OVFL_TXNC structure size, the address offset
+	 * and size are single bytes: that's safe because the address follows
+	 * the structure (which can't be more than about 100B), and address
+	 * cookies are limited to 255B.
+	 */
+	size = sizeof(WT_OVFL_TXNC) +
+	    skipdepth * sizeof(WT_OVFL_TXNC *) + addr_size + value_size;
+	WT_RET(__wt_calloc(session, 1, size, &txnc));
+	p = (uint8_t *)txnc +
+	    sizeof(WT_OVFL_TXNC) + skipdepth * sizeof(WT_OVFL_TXNC *);
+	txnc->addr_offset = (uint8_t)WT_PTRDIFF(p, txnc);
+	txnc->addr_size = (uint8_t)addr_size;
+	memcpy(p, addr, addr_size);
+	p += addr_size;
+	txnc->value_offset = WT_PTRDIFF32(p, txnc);
+	txnc->value_size = WT_STORE_SIZE(value_size);
+	memcpy(p, value, value_size);
+	txnc->current = __wt_txn_new_id(session);
+
+	__wt_cache_page_inmem_incr(session, page,
+	    WT_OVFL_SIZE(WT_OVFL_TXNC) + addr_size + value_size);
+
+	/* Insert the new entry into the skiplist. */
+	__ovfl_txnc_skip_search_stack(head, stack, addr, addr_size);
+	for (i = 0; i < skipdepth; ++i) {
+		txnc->next[i] = *stack[i];
+		*stack[i] = txnc;
+	}
+
+	if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+		WT_RET(__ovfl_txnc_verbose(session, page, txnc, "add"));
+
+	return (0);
+}
+
+/*
+ * __wt_ovfl_txnc_free --
+ *	Free the page's list of transaction-cached overflow records.
+ */
+void
+__wt_ovfl_txnc_free(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_OVFL_TXNC *txnc;
+	WT_PAGE_MODIFY *mod;
+	void *next;
+
+	mod = page->modify;
+	if (mod == NULL || mod->ovfl_track == NULL)
+		return;
+
+	for (txnc = mod->ovfl_track->ovfl_txnc[0];
+	    txnc != NULL; txnc = next) {
+		next = txnc->next[0];
+		__wt_free(session, txnc);
+	}
+}
+
+/*
+ * __wt_ovfl_track_wrapup --
+ *	Resolve the page's overflow tracking on reconciliation success.
+ */
+int
+__wt_ovfl_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_DECL_RET;
+	WT_OVFL_TRACK *track;
+
+	if (page->modify == NULL || page->modify->ovfl_track == NULL)
+		return (0);
+
+	track = page->modify->ovfl_track;
+	if (track->discard != NULL)
+		WT_RET(__ovfl_discard_wrapup(session, page));
+
+	if (track->ovfl_reuse[0] != NULL)
+		WT_RET(__ovfl_reuse_wrapup(session, page));
+
+	if (track->ovfl_txnc[0] != NULL) {
+		WT_RET(__wt_writelock(session, S2BT(session)->ovfl_lock));
+		ret = __ovfl_txnc_wrapup(session, page);
+		WT_TRET(__wt_writeunlock(session, S2BT(session)->ovfl_lock));
+	}
+	return (0);
+}
+
+/*
+ * __wt_ovfl_track_wrapup_err --
+ *	Resolve the page's overflow tracking on reconciliation error.
+ */
+int
+__wt_ovfl_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_DECL_RET;
+	WT_OVFL_TRACK *track;
+
+	if (page->modify == NULL || page->modify->ovfl_track == NULL)
+		return (0);
+
+	track = page->modify->ovfl_track;
+	if (track->discard != NULL)
+		WT_RET(__ovfl_discard_wrapup_err(session, page));
+
+	if (track->ovfl_reuse[0] != NULL)
+		WT_RET(__ovfl_reuse_wrapup_err(session, page));
+
+	if (track->ovfl_txnc[0] != NULL) {
+		WT_RET(__wt_writelock(session, S2BT(session)->ovfl_lock));
+		ret = __ovfl_txnc_wrapup(session, page);
+		WT_TRET(__wt_writeunlock(session, S2BT(session)->ovfl_lock));
+	}
+	return (0);
+}
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
new file mode 100644
index 00000000000..c72447ae841
--- /dev/null
+++ b/src/reconcile/rec_write.c
@@ -0,0 +1,5528 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+struct __rec_boundary;		typedef struct __rec_boundary WT_BOUNDARY;
+struct __rec_dictionary;	typedef struct __rec_dictionary WT_DICTIONARY;
+struct __rec_kv;		typedef struct __rec_kv WT_KV;
+
+/*
+ * Reconciliation is the process of taking an in-memory page, walking each entry
+ * in the page, building a backing disk image in a temporary buffer representing
+ * that information, and writing that buffer to disk.  What could be simpler?
+ *
+ * WT_RECONCILE --
+ *	Information tracking a single page reconciliation.
+ */
+typedef struct {
+	WT_REF  *ref;			/* Page being reconciled */
+	WT_PAGE *page;
+	uint32_t flags;			/* Caller's configuration */
+
+	WT_ITEM	 dsk;			/* Temporary disk-image buffer */
+
+	/* Track whether all changes to the page are written. */
+	uint64_t max_txn;
+	uint64_t skipped_txn;
+	uint32_t orig_write_gen;
+
+	/*
+	 * If page updates are skipped because they are as yet unresolved, or
+	 * the page has updates we cannot discard, the page is left "dirty":
+	 * the page cannot be discarded and a subsequent reconciliation will
+	 * be necessary to discard the page.
+	 */
+	int	 leave_dirty;
+
+	/*
+	 * Raw compression (don't get me started, as if normal reconciliation
+	 * wasn't bad enough).  If an application wants absolute control over
+	 * what gets written to disk, we give it a list of byte strings and it
+	 * gives us back an image that becomes a file block.  Because we don't
+	 * know the number of items we're storing in a block until we've done
+	 * a lot of work, we turn off most compression: dictionary, copy-cell,
+	 * prefix and row-store internal page suffix compression are all off.
+	 */
+	int	  raw_compression;
+	uint32_t  raw_max_slots;	/* Raw compression array sizes */
+	uint32_t *raw_entries;		/* Raw compression slot entries */
+	uint32_t *raw_offsets;		/* Raw compression slot offsets */
+	uint64_t *raw_recnos;		/* Raw compression recno count */
+	WT_ITEM	  raw_destination;	/* Raw compression destination buffer */
+
+	/*
+	 * Track if reconciliation has seen any overflow items.  If a leaf page
+	 * with no overflow items is written, the parent page's address cell is
+	 * set to the leaf-no-overflow type.  This means we can delete the leaf
+	 * page without reading it because we don't have to discard any overflow
+	 * items it might reference.
+	 *
+	 * The test test is per-page reconciliation, that is, once we see an
+	 * overflow item on the page, all subsequent leaf pages written for the
+	 * page will not be leaf-no-overflow type, regardless of whether or not
+	 * they contain overflow items.  In other words, leaf-no-overflow is not
+	 * guaranteed to be set on every page that doesn't contain an overflow
+	 * item, only that if it is set, the page contains no overflow items.
+	 *
+	 * The reason is because of raw compression: there's no easy/fast way to
+	 * figure out if the rows selected by raw compression included overflow
+	 * items, and the optimization isn't worth another pass over the data.
+	 */
+	int	ovfl_items;
+
+	/*
+	 * Track if reconciliation of a row-store leaf page has seen empty (zero
+	 * length) values.  We don't write out anything for empty values, so if
+	 * there are empty values on a page, we have to make two passes over the
+	 * page when it's read to figure out how many keys it has, expensive in
+	 * the common case of no empty values and (entries / 2) keys.  Likewise,
+	 * a page with only empty values is another common data set, and keys on
+	 * that page will be equal to the number of entries.  In both cases, set
+	 * a flag in the page's on-disk header.
+	 *
+	 * The test is per-page reconciliation as described above for the
+	 * overflow-item test.
+	 */
+	int	all_empty_value, any_empty_value;
+
+	/*
+	 * Reconciliation gets tricky if we have to split a page, which happens
+	 * when the disk image we create exceeds the page type's maximum disk
+	 * image size.
+	 *
+	 * First, the sizes of the page we're building.  If WiredTiger is doing
+	 * page layout, page_size is the same as page_size_max.  We accumulate
+	 * the maximum page size of raw data and when we reach that size, we
+	 * split the page into multiple chunks, eventually compressing those
+	 * chunks.  When the application is doing page layout (raw compression
+	 * is configured), page_size can continue to grow past page_size_max,
+	 * and we keep accumulating raw data until the raw compression callback
+	 * accepts it.
+	 */
+	uint32_t page_size;		/* Current page size */
+	uint32_t page_size_max;		/* Maximum on-disk page size */
+
+	/*
+	 * Second, the split size: if we're doing the page layout, split to a
+	 * smaller-than-maximum page size when a split is required so we don't
+	 * repeatedly split a packed page.
+	 */
+	uint32_t split_size;		/* Split page size */
+
+	/*
+	 * The problem with splits is we've done a lot of work by the time we
+	 * realize we're going to have to split, we don't want to start over.
+	 *
+	 * To keep from having to start over when we hit the maximum page size,
+	 * we track the page information when we approach a split boundary.
+	 * If we eventually have to split, we walk this structure and pretend
+	 * we were splitting all along.  After that, we continue to append to
+	 * this structure, and eventually walk it to create a new internal page
+	 * that references all of our split pages.
+	 */
+	struct __rec_boundary {
+		/*
+		 * The start field records location in the initial split buffer,
+		 * that is, the first byte of the split chunk recorded before we
+		 * decide to split a page; the offset between the first byte of
+		 * chunk[0] and the first byte of chunk[1] is chunk[0]'s length.
+		 *
+		 * Once we split a page, we stop filling in the start field, as
+		 * we're writing the split chunks as we find them.
+		 */
+		uint8_t *start;		/* Split's first byte */
+
+		/*
+		 * The recno and entries fields are the starting record number
+		 * of the split chunk (for column-store splits), and the number
+		 * of entries in the split chunk.  These fields are used both
+		 * to write the split chunk, and to create a new internal page
+		 * to reference the split pages.
+		 */
+		uint64_t recno;		/* Split's starting record */
+		uint32_t entries;	/* Split's entries */
+
+		WT_ADDR addr;		/* Split's written location */
+		uint32_t size;		/* Split's size */
+		uint32_t cksum;		/* Split's checksum */
+		void    *dsk;		/* Split's disk image */
+
+		/*
+		 * When busy pages get large, we need to be able to evict them
+		 * even when they contain unresolved updates, or updates which
+		 * cannot be evicted because of running transactions.  In such
+		 * cases, break the page into multiple blocks, write the blocks
+		 * that can be evicted, saving lists of updates for blocks that
+		 * cannot be evicted, then re-instantiate the blocks that cannot
+		 * be evicted as new, in-memory pages, restoring the updates on
+		 * those pages.
+		 */
+		WT_UPD_SKIPPED *skip;		/* Skipped updates */
+		uint32_t	skip_next;
+		size_t		skip_allocated;
+
+		/*
+		 * The key for a row-store page; no column-store key is needed
+		 * because the page's recno, stored in the recno field, is the
+		 * column-store key.
+		 */
+		WT_ITEM key;		/* Promoted row-store key */
+
+		/*
+		 * During wrapup, after reconciling the root page, we write a
+		 * final block as part of a checkpoint.  If raw compression
+		 * was configured, that block may have already been compressed.
+		 */
+		int already_compressed;
+	} *bnd;				/* Saved boundaries */
+	uint32_t bnd_next;		/* Next boundary slot */
+	uint32_t bnd_next_max;		/* Maximum boundary slots used */
+	size_t	 bnd_entries;		/* Total boundary slots */
+	size_t   bnd_allocated;		/* Bytes allocated */
+
+	/*
+	 * We track the total number of page entries copied into split chunks
+	 * so we can easily figure out how many entries in the current split
+	 * chunk.
+	 */
+	uint32_t total_entries;		/* Total entries in splits */
+
+	/*
+	 * And there's state information as to where in this process we are:
+	 * (1) tracking split boundaries because we can still fit more split
+	 * chunks into the maximum page size, (2) tracking the maximum page
+	 * size boundary because we can't fit any more split chunks into the
+	 * maximum page size, (3) not performing boundary checks because it's
+	 * either not useful with the current page size configuration, or
+	 * because we've already been forced to split.
+	 */
+	enum {	SPLIT_BOUNDARY=0,	/* Next: a split page boundary */
+		SPLIT_MAX=1,		/* Next: the maximum page boundary */
+		SPLIT_TRACKING_OFF=2,	/* No boundary checks */
+		SPLIT_TRACKING_RAW=3 }	/* Underlying compression decides */
+	bnd_state;
+
+	/*
+	 * We track current information about the current record number, the
+	 * number of entries copied into the temporary buffer, where we are
+	 * in the temporary buffer, and how much memory remains.  Those items
+	 * are packaged here rather than passing pointers to stack locations
+	 * around the code.
+	 */
+	uint64_t recno;			/* Current record number */
+	uint32_t entries;		/* Current number of entries */
+	uint8_t *first_free;		/* Current first free byte */
+	size_t	 space_avail;		/* Remaining space in this chunk */
+
+	/*
+	 * While reviewing updates for each page, we store skipped updates here,
+	 * and then move them to per-block areas as the blocks are defined.
+	 */
+	WT_UPD_SKIPPED *skip;		/* Skipped updates */
+	uint32_t	skip_next;
+	size_t		skip_allocated;
+
+	/*
+	 * We don't need to keep the 0th key around on internal pages, the
+	 * search code ignores them as nothing can sort less by definition.
+	 * There's some trickiness here, see the code for comments on how
+	 * these fields work.
+	 */
+	int	 cell_zero;		/* Row-store internal page 0th key */
+
+	/*
+	 * WT_DICTIONARY --
+	 *	We optionally build a dictionary of row-store values for leaf
+	 * pages.  Where two value cells are identical, only write the value
+	 * once, the second and subsequent copies point to the original cell.
+	 * The dictionary is fixed size, but organized in a skip-list to make
+	 * searches faster.
+	 */
+	struct __rec_dictionary {
+		uint64_t hash;				/* Hash value */
+		void	*cell;				/* Matching cell */
+
+		u_int depth;				/* Skiplist */
+		WT_DICTIONARY *next[0];
+	} **dictionary;					/* Dictionary */
+	u_int dictionary_next, dictionary_slots;	/* Next, max entries */
+							/* Skiplist head. */
+	WT_DICTIONARY *dictionary_head[WT_SKIP_MAXDEPTH];
+
+	/*
+	 * WT_KV--
+	 *	An on-page key/value item we're building.
+	 */
+	struct __rec_kv {
+		WT_ITEM	 buf;		/* Data */
+		WT_CELL	 cell;		/* Cell and cell's length */
+		size_t cell_len;
+		size_t len;		/* Total length of cell + data */
+	} k, v;				/* Key/Value being built */
+
+	WT_ITEM *cur, _cur;		/* Key/Value being built */
+	WT_ITEM *last, _last;		/* Last key/value built */
+
+	int key_pfx_compress;		/* If can prefix-compress next key */
+	int key_pfx_compress_conf;	/* If prefix compression configured */
+	int key_sfx_compress;		/* If can suffix-compress next key */
+	int key_sfx_compress_conf;	/* If suffix compression configured */
+
+	int is_bulk_load;		/* If it's a bulk load */
+
+	WT_SALVAGE_COOKIE *salvage;	/* If it's a salvage operation */
+
+	int tested_ref_state;		/* Debugging information */
+} WT_RECONCILE;
+
+static void __rec_bnd_cleanup(WT_SESSION_IMPL *, WT_RECONCILE *, int);
+static void __rec_cell_build_addr(
+		WT_RECONCILE *, const void *, size_t, u_int, uint64_t);
+static int  __rec_cell_build_int_key(WT_SESSION_IMPL *,
+		WT_RECONCILE *, const void *, size_t, int *);
+static int  __rec_cell_build_leaf_key(WT_SESSION_IMPL *,
+		WT_RECONCILE *, const void *, size_t, int *);
+static int  __rec_cell_build_ovfl(WT_SESSION_IMPL *,
+		WT_RECONCILE *, WT_KV *, uint8_t, uint64_t);
+static int  __rec_cell_build_val(WT_SESSION_IMPL *,
+		WT_RECONCILE *, const void *, size_t, uint64_t);
+static int  __rec_child_deleted(
+		WT_SESSION_IMPL *, WT_RECONCILE *, WT_REF *, int *);
+static int  __rec_col_fix(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int  __rec_col_fix_slvg(WT_SESSION_IMPL *,
+		WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *);
+static int  __rec_col_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int  __rec_col_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int  __rec_col_var(WT_SESSION_IMPL *,
+		WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *);
+static int  __rec_col_var_helper(WT_SESSION_IMPL *, WT_RECONCILE *,
+		WT_SALVAGE_COOKIE *, WT_ITEM *, int, uint8_t, uint64_t);
+static int  __rec_destroy_session(WT_SESSION_IMPL *);
+static int  __rec_root_write(WT_SESSION_IMPL *, WT_PAGE *, uint32_t);
+static int  __rec_row_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int  __rec_row_leaf(WT_SESSION_IMPL *,
+		WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *);
+static int  __rec_row_leaf_insert(
+		WT_SESSION_IMPL *, WT_RECONCILE *, WT_INSERT *);
+static int  __rec_row_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int  __rec_split_col(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int  __rec_split_discard(WT_SESSION_IMPL *, WT_PAGE *);
+static int  __rec_split_fixup(WT_SESSION_IMPL *, WT_RECONCILE *);
+static int  __rec_split_row(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int  __rec_split_row_promote(
+		WT_SESSION_IMPL *, WT_RECONCILE *, WT_ITEM *, uint8_t);
+static int  __rec_split_write(WT_SESSION_IMPL *,
+		WT_RECONCILE *, WT_BOUNDARY *, WT_ITEM *, int);
+static int  __rec_write_init(WT_SESSION_IMPL *,
+		WT_REF *, uint32_t, WT_SALVAGE_COOKIE *, void *);
+static int  __rec_write_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int  __rec_write_wrapup_err(
+		WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+
+static void __rec_dictionary_free(WT_SESSION_IMPL *, WT_RECONCILE *);
+static int  __rec_dictionary_init(WT_SESSION_IMPL *, WT_RECONCILE *, u_int);
+static int  __rec_dictionary_lookup(
+		WT_SESSION_IMPL *, WT_RECONCILE *, WT_KV *, WT_DICTIONARY **);
+static void __rec_dictionary_reset(WT_RECONCILE *);
+
+/*
+ * __wt_reconcile --
+ *	Reconcile an in-memory page into its on-disk format, and write it.
+ */
+int
+__wt_reconcile(WT_SESSION_IMPL *session,
+    WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_PAGE *page;
+	WT_PAGE_MODIFY *mod;
+	WT_RECONCILE *r;
+	int locked;
+
+	conn = S2C(session);
+	page = ref->page;
+	mod = page->modify;
+
+	/* We're shouldn't get called with a clean page, that's an error. */
+	if (!__wt_page_is_modified(page))
+		WT_RET_MSG(session, WT_ERROR,
+		    "Attempt to reconcile a clean page.");
+
+	WT_RET(__wt_verbose(session,
+	    WT_VERB_RECONCILE, "%s", __wt_page_type_string(page->type)));
+	WT_STAT_FAST_CONN_INCR(session, rec_pages);
+	WT_STAT_FAST_DATA_INCR(session, rec_pages);
+	if (LF_ISSET(WT_EVICTING)) {
+		WT_STAT_FAST_CONN_INCR(session, rec_pages_eviction);
+		WT_STAT_FAST_DATA_INCR(session, rec_pages_eviction);
+	}
+
+	/* Record the most recent transaction ID we will *not* write. */
+	mod->disk_snap_min = session->txn.snap_min;
+
+	/* Initialize the reconciliation structure for each new run. */
+	WT_RET(__rec_write_init(
+	    session, ref, flags, salvage, &session->reconcile));
+	r = session->reconcile;
+
+	/*
+	 * The compaction process looks at the page's modification information;
+	 * if compaction is running, lock the page down.
+	 *
+	 * Otherwise, flip on the scanning flag: obsolete updates cannot be
+	 * freed while reconciliation is in progress.
+	 */
+	locked = 0;
+	if (conn->compact_in_memory_pass) {
+		locked = 1;
+		WT_PAGE_LOCK(session, page);
+	} else
+		for (;;) {
+			F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret);
+			if (ret == 0)
+				break;
+			__wt_yield();
+		}
+
+	/* Reconcile the page. */
+	switch (page->type) {
+	case WT_PAGE_COL_FIX:
+		if (salvage != NULL)
+			ret = __rec_col_fix_slvg(session, r, page, salvage);
+		else
+			ret = __rec_col_fix(session, r, page);
+		break;
+	case WT_PAGE_COL_INT:
+		ret = __rec_col_int(session, r, page);
+		break;
+	case WT_PAGE_COL_VAR:
+		ret = __rec_col_var(session, r, page, salvage);
+		break;
+	case WT_PAGE_ROW_INT:
+		ret = __rec_row_int(session, r, page);
+		break;
+	case WT_PAGE_ROW_LEAF:
+		ret = __rec_row_leaf(session, r, page, salvage);
+		break;
+	WT_ILLEGAL_VALUE_SET(session);
+	}
+
+	/* Wrap up the page reconciliation. */
+	if (ret == 0)
+		ret = __rec_write_wrapup(session, r, page);
+	else
+		WT_TRET(__rec_write_wrapup_err(session, r, page));
+
+	/* Release the page lock if we're holding one. */
+	if (locked)
+		WT_PAGE_UNLOCK(session, page);
+	else
+		F_CLR_ATOMIC(page, WT_PAGE_SCANNING);
+
+	/*
+	 * Clean up the boundary structures: some workloads result in millions
+	 * of these structures, and if associated with some random session that
+	 * got roped into doing forced eviction, they won't be discarded for the
+	 * life of the session.
+	 */
+	__rec_bnd_cleanup(session, r, 0);
+
+	WT_RET(ret);
+
+	/*
+	 * Root pages are special, splits have to be done, we can't put it off
+	 * as the parent's problem any more.
+	 */
+	if (__wt_ref_is_root(ref))
+		return (__rec_root_write(session, page, flags));
+
+	/*
+	 * Otherwise, mark the page's parent dirty.
+	 * Don't mark the tree dirty: if this reconciliation is in service of a
+	 * checkpoint, it's cleared the tree's dirty flag, and we don't want to
+	 * set it again as part of that walk.
+	 */
+	return (__wt_page_parent_modify_set(session, ref, 1));
+}
+
+/*
+ * __rec_root_write --
+ *	Handle the write of a root page.
+ */
+static int
+__rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
+{
+	WT_DECL_RET;
+	WT_PAGE *next;
+	WT_PAGE_INDEX *pindex;
+	WT_PAGE_MODIFY *mod;
+	WT_REF fake_ref;
+	uint32_t i;
+
+	mod = page->modify;
+
+	/*
+	 * If a single root page was written (either an empty page or there was
+	 * a 1-for-1 page swap), we've written root and checkpoint, we're done.
+	 * If the root page split, write the resulting WT_REF array.  We already
+	 * have an infrastructure for writing pages, create a fake root page and
+	 * write it instead of adding code to write blocks based on the list of
+	 * blocks resulting from a multiblock reconciliation.
+	 */
+	switch (F_ISSET(mod, WT_PM_REC_MASK)) {
+	case WT_PM_REC_EMPTY:				/* Page is empty */
+	case WT_PM_REC_REPLACE:				/* 1-for-1 page swap */
+		return (0);
+	case WT_PM_REC_MULTIBLOCK:			/* Multiple blocks */
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	WT_RET(__wt_verbose(session, WT_VERB_SPLIT,
+	    "root page split -> %" PRIu32 " pages", mod->mod_multi_entries));
+
+	/*
+	 * Create a new root page, initialize the array of child references,
+	 * mark it dirty, then write it.
+	 */
+	switch (page->type) {
+	case WT_PAGE_COL_INT:
+		WT_RET(__wt_page_alloc(session,
+		    WT_PAGE_COL_INT, 1, mod->mod_multi_entries, 1, &next));
+		break;
+	case WT_PAGE_ROW_INT:
+		WT_RET(__wt_page_alloc(session,
+		    WT_PAGE_ROW_INT, 0, mod->mod_multi_entries, 1, &next));
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	pindex = WT_INTL_INDEX_COPY(next);
+	for (i = 0; i < mod->mod_multi_entries; ++i) {
+		WT_ERR(__wt_multi_to_ref(session,
+		    next, &mod->mod_multi[i], &pindex->index[i], NULL));
+		pindex->index[i]->home = next;
+	}
+
+	/*
+	 * We maintain a list of pages written for the root in order to free the
+	 * backing blocks the next time the root is written.
+	 */
+	mod->mod_root_split = next;
+
+	WT_ERR(__wt_page_modify_init(session, next));
+	__wt_page_only_modify_set(session, next);
+
+	/*
+	 * Fake up a reference structure, and write the next root page.
+	 */
+	__wt_root_ref_init(&fake_ref, next, page->type == WT_PAGE_COL_INT);
+	return (__wt_reconcile(session, &fake_ref, NULL, flags));
+
+err:	__wt_page_out(session, &next);
+	return (ret);
+}
+
+/*
+ * __rec_raw_compression_config --
+ *	Configure raw compression.
+ */
+static inline int
+__rec_raw_compression_config(
+    WT_SESSION_IMPL *session, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
+{
+	WT_BTREE *btree;
+
+	btree = S2BT(session);
+
+	/* Check if raw compression configured. */
+	if (btree->compressor == NULL ||
+	    btree->compressor->compress_raw == NULL)
+		return (0);
+
+	/* Only for row-store and variable-length column-store objects. */
+	if (page->type == WT_PAGE_COL_FIX)
+		return (0);
+
+	/*
+	 * Raw compression cannot support dictionary compression. (Technically,
+	 * we could still use the raw callback on column-store variable length
+	 * internal pages with dictionary compression configured, because
+	 * dictionary compression only applies to column-store leaf pages, but
+	 * that seems an unlikely use case.)
+	 */
+	if (btree->dictionary != 0)
+		return (0);
+
+	/* Raw compression cannot support prefix compression. */
+	if (btree->prefix_compression != 0)
+		return (0);
+
+	/*
+	 * Raw compression is also turned off during salvage: we can't allow
+	 * pages to split during salvage, raw compression has no point if it
+	 * can't manipulate the page size.
+	 */
+	if (salvage != NULL)
+		return (0);
+
+	return (1);
+}
+
+/*
+ * __rec_write_init --
+ *	Initialize the reconciliation structure.
+ */
+static int
+__rec_write_init(WT_SESSION_IMPL *session,
+    WT_REF *ref, uint32_t flags, WT_SALVAGE_COOKIE *salvage, void *reconcilep)
+{
+	WT_BTREE *btree;
+	WT_PAGE *page;
+	WT_RECONCILE *r;
+
+	btree = S2BT(session);
+	page = ref->page;
+
+	if ((r = *(WT_RECONCILE **)reconcilep) == NULL) {
+		WT_RET(__wt_calloc_def(session, 1, &r));
+
+		*(WT_RECONCILE **)reconcilep = r;
+		session->reconcile_cleanup = __rec_destroy_session;
+
+		/* Connect pointers/buffers. */
+		r->cur = &r->_cur;
+		r->last = &r->_last;
+
+		/* Disk buffers need to be aligned for writing. */
+		F_SET(&r->dsk, WT_ITEM_ALIGNED);
+	}
+
+	/* Remember the configuration. */
+	r->ref = ref;
+	r->page = page;
+	r->flags = flags;
+
+	/* Track if the page can be marked clean. */
+	r->leave_dirty = 0;
+
+	/* Raw compression. */
+	r->raw_compression =
+	    __rec_raw_compression_config(session, page, salvage);
+	r->raw_destination.flags = WT_ITEM_ALIGNED;
+
+	/* Track overflow items. */
+	r->ovfl_items = 0;
+
+	/* Track empty values. */
+	r->all_empty_value = 1;
+	r->any_empty_value = 0;
+
+	/* The list of cached, skipped updates. */
+	r->skip_next = 0;
+
+	/*
+	 * Dictionary compression only writes repeated values once.  We grow
+	 * the dictionary as necessary, always using the largest size we've
+	 * seen.
+	 *
+	 * Reset the dictionary.
+	 *
+	 * Sanity check the size: 100 slots is the smallest dictionary we use.
+	 */
+	if (btree->dictionary != 0 && btree->dictionary > r->dictionary_slots)
+		WT_RET(__rec_dictionary_init(session,
+		    r, btree->dictionary < 100 ? 100 : btree->dictionary));
+	__rec_dictionary_reset(r);
+
+	/*
+	 * Suffix compression shortens internal page keys by discarding trailing
+	 * bytes that aren't necessary for tree navigation.  We don't do suffix
+	 * compression if there is a custom collator because we don't know what
+	 * bytes a custom collator might use.  Some custom collators (for
+	 * example, a collator implementing reverse ordering of strings), won't
+	 * have any problem with suffix compression: if there's ever a reason to
+	 * implement suffix compression for custom collators, we can add a
+	 * setting to the collator, configured when the collator is added, that
+	 * turns on suffix compression.
+	 *
+	 * The raw compression routines don't even consider suffix compression,
+	 * but it doesn't hurt to confirm that.
+	 */
+	r->key_sfx_compress_conf = 0;
+	if (btree->collator == NULL &&
+	    btree->internal_key_truncate && !r->raw_compression)
+		r->key_sfx_compress_conf = 1;
+
+	/*
+	 * Prefix compression discards repeated prefix bytes from row-store leaf
+	 * page keys.
+	 */
+	r->key_pfx_compress_conf = 0;
+	if (btree->prefix_compression && page->type == WT_PAGE_ROW_LEAF)
+		r->key_pfx_compress_conf = 1;
+
+	r->salvage = salvage;
+
+	/* Save the page's write generation before reading the page. */
+	WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen);
+
+	/*
+	 * Running transactions may update the page after we write it, so
+	 * this is the highest ID we can be confident we will see.
+	 */
+	r->skipped_txn = S2C(session)->txn_global.last_running;
+
+	return (0);
+}
+
+/*
+ * __rec_destroy --
+ *	Clean up the reconciliation structure.
+ */
+static void
+__rec_destroy(WT_SESSION_IMPL *session, void *reconcilep)
+{
+	WT_RECONCILE *r;
+
+	if ((r = *(WT_RECONCILE **)reconcilep) == NULL)
+		return;
+	*(WT_RECONCILE **)reconcilep = NULL;
+
+	__wt_buf_free(session, &r->dsk);
+
+	__wt_free(session, r->raw_entries);
+	__wt_free(session, r->raw_offsets);
+	__wt_free(session, r->raw_recnos);
+	__wt_buf_free(session, &r->raw_destination);
+
+	__rec_bnd_cleanup(session, r, 1);
+
+	__wt_free(session, r->skip);
+
+	__wt_buf_free(session, &r->k.buf);
+	__wt_buf_free(session, &r->v.buf);
+	__wt_buf_free(session, &r->_cur);
+	__wt_buf_free(session, &r->_last);
+
+	__rec_dictionary_free(session, r);
+
+	__wt_free(session, r);
+}
+
+/*
+ * __rec_destroy_session --
+ *	Clean up the reconciliation structure, session version.
+ */
+static int
+__rec_destroy_session(WT_SESSION_IMPL *session)
+{
+	__rec_destroy(session, &session->reconcile);
+	return (0);
+}
+
+/*
+ * __rec_bnd_cleanup --
+ *	Cleanup the boundary structure information.
+ */
+static void
+__rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, int destroy)
+{
+	WT_BOUNDARY *bnd;
+	uint32_t i, last_used;
+
+	if (r->bnd == NULL)
+		return;
+
+	/*
+	 * Free the boundary structures' memory.  In the case of normal cleanup,
+	 * discard any memory we won't reuse in the next reconciliation; in the
+	 * case of destruction, discard everything.
+	 *
+	 * During some big-page evictions we have seen boundary arrays that have
+	 * millions of elements.  That should not be a normal event, but if the
+	 * memory is associated with a random session, it won't be discarded
+	 * until the session is closed.   If there are more than 10,000 boundary
+	 * structure elements, destroy the boundary array and we'll start over.
+	 */
+	if (destroy || r->bnd_entries > 10 * 1000) {
+		for (bnd = r->bnd, i = 0; i < r->bnd_entries; ++bnd, ++i) {
+			__wt_free(session, bnd->addr.addr);
+			__wt_free(session, bnd->dsk);
+			__wt_free(session, bnd->skip);
+			__wt_buf_free(session, &bnd->key);
+		}
+		__wt_free(session, r->bnd);
+		r->bnd_next = 0;
+		r->bnd_entries = r->bnd_allocated = 0;
+	} else {
+		/*
+		 * The boundary-next field points to the next boundary structure
+		 * we were going to use, but there's no requirement that value
+		 * be incremented before reconciliation updates the structure it
+		 * points to, that is, there's no guarantee elements of the next
+		 * boundary structure are still unchanged. Be defensive, clean
+		 * up the "next" structure as well as the ones we know we used.
+		 */
+		last_used = r->bnd_next;
+		if (last_used < r->bnd_entries)
+			++last_used;
+		for (bnd = r->bnd, i = 0; i < last_used; ++bnd, ++i) {
+			__wt_free(session, bnd->addr.addr);
+			__wt_free(session, bnd->dsk);
+			__wt_free(session, bnd->skip);
+		}
+	}
+}
+
+/*
+ * __rec_skip_update_save --
+ *	Save a skipped WT_UPDATE list for later restoration.
+ */
+static int
+__rec_skip_update_save(
+    WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip)
+{
+	WT_RET(__wt_realloc_def(
+	    session, &r->skip_allocated, r->skip_next + 1, &r->skip));
+	r->skip[r->skip_next].ins = ins;
+	r->skip[r->skip_next].rip = rip;
+	++r->skip_next;
+	return (0);
+}
+
+/*
+ * __rec_skip_update_move --
+ *	Move a skipped WT_UPDATE list from the per-page cache to a specific
+ * block's list.
+ */
+static int
+__rec_skip_update_move(
+    WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_UPD_SKIPPED *skip)
+{
+	WT_RET(__wt_realloc_def(
+	    session, &bnd->skip_allocated, bnd->skip_next + 1, &bnd->skip));
+	bnd->skip[bnd->skip_next] = *skip;
+	++bnd->skip_next;
+
+	skip->ins = NULL;
+	skip->rip = NULL;
+	return (0);
+}
+
+/*
+ * __rec_txn_read --
+ *	Return the first visible update in a list (or NULL if none are visible),
+ * set a flag if any updates were skipped, track the maximum transaction ID on
+ * the page.
+ */
+static inline int
+__rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
+    WT_INSERT *ins, WT_ROW *rip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp)
+{
+	WT_ITEM ovfl;
+	WT_PAGE *page;
+	WT_UPDATE *upd, *upd_list, *upd_ovfl;
+	size_t notused;
+	uint64_t max_txn, min_txn, txnid;
+	int skipped;
+
+	*updp = NULL;
+
+	page = r->page;
+
+	/*
+	 * If we're called with an WT_INSERT reference, use its WT_UPDATE
+	 * list, else is an on-page row-store WT_UPDATE list.
+	 */
+	upd_list = ins == NULL ? WT_ROW_UPDATE(page, rip) : ins->upd;
+	skipped = 0;
+
+	for (max_txn = WT_TXN_NONE, min_txn = UINT64_MAX, upd = upd_list;
+	    upd != NULL; upd = upd->next) {
+		if ((txnid = upd->txnid) == WT_TXN_ABORTED)
+			continue;
+
+		/* Track the largest/smallest transaction IDs on the list. */
+		if (TXNID_LT(max_txn, txnid))
+			max_txn = txnid;
+		if (TXNID_LT(txnid, min_txn))
+			min_txn = txnid;
+		if (TXNID_LT(txnid, r->skipped_txn) &&
+		    !__wt_txn_visible_all(session, txnid))
+			r->skipped_txn = txnid;
+
+		/*
+		 * Record whether any updates were skipped on the way to finding
+		 * the first visible update.
+		 *
+		 * If updates were skipped before the one being written, future
+		 * reads without intervening modifications to the page could
+		 * see a different value; if no updates were skipped, the page
+		 * can safely be marked clean and does not need to be
+		 * reconciled until modified again.
+		 */
+		if (*updp == NULL) {
+			if (__wt_txn_visible(session, txnid))
+				*updp = upd;
+			else
+				skipped = 1;
+		}
+	}
+
+	/*
+	 * Track the maximum transaction ID in the page.  We store this in the
+	 * page at the end of reconciliation if no updates are skipped, it's
+	 * used to avoid evicting clean pages from memory with changes required
+	 * to satisfy a snapshot read.
+	 */
+	if (TXNID_LT(r->max_txn, max_txn))
+		r->max_txn = max_txn;
+
+	/*
+	 * If all updates are globally visible and no updates were skipped, the
+	 * page can be marked clean and we're done, regardless of whether we're
+	 * evicting or checkpointing.
+	 *
+	 * The oldest transaction ID may have moved while we were scanning the
+	 * page, so it is possible to skip an update but then find that by the
+	 * end of the scan, all updates are stable.
+	 */
+	if (__wt_txn_visible_all(session, max_txn) && !skipped)
+		return (0);
+
+	/*
+	 * If some updates are not globally visible, or were skipped, the page
+	 * cannot be marked clean.
+	 */
+	r->leave_dirty = 1;
+
+	/* If we're not evicting, we're done, we know what we'll write. */
+	if (!F_ISSET(r, WT_EVICTING))
+		return (0);
+
+	/* In some cases, there had better not be any updates we can't write. */
+	if (F_ISSET(r, WT_SKIP_UPDATE_ERR))
+		WT_PANIC_RET(session, EINVAL,
+		    "reconciliation illegally skipped an update");
+
+	/*
+	 * If evicting and we aren't able to save/restore the not-yet-visible
+	 * updates, the page can't be evicted.
+	 */
+	if (!F_ISSET(r, WT_SKIP_UPDATE_RESTORE))
+		return (EBUSY);
+
+	/*
+	 * Evicting a page with not-yet-visible updates: save and restore the
+	 * list of updates on a newly instantiated page.
+	 *
+	 * The order of the updates on the list matters so we can't move only
+	 * the unresolved updates, we have to move the entire update list.
+	 *
+	 * Clear the returned update so our caller ignores the key/value pair
+	 * in the case of an insert/append entry (everything we need is in the
+	 * update list), and otherwise writes the original on-page key/value
+	 * pair to which the update list applies.
+	 */
+	*updp = NULL;
+
+	/*
+	 * Handle the case were we don't want to write an original on-page value
+	 * item to disk because it's been updated or removed.
+	 *
+	 * Here's the deal: an overflow value was updated or removed and its
+	 * backing blocks freed.  If any transaction in the system might still
+	 * read the value, a copy was cached in page reconciliation tracking
+	 * memory, and the page cell set to WT_CELL_VALUE_OVFL_RM.  Eviction
+	 * then chose the page and we're splitting it up in order to push parts
+	 * of it out of memory.
+	 *
+	 * We could write the original on-page value item to disk... if we had
+	 * a copy.  The cache may not have a copy (a globally visible update
+	 * would have kept a value from ever being cached), or an update that
+	 * subsequent became globally visible could cause a cached value to be
+	 * discarded.  Either way, once there's a globally visible update, we
+	 * may not have the value.
+	 *
+	 * Fortunately, if there's a globally visible update we don't care about
+	 * the original version, so we simply ignore it, no transaction can ever
+	 * try and read it.  If there isn't a globally visible update, there had
+	 * better be a cached value.
+	 *
+	 * In the latter case, we could write the value out to disk, but (1) we
+	 * are planning on re-instantiating this page in memory, it isn't going
+	 * to disk, and (2) the value item is eventually going to be discarded,
+	 * that seems like a waste of a write.  Instead, find the cached value
+	 * and append it to the update list we're saving for later restoration.
+	 */
+	if (vpack != NULL && vpack->raw == WT_CELL_VALUE_OVFL_RM &&
+	    !__wt_txn_visible_all(session, min_txn)) {
+		WT_RET(__wt_ovfl_txnc_search(
+		    page, vpack->data, vpack->size, &ovfl));
+		/*
+		 * Create an update structure with an impossibly low transaction
+		 * ID and append it to the update list we're about to save.
+		 * Restoring that update list when this page is re-instantiated
+		 * creates an update for the key/value pair visible to every
+		 * running transaction in the system, ensuring the on-page value
+		 * will be ignored.
+		 */
+		WT_RET(__wt_update_alloc(session, &ovfl, &upd_ovfl, &notused));
+		upd_ovfl->txnid = WT_TXN_NONE;
+		for (upd = upd_list; upd->next != NULL; upd = upd->next)
+			;
+		upd->next = upd_ovfl;
+	}
+
+	return (__rec_skip_update_save(session, r, ins, rip));
+}
+
+/*
+ * CHILD_RELEASE --
+ *	Macros to clean up during internal-page reconciliation, releasing the
+ * hazard pointer we're holding on child pages.
+ */
+#undef	CHILD_RELEASE
+#define	CHILD_RELEASE(session, hazard, ref) do {			\
+	if (hazard) {							\
+		hazard = 0;						\
+		WT_TRET(						\
+		    __wt_page_release(session, ref, WT_READ_NO_EVICT));	\
+	}								\
+} while (0)
+#undef	CHILD_RELEASE_ERR
+#define	CHILD_RELEASE_ERR(session, hazard, ref) do {			\
+	CHILD_RELEASE(session, hazard, ref);				\
+	WT_ERR(ret);							\
+} while (0)
+
+/*
+ * __rec_child_modify --
+ *	Return if the internal page's child references any modifications.
+ */
+static int
+__rec_child_modify(WT_SESSION_IMPL *session,
+    WT_RECONCILE *r, WT_REF *ref, int *hazardp, int *statep)
+{
+	WT_DECL_RET;
+	WT_PAGE_MODIFY *mod;
+
+	/* We may acquire a hazard pointer our caller must release. */
+	*hazardp = 0;
+
+#define	WT_CHILD_IGNORE		1		/* Deleted child: ignore */
+#define	WT_CHILD_MODIFIED	2		/* Modified child */
+#define	WT_CHILD_PROXY		3		/* Deleted child: proxy */
+	*statep = 0;
+
+	/*
+	 * This function is called when walking an internal page to decide how
+	 * to handle child pages referenced by the internal page, specifically
+	 * if the child page is to be merged into its parent.
+	 *
+	 * Internal pages are reconciled for two reasons: first, when evicting
+	 * an internal page, second by the checkpoint code when writing internal
+	 * pages.  During eviction, the subtree is locked down so all pages
+	 * should be in the WT_REF_DISK or WT_REF_LOCKED state. During
+	 * checkpoint, any eviction that might affect our review of an internal
+	 * page is prohibited, however, as the subtree is not reserved for our
+	 * exclusive use, there are other page states that must be considered.
+	 */
+	for (;; __wt_yield())
+		switch (r->tested_ref_state = ref->state) {
+		case WT_REF_DISK:
+			/* On disk, not modified by definition. */
+			goto done;
+
+		case WT_REF_DELETED:
+			/*
+			 * The child is in a deleted state.
+			 *
+			 * It's possible the state could change underneath us as
+			 * the page is read in, and we can race between checking
+			 * for a deleted state and looking at the transaction ID
+			 * to see if the delete is visible to us.  Lock down the
+			 * structure.
+			 */
+			if (!WT_ATOMIC_CAS4(
+			    ref->state, WT_REF_DELETED, WT_REF_LOCKED))
+				break;
+			ret = __rec_child_deleted(session, r, ref, statep);
+			WT_PUBLISH(ref->state, WT_REF_DELETED);
+			goto done;
+
+		case WT_REF_LOCKED:
+			/*
+			 * Locked.
+			 *
+			 * If evicting, the evicted page's subtree, including
+			 * this child, was selected for eviction by us and the
+			 * state is stable until we reset it, it's an in-memory
+			 * state.  This is the expected state for a child being
+			 * merged into a page (where the page was selected by
+			 * the eviction server for eviction).
+			 */
+			if (F_ISSET(r, WT_EVICTING))
+				goto in_memory;
+
+			/*
+			 * If called during checkpoint, the child is being
+			 * considered by the eviction server or the child is a
+			 * fast-delete page being read.  The eviction may have
+			 * started before the checkpoint and so we must wait
+			 * for the eviction to be resolved.  I suspect we could
+			 * handle fast-delete reads, but we can't distinguish
+			 * between the two and fast-delete reads aren't expected
+			 * to be common.
+			 */
+			break;
+
+		case WT_REF_MEM:
+			/*
+			 * In memory.
+			 *
+			 * If evicting, the evicted page's subtree, including
+			 * this child, was selected for eviction by us and the
+			 * state is stable until we reset it, it's an in-memory
+			 * state.  This is the expected state for a child being
+			 * merged into a page (where the page belongs to a file
+			 * being discarded from the cache during close).
+			 */
+			if (F_ISSET(r, WT_EVICTING))
+				goto in_memory;
+
+			/*
+			 * If called during checkpoint, acquire a hazard pointer
+			 * so the child isn't evicted, it's an in-memory case.
+			 *
+			 * This call cannot return split/restart, dirty page
+			 * eviction is shutout during checkpoint, all splits in
+			 * process will have completed before we walk any pages
+			 * for checkpoint.
+			 */
+			if ((ret = __wt_page_in(session, ref,
+			    WT_READ_CACHE | WT_READ_NO_EVICT |
+			    WT_READ_NO_GEN | WT_READ_NO_WAIT)) == WT_NOTFOUND) {
+				ret = 0;
+				break;
+			}
+			*hazardp = 1;
+			goto in_memory;
+
+		case WT_REF_READING:
+			/*
+			 * Being read, not modified by definition.
+			 *
+			 * We should never be here during eviction, a child page
+			 * in this state within an evicted page's subtree would
+			 * have caused normally eviction to fail, and exclusive
+			 * eviction shouldn't ever see pages being read.
+			 */
+			WT_ASSERT(session, !F_ISSET(r, WT_EVICTING));
+			goto done;
+
+		case WT_REF_SPLIT:
+			/*
+			 * The page was split out from under us.
+			 *
+			 * We should never be here during eviction, a child page
+			 * in this state within an evicted page's subtree would
+			 * have caused eviction to fail.
+			 *
+			 * We should never be here during checkpoint, dirty page
+			 * eviction is shutout during checkpoint, all splits in
+			 * process will have completed before we walk any pages
+			 * for checkpoint.
+			 */
+			WT_ASSERT(session, ref->state != WT_REF_SPLIT);
+			/* FALLTHROUGH */
+
+		WT_ILLEGAL_VALUE(session);
+		}
+
+in_memory:
+	/*
+	 * In-memory states: the child is potentially modified if the page's
+	 * modify structure has been instantiated.   If the modify structure
+	 * exists and the page has actually been modified, set that state.
+	 * If that's not the case, we would normally use the original cell's
+	 * disk address as our reference, but, if we're forced to instantiate
+	 * a deleted child page and it's never modified, we end up here with
+	 * a page that has a modify structure, no modifications, and no disk
+	 * address.  Ignore those pages, they're not modified and there is no
+	 * reason to write the cell.
+	 */
+	mod = ref->page->modify;
+	if (mod != NULL && mod->flags != 0)
+		*statep = WT_CHILD_MODIFIED;
+	else if (ref->addr == NULL) {
+		*statep = WT_CHILD_IGNORE;
+		CHILD_RELEASE(session, *hazardp, ref);
+	}
+
+done:	WT_HAVE_DIAGNOSTIC_YIELD;
+	return (ret);
+}
+
+/*
+ * __rec_child_deleted --
+ *	Handle pages with leaf pages in the WT_REF_DELETED state.
+ */
+static int
+__rec_child_deleted(
+    WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, int *statep)
+{
+	WT_BM *bm;
+	WT_PAGE_DELETED *page_del;
+	size_t addr_size;
+	const uint8_t *addr;
+
+	bm = S2BT(session)->bm;
+	page_del = ref->page_del;
+
+	/*
+	 * Internal pages with child leaf pages in the WT_REF_DELETED state are
+	 * a special case during reconciliation.  First, if the deletion was a
+	 * result of a session truncate call, the deletion may not be visible to
+	 * us.  In that case, we proceed as with any change that's not visible
+	 * during reconciliation by setting the skipped flag and ignoring the
+	 * change for the purposes of writing the internal page.
+	 *
+	 * In this case, there must be an associated page-deleted structure, and
+	 * it holds the transaction ID we care about.
+	 */
+	if (page_del != NULL && !__wt_txn_visible(session, page_del->txnid)) {
+		/*
+		 * In some cases, there had better not be any updates we can't
+		 * write.
+		 */
+		if (F_ISSET(r, WT_SKIP_UPDATE_ERR))
+			WT_PANIC_RET(session, EINVAL,
+			    "reconciliation illegally skipped an update");
+
+		/* If this page cannot be evicted, quit now. */
+		if (F_ISSET(r, WT_EVICTING))
+			return (EBUSY);
+	}
+
+	/*
+	 * The deletion is visible to us, deal with any underlying disk blocks.
+	 *
+	 * First, check to see if there is an address associated with this leaf:
+	 * if there isn't, we're done, the underlying page is already gone.  If
+	 * the page still exists, check for any transactions in the system that
+	 * might want to see the page's state before it's deleted.
+	 *
+	 * If any such transactions exist, we cannot discard the underlying leaf
+	 * page to the block manager because the transaction may eventually read
+	 * it.  However, this write might be part of a checkpoint, and should we
+	 * recover to that checkpoint, we'll need to delete the leaf page, else
+	 * we'd leak it.  The solution is to write a proxy cell on the internal
+	 * page ensuring the leaf page is eventually discarded.
+	 *
+	 * If no such transactions exist, we can discard the leaf page to the
+	 * block manager and no cell needs to be written at all.  We do this
+	 * outside of the underlying tracking routines because this action is
+	 * permanent and irrevocable.  (Clearing the address means we've lost
+	 * track of the disk address in a permanent way.  This is safe because
+	 * there's no path to reading the leaf page again: if there's ever a
+	 * read into this part of the name space again, the cache read function
+	 * instantiates an entirely new page.)
+	 */
+	if (ref->addr != NULL &&
+	    (page_del == NULL ||
+	    __wt_txn_visible_all(session, page_del->txnid))) {
+		WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+		WT_RET(bm->free(bm, session, addr, addr_size));
+
+		if (__wt_off_page(ref->home, ref->addr)) {
+			__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
+			__wt_free(session, ref->addr);
+		}
+		ref->addr = NULL;
+	}
+
+	/*
+	 * Minor memory cleanup: if a truncate call deleted this page and we
+	 * were ever forced to instantiate the page in memory, we would have
+	 * built a list of updates in the page reference in order to be able
+	 * to abort the truncate.  It's a cheap test to make that memory go
+	 * away, we do it here because there's really nowhere else we do the
+	 * checks.  In short, if we have such a list, and the backing address
+	 * blocks are gone, there can't be any transaction that can abort.
+	 */
+	if (ref->addr == NULL && page_del != NULL) {
+		__wt_free(session, ref->page_del->update_list);
+		__wt_free(session, ref->page_del);
+	}
+
+	/*
+	 * If there's still a disk address, then we have to write a proxy
+	 * record, otherwise, we can safely ignore this child page.
+	 */
+	*statep = ref->addr == NULL ? WT_CHILD_IGNORE : WT_CHILD_PROXY;
+	return (0);
+}
+
+/*
+ * __rec_incr --
+ *	Update the memory tracking structure for a set of new entries.
+ */
+static inline void
+__rec_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size)
+{
+	/*
+	 * The buffer code is fragile and prone to off-by-one errors -- check
+	 * for overflow in diagnostic mode.
+	 */
+	WT_ASSERT(session, r->space_avail >= size);
+	WT_ASSERT(session,
+	    WT_BLOCK_FITS(r->first_free, size, r->dsk.mem, r->page_size));
+
+	r->entries += v;
+	r->space_avail -= size;
+	r->first_free += size;
+}
+
+/*
+ * __rec_copy_incr --
+ *	Copy a key/value cell and buffer pair into the new image.
+ */
+static inline void
+__rec_copy_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_KV *kv)
+{
+	size_t len;
+	uint8_t *p, *t;
+
+	/*
+	 * If there's only one chunk of data to copy (because the cell and data
+	 * are being copied from the original disk page), the cell length won't
+	 * be set, the WT_ITEM data/length will reference the data to be copied.
+	 *
+	 * WT_CELLs are typically small, 1 or 2 bytes -- don't call memcpy, do
+	 * the copy in-line.
+	 */
+	for (p = (uint8_t *)r->first_free,
+	    t = (uint8_t *)&kv->cell, len = kv->cell_len; len > 0; --len)
+		*p++ = *t++;
+
+	/* The data can be quite large -- call memcpy. */
+	if (kv->buf.size != 0)
+		memcpy(p, kv->buf.data, kv->buf.size);
+
+	WT_ASSERT(session, kv->len == kv->cell_len + kv->buf.size);
+	__rec_incr(session, r, 1, kv->len);
+}
+
+/*
+ * __rec_dict_replace --
+ *	Check for a dictionary match.
+ */
+static int
+__rec_dict_replace(
+    WT_SESSION_IMPL *session, WT_RECONCILE *r, uint64_t rle, WT_KV *val)
+{
+	WT_DICTIONARY *dp;
+	uint64_t offset;
+
+	/*
+	 * We optionally create a dictionary of values and only write a unique
+	 * value once per page, using a special "copy" cell for all subsequent
+	 * copies of the value.  We have to do the cell build and resolution at
+	 * this low level because we need physical cell offsets for the page.
+	 *
+	 * Sanity check: short-data cells can be smaller than dictionary-copy
+	 * cells.  If the data is already small, don't bother doing the work.
+	 * This isn't just work avoidance: on-page cells can't grow as a result
+	 * of writing a dictionary-copy cell, the reconciliation functions do a
+	 * split-boundary test based on the size required by the value's cell;
+	 * if we grow the cell after that test we'll potentially write off the
+	 * end of the buffer's memory.
+	 */
+	if (val->buf.size <= WT_INTPACK32_MAXSIZE)
+		return (0);
+	WT_RET(__rec_dictionary_lookup(session, r, val, &dp));
+	if (dp == NULL)
+		return (0);
+
+	/*
+	 * If the dictionary cell reference is not set, we're creating a new
+	 * entry in the dictionary, update its location.
+	 *
+	 * If the dictionary cell reference is set, we have a matching value.
+	 * Create a copy cell instead.
+	 */
+	if (dp->cell == NULL)
+		dp->cell = r->first_free;
+	else {
+		offset = WT_PTRDIFF(r->first_free, dp->cell);
+		val->len = val->cell_len =
+		    __wt_cell_pack_copy(&val->cell, rle, offset);
+		val->buf.data = NULL;
+		val->buf.size = 0;
+	}
+	return (0);
+}
+
+/*
+ * __rec_key_state_update --
+ *	Update prefix and suffix compression based on the last key.
+ */
+static inline void
+__rec_key_state_update(WT_RECONCILE *r, int ovfl_key)
+{
+	WT_ITEM *a;
+
+	/*
+	 * If writing an overflow key onto the page, don't update the "last key"
+	 * value, and leave the state of prefix compression alone.  (If we are
+	 * currently doing prefix compression, we have a key state which will
+	 * continue to work, we're just skipping the key just created because
+	 * it's an overflow key and doesn't participate in prefix compression.
+	 * If we are not currently doing prefix compression, we can't start, an
+	 * overflow key doesn't give us any state.)
+	 *
+	 * Additionally, if we wrote an overflow key onto the page, turn off the
+	 * suffix compression of row-store internal node keys.  (When we split,
+	 * "last key" is the largest key on the previous page, and "cur key" is
+	 * the first key on the next page, which is being promoted.  In some
+	 * cases we can discard bytes from the "cur key" that are not needed to
+	 * distinguish between the "last key" and "cur key", compressing the
+	 * size of keys on internal nodes.  If we just built an overflow key,
+	 * we're not going to update the "last key", making suffix compression
+	 * impossible for the next key.   Alternatively, we could remember where
+	 * the last key was on the page, detect it's an overflow key, read it
+	 * from disk and do suffix compression, but that's too much work for an
+	 * unlikely event.)
+	 *
+	 * If we're not writing an overflow key on the page, update the last-key
+	 * value and turn on both prefix and suffix compression.
+	 */
+	if (ovfl_key)
+		r->key_sfx_compress = 0;
+	else {
+		a = r->cur;
+		r->cur = r->last;
+		r->last = a;
+
+		r->key_pfx_compress = r->key_pfx_compress_conf;
+		r->key_sfx_compress = r->key_sfx_compress_conf;
+	}
+}
+
+/*
+ * Macros from fixed-length entries to/from bytes.
+ */
+#define	WT_FIX_BYTES_TO_ENTRIES(btree, bytes)				\
+    ((uint32_t)((((bytes) * 8) / (btree)->bitcnt)))
+#define	WT_FIX_ENTRIES_TO_BYTES(btree, entries)				\
+	((uint32_t)WT_ALIGN((entries) * (btree)->bitcnt, 8))
+
+/*
+ * __rec_leaf_page_max --
+ *	Figure out the maximum leaf page size for the reconciliation.
+ */
+static inline uint32_t
+__rec_leaf_page_max(WT_SESSION_IMPL *session,  WT_RECONCILE *r)
+{
+	WT_BTREE *btree;
+	WT_PAGE *page;
+	uint32_t page_size;
+
+	btree = S2BT(session);
+	page = r->page;
+
+	page_size = 0;
+	switch (page->type) {
+	case WT_PAGE_COL_FIX:
+		/*
+		 * Column-store pages can grow if there are missing records
+		 * (that is, we lost a chunk of the range, and have to write
+		 * deleted records).  Fixed-length objects are a problem, if
+		 * there's a big missing range, we could theoretically have to
+		 * write large numbers of missing objects.
+		 */
+		page_size = (uint32_t)WT_ALIGN(WT_FIX_ENTRIES_TO_BYTES(btree,
+		    r->salvage->take + r->salvage->missing), btree->allocsize);
+		break;
+	case WT_PAGE_COL_VAR:
+		/*
+		 * Column-store pages can grow if there are missing records
+		 * (that is, we lost a chunk of the range, and have to write
+		 * deleted records).  Variable-length objects aren't usually a
+		 * problem because we can write any number of deleted records
+		 * in a single page entry because of the RLE, we just need to
+		 * ensure that additional entry fits.
+		 */
+		break;
+	case WT_PAGE_ROW_LEAF:
+	default:
+		/*
+		 * Row-store pages can't grow, salvage never does anything
+		 * other than reduce the size of a page read from disk.
+		 */
+		break;
+	}
+
+	/*
+	 * Default size for variable-length column-store and row-store pages
+	 * during salvage is the maximum leaf page size.
+	 */
+	if (page_size < btree->maxleafpage)
+		page_size = btree->maxleafpage;
+
+	/*
+	 * The page we read from the disk should be smaller than the page size
+	 * we just calculated, check out of paranoia.
+	 */
+	if (page_size < page->dsk->mem_size)
+		page_size = page->dsk->mem_size;
+
+	/*
+	 * Salvage is the backup plan: don't let this fail.
+	 */
+	return (page_size * 2);
+}
+
+/*
+ * __rec_split_bnd_init --
+ *	Initialize a single boundary structure.
+ */
+static void
+__rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd)
+{
+	bnd->start = NULL;
+
+	bnd->recno = 0;
+	bnd->entries = 0;
+
+	__wt_free(session, bnd->addr.addr);
+	WT_CLEAR(bnd->addr);
+	bnd->size = 0;
+	bnd->cksum = 0;
+	__wt_free(session, bnd->dsk);
+
+	__wt_free(session, bnd->skip);
+	bnd->skip_next = 0;
+	bnd->skip_allocated = 0;
+
+	/* Ignore the key, we re-use that memory in each new reconciliation. */
+
+	bnd->already_compressed = 0;
+}
+
+/*
+ * __rec_split_bnd_grow --
+ *	Grow the boundary array as necessary.
+ */
+static int
+__rec_split_bnd_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+	/*
+	 * Make sure there's enough room for another boundary.  The calculation
+	 * is +2, because when filling in the current boundary's information,
+	 * we save the start point of the next boundary (for example, a record
+	 * number or key), in the (current + 1) slot.
+	 *
+	 * For the same reason, we're always initializing one ahead.
+	 */
+	WT_RET(__wt_realloc_def(
+	    session, &r->bnd_allocated, r->bnd_next + 2, &r->bnd));
+	r->bnd_entries = r->bnd_allocated / sizeof(r->bnd[0]);
+
+	__rec_split_bnd_init(session, &r->bnd[r->bnd_next + 1]);
+
+	return (0);
+}
+
+/*
+ * __rec_split_init --
+ *	Initialization for the reconciliation split functions.
+ */
+static int
+__rec_split_init(WT_SESSION_IMPL *session,
+    WT_RECONCILE *r, WT_PAGE *page, uint64_t recno, uint32_t max)
+{
+	WT_BM *bm;
+	WT_BTREE *btree;
+	WT_PAGE_HEADER *dsk;
+	size_t corrected_page_size;
+
+	btree = S2BT(session);
+	bm = btree->bm;
+
+	/*
+	 * The maximum leaf page size governs when an in-memory leaf page splits
+	 * into multiple on-disk pages; however, salvage can't be allowed to
+	 * split, there's no parent page yet.  If we're doing salvage, override
+	 * the caller's selection of a maximum page size, choosing a page size
+	 * that ensures we won't split.
+	 */
+	if (r->salvage != NULL)
+		max = __rec_leaf_page_max(session, r);
+
+	/*
+	 * Set the page sizes.  If we're doing the page layout, the maximum page
+	 * size is the same as the page size.  If the application is doing page
+	 * layout (raw compression is configured), we accumulate some amount of
+	 * additional data because we don't know how well it will compress, and
+	 * we don't want to increment our way up to the amount of data needed by
+	 * the application to successfully compress to the target page size.
+	 */
+	r->page_size = r->page_size_max = max;
+	if (r->raw_compression)
+		r->page_size *= 10;
+
+	/*
+	 * Ensure the disk image buffer is large enough for the max object, as
+	 * corrected by the underlying block manager.
+	 */
+	corrected_page_size = r->page_size;
+	WT_RET(bm->write_size(bm, session, &corrected_page_size));
+	WT_RET(__wt_buf_init(session, &r->dsk, corrected_page_size));
+
+	/*
+	 * Clear the disk page's header and block-manager space, set the page
+	 * type (the type doesn't change, and setting it later would require
+	 * additional code in a few different places).
+	 */
+	dsk = r->dsk.mem;
+	memset(dsk, 0, WT_PAGE_HEADER_BYTE_SIZE(btree));
+	dsk->type = page->type;
+
+	/*
+	 * If we have to split, we want to choose a smaller page size for the
+	 * split pages, because otherwise we could end up splitting one large
+	 * packed page over and over.   We don't want to pick the minimum size
+	 * either, because that penalizes an application that did a bulk load
+	 * and subsequently inserted a few items into packed pages.  Currently
+	 * defaulted to 75%, but I have no empirical evidence that's "correct".
+	 *
+	 * The maximum page size may be a multiple of the split page size (for
+	 * example, there's a maximum page size of 128KB, but because the table
+	 * is active and we don't want to split a lot, the split size is 20KB).
+	 * The maximum page size may NOT be an exact multiple of the split page
+	 * size.
+	 *
+	 * It's lots of work to build these pages and don't want to start over
+	 * when we reach the maximum page size (it's painful to restart after
+	 * creating overflow items and compacted data, for example, as those
+	 * items have already been written to disk).  So, the loop calls the
+	 * helper functions when approaching a split boundary, and we save the
+	 * information at that point.  That allows us to go back and split the
+	 * page at the boundary points if we eventually overflow the maximum
+	 * page size.
+	 *
+	 * Finally, all this doesn't matter for fixed-size column-store pages,
+	 * raw compression, and salvage.  Fixed-size column store pages can
+	 * split under (very) rare circumstances, but they're allocated at a
+	 * fixed page size, never anything smaller.  In raw compression, the
+	 * underlying compression routine decides when we split, so it's not
+	 * our problem.  In salvage, as noted above, we can't split at all.
+	 */
+	if (r->raw_compression || r->salvage != NULL) {
+		r->split_size = 0;
+		r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+	}
+	else if (page->type == WT_PAGE_COL_FIX) {
+		r->split_size = r->page_size_max;
+		r->space_avail =
+		    r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+	} else {
+		r->split_size = __wt_split_page_size(btree, r->page_size_max);
+		r->space_avail =
+		    r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+	}
+	r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
+
+	/* Initialize the first boundary. */
+	r->bnd_next = 0;
+	WT_RET(__rec_split_bnd_grow(session, r));
+	__rec_split_bnd_init(session, &r->bnd[0]);
+	r->bnd[0].recno = recno;
+	r->bnd[0].start = WT_PAGE_HEADER_BYTE(btree, dsk);
+
+	/*
+	 * If the maximum page size is the same as the split page size, either
+	 * because of the object type or application configuration, there isn't
+	 * any need to maintain split boundaries within a larger page.
+	 *
+	 * No configuration for salvage here, because salvage can't split.
+	 */
+	if (r->raw_compression)
+		r->bnd_state = SPLIT_TRACKING_RAW;
+	else if (max == r->split_size)
+		r->bnd_state = SPLIT_TRACKING_OFF;
+	else
+		r->bnd_state = SPLIT_BOUNDARY;
+
+	/* Initialize the entry counters. */
+	r->entries = r->total_entries = 0;
+
+	/* Initialize the starting record number. */
+	r->recno = recno;
+
+	/* New page, compression off. */
+	r->key_pfx_compress = r->key_sfx_compress = 0;
+
+	return (0);
+}
+
+/*
+ * __rec_is_checkpoint --
+ *	Return if we're writing a checkpoint.
+ */
+static int
+__rec_is_checkpoint(WT_RECONCILE *r, WT_BOUNDARY *bnd)
+{
+	/*
+	 * Check to see if we're going to create a checkpoint.
+	 *
+	 * This function exists as a place to hang this comment.
+	 *
+	 * Any time we write the root page of the tree without splitting we are
+	 * creating a checkpoint (and have to tell the underlying block manager
+	 * so it creates and writes the additional information checkpoints
+	 * require).  However, checkpoints are completely consistent, and so we
+	 * have to resolve information about the blocks we're expecting to free
+	 * as part of the checkpoint, before writing the checkpoint.  In short,
+	 * we don't do checkpoint writes here; clear the boundary information as
+	 * a reminder and create the checkpoint during wrapup.
+	 */
+	if (bnd == &r->bnd[0] && __wt_ref_is_root(r->ref)) {
+		bnd->addr.addr = NULL;
+		bnd->addr.size = 0;
+		bnd->addr.type = 0;
+		return (1);
+	}
+	return (0);
+}
+
+/*
+ * __rec_split_row_promote_cell --
+ *	Get a key from a cell for the purposes of promotion.
+ */
+static int
+__rec_split_row_promote_cell(
+    WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk, WT_ITEM *key)
+{
+	WT_BTREE *btree;
+	WT_CELL *cell;
+	WT_CELL_UNPACK *kpack, _kpack;
+
+	btree = S2BT(session);
+	kpack = &_kpack;
+
+	/*
+	 * The cell had better have a zero-length prefix and not be a copy cell;
+	 * the first cell on a page cannot refer an earlier cell on the page.
+	 */
+	cell = WT_PAGE_HEADER_BYTE(btree, dsk);
+	__wt_cell_unpack(cell, kpack);
+	WT_ASSERT(session,
+	    kpack->prefix == 0 && kpack->raw != WT_CELL_VALUE_COPY);
+
+	WT_RET(__wt_cell_data_copy(session, dsk->type, kpack, key));
+	return (0);
+}
+
+/*
+ * __rec_split_row_promote --
+ *	Key promotion for a row-store.
+ */
+static int
+__rec_split_row_promote(
+    WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_ITEM *key, uint8_t type)
+{
+	WT_BTREE *btree;
+	WT_DECL_ITEM(update);
+	WT_DECL_RET;
+	WT_ITEM *max;
+	WT_UPD_SKIPPED *skip;
+	size_t cnt, len, size;
+	uint32_t i;
+	const uint8_t *pa, *pb;
+	int cmp;
+
+	/*
+	 * For a column-store, the promoted key is the recno and we already have
+	 * a copy.  For a row-store, it's the first key on the page, a variable-
+	 * length byte string, get a copy.
+	 *
+	 * This function is called from the split code at each split boundary,
+	 * but that means we're not called before the first boundary, and we
+	 * will eventually have to get the first key explicitly when splitting
+	 * a page.
+	 *
+	 * For the current slot, take the last key we built, after doing suffix
+	 * compression.  The "last key we built" describes some process: before
+	 * calling the split code, we must place the last key on the page before
+	 * the boundary into the "last" key structure, and the first key on the
+	 * page after the boundary into the "current" key structure, we're going
+	 * to compare them for suffix compression.
+	 *
+	 * Suffix compression is a hack to shorten keys on internal pages.  We
+	 * only need enough bytes in the promoted key to ensure searches go to
+	 * the correct page: the promoted key has to be larger than the last key
+	 * on the leaf page preceding it, but we don't need any more bytes than
+	 * that.   In other words, we can discard any suffix bytes not required
+	 * to distinguish between the key being promoted and the last key on the
+	 * leaf page preceding it.  This can only be done for the first level of
+	 * internal pages, you cannot repeat suffix truncation as you split up
+	 * the tree, it loses too much information.
+	 *
+	 * Note #1: if the last key on the previous page was an overflow key,
+	 * we don't have the in-memory key against which to compare, and don't
+	 * try to do suffix compression.  The code for that case turns suffix
+	 * compression off for the next key, we don't have to deal with it here.
+	 */
+	if (type != WT_PAGE_ROW_LEAF || !r->key_sfx_compress)
+		return (__wt_buf_set(session, key, r->cur->data, r->cur->size));
+
+	btree = S2BT(session);
+	WT_RET(__wt_scr_alloc(session, 0, &update));
+
+	/*
+	 * Note #2: if we skipped updates, an update key may be larger than the
+	 * last key stored in the previous block (probable for append-centric
+	 * workloads).  If there are skipped updates, check for one larger than
+	 * the last key and smaller than the current key.
+	 */
+	max = r->last;
+	for (i = r->skip_next; i > 0; --i) {
+		skip = &r->skip[i - 1];
+		if (skip->ins == NULL)
+			WT_ERR(__wt_row_leaf_key(
+			    session, r->page, skip->rip, update, 0));
+		else {
+			update->data = WT_INSERT_KEY(skip->ins);
+			update->size = WT_INSERT_KEY_SIZE(skip->ins);
+		}
+
+		/* Compare against the current key, it must be less. */
+		WT_ERR(__wt_compare(
+		    session, btree->collator, update, r->cur, &cmp));
+		if (cmp >= 0)
+			continue;
+
+		/* Compare against the last key, it must be greater. */
+		WT_ERR(__wt_compare(
+		    session, btree->collator, update, r->last, &cmp));
+		if (cmp >= 0)
+			max = update;
+
+		/*
+		 * The skipped updates are in key-sort order so the entry we're
+		 * looking for is either the last one or the next-to-last one
+		 * in the list.  Once we've compared an entry against the last
+		 * key on the page, we're done.
+		 */
+		break;
+	}
+
+	/*
+	 * The largest key on the last block must sort before the current key,
+	 * so we'll either find a larger byte value in the current key, or the
+	 * current key will be a longer key, and the interesting byte is one
+	 * past the length of the shorter key.
+	 */
+	pa = max->data;
+	pb = r->cur->data;
+	len = WT_MIN(max->size, r->cur->size);
+	size = len + 1;
+	for (cnt = 1; len > 0; ++cnt, --len, ++pa, ++pb)
+		if (*pa != *pb) {
+			if (size != cnt) {
+				WT_STAT_FAST_DATA_INCRV(session,
+				    rec_suffix_compression, size - cnt);
+				size = cnt;
+			}
+			break;
+		}
+	ret = __wt_buf_set(session, key, r->cur->data, size);
+
+err:	__wt_scr_free(&update);
+	return (ret);
+}
+
+/*
+ * __rec_split --
+ *	Handle the page reconciliation bookkeeping.  (Did you know "bookkeeper"
+ * has 3 doubled letters in a row?  Sweet-tooth does, too.)
+ */
+static int
+__rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+	WT_BTREE *btree;
+	WT_BOUNDARY *last, *next;
+	WT_PAGE_HEADER *dsk;
+	uint32_t len;
+
+	/*
+	 * We should never split during salvage, and we're about to drop core
+	 * because there's no parent page.
+	 */
+	if (r->salvage != NULL)
+		WT_PANIC_RET(session, WT_PANIC,
+		    "%s page too large, attempted split during salvage",
+		    __wt_page_type_string(r->page->type));
+
+	/*
+	 * Handle page-buffer size tracking; we have to do this work in every
+	 * reconciliation loop, and I don't want to repeat the code that many
+	 * times.
+	 */
+	btree = S2BT(session);
+	dsk = r->dsk.mem;
+
+	/* Hitting a page boundary resets the dictionary, in all cases. */
+	__rec_dictionary_reset(r);
+
+	/*
+	 * There are 3 cases we have to handle.
+	 *
+	 * #1
+	 * About to cross a split boundary: save current boundary information
+	 * and return.
+	 *
+	 * #2
+	 * About to cross the maximum boundary: use saved boundary information
+	 * to write all of the split pages.
+	 *
+	 * #3
+	 * About to cross a split boundary, but we've either already done the
+	 * split thing when we approached the maximum boundary, in which
+	 * case we write the page and keep going, or we were never tracking
+	 * split boundaries at all.
+	 *
+	 * Cases #1 and #2 are the hard ones: we're called when we're about to
+	 * cross each split boundary, and we save information away so we can
+	 * split if we have to.  We're also called when we're about to cross
+	 * the maximum page boundary: in that case, we do the actual split and
+	 * clean up all the previous boundaries, then keep going.
+	 */
+	switch (r->bnd_state) {
+	case SPLIT_BOUNDARY:				/* Case #1 */
+		/*
+		 * Save the information about where we are when the split would
+		 * have happened.
+		 */
+		WT_RET(__rec_split_bnd_grow(session, r));
+		last = &r->bnd[r->bnd_next++];
+		next = last + 1;
+
+		/* Set the number of entries for the just finished chunk. */
+		last->entries = r->entries - r->total_entries;
+		r->total_entries = r->entries;
+
+		/* Set the key for the next chunk. */
+		next->recno = r->recno;
+		if (dsk->type == WT_PAGE_ROW_INT ||
+		    dsk->type == WT_PAGE_ROW_LEAF)
+			WT_RET(__rec_split_row_promote(
+			    session, r, &next->key, dsk->type));
+
+		/*
+		 * Set the starting buffer address and clear the entries (the
+		 * latter not required, but cleaner).
+		 */
+		next->start = r->first_free;
+		next->entries = 0;
+
+		/*
+		 * Set the space available to another split-size chunk, if we
+		 * have one.  If we don't have room for another split chunk,
+		 * add whatever space remains in the maximum page size, and
+		 * hope it's enough.
+		 */
+		len = WT_PTRDIFF32(r->first_free, dsk);
+		if (len + r->split_size <= r->page_size)
+			r->space_avail =
+			    r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+		else {
+			r->bnd_state = SPLIT_MAX;
+			r->space_avail = r->page_size -
+			    (WT_PAGE_HEADER_BYTE_SIZE(btree) + len);
+		}
+		break;
+	case SPLIT_MAX:					/* Case #2 */
+		/*
+		 * It didn't all fit into a single page.
+		 *
+		 * Cycle through the saved split-point information, writing the
+		 * split chunks we have tracked.
+		 */
+		WT_RET(__rec_split_fixup(session, r));
+
+		/* We're done saving split chunks. */
+		r->bnd_state = SPLIT_TRACKING_OFF;
+		break;
+	case SPLIT_TRACKING_OFF:			/* Case #3 */
+		/*
+		 * It didn't all fit, but either we've already noticed it and
+		 * are now processing the rest of the page at the split-size
+		 * boundaries, or the split size was the same as the page size,
+		 * so we never bothered with saving split-point information.
+		 */
+		WT_RET(__rec_split_bnd_grow(session, r));
+		last = &r->bnd[r->bnd_next++];
+		next = last + 1;
+
+		/*
+		 * Set the key for the next chunk (before writing the block, a
+		 * key range is needed in that code).
+		 */
+		next->recno = r->recno;
+		if (dsk->type == WT_PAGE_ROW_INT ||
+		    dsk->type == WT_PAGE_ROW_LEAF)
+			WT_RET(__rec_split_row_promote(
+			    session, r, &next->key, dsk->type));
+
+		/* Clear the entries (not required, but cleaner). */
+		next->entries = 0;
+
+		/* Finalize the header information and write the page. */
+		dsk->recno = last->recno;
+		dsk->u.entries = r->entries;
+		dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk);
+		WT_RET(__rec_split_write(session, r, last, &r->dsk, 0));
+
+		/*
+		 * Set the caller's entry count and buffer information for the
+		 * next chunk.  We only get here if we're not splitting or have
+		 * already split, so it's split-size chunks from here on out.
+		 */
+		r->entries = 0;
+		r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
+		r->space_avail =
+		    r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+		break;
+	case SPLIT_TRACKING_RAW:
+	WT_ILLEGAL_VALUE(session);
+	}
+	return (0);
+}
+
+/*
+ * __rec_split_raw_worker --
+ *	Handle the raw compression page reconciliation bookkeeping.
+ */
+static int
+__rec_split_raw_worker(
+    WT_SESSION_IMPL *session, WT_RECONCILE *r, int no_more_rows)
+{
+	WT_BM *bm;
+	WT_BOUNDARY *last, *next;
+	WT_BTREE *btree;
+	WT_CELL *cell;
+	WT_CELL_UNPACK *unpack, _unpack;
+	WT_COMPRESSOR *compressor;
+	WT_DECL_RET;
+	WT_ITEM *dst, *write_ref;
+	WT_PAGE_HEADER *dsk, *dsk_dst;
+	WT_SESSION *wt_session;
+	size_t corrected_page_size, len, result_len;
+	uint64_t recno;
+	uint32_t entry, i, result_slots, slots;
+	int last_block;
+	uint8_t *dsk_start;
+
+	wt_session = (WT_SESSION *)session;
+	btree = S2BT(session);
+	bm = btree->bm;
+
+	unpack = &_unpack;
+	compressor = btree->compressor;
+	dst = &r->raw_destination;
+	dsk = r->dsk.mem;
+
+	WT_RET(__rec_split_bnd_grow(session, r));
+	last = &r->bnd[r->bnd_next];
+	next = last + 1;
+
+	/*
+	 * Build arrays of offsets and cumulative counts of cells and rows in
+	 * the page: the offset is the byte offset to the possible split-point
+	 * (adjusted for an initial chunk that cannot be compressed), entries
+	 * is the cumulative page entries covered by the byte offset, recnos is
+	 * the cumulative rows covered by the byte offset.
+	 */
+	if (r->entries >= r->raw_max_slots) {
+		__wt_free(session, r->raw_entries);
+		__wt_free(session, r->raw_offsets);
+		__wt_free(session, r->raw_recnos);
+		r->raw_max_slots = 0;
+
+		i = r->entries + 100;
+		WT_RET(__wt_calloc_def(session, i, &r->raw_entries));
+		WT_RET(__wt_calloc_def(session, i, &r->raw_offsets));
+		if (dsk->type == WT_PAGE_COL_INT ||
+		    dsk->type == WT_PAGE_COL_VAR)
+			WT_RET(__wt_calloc_def(session, i, &r->raw_recnos));
+		r->raw_max_slots = i;
+	}
+
+	/*
+	 * We're going to walk the disk image, which requires setting the
+	 * number of entries.
+	 */
+	dsk->u.entries = r->entries;
+
+	/*
+	 * We track the record number at each column-store split point, set an
+	 * initial value.
+	 */
+	recno = 0;
+	if (dsk->type == WT_PAGE_COL_VAR)
+		recno = last->recno;
+
+	entry = slots = 0;
+	WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+		++entry;
+
+		/*
+		 * Row-store pages can split at keys, but not at values,
+		 * column-store pages can split at values.
+		 */
+		__wt_cell_unpack(cell, unpack);
+		switch (unpack->type) {
+		case WT_CELL_KEY:
+		case WT_CELL_KEY_OVFL:
+		case WT_CELL_KEY_SHORT:
+			break;
+		case WT_CELL_ADDR_DEL:
+		case WT_CELL_ADDR_INT:
+		case WT_CELL_ADDR_LEAF:
+		case WT_CELL_ADDR_LEAF_NO:
+		case WT_CELL_DEL:
+		case WT_CELL_VALUE:
+		case WT_CELL_VALUE_OVFL:
+		case WT_CELL_VALUE_SHORT:
+			if (dsk->type == WT_PAGE_COL_INT) {
+				recno = unpack->v;
+				break;
+			}
+			if (dsk->type == WT_PAGE_COL_VAR) {
+				recno += __wt_cell_rle(unpack);
+				break;
+			}
+			r->raw_entries[slots] = entry;
+			continue;
+		WT_ILLEGAL_VALUE(session);
+		}
+
+		/*
+		 * We can't compress the first 64B of the block (it must be
+		 * written without compression), and a possible split point
+		 * may appear in that 64B; keep it simple, ignore the first
+		 * allocation size of data, anybody splitting smaller than
+		 * that (as calculated before compression), is doing it wrong.
+		 */
+		if ((len = WT_PTRDIFF(cell, dsk)) > btree->allocsize)
+			r->raw_offsets[++slots] =
+			    WT_STORE_SIZE(len - WT_BLOCK_COMPRESS_SKIP);
+
+		if (dsk->type == WT_PAGE_COL_INT ||
+		    dsk->type == WT_PAGE_COL_VAR)
+			r->raw_recnos[slots] = recno;
+		r->raw_entries[slots] = entry;
+	}
+
+	/*
+	 * If we haven't managed to find at least one split point, we're done,
+	 * don't bother calling the underlying compression function.
+	 */
+	if (slots == 0) {
+		result_len = 0;
+		result_slots = 0;
+		goto no_slots;
+	}
+
+	/* The slot at array's end is the total length of the data. */
+	r->raw_offsets[++slots] =
+	    WT_STORE_SIZE(WT_PTRDIFF(cell, dsk) - WT_BLOCK_COMPRESS_SKIP);
+
+	/*
+	 * Allocate a destination buffer.  If there's a pre-size function, use
+	 * it to determine the destination buffer's minimum size, otherwise the
+	 * destination buffer is documented to be at least the maximum object
+	 * size.
+	 *
+	 * The destination buffer really only needs to be large enough for the
+	 * target block size, corrected for the requirements of the underlying
+	 * block manager.  If the target block size is 8KB, that's a multiple
+	 * of 512B and so the underlying block manager is fine with it.  But...
+	 * we don't control what the pre_size method returns us as a required
+	 * size, and we don't want to document the compress_raw method has to
+	 * skip bytes in the buffer because that's confusing, so do something
+	 * more complicated.  First, find out how much space the compress_raw
+	 * function might need, either the value returned from pre_size, or the
+	 * maximum object size.  Add the compress-skip bytes, and then correct
+	 * that value for the underlying block manager.   As a result, we have
+	 * a destination buffer that's the right "object" size when calling the
+	 * compress_raw method, and there are bytes in the header just for us.
+	 */
+	if (compressor->pre_size == NULL)
+		result_len = r->page_size_max;
+	else
+		WT_RET(compressor->pre_size(compressor, wt_session,
+		    (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP,
+		    (size_t)r->raw_offsets[slots], &result_len));
+	corrected_page_size = result_len + WT_BLOCK_COMPRESS_SKIP;
+	WT_RET(bm->write_size(bm, session, &corrected_page_size));
+	WT_RET(__wt_buf_init(session, dst, corrected_page_size));
+
+	/*
+	 * Copy the header bytes into the destination buffer, then call the
+	 * compression function.
+	 */
+	memcpy(dst->mem, dsk, WT_BLOCK_COMPRESS_SKIP);
+	ret = compressor->compress_raw(compressor, wt_session,
+	    r->page_size_max, btree->split_pct,
+	    WT_BLOCK_COMPRESS_SKIP, (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP,
+	    r->raw_offsets, slots,
+	    (uint8_t *)dst->mem + WT_BLOCK_COMPRESS_SKIP,
+	    result_len, no_more_rows, &result_len, &result_slots);
+	switch (ret) {
+	case EAGAIN:
+		/*
+		 * The compression function wants more rows; accumulate and
+		 * retry.
+		 *
+		 * Reset the resulting slots count, just in case the compression
+		 * function modified it before giving up.
+		 */
+		result_slots = 0;
+		break;
+	case 0:
+		/*
+		 * If the compression function returned zero result slots, it's
+		 * giving up and we write the original data.  (This is a pretty
+		 * bad result: we've not done compression on a block much larger
+		 * than the maximum page size, but once compression gives up,
+		 * there's not much else we can do.)
+		 *
+		 * If the compression function returned non-zero result slots,
+		 * we were successful and have a block to write.
+		 */
+		if (result_slots == 0) {
+			WT_STAT_FAST_DATA_INCR(session, compress_raw_fail);
+
+			/*
+			 * If there are no more rows, we can write the original
+			 * data from the original buffer.
+			 */
+			if (no_more_rows)
+				break;
+
+			/*
+			 * Copy the original data to the destination buffer, as
+			 * if the compression function simply copied it.  Take
+			 * all but the last row of the original data (the last
+			 * row has to be set as the key for the next block).
+			 */
+			result_slots = slots - 1;
+			result_len = r->raw_offsets[result_slots];
+			WT_RET(__wt_buf_grow(
+			    session, dst, result_len + WT_BLOCK_COMPRESS_SKIP));
+			memcpy((uint8_t *)dst->mem + WT_BLOCK_COMPRESS_SKIP,
+			    (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP,
+			    result_len);
+
+			/*
+			 * Mark it as uncompressed so the standard compression
+			 * function is called before the buffer is written.
+			 */
+			last->already_compressed = 0;
+		} else {
+			WT_STAT_FAST_DATA_INCR(session, compress_raw_ok);
+
+			/*
+			 * If there are more rows and the compression function
+			 * consumed all of the current data, there are problems:
+			 * First, with row-store objects, we're potentially
+			 * skipping updates, we must have a key for the next
+			 * block so we know with what block a skipped update is
+			 * associated.  Second, if the compression function
+			 * compressed all of the data, we're not pushing it
+			 * hard enough (unless we got lucky and gave it exactly
+			 * the right amount to work with, which is unlikely).
+			 * Handle both problems by accumulating more data any
+			 * time we're not writing the last block and compression
+			 * ate all of the rows.
+			 */
+			if (result_slots == slots && !no_more_rows)
+				result_slots = 0;
+			else
+				last->already_compressed = 1;
+		}
+		break;
+	default:
+		return (ret);
+	}
+
+no_slots:
+	/*
+	 * Check for the last block we're going to write: if no more rows and
+	 * we failed to compress anything, or we compressed everything, it's
+	 * the last block.
+	 */
+	last_block = no_more_rows &&
+	    (result_slots == 0 || result_slots == slots);
+
+	if (result_slots != 0) {
+		/*
+		 * We have a block, finalize the header information.
+		 */
+		dst->size = result_len + WT_BLOCK_COMPRESS_SKIP;
+		dsk_dst = dst->mem;
+		dsk_dst->recno = last->recno;
+		dsk_dst->mem_size =
+		    r->raw_offsets[result_slots] + WT_BLOCK_COMPRESS_SKIP;
+		dsk_dst->u.entries = r->raw_entries[result_slots - 1];
+
+		/*
+		 * There is likely a remnant in the working buffer that didn't
+		 * get compressed; copy it down to the start of the buffer and
+		 * update the starting record number, free space and so on.
+		 * !!!
+		 * Note use of memmove, the source and destination buffers can
+		 * overlap.
+		 */
+		len = WT_PTRDIFF(r->first_free, (uint8_t *)dsk +
+		    r->raw_offsets[result_slots] + WT_BLOCK_COMPRESS_SKIP);
+		dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
+		(void)memmove(dsk_start, (uint8_t *)r->first_free - len, len);
+
+		r->entries -= r->raw_entries[result_slots - 1];
+		r->first_free = dsk_start + len;
+		r->space_avail =
+		    r->page_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len);
+
+		/*
+		 * Set the key for the next block (before writing the block, a
+		 * key range is needed in that code).
+		 */
+		switch (dsk->type) {
+		case WT_PAGE_COL_INT:
+			next->recno = r->raw_recnos[result_slots];
+			break;
+		case WT_PAGE_COL_VAR:
+			next->recno = r->raw_recnos[result_slots - 1];
+			break;
+		case WT_PAGE_ROW_INT:
+		case WT_PAGE_ROW_LEAF:
+			next->recno = 0;
+			if (!last_block) {
+				/*
+				 * Confirm there was uncompressed data remaining
+				 * in the buffer, we're about to read it for the
+				 * next chunk's initial key.
+				 */
+				WT_ASSERT(session, len > 0);
+				WT_RET(__rec_split_row_promote_cell(
+				    session, dsk, &next->key));
+			}
+			break;
+		}
+		write_ref = dst;
+	} else if (no_more_rows) {
+		/*
+		 * Compression failed and there are no more rows to accumulate,
+		 * write the original buffer instead.
+		 */
+		WT_STAT_FAST_DATA_INCR(session, compress_raw_fail);
+
+		dsk->recno = last->recno;
+		dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk);
+		dsk->u.entries = r->entries;
+
+		r->entries = 0;
+		r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
+		r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+
+		write_ref = &r->dsk;
+		last->already_compressed = 0;
+	} else {
+		/*
+		 * Compression failed, there are more rows to accumulate and the
+		 * compression function wants to try again; increase the size of
+		 * the "page" and try again after we accumulate some more rows.
+		 */
+		WT_STAT_FAST_DATA_INCR(session, compress_raw_fail_temporary);
+
+		len = WT_PTRDIFF(r->first_free, r->dsk.mem);
+		corrected_page_size = r->page_size * 2;
+		WT_RET(bm->write_size(bm, session, &corrected_page_size));
+		WT_RET(__wt_buf_grow(session, &r->dsk, corrected_page_size));
+		r->page_size *= 2;
+		r->first_free = (uint8_t *)r->dsk.mem + len;
+		r->space_avail =
+		    r->page_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len);
+		return (0);
+	}
+
+	/* We have a block, update the boundary counter. */
+	++r->bnd_next;
+
+	/*
+	 * If we are writing the whole page in our first/only attempt, it might
+	 * be a checkpoint (checkpoints are only a single page, by definition).
+	 * Further, checkpoints aren't written here, the wrapup functions do the
+	 * write, and they do the write from the original buffer location.  If
+	 * it's a checkpoint and the block isn't in the right buffer, copy it.
+	 *
+	 * If it's not a checkpoint, write the block.
+	 */
+	if (r->bnd_next == 1 && last_block && __rec_is_checkpoint(r, last)) {
+		if (write_ref == dst)
+			WT_RET(__wt_buf_set(
+			    session, &r->dsk, dst->mem, dst->size));
+	} else
+		WT_RET(
+		    __rec_split_write(session, r, last, write_ref, last_block));
+	return (0);
+}
+
+/*
+ * __rec_raw_decompress --
+ *	Decompress a raw-compressed image.
+ */
+static int
+__rec_raw_decompress(
+    WT_SESSION_IMPL *session, const void *image, size_t size, void *retp)
+{
+	WT_BTREE *btree;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	WT_PAGE_HEADER const *dsk;
+	size_t result_len;
+
+	btree = S2BT(session);
+	dsk = image;
+
+	/*
+	 * We skipped an update and we can't write a block, but unfortunately,
+	 * the block has already been compressed.   Decompress the block so we
+	 * can subsequently re-instantiate it in memory.
+	 */
+	WT_RET(__wt_scr_alloc(session, dsk->mem_size, &tmp));
+	memcpy(tmp->mem, image, WT_BLOCK_COMPRESS_SKIP);
+	WT_ERR(btree->compressor->decompress(btree->compressor,
+	    &session->iface,
+	    (uint8_t *)image + WT_BLOCK_COMPRESS_SKIP,
+	    size - WT_BLOCK_COMPRESS_SKIP,
+	    (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP,
+	    dsk->mem_size - WT_BLOCK_COMPRESS_SKIP,
+	    &result_len));
+	if (result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP)
+		WT_ERR(__wt_illegal_value(session, btree->dhandle->name));
+
+	WT_ERR(__wt_strndup(session, tmp->data, dsk->mem_size, retp));
+	WT_ASSERT(session, __wt_verify_dsk_image(
+	    session, "[raw evict split]", tmp->data, dsk->mem_size, 0) == 0);
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __rec_split_raw --
+ *	Raw compression split routine.
+ */
+static inline int
+__rec_split_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+	return (__rec_split_raw_worker(session, r, 0));
+}
+
+/*
+ * __rec_split_finish_std --
+ *	Finish processing a page, standard version.
+ */
+static int
+__rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+	WT_BOUNDARY *bnd;
+	WT_PAGE_HEADER *dsk;
+
+	/* Adjust the boundary information based on our split status. */
+	switch (r->bnd_state) {
+	case SPLIT_BOUNDARY:
+	case SPLIT_MAX:
+		/*
+		 * We never split, the reconciled page fit into a maximum page
+		 * size.  Change the first boundary slot to represent the full
+		 * page (the first boundary slot is largely correct, just update
+		 * the number of entries).
+		 */
+		r->bnd_next = 0;
+		break;
+	case SPLIT_TRACKING_OFF:
+		/*
+		 * If we have already split, or aren't tracking boundaries, put
+		 * the remaining data in the next boundary slot.
+		 */
+		WT_RET(__rec_split_bnd_grow(session, r));
+		break;
+	case SPLIT_TRACKING_RAW:
+		/*
+		 * We were configured for raw compression, but never actually
+		 * wrote anything.
+		 */
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	/*
+	 * We only arrive here with no entries to write if the page was entirely
+	 * empty, and if the page is empty, we merge it into its parent during
+	 * the parent's reconciliation.  A page with skipped updates isn't truly
+	 * empty, continue on.
+	 */
+	if (r->entries == 0 && r->skip_next == 0)
+		return (0);
+
+	/* Set the boundary reference and increment the count. */
+	bnd = &r->bnd[r->bnd_next++];
+	bnd->entries = r->entries;
+
+	/* Finalize the header information. */
+	dsk = r->dsk.mem;
+	dsk->recno = bnd->recno;
+	dsk->u.entries = r->entries;
+	dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk);
+
+	/* If this is a checkpoint, we're done, otherwise write the page. */
+	return (
+	    __rec_is_checkpoint(r, bnd) ? 0 :
+	    __rec_split_write(session, r, bnd, &r->dsk, 1));
+}
+
+/*
+ * __rec_split_finish --
+ *	Finish processing a page.
+ */
+static int
+__rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+	/* We're done reconciling - write the final page */
+	if (r->raw_compression && r->entries != 0) {
+		while (r->entries != 0)
+			WT_RET(__rec_split_raw_worker(session, r, 1));
+	} else
+		WT_RET(__rec_split_finish_std(session, r));
+
+	return (0);
+}
+
+/*
+ * __rec_split_fixup --
+ *	Fix up after crossing the maximum page boundary.
+ */
+static int
+__rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+	WT_BOUNDARY *bnd;
+	WT_BTREE *btree;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	WT_PAGE_HEADER *dsk;
+	uint32_t i, len;
+	uint8_t *dsk_start;
+
+	/*
+	 * When we overflow physical limits of the page, we walk the list of
+	 * split chunks we've created and write those pages out, then update
+	 * the caller's information.
+	 */
+	btree = S2BT(session);
+
+	/*
+	 * The data isn't laid out on a page boundary or nul padded; copy it to
+	 * a clean, aligned, padded buffer before writing it.
+	 *
+	 * Allocate a scratch buffer to hold the new disk image.  Copy the
+	 * WT_PAGE_HEADER header onto the scratch buffer, most of the header
+	 * information remains unchanged between the pages.
+	 */
+	WT_RET(__wt_scr_alloc(session, r->page_size_max, &tmp));
+	dsk = tmp->mem;
+	memcpy(dsk, r->dsk.mem, WT_PAGE_HEADER_SIZE);
+
+	/*
+	 * For each split chunk we've created, update the disk image and copy
+	 * it into place.
+	 */
+	dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
+	for (i = 0, bnd = r->bnd; i < r->bnd_next; ++i, ++bnd) {
+		/* Copy the page contents to the temporary buffer. */
+		len = WT_PTRDIFF32((bnd + 1)->start, bnd->start);
+		memcpy(dsk_start, bnd->start, len);
+
+		/* Finalize the header information and write the page. */
+		dsk->recno = bnd->recno;
+		dsk->u.entries = bnd->entries;
+		dsk->mem_size =
+		    tmp->size = WT_PAGE_HEADER_BYTE_SIZE(btree) + len;
+		WT_ERR(__rec_split_write(session, r, bnd, tmp, 0));
+	}
+
+	/*
+	 * There is probably a remnant in the working buffer that didn't get
+	 * written; copy it down to the beginning of the working buffer, and
+	 * update the starting record number.
+	 *
+	 * Confirm the remnant is no larger than the available split buffer.
+	 *
+	 * Fix up our caller's information.
+	 */
+	len = WT_PTRDIFF32(r->first_free, bnd->start);
+	if (len >= r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree))
+		WT_PANIC_ERR(session, EINVAL,
+		    "Reconciliation remnant too large for the split buffer");
+
+	dsk = r->dsk.mem;
+	dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
+	(void)memmove(dsk_start, bnd->start, len);
+
+	r->entries -= r->total_entries;
+	r->first_free = dsk_start + len;
+	r->space_avail =
+	    (r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree)) - len;
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __rec_split_write --
+ *	Write a disk block out for the split helper functions.
+ */
+static int
+__rec_split_write(WT_SESSION_IMPL *session,
+    WT_RECONCILE *r, WT_BOUNDARY *bnd, WT_ITEM *buf, int last_block)
+{
+	WT_BTREE *btree;
+	WT_DECL_ITEM(key);
+	WT_DECL_RET;
+	WT_MULTI *multi;
+	WT_PAGE *page;
+	WT_PAGE_HEADER *dsk;
+	WT_PAGE_MODIFY *mod;
+	WT_UPD_SKIPPED *skip;
+	size_t addr_size;
+	uint32_t bnd_slot, i, j;
+	int cmp;
+	uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE];
+
+	btree = S2BT(session);
+	dsk = buf->mem;
+	page = r->page;
+	mod = page->modify;
+
+	WT_RET(__wt_scr_alloc(session, 0, &key));
+
+	/* Set the zero-length value flag in the page header. */
+	if (dsk->type == WT_PAGE_ROW_LEAF) {
+		F_CLR(dsk, WT_PAGE_EMPTY_V_ALL | WT_PAGE_EMPTY_V_NONE);
+
+		if (r->entries != 0 && r->all_empty_value)
+			F_SET(dsk, WT_PAGE_EMPTY_V_ALL);
+		if (r->entries != 0 && !r->any_empty_value)
+			F_SET(dsk, WT_PAGE_EMPTY_V_NONE);
+	}
+
+	/* Initialize the address (set the page type for the parent). */
+	switch (dsk->type) {
+	case WT_PAGE_COL_FIX:
+		bnd->addr.type = WT_ADDR_LEAF_NO;
+		break;
+	case WT_PAGE_COL_VAR:
+	case WT_PAGE_ROW_LEAF:
+		bnd->addr.type = r->ovfl_items ? WT_ADDR_LEAF : WT_ADDR_LEAF_NO;
+		break;
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_ROW_INT:
+		bnd->addr.type = WT_ADDR_INT;
+		break;
+	WT_ILLEGAL_VALUE_ERR(session);
+	}
+
+	bnd->size = (uint32_t)buf->size;
+	bnd->cksum = 0;
+
+	/*
+	 * Check if we've skipped updates that belong to this block, and move
+	 * any to the per-block structure.  Quit as soon as we find a skipped
+	 * update that doesn't belong to the block, they're in sorted order.
+	 *
+	 * This code requires a key be filled in for the next block (or the
+	 * last block flag be set, if there's no next block).
+	 */
+	for (i = 0, skip = r->skip; i < r->skip_next; ++i, ++skip) {
+		/* The last block gets all remaining skipped updates. */
+		if (last_block) {
+			WT_ERR(__rec_skip_update_move(session, bnd, skip));
+			continue;
+		}
+
+		/*
+		 * Get the skipped update's key and compare it with this block's
+		 * key range.  If the skipped update list belongs with the block
+		 * we're about to write, move it to the per-block memory.  Check
+		 * only to the first update that doesn't go with the block, they
+		 * must be in sorted order.
+		 */
+		switch (page->type) {
+		case WT_PAGE_COL_FIX:
+		case WT_PAGE_COL_VAR:
+			if (WT_INSERT_RECNO(skip->ins) >= (bnd + 1)->recno)
+				goto skip_check_complete;
+			break;
+		case WT_PAGE_ROW_LEAF:
+			if (skip->ins == NULL)
+				WT_ERR(__wt_row_leaf_key(
+				    session, page, skip->rip, key, 0));
+			else {
+				key->data = WT_INSERT_KEY(skip->ins);
+				key->size = WT_INSERT_KEY_SIZE(skip->ins);
+			}
+			WT_ERR(__wt_compare(session,
+			    btree->collator, key, &(bnd + 1)->key, &cmp));
+			if (cmp >= 0)
+				goto skip_check_complete;
+			break;
+		WT_ILLEGAL_VALUE_ERR(session);
+		}
+		WT_ERR(__rec_skip_update_move(session, bnd, skip));
+	}
+
+skip_check_complete:
+	/*
+	 * If there are updates that weren't moved to the block, shuffle them to
+	 * the beginning of the cached list (we maintain the skipped updates in
+	 * sorted order, new skipped updates must be appended to the list).
+	 */
+	for (j = 0; i < r->skip_next; ++j, ++i)
+		r->skip[j] = r->skip[i];
+	r->skip_next = j;
+
+	/*
+	 * If we had to skip updates in order to build this disk image, we can't
+	 * actually write it. Instead, we will re-instantiate the page using the
+	 * disk image and the list of updates we skipped.
+	 */
+	if (bnd->skip != NULL) {
+		/*
+		 * If the buffer is compressed (raw compression was configured),
+		 * we have to decompress it so we can instantiate it later. It's
+		 * a slow and convoluted path, but it's also a rare one and it's
+		 * not worth making it faster. Else, the disk image is ready,
+		 * copy it into place for later. It's possible the disk image
+		 * has no items; we have to flag that for verification, it's a
+		 * special case since read/writing empty pages isn't generally
+		 * allowed.
+		 */
+		if (bnd->already_compressed)
+			WT_ERR(__rec_raw_decompress(
+			    session, buf->data, buf->size, &bnd->dsk));
+		else {
+			WT_ERR(__wt_strndup(
+			    session, buf->data, buf->size, &bnd->dsk));
+			WT_ASSERT(session, __wt_verify_dsk_image(session,
+			    "[evict split]", buf->data, buf->size, 1) == 0);
+		}
+		goto done;
+	}
+
+	/*
+	 * If we wrote this block before, re-use it.  Pages get written in the
+	 * same block order every time, only check the appropriate slot.  The
+	 * expensive part of this test is the checksum, only do that work when
+	 * there has been or will be a reconciliation of this page involving
+	 * split pages.  This test isn't perfect: we're doing a checksum if a
+	 * previous reconciliation of the page split or if we will split this
+	 * time, but that test won't calculate a checksum on the first block
+	 * the first time the page splits.
+	 */
+	bnd_slot = (uint32_t)(bnd - r->bnd);
+	if (bnd_slot > 1 ||
+	    (F_ISSET(mod, WT_PM_REC_MULTIBLOCK) && mod->mod_multi != NULL)) {
+		/*
+		 * There are page header fields which need to be cleared to get
+		 * consistent checksums: specifically, the write generation and
+		 * the memory owned by the block manager.  We are reusing the
+		 * same buffer space each time, clear it before calculating the
+		 * checksum.
+		 */
+		dsk->write_gen = 0;
+		memset(WT_BLOCK_HEADER_REF(dsk), 0, btree->block_header);
+		bnd->cksum = __wt_cksum(buf->data, buf->size);
+
+		if (F_ISSET(mod, WT_PM_REC_MULTIBLOCK) &&
+		    mod->mod_multi_entries > bnd_slot) {
+			multi = &mod->mod_multi[bnd_slot];
+			if (multi->size == bnd->size &&
+			    multi->cksum == bnd->cksum) {
+				multi->addr.reuse = 1;
+				bnd->addr = multi->addr;
+
+				WT_STAT_FAST_DATA_INCR(session, rec_page_match);
+				goto done;
+			}
+		}
+	}
+
+	WT_ERR(__wt_bt_write(session,
+	    buf, addr, &addr_size, 0, bnd->already_compressed));
+	WT_ERR(__wt_strndup(session, addr, addr_size, &bnd->addr.addr));
+	bnd->addr.size = (uint8_t)addr_size;
+
+done:
+err:	__wt_scr_free(&key);
+	return (ret);
+}
+
+/*
+ * __wt_bulk_init --
+ *	Bulk insert initialization.
+ */
+int
+__wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+{
+	WT_BTREE *btree;
+	WT_PAGE_INDEX *pindex;
+	WT_RECONCILE *r;
+	uint64_t recno;
+
+	btree = S2BT(session);
+	/*
+	 * Bulk-load is only permitted on newly created files, not any empty
+	 * file -- see the checkpoint code for a discussion.
+	 */
+	if (!btree->bulk_load_ok)
+		WT_RET_MSG(session, EINVAL,
+		    "bulk-load is only possible for newly created trees");
+
+	/* Set a reference to the empty leaf page. */
+	pindex = WT_INTL_INDEX_COPY(btree->root.page);
+	cbulk->ref = pindex->index[0];
+	cbulk->leaf = cbulk->ref->page;
+
+	WT_RET(
+	    __rec_write_init(session, cbulk->ref, 0, NULL, &cbulk->reconcile));
+	r = cbulk->reconcile;
+	r->is_bulk_load = 1;
+
+	switch (btree->type) {
+	case BTREE_COL_FIX:
+	case BTREE_COL_VAR:
+		recno = 1;
+		break;
+	case BTREE_ROW:
+		recno = 0;
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	return (__rec_split_init(
+	    session, r, cbulk->leaf, recno, btree->maxleafpage));
+}
+
+/*
+ * __wt_bulk_wrapup --
+ *	Bulk insert cleanup.
+ */
+int
+__wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+{
+	WT_BTREE *btree;
+	WT_PAGE *parent;
+	WT_RECONCILE *r;
+
+	r = cbulk->reconcile;
+	btree = S2BT(session);
+
+	switch (btree->type) {
+	case BTREE_COL_FIX:
+		if (cbulk->entry != 0)
+			__rec_incr(session, r, cbulk->entry,
+			    __bitstr_size(
+			    (size_t)cbulk->entry * btree->bitcnt));
+		break;
+	case BTREE_COL_VAR:
+		if (cbulk->rle != 0)
+			WT_RET(__wt_bulk_insert_var(session, cbulk));
+		break;
+	case BTREE_ROW:
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	WT_RET(__rec_split_finish(session, r));
+	WT_RET(__rec_write_wrapup(session, r, r->page));
+
+	/* Mark the page's parent dirty. */
+	parent = r->ref->home;
+	WT_RET(__wt_page_modify_init(session, parent));
+	__wt_page_modify_set(session, parent);
+
+	__rec_destroy(session, &cbulk->reconcile);
+
+	return (0);
+}
+
+/*
+ * __wt_bulk_insert_row --
+ *	Row-store bulk insert.
+ */
+int
+__wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+{
+	WT_BTREE *btree;
+	WT_CURSOR *cursor;
+	WT_KV *key, *val;
+	WT_RECONCILE *r;
+	int ovfl_key;
+
+	r = cbulk->reconcile;
+	btree = S2BT(session);
+	cursor = &cbulk->cbt.iface;
+
+	key = &r->k;
+	val = &r->v;
+	WT_RET(__rec_cell_build_leaf_key(session, r,	/* Build key cell */
+	    cursor->key.data, cursor->key.size, &ovfl_key));
+	WT_RET(__rec_cell_build_val(session, r,		/* Build value cell */
+	    cursor->value.data, cursor->value.size, (uint64_t)0));
+
+	/* Boundary: split or write the page. */
+	while (key->len + val->len > r->space_avail)
+		if (r->raw_compression)
+			WT_RET(__rec_split_raw(session, r));
+		else {
+			WT_RET(__rec_split(session, r));
+
+			/*
+			 * Turn off prefix compression until a full key written
+			 * to the new page, and (unless we're already working
+			 * with an overflow key), rebuild the key without prefix
+			 * compression.
+			 */
+			if (r->key_pfx_compress_conf) {
+				r->key_pfx_compress = 0;
+				if (!ovfl_key)
+					WT_RET(__rec_cell_build_leaf_key(
+					    session, r, NULL, 0, &ovfl_key));
+			}
+		}
+
+	/* Copy the key/value pair onto the page. */
+	__rec_copy_incr(session, r, key);
+	if (val->len == 0)
+		r->any_empty_value = 1;
+	else {
+		r->all_empty_value = 0;
+		if (btree->dictionary)
+			WT_RET(__rec_dict_replace(session, r, 0, val));
+		__rec_copy_incr(session, r, val);
+	}
+
+	/* Update compression state. */
+	__rec_key_state_update(r, ovfl_key);
+
+	return (0);
+}
+
+/*
+ * __rec_col_fix_bulk_insert_split_check --
+ *	Check if a bulk-loaded fixed-length column store page needs to split.
+ */
+static inline int
+__rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk)
+{
+	WT_BTREE *btree;
+	WT_RECONCILE *r;
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session;
+	r = cbulk->reconcile;
+	btree = S2BT(session);
+
+	if (cbulk->entry == cbulk->nrecs) {
+		if (cbulk->entry != 0) {
+			/*
+			 * If everything didn't fit, update the counters and
+			 * split.
+			 *
+			 * Boundary: split or write the page.
+			 */
+			__rec_incr(session, r, cbulk->entry,
+			    __bitstr_size(
+			    (size_t)cbulk->entry * btree->bitcnt));
+			WT_RET(__rec_split(session, r));
+		}
+		cbulk->entry = 0;
+		cbulk->nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail);
+	}
+	return (0);
+}
+
+/*
+ * __wt_bulk_insert_fix --
+ *	Fixed-length column-store bulk insert.
+ */
+int
+__wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+{
+	WT_BTREE *btree;
+	WT_CURSOR *cursor;
+	WT_RECONCILE *r;
+	uint32_t entries, offset, page_entries, page_size;
+	const uint8_t *data;
+
+	r = cbulk->reconcile;
+	btree = S2BT(session);
+	cursor = &cbulk->cbt.iface;
+
+	if (cbulk->bitmap) {
+		if (((r->recno - 1) * btree->bitcnt) & 0x7)
+			WT_RET_MSG(session, EINVAL,
+			    "Bulk bitmap load not aligned on a byte boundary");
+		for (data = cursor->value.data,
+		    entries = (uint32_t)cursor->value.size;
+		    entries > 0;
+		    entries -= page_entries, data += page_size) {
+			WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk));
+
+			page_entries =
+			    WT_MIN(entries, cbulk->nrecs - cbulk->entry);
+			page_size = __bitstr_size(page_entries * btree->bitcnt);
+			offset = __bitstr_size(cbulk->entry * btree->bitcnt);
+			memcpy(r->first_free + offset, data, page_size);
+			cbulk->entry += page_entries;
+			r->recno += page_entries;
+		}
+		return (0);
+	}
+
+	WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk));
+
+	__bit_setv(r->first_free,
+	    cbulk->entry, btree->bitcnt, ((uint8_t *)cursor->value.data)[0]);
+	++cbulk->entry;
+	++r->recno;
+
+	return (0);
+}
+
+/*
+ * __wt_bulk_insert_var --
+ *	Variable-length column-store bulk insert.
+ */
+int
+__wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+{
+	WT_BTREE *btree;
+	WT_KV *val;
+	WT_RECONCILE *r;
+
+	r = cbulk->reconcile;
+	btree = S2BT(session);
+
+	/*
+	 * Store the bulk cursor's last buffer, not the current value, we're
+	 * creating a duplicate count, which means we want the previous value
+	 * seen, not the current value.
+	 */
+	val = &r->v;
+	WT_RET(__rec_cell_build_val(
+	    session, r, cbulk->last.data, cbulk->last.size, cbulk->rle));
+
+	/* Boundary: split or write the page. */
+	while (val->len > r->space_avail)
+		if (r->raw_compression)
+			WT_RET(__rec_split_raw(session, r));
+		else
+			WT_RET(__rec_split(session, r));
+
+	/* Copy the value onto the page. */
+	if (btree->dictionary)
+		WT_RET(__rec_dict_replace(session, r, cbulk->rle, val));
+	__rec_copy_incr(session, r, val);
+
+	/* Update the starting record number in case we split. */
+	r->recno += cbulk->rle;
+
+	return (0);
+}
+
+/*
+ * __rec_vtype --
+ *	Return a value cell's address type.
+ */
+static inline u_int
+__rec_vtype(WT_ADDR *addr)
+{
+	if (addr->type == WT_ADDR_INT)
+		return (WT_CELL_ADDR_INT);
+	if (addr->type == WT_ADDR_LEAF)
+		return (WT_CELL_ADDR_LEAF);
+	return (WT_CELL_ADDR_LEAF_NO);
+}
+
+/*
+ * __rec_col_int --
+ *	Reconcile a column-store internal page.
+ */
+static int
+__rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+	WT_ADDR *addr;
+	WT_BTREE *btree;
+	WT_CELL_UNPACK *vpack, _vpack;
+	WT_DECL_RET;
+	WT_KV *val;
+	WT_PAGE *child;
+	WT_REF *ref;
+	int hazard, state;
+
+	btree = S2BT(session);
+	child = NULL;
+	hazard = 0;
+
+	val = &r->v;
+	vpack = &_vpack;
+
+	WT_RET(__rec_split_init(
+	    session, r, page, page->pg_intl_recno, btree->maxintlpage));
+
+	/* For each entry in the in-memory page... */
+	WT_INTL_FOREACH_BEGIN(session, page, ref) {
+		/* Update the starting record number in case we split. */
+		r->recno = ref->key.recno;
+
+		/*
+		 * Modified child.
+		 * The page may be emptied or internally created during a split.
+		 * Deleted/split pages are merged into the parent and discarded.
+		 */
+		WT_ERR(__rec_child_modify(session, r, ref, &hazard, &state));
+		addr = NULL;
+		child = ref->page;
+		if (state != 0) {
+			/*
+			 * Currently the only non-zero returned stated possible
+			 * for a column-store page is child-modified (all other
+			 * states are part of the fast-truncate support, which
+			 * is row-store only).
+			 */
+			WT_ASSERT(session, state == WT_CHILD_MODIFIED);
+
+			switch (F_ISSET(child->modify, WT_PM_REC_MASK)) {
+			case WT_PM_REC_EMPTY:
+				/*
+				 * Column-store pages are almost never empty, as
+				 * discarding a page would remove a chunk of the
+				 * name space.  The exceptions are pages created
+				 * when the tree is created, and never filled.
+				 */
+				CHILD_RELEASE_ERR(session, hazard, ref);
+				continue;
+			case WT_PM_REC_MULTIBLOCK:
+				WT_ERR(__rec_col_merge(session, r, child));
+				CHILD_RELEASE_ERR(session, hazard, ref);
+				continue;
+			case WT_PM_REC_REPLACE:
+				addr = &child->modify->mod_replace;
+				break;
+			WT_ILLEGAL_VALUE_ERR(session);
+			}
+		}
+
+		/*
+		 * Build the value cell.  The child page address is in one of 3
+		 * places: if the page was replaced, the page's modify structure
+		 * references it and we built the value cell just above in the
+		 * switch statement.  Else, the WT_REF->addr reference points to
+		 * an on-page cell or an off-page WT_ADDR structure: if it's an
+		 * on-page cell and we copy it from the page, else build a new
+		 * cell.
+		 */
+		if (addr == NULL && __wt_off_page(page, ref->addr))
+			addr = ref->addr;
+		if (addr == NULL) {
+			__wt_cell_unpack(ref->addr, vpack);
+			val->buf.data = ref->addr;
+			val->buf.size = __wt_cell_total_len(vpack);
+			val->cell_len = 0;
+			val->len = val->buf.size;
+		} else
+			__rec_cell_build_addr(r, addr->addr, addr->size,
+			    __rec_vtype(addr), ref->key.recno);
+		CHILD_RELEASE_ERR(session, hazard, ref);
+
+		/* Boundary: split or write the page. */
+		while (val->len > r->space_avail)
+			if (r->raw_compression)
+				WT_ERR(__rec_split_raw(session, r));
+			else
+				WT_ERR(__rec_split(session, r));
+
+		/* Copy the value onto the page. */
+		__rec_copy_incr(session, r, val);
+	} WT_INTL_FOREACH_END;
+
+	/* Write the remnant page. */
+	return (__rec_split_finish(session, r));
+
+err:	CHILD_RELEASE(session, hazard, ref);
+	return (ret);
+}
+
+/*
+ * __rec_col_merge --
+ *	Merge in a split page.
+ */
+static int
+__rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+	WT_ADDR *addr;
+	WT_KV *val;
+	WT_MULTI *multi;
+	WT_PAGE_MODIFY *mod;
+	uint32_t i;
+
+	mod = page->modify;
+
+	val = &r->v;
+
+	/* For each entry in the split array... */
+	for (multi = mod->mod_multi,
+	    i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
+		/* Update the starting record number in case we split. */
+		r->recno = multi->key.recno;
+
+		/* Build the value cell. */
+		addr = &multi->addr;
+		__rec_cell_build_addr(r,
+		    addr->addr, addr->size, __rec_vtype(addr), r->recno);
+
+		/* Boundary: split or write the page. */
+		while (val->len > r->space_avail)
+			if (r->raw_compression)
+				WT_RET(__rec_split_raw(session, r));
+			else
+				WT_RET(__rec_split(session, r));
+
+		/* Copy the value onto the page. */
+		__rec_copy_incr(session, r, val);
+	}
+	return (0);
+}
+
+/*
+ * __rec_col_fix --
+ *	Reconcile a fixed-width, column-store leaf page.
+ */
+static int
+__rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+	WT_BTREE *btree;
+	WT_INSERT *ins;
+	WT_UPDATE *upd;
+	uint64_t recno;
+	uint32_t entry, nrecs;
+
+	btree = S2BT(session);
+
+	WT_RET(__rec_split_init(
+	    session, r, page, page->pg_fix_recno, btree->maxleafpage));
+
+	/* Update any changes to the original on-page data items. */
+	WT_SKIP_FOREACH(ins, WT_COL_UPDATE_SINGLE(page)) {
+		WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd));
+		if (upd != NULL)
+			__bit_setv_recno(page, WT_INSERT_RECNO(ins),
+			    btree->bitcnt, ((uint8_t *)WT_UPDATE_DATA(upd))[0]);
+	}
+
+	/* Copy the updated, disk-image bytes into place. */
+	memcpy(r->first_free, page->pg_fix_bitf,
+	    __bitstr_size((size_t)page->pg_fix_entries * btree->bitcnt));
+
+	/* Calculate the number of entries per page remainder. */
+	entry = page->pg_fix_entries;
+	nrecs = WT_FIX_BYTES_TO_ENTRIES(
+	    btree, r->space_avail) - page->pg_fix_entries;
+	r->recno += entry;
+
+	/* Walk any append list. */
+	WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) {
+		WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd));
+		if (upd == NULL)
+			continue;
+		for (;;) {
+			/*
+			 * The application may have inserted records which left
+			 * gaps in the name space.
+			 */
+			for (recno = WT_INSERT_RECNO(ins);
+			    nrecs > 0 && r->recno < recno;
+			    --nrecs, ++entry, ++r->recno)
+				__bit_setv(
+				    r->first_free, entry, btree->bitcnt, 0);
+
+			if (nrecs > 0) {
+				__bit_setv(r->first_free, entry, btree->bitcnt,
+				    ((uint8_t *)WT_UPDATE_DATA(upd))[0]);
+				--nrecs;
+				++entry;
+				++r->recno;
+				break;
+			}
+
+			/*
+			 * If everything didn't fit, update the counters and
+			 * split.
+			 *
+			 * Boundary: split or write the page.
+			 */
+			__rec_incr(session, r, entry,
+			    __bitstr_size((size_t)entry * btree->bitcnt));
+			WT_RET(__rec_split(session, r));
+
+			/* Calculate the number of entries per page. */
+			entry = 0;
+			nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail);
+		}
+	}
+
+	/* Update the counters. */
+	__rec_incr(
+	    session, r, entry, __bitstr_size((size_t)entry * btree->bitcnt));
+
+	/* Write the remnant page. */
+	return (__rec_split_finish(session, r));
+}
+
+/*
+ * __rec_col_fix_slvg --
+ *	Reconcile a fixed-width, column-store leaf page created during salvage.
+ */
+static int
+__rec_col_fix_slvg(WT_SESSION_IMPL *session,
+    WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
+{
+	WT_BTREE *btree;
+	uint64_t page_start, page_take;
+	uint32_t entry, nrecs;
+
+	btree = S2BT(session);
+
+	/*
+	 * !!!
+	 * It's vanishingly unlikely and probably impossible for fixed-length
+	 * column-store files to have overlapping key ranges.  It's possible
+	 * for an entire key range to go missing (if a page is corrupted and
+	 * lost), but because pages can't split, it shouldn't be possible to
+	 * find pages where the key ranges overlap.  That said, we check for
+	 * it during salvage and clean up after it here because it doesn't
+	 * cost much and future column-store formats or operations might allow
+	 * for fixed-length format ranges to overlap during salvage, and I
+	 * don't want to have to retrofit the code later.
+	 */
+	WT_RET(__rec_split_init(
+	    session, r, page, page->pg_fix_recno, btree->maxleafpage));
+
+	/* We may not be taking all of the entries on the original page. */
+	page_take = salvage->take == 0 ? page->pg_fix_entries : salvage->take;
+	page_start = salvage->skip == 0 ? 0 : salvage->skip;
+
+	/* Calculate the number of entries per page. */
+	entry = 0;
+	nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail);
+
+	for (; nrecs > 0 && salvage->missing > 0;
+	    --nrecs, --salvage->missing, ++entry)
+		__bit_setv(r->first_free, entry, btree->bitcnt, 0);
+
+	for (; nrecs > 0 && page_take > 0;
+	    --nrecs, --page_take, ++page_start, ++entry)
+		__bit_setv(r->first_free, entry, btree->bitcnt,
+		    __bit_getv(page->pg_fix_bitf,
+			(uint32_t)page_start, btree->bitcnt));
+
+	r->recno += entry;
+	__rec_incr(session, r, entry,
+	    __bitstr_size((size_t)entry * btree->bitcnt));
+
+	/*
+	 * We can't split during salvage -- if everything didn't fit, it's
+	 * all gone wrong.
+	 */
+	if (salvage->missing != 0 || page_take != 0)
+		WT_PANIC_RET(session, WT_PANIC,
+		    "%s page too large, attempted split during salvage",
+		    __wt_page_type_string(page->type));
+
+	/* Write the page. */
+	return (__rec_split_finish(session, r));
+}
+
+/*
+ * __rec_col_var_helper --
+ *	Create a column-store variable length record cell and write it onto a
+ * page.
+ */
+static int
+__rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r,
+    WT_SALVAGE_COOKIE *salvage,
+    WT_ITEM *value, int deleted, uint8_t overflow_type, uint64_t rle)
+{
+	WT_BTREE *btree;
+	WT_KV *val;
+
+	btree = S2BT(session);
+
+	val = &r->v;
+
+	/*
+	 * Occasionally, salvage needs to discard records from the beginning or
+	 * end of the page, and because the items may be part of a RLE cell, do
+	 * the adjustments here.   It's not a mistake we don't bother telling
+	 * our caller we've handled all the records from the page we care about,
+	 * and can quit processing the page: salvage is a rare operation and I
+	 * don't want to complicate our caller's loop.
+	 */
+	if (salvage != NULL) {
+		if (salvage->done)
+			return (0);
+		if (salvage->skip != 0) {
+			if (rle <= salvage->skip) {
+				salvage->skip -= rle;
+				return (0);
+			}
+			rle -= salvage->skip;
+			salvage->skip = 0;
+		}
+		if (salvage->take != 0) {
+			if (rle <= salvage->take)
+				salvage->take -= rle;
+			else {
+				rle = salvage->take;
+				salvage->take = 0;
+			}
+			if (salvage->take == 0)
+				salvage->done = 1;
+		}
+	}
+
+	if (deleted) {
+		val->cell_len = __wt_cell_pack_del(&val->cell, rle);
+		val->buf.data = NULL;
+		val->buf.size = 0;
+		val->len = val->cell_len;
+	} else if (overflow_type) {
+		val->cell_len = __wt_cell_pack_ovfl(
+		    &val->cell, overflow_type, rle, value->size);
+		val->buf.data = value->data;
+		val->buf.size = value->size;
+		val->len = val->cell_len + value->size;
+	} else
+		WT_RET(__rec_cell_build_val(
+		    session, r, value->data, value->size, rle));
+
+	/* Boundary: split or write the page. */
+	while (val->len > r->space_avail)
+		if (r->raw_compression)
+			WT_RET(__rec_split_raw(session, r));
+		else
+			WT_RET(__rec_split(session, r));
+
+	/* Copy the value onto the page. */
+	if (!deleted && !overflow_type && btree->dictionary)
+		WT_RET(__rec_dict_replace(session, r, rle, val));
+	__rec_copy_incr(session, r, val);
+
+	/* Update the starting record number in case we split. */
+	r->recno += rle;
+
+	return (0);
+}
+
+/*
+ * __rec_col_var --
+ *	Reconcile a variable-width column-store leaf page.
+ */
+static int
+__rec_col_var(WT_SESSION_IMPL *session,
+    WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
+{
+	enum { OVFL_IGNORE, OVFL_UNUSED, OVFL_USED } ovfl_state;
+	WT_BTREE *btree;
+	WT_CELL *cell;
+	WT_CELL_UNPACK *vpack, _vpack;
+	WT_COL *cip;
+	WT_DECL_ITEM(orig);
+	WT_DECL_RET;
+	WT_INSERT *ins;
+	WT_ITEM *last;
+	WT_UPDATE *upd;
+	uint64_t n, nrepeat, repeat_count, rle, src_recno;
+	uint32_t i, size;
+	int deleted, last_deleted, orig_deleted, update_no_copy;
+	const void *data;
+
+	btree = S2BT(session);
+	last = r->last;
+	vpack = &_vpack;
+
+	WT_RET(__wt_scr_alloc(session, 0, &orig));
+	data = NULL;
+	size = 0;
+	upd = NULL;
+
+	WT_RET(__rec_split_init(
+	    session, r, page, page->pg_var_recno, btree->maxleafpage));
+
+	/*
+	 * The salvage code may be calling us to reconcile a page where there
+	 * were missing records in the column-store name space.  If taking the
+	 * first record from on the page, it might be a deleted record, so we
+	 * have to give the RLE code a chance to figure that out.  Else, if
+	 * not taking the first record from the page, write a single element
+	 * representing the missing records onto a new page.  (Don't pass the
+	 * salvage cookie to our helper function in this case, we're handling
+	 * one of the salvage cookie fields on our own, and we don't need the
+	 * helper function's assistance.)
+	 */
+	rle = 0;
+	last_deleted = 0;
+	if (salvage != NULL && salvage->missing != 0) {
+		if (salvage->skip == 0) {
+			rle = salvage->missing;
+			last_deleted = 1;
+
+			/*
+			 * Correct the number of records we're going to "take",
+			 * pretending the missing records were on the page.
+			 */
+			salvage->take += salvage->missing;
+		} else
+			WT_ERR(__rec_col_var_helper(
+			    session, r, NULL, NULL, 1, 0, salvage->missing));
+	}
+
+	/*
+	 * We track two data items through this loop: the previous (last) item
+	 * and the current item: if the last item is the same as the current
+	 * item, we increment the RLE count for the last item; if the last item
+	 * is different from the current item, we write the last item onto the
+	 * page, and replace it with the current item.  The r->recno counter
+	 * tracks records written to the page, and is incremented by the helper
+	 * function immediately after writing records to the page.  The record
+	 * number of our source record, that is, the current item, is maintained
+	 * in src_recno.
+	 */
+	src_recno = r->recno + rle;
+
+	/* For each entry in the in-memory page... */
+	WT_COL_FOREACH(page, cip, i) {
+		ovfl_state = OVFL_IGNORE;
+		if ((cell = WT_COL_PTR(page, cip)) == NULL) {
+			nrepeat = 1;
+			ins = NULL;
+			orig_deleted = 1;
+		} else {
+			__wt_cell_unpack(cell, vpack);
+			nrepeat = __wt_cell_rle(vpack);
+			ins = WT_SKIP_FIRST(WT_COL_UPDATE(page, cip));
+
+			/*
+			 * If the original value is "deleted", there's no value
+			 * to compare, we're done.
+			 */
+			orig_deleted = vpack->type == WT_CELL_DEL ? 1 : 0;
+			if (orig_deleted)
+				goto record_loop;
+
+			/*
+			 * Overflow items are tricky: we don't know until we're
+			 * finished processing the set of values if we need the
+			 * overflow value or not.  If we don't use the overflow
+			 * item at all, we have to discard it from the backing
+			 * file, otherwise we'll leak blocks on the checkpoint.
+			 * That's safe because if the backing overflow value is
+			 * still needed by any running transaction, we'll cache
+			 * a copy in the reconciliation tracking structures.
+			 *
+			 * Regardless, we avoid copying in overflow records: if
+			 * there's a WT_INSERT entry that modifies a reference
+			 * counted overflow record, we may have to write copies
+			 * of the overflow record, and in that case we'll do the
+			 * comparisons, but we don't read overflow items just to
+			 * see if they match records on either side.
+			 */
+			if (vpack->ovfl) {
+				ovfl_state = OVFL_UNUSED;
+				goto record_loop;
+			}
+
+			/*
+			 * If data is Huffman encoded, we have to decode it in
+			 * order to compare it with the last item we saw, which
+			 * may have been an update string.  This guarantees we
+			 * find every single pair of objects we can RLE encode,
+			 * including applications updating an existing record
+			 * where the new value happens (?) to match a Huffman-
+			 * encoded value in a previous or next record.
+			 */
+			WT_ERR(__wt_dsk_cell_data_ref(
+			    session, WT_PAGE_COL_VAR, vpack, orig));
+		}
+
+record_loop:	/*
+		 * Generate on-page entries: loop repeat records, looking for
+		 * WT_INSERT entries matching the record number.  The WT_INSERT
+		 * lists are in sorted order, so only need check the next one.
+		 */
+		for (n = 0;
+		    n < nrepeat; n += repeat_count, src_recno += repeat_count) {
+			upd = NULL;
+			if (ins != NULL && WT_INSERT_RECNO(ins) == src_recno) {
+				WT_ERR(__rec_txn_read(
+				    session, r, ins, NULL, vpack, &upd));
+				ins = WT_SKIP_NEXT(ins);
+			}
+			if (upd != NULL) {
+				update_no_copy = 1;	/* No data copy */
+				repeat_count = 1;	/* Single record */
+
+				deleted = WT_UPDATE_DELETED_ISSET(upd);
+				if (!deleted) {
+					data = WT_UPDATE_DATA(upd);
+					size = upd->size;
+				}
+			} else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) {
+				update_no_copy = 1;	/* No data copy */
+				repeat_count = 1;	/* Single record */
+
+				deleted = 0;
+
+				/*
+				 * If doing update save and restore, there's an
+				 * update that's not globally visible, and the
+				 * underlying value is a removed overflow value,
+				 * we end up here.
+				 *
+				 * When the update save/restore code noticed the
+				 * removed overflow value, it appended a copy of
+				 * the cached, original overflow value to the
+				 * update list being saved (ensuring the on-page
+				 * item will never be accessed after the page is
+				 * re-instantiated), then returned a NULL update
+				 * to us.
+				 *
+				 * Assert the case: if we remove an underlying
+				 * overflow object, checkpoint reconciliation
+				 * should never see it again, there should be a
+				 * visible update in the way.
+				 *
+				 * Write a placeholder.
+				 */
+				 WT_ASSERT(session,
+				     F_ISSET(r, WT_SKIP_UPDATE_RESTORE));
+
+				data = "@";
+				size = 1;
+			} else {
+				update_no_copy = 0;	/* Maybe data copy */
+
+				/*
+				 * The repeat count is the number of records up
+				 * to the next WT_INSERT record, or up to the
+				 * end of the entry if we have no more WT_INSERT
+				 * records.
+				 */
+				if (ins == NULL)
+					repeat_count = nrepeat - n;
+				else
+					repeat_count =
+					    WT_INSERT_RECNO(ins) - src_recno;
+
+				deleted = orig_deleted;
+				if (deleted)
+					goto compare;
+
+				/*
+				 * If we are handling overflow items, use the
+				 * overflow item itself exactly once, after
+				 * which we have to copy it into a buffer and
+				 * from then on use a complete copy because we
+				 * are re-creating a new overflow record each
+				 * time.
+				 */
+				switch (ovfl_state) {
+				case OVFL_UNUSED:
+					/*
+					 * An as-yet-unused overflow item.
+					 *
+					 * We're going to copy the on-page cell,
+					 * write out any record we're tracking.
+					 */
+					if (rle != 0) {
+						WT_ERR(__rec_col_var_helper(
+						    session, r, salvage, last,
+						    last_deleted, 0, rle));
+						rle = 0;
+					}
+
+					last->data = vpack->data;
+					last->size = vpack->size;
+					WT_ERR(__rec_col_var_helper(
+					    session, r, salvage, last, 0,
+					    WT_CELL_VALUE_OVFL, repeat_count));
+
+					/* Track if page has overflow items. */
+					r->ovfl_items = 1;
+
+					ovfl_state = OVFL_USED;
+					continue;
+				case OVFL_USED:
+					/*
+					 * Original is an overflow item; we used
+					 * it for a key and now we need another
+					 * copy; read it into memory.
+					 */
+					WT_ERR(__wt_dsk_cell_data_ref(session,
+					    WT_PAGE_COL_VAR, vpack, orig));
+
+					ovfl_state = OVFL_IGNORE;
+					/* FALLTHROUGH */
+				case OVFL_IGNORE:
+					/*
+					 * Original is an overflow item and we
+					 * were forced to copy it into memory,
+					 * or the original wasn't an overflow
+					 * item; use the data copied into orig.
+					 */
+					data = orig->data;
+					size = (uint32_t)orig->size;
+					break;
+				}
+			}
+
+compare:		/*
+			 * If we have a record against which to compare, and
+			 * the records compare equal, increment the rle counter
+			 * and continue.  If the records don't compare equal,
+			 * output the last record and swap the last and current
+			 * buffers: do NOT update the starting record number,
+			 * we've been doing that all along.
+			 */
+			if (rle != 0) {
+				if ((deleted && last_deleted) ||
+				    (!last_deleted && !deleted &&
+				    last->size == size &&
+				    memcmp(last->data, data, size) == 0)) {
+					rle += repeat_count;
+					continue;
+				}
+				WT_ERR(__rec_col_var_helper(session, r,
+				    salvage, last, last_deleted, 0, rle));
+			}
+
+			/*
+			 * Swap the current/last state.
+			 *
+			 * Reset RLE counter and turn on comparisons.
+			 */
+			if (!deleted) {
+				/*
+				 * We can't simply assign the data values into
+				 * the last buffer because they may have come
+				 * from a copy built from an encoded/overflow
+				 * cell and creating the next record is going
+				 * to overwrite that memory.  Check, because
+				 * encoded/overflow cells aren't that common
+				 * and we'd like to avoid the copy.  If data
+				 * was taken from the current unpack structure
+				 * (which points into the page), or was taken
+				 * from an update structure, we can just use
+				 * the pointers, they're not moving.
+				 */
+				if (data == vpack->data || update_no_copy) {
+					last->data = data;
+					last->size = size;
+				} else
+					WT_ERR(__wt_buf_set(
+					    session, last, data, size));
+			}
+			last_deleted = deleted;
+			rle = repeat_count;
+		}
+
+		/*
+		 * If we had a reference to an overflow record we never used,
+		 * discard the underlying blocks, they're no longer useful.
+		 *
+		 * One complication: we must cache a copy before discarding the
+		 * on-disk version if there's a transaction in the system that
+		 * might read the original value.
+		 */
+		if (ovfl_state == OVFL_UNUSED &&
+		    vpack->raw != WT_CELL_VALUE_OVFL_RM)
+			WT_ERR(__wt_ovfl_cache(session, page, upd, vpack));
+	}
+
+	/* Walk any append list. */
+	WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) {
+		WT_ERR(__rec_txn_read(session, r, ins, NULL, NULL, &upd));
+		if (upd == NULL)
+			continue;
+		for (n = WT_INSERT_RECNO(ins); src_recno <= n; ++src_recno) {
+			/*
+			 * The application may have inserted records which left
+			 * gaps in the name space.
+			 */
+			if (src_recno < n)
+				deleted = 1;
+			else {
+				deleted = WT_UPDATE_DELETED_ISSET(upd);
+				if (!deleted) {
+					data = WT_UPDATE_DATA(upd);
+					size = upd->size;
+				}
+			}
+
+			/*
+			 * Handle RLE accounting and comparisons -- see comment
+			 * above, this code fragment does the same thing.
+			 */
+			if (rle != 0) {
+				if ((deleted && last_deleted) ||
+				    (!last_deleted && !deleted &&
+				    last->size == size &&
+				    memcmp(last->data, data, size) == 0)) {
+					++rle;
+					continue;
+				}
+				WT_ERR(__rec_col_var_helper(session, r,
+				    salvage, last, last_deleted, 0, rle));
+			}
+
+			/*
+			 * Swap the current/last state.  We always assign the
+			 * data values to the buffer because they can only be
+			 * the data from a WT_UPDATE structure.
+			 *
+			 * Reset RLE counter and turn on comparisons.
+			 */
+			if (!deleted) {
+				last->data = data;
+				last->size = size;
+			}
+			last_deleted = deleted;
+			rle = 1;
+		}
+	}
+
+	/* If we were tracking a record, write it. */
+	if (rle != 0)
+		WT_ERR(__rec_col_var_helper(
+		    session, r, salvage, last, last_deleted, 0, rle));
+
+	/* Write the remnant page. */
+	ret = __rec_split_finish(session, r);
+
+err:	__wt_scr_free(&orig);
+	return (ret);
+}
+
+/*
+ * __rec_row_int --
+ *	Reconcile a row-store internal page.
+ */
+static int
+__rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+	WT_ADDR *addr;
+	WT_BTREE *btree;
+	WT_CELL *cell;
+	WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack;
+	WT_DECL_RET;
+	WT_IKEY *ikey;
+	WT_KV *key, *val;
+	WT_PAGE *child;
+	WT_REF *ref;
+	size_t size;
+	u_int vtype;
+	int hazard, key_onpage_ovfl, ovfl_key, state;
+	const void *p;
+
+	btree = S2BT(session);
+	child = NULL;
+	hazard = 0;
+
+	key = &r->k;
+	kpack = &_kpack;
+	WT_CLEAR(*kpack);	/* -Wuninitialized */
+	val = &r->v;
+	vpack = &_vpack;
+	WT_CLEAR(*vpack);	/* -Wuninitialized */
+
+	WT_RET(__rec_split_init(session, r, page, 0ULL, btree->maxintlpage));
+
+	/*
+	 * Ideally, we'd never store the 0th key on row-store internal pages
+	 * because it's never used during tree search and there's no reason
+	 * to waste the space.  The problem is how we do splits: when we split,
+	 * we've potentially picked out several "split points" in the buffer
+	 * which is overflowing the maximum page size, and when the overflow
+	 * happens, we go back and physically split the buffer, at those split
+	 * points, into new pages.  It would be both difficult and expensive
+	 * to re-process the 0th key at each split point to be an empty key,
+	 * so we don't do that.  However, we are reconciling an internal page
+	 * for whatever reason, and the 0th key is known to be useless.  We
+	 * truncate the key to a single byte, instead of removing it entirely,
+	 * it simplifies various things in other parts of the code (we don't
+	 * have to special case transforming the page from its disk image to
+	 * its in-memory version, for example).
+	 */
+	r->cell_zero = 1;
+
+	/* For each entry in the in-memory page... */
+	WT_INTL_FOREACH_BEGIN(session, page, ref) {
+		/*
+		 * There are different paths if the key is an overflow item vs.
+		 * a straight-forward on-page value.   If an overflow item, we
+		 * would have instantiated it, and we can use that fact to set
+		 * things up.
+		 *
+		 * Note the cell reference and unpacked key cell are available
+		 * only in the case of an instantiated, off-page key.
+		 */
+		ikey = __wt_ref_key_instantiated(ref);
+		if (ikey == NULL || ikey->cell_offset == 0) {
+			cell = NULL;
+			key_onpage_ovfl = 0;
+		} else {
+			cell = WT_PAGE_REF_OFFSET(page, ikey->cell_offset);
+			__wt_cell_unpack(cell, kpack);
+			key_onpage_ovfl =
+			    kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM;
+		}
+
+		WT_ERR(__rec_child_modify(session, r, ref, &hazard, &state));
+		addr = ref->addr;
+		child = ref->page;
+		vtype = 0;
+
+		/* Deleted child we don't have to write. */
+		if (state == WT_CHILD_IGNORE) {
+			/*
+			 * Overflow keys referencing discarded pages are no
+			 * longer useful, schedule them for discard.  Don't
+			 * worry about instantiation, internal page keys are
+			 * always instantiated.  Don't worry about reuse,
+			 * reusing this key in this reconciliation is unlikely.
+			 */
+			if (key_onpage_ovfl)
+				WT_ERR(__wt_ovfl_discard_add(
+				    session, page, kpack->cell));
+			CHILD_RELEASE_ERR(session, hazard, ref);
+			continue;
+		}
+
+		/* Deleted child requiring a proxy cell. */
+		if (state == WT_CHILD_PROXY)
+			vtype = WT_CELL_ADDR_DEL;
+
+		/*
+		 * Modified child.  Empty pages are merged into the parent and
+		 * discarded.
+		 */
+		if (state == WT_CHILD_MODIFIED)
+			switch (F_ISSET(child->modify, WT_PM_REC_MASK)) {
+			case WT_PM_REC_EMPTY:
+				/*
+				 * Overflow keys referencing empty pages are no
+				 * longer useful, schedule them for discard.
+				 * Don't worry about instantiation, internal
+				 * page keys are always instantiated.  Don't
+				 * worry about reuse, reusing this key in this
+				 * reconciliation is unlikely.
+				 */
+				if (key_onpage_ovfl)
+					WT_ERR(__wt_ovfl_discard_add(
+					    session, page, kpack->cell));
+				CHILD_RELEASE_ERR(session, hazard, ref);
+				continue;
+			case WT_PM_REC_MULTIBLOCK:
+				/*
+				 * Overflow keys referencing split pages are no
+				 * longer useful (the split page's key is the
+				 * interesting key); schedule them for discard.
+				 * Don't worry about instantiation, internal
+				 * page keys are always instantiated.  Don't
+				 * worry about reuse, reusing this key in this
+				 * reconciliation is unlikely.
+				 */
+				if (key_onpage_ovfl)
+					WT_ERR(__wt_ovfl_discard_add(
+					    session, page, kpack->cell));
+
+				WT_ERR(__rec_row_merge(session, r, child));
+				CHILD_RELEASE_ERR(session, hazard, ref);
+				continue;
+			case WT_PM_REC_REPLACE:
+				/*
+				 * If the page is replaced, the page's modify
+				 * structure has the page's address.
+				 */
+				addr = &child->modify->mod_replace;
+				break;
+			WT_ILLEGAL_VALUE_ERR(session);
+			}
+
+		/*
+		 * Build the value cell, the child page's address.  Addr points
+		 * to an on-page cell or an off-page WT_ADDR structure.   The
+		 * cell type has been set in the case of page deletion requiring
+		 * a proxy cell, otherwise use the information from the addr or
+		 * original cell.
+		 */
+		if (__wt_off_page(page, addr)) {
+			p = addr->addr;
+			size = addr->size;
+			if (vtype == 0)
+				vtype = __rec_vtype(addr);
+		} else {
+			__wt_cell_unpack(ref->addr, vpack);
+			p = vpack->data;
+			size = vpack->size;
+			if (vtype == 0)
+				vtype = vpack->raw;
+		}
+		__rec_cell_build_addr(r, p, size, vtype, 0);
+		CHILD_RELEASE_ERR(session, hazard, ref);
+
+		/*
+		 * Build key cell.
+		 * Truncate any 0th key, internal pages don't need 0th keys.
+		 */
+		if (key_onpage_ovfl) {
+			key->buf.data = cell;
+			key->buf.size = __wt_cell_total_len(kpack);
+			key->cell_len = 0;
+			key->len = key->buf.size;
+			ovfl_key = 1;
+		} else {
+			__wt_ref_key(page, ref, &p, &size);
+			WT_ERR(__rec_cell_build_int_key(
+			    session, r, p, r->cell_zero ? 1 : size, &ovfl_key));
+		}
+		r->cell_zero = 0;
+
+		/* Boundary: split or write the page. */
+		while (key->len + val->len > r->space_avail) {
+			if (r->raw_compression) {
+				WT_ERR(__rec_split_raw(session, r));
+				continue;
+			}
+
+			/*
+			 * In one path above, we copied address blocks from the
+			 * page rather than building the actual key.  In that
+			 * case, we have to build the actual key now because we
+			 * are about to promote it.
+			 */
+			if (key_onpage_ovfl) {
+				WT_ERR(__wt_buf_set(session,
+				    r->cur, WT_IKEY_DATA(ikey), ikey->size));
+				key_onpage_ovfl = 0;
+			}
+			WT_ERR(__rec_split(session, r));
+		}
+
+		/* Copy the key and value onto the page. */
+		__rec_copy_incr(session, r, key);
+		__rec_copy_incr(session, r, val);
+
+		/* Update compression state. */
+		__rec_key_state_update(r, ovfl_key);
+	} WT_INTL_FOREACH_END;
+
+	/* Write the remnant page. */
+	return (__rec_split_finish(session, r));
+
+err:	CHILD_RELEASE(session, hazard, ref);
+	return (ret);
+}
+
+/*
+ * __rec_row_merge --
+ *	Merge in a split page.
+ */
+static int
+__rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+	WT_ADDR *addr;
+	WT_KV *key, *val;
+	WT_MULTI *multi;
+	WT_PAGE_MODIFY *mod;
+	uint32_t i;
+	int ovfl_key;
+
+	mod = page->modify;
+
+	key = &r->k;
+	val = &r->v;
+
+	/* For each entry in the split array... */
+	for (multi = mod->mod_multi,
+	    i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
+		/* Build the key and value cells. */
+		WT_RET(__rec_cell_build_int_key(session, r,
+		    WT_IKEY_DATA(multi->key.ikey),
+		    r->cell_zero ? 1 : multi->key.ikey->size, &ovfl_key));
+		r->cell_zero = 0;
+
+		addr = &multi->addr;
+		__rec_cell_build_addr(
+		    r, addr->addr, addr->size, __rec_vtype(addr), 0);
+
+		/* Boundary: split or write the page. */
+		while (key->len + val->len > r->space_avail)
+			if (r->raw_compression)
+				WT_RET(__rec_split_raw(session, r));
+			else
+				WT_RET(__rec_split(session, r));
+
+		/* Copy the key and value onto the page. */
+		__rec_copy_incr(session, r, key);
+		__rec_copy_incr(session, r, val);
+
+		/* Update compression state. */
+		__rec_key_state_update(r, ovfl_key);
+	}
+	return (0);
+}
+
+/*
+ * __rec_row_leaf --
+ *	Reconcile a row-store leaf page.
+ */
+static int
+__rec_row_leaf(WT_SESSION_IMPL *session,
+    WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
+{
+	WT_BTREE *btree;
+	WT_CELL *cell, *val_cell;
+	WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack;
+	WT_DECL_ITEM(tmpkey);
+	WT_DECL_ITEM(tmpval);
+	WT_DECL_RET;
+	WT_IKEY *ikey;
+	WT_INSERT *ins;
+	WT_KV *key, *val;
+	WT_ROW *rip;
+	WT_UPDATE *upd;
+	size_t size;
+	uint64_t slvg_skip;
+	uint32_t i;
+	int dictionary, onpage_ovfl, ovfl_key;
+	const void *p;
+	void *copy;
+
+	btree = S2BT(session);
+	slvg_skip = salvage == NULL ? 0 : salvage->skip;
+
+	key = &r->k;
+	val = &r->v;
+
+	WT_RET(__rec_split_init(session, r, page, 0ULL, btree->maxleafpage));
+
+	/*
+	 * Write any K/V pairs inserted into the page before the first from-disk
+	 * key on the page.
+	 */
+	if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL)
+		WT_RET(__rec_row_leaf_insert(session, r, ins));
+
+	/*
+	 * Temporary buffers in which to instantiate any uninstantiated keys
+	 * or value items we need.
+	 */
+	WT_RET(__wt_scr_alloc(session, 0, &tmpkey));
+	WT_RET(__wt_scr_alloc(session, 0, &tmpval));
+
+	/* For each entry in the page... */
+	WT_ROW_FOREACH(page, rip, i) {
+		/*
+		 * The salvage code, on some rare occasions, wants to reconcile
+		 * a page but skip some leading records on the page.  Because
+		 * the row-store leaf reconciliation function copies keys from
+		 * the original disk page, this is non-trivial -- just changing
+		 * the in-memory pointers isn't sufficient, we have to change
+		 * the WT_CELL structures on the disk page, too.  It's ugly, but
+		 * we pass in a value that tells us how many records to skip in
+		 * this case.
+		 */
+		if (slvg_skip != 0) {
+			--slvg_skip;
+			continue;
+		}
+
+		/*
+		 * Figure out the key: set any cell reference (and unpack it),
+		 * set any instantiated key reference.
+		 */
+		copy = WT_ROW_KEY_COPY(rip);
+		(void)__wt_row_leaf_key_info(
+		    page, copy, &ikey, &cell, NULL, NULL);
+		if (cell == NULL)
+			kpack = NULL;
+		else {
+			kpack = &_kpack;
+			__wt_cell_unpack(cell, kpack);
+		}
+
+		/* Unpack the on-page value cell, and look for an update. */
+		if ((val_cell =
+		    __wt_row_leaf_value_cell(page, rip, NULL)) == NULL)
+			vpack = NULL;
+		else {
+			vpack = &_vpack;
+			__wt_cell_unpack(val_cell, vpack);
+		}
+		WT_ERR(__rec_txn_read(session, r, NULL, rip, vpack, &upd));
+
+		/* Build value cell. */
+		dictionary = 0;
+		if (upd == NULL) {
+			/*
+			 * When the page was read into memory, there may not
+			 * have been a value item.
+			 *
+			 * If there was a value item, check if it's a dictionary
+			 * cell (a copy of another item on the page).  If it's a
+			 * copy, we have to create a new value item as the old
+			 * item might have been discarded from the page.
+			 */
+			if (vpack == NULL) {
+				val->buf.data = NULL;
+				val->cell_len = val->len = val->buf.size = 0;
+			} else if (vpack->raw == WT_CELL_VALUE_COPY) {
+				/* If the item is Huffman encoded, decode it. */
+				if (btree->huffman_value == NULL) {
+					p = vpack->data;
+					size = vpack->size;
+				} else {
+					WT_ERR(__wt_huffman_decode(session,
+					    btree->huffman_value,
+					    vpack->data, vpack->size,
+					    tmpval));
+					p = tmpval->data;
+					size = tmpval->size;
+				}
+				WT_ERR(__rec_cell_build_val(
+				    session, r, p, size, (uint64_t)0));
+				dictionary = 1;
+			} else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) {
+				/*
+				 * If doing update save and restore in service
+				 * of eviction, there's an update that's not
+				 * globally visible, and the underlying value
+				 * is a removed overflow value, we end up here.
+				 *
+				 * When the update save/restore code noticed the
+				 * removed overflow value, it appended a copy of
+				 * the cached, original overflow value to the
+				 * update list being saved (ensuring any on-page
+				 * item will never be accessed after the page is
+				 * re-instantiated), then returned a NULL update
+				 * to us.
+				 *
+				 * Assert the case.
+				 */
+				WT_ASSERT(session,
+				    F_ISSET(r, WT_SKIP_UPDATE_RESTORE));
+
+				/*
+				 * If the key is also a removed overflow item,
+				 * don't write anything at all.
+				 *
+				 * We don't have to write anything because the
+				 * code re-instantiating the page gets the key
+				 * to match the saved list of updates from the
+				 * original page.  By not putting the key on
+				 * the page, we'll move the key/value set from
+				 * a row-store leaf page slot to an insert list,
+				 * but that shouldn't matter.
+				 *
+				 * The reason we bother with the test is because
+				 * overflows are expensive to write.  It's hard
+				 * to imagine a real workload where this test is
+				 * worth the effort, but it's a simple test.
+				 */
+				if (kpack != NULL &&
+				    kpack->raw == WT_CELL_KEY_OVFL_RM)
+					goto leaf_insert;
+
+				/*
+				 * The on-page value will never be accessed,
+				 * write a placeholder record.
+				 */
+				WT_ERR(__rec_cell_build_val(
+				    session, r, "@", 1, (uint64_t)0));
+			} else {
+				val->buf.data = val_cell;
+				val->buf.size = __wt_cell_total_len(vpack);
+				val->cell_len = 0;
+				val->len = val->buf.size;
+
+				/* Track if page has overflow items. */
+				if (vpack->ovfl)
+					r->ovfl_items = 1;
+			}
+		} else {
+			/*
+			 * If the original value was an overflow and we've not
+			 * already done so, discard it.  One complication: we
+			 * must cache a copy before discarding the on-disk
+			 * version if there's a transaction in the system that
+			 * might read the original value.
+			 */
+			if (vpack != NULL &&
+			    vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM)
+				WT_ERR(
+				    __wt_ovfl_cache(session, page, rip, vpack));
+
+			/* If this key/value pair was deleted, we're done. */
+			if (WT_UPDATE_DELETED_ISSET(upd)) {
+				/*
+				 * Overflow keys referencing discarded values
+				 * are no longer useful, discard the backing
+				 * blocks.  Don't worry about reuse, reusing
+				 * keys from a row-store page reconciliation
+				 * seems unlikely enough to ignore.
+				 */
+				if (kpack != NULL && kpack->ovfl &&
+				    kpack->raw != WT_CELL_KEY_OVFL_RM) {
+					/*
+					 * Keys are part of the name-space, we
+					 * can't remove them from the in-memory
+					 * tree; if an overflow key was deleted
+					 * without being instantiated (for
+					 * example, cursor-based truncation, do
+					 * it now.
+					 */
+					if (ikey == NULL)
+						WT_ERR(__wt_row_leaf_key(
+						    session,
+						    page, rip, tmpkey, 1));
+
+					WT_ERR(__wt_ovfl_discard_add(
+					    session, page, kpack->cell));
+				}
+
+				/*
+				 * We aren't actually creating the key so we
+				 * can't use bytes from this key to provide
+				 * prefix information for a subsequent key.
+				 */
+				tmpkey->size = 0;
+
+				/* Proceed with appended key/value pairs. */
+				goto leaf_insert;
+			}
+
+			/*
+			 * If no value, nothing needs to be copied.  Otherwise,
+			 * build the value's WT_CELL chunk from the most recent
+			 * update value.
+			 */
+			if (upd->size == 0) {
+				val->buf.data = NULL;
+				val->cell_len = val->len = val->buf.size = 0;
+			} else {
+				WT_ERR(__rec_cell_build_val(session, r,
+				    WT_UPDATE_DATA(upd), upd->size,
+				    (uint64_t)0));
+				dictionary = 1;
+			}
+		}
+
+		/*
+		 * Build key cell.
+		 *
+		 * If the key is an overflow key that hasn't been removed, use
+		 * the original backing blocks.
+		 */
+		onpage_ovfl = kpack != NULL &&
+		    kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM;
+		if (onpage_ovfl) {
+			key->buf.data = cell;
+			key->buf.size = __wt_cell_total_len(kpack);
+			key->cell_len = 0;
+			key->len = key->buf.size;
+			ovfl_key = 1;
+
+			/*
+			 * We aren't creating a key so we can't use this key as
+			 * a prefix for a subsequent key.
+			 */
+			tmpkey->size = 0;
+
+			/* Track if page has overflow items. */
+			r->ovfl_items = 1;
+		} else {
+			/*
+			 * Get the key from the page or an instantiated key, or
+			 * inline building the key from a previous key (it's a
+			 * fast path for simple, prefix-compressed keys), or by
+			 * by building the key from scratch.
+			 */
+			if (__wt_row_leaf_key_info(page, copy,
+			    NULL, &cell, &tmpkey->data, &tmpkey->size))
+				goto build;
+
+			kpack = &_kpack;
+			__wt_cell_unpack(cell, kpack);
+			if (btree->huffman_key == NULL &&
+			    kpack->type == WT_CELL_KEY &&
+			    tmpkey->size >= kpack->prefix) {
+				/*
+				 * The previous clause checked for a prefix of
+				 * zero, which means the temporary buffer must
+				 * have a non-zero size, and it references a
+				 * valid key.
+				 */
+				WT_ASSERT(session, tmpkey->size != 0);
+
+				/*
+				 * Grow the buffer as necessary, ensuring data
+				 * data has been copied into local buffer space,
+				 * then append the suffix to the prefix already
+				 * in the buffer.
+				 *
+				 * Don't grow the buffer unnecessarily or copy
+				 * data we don't need, truncate the item's data
+				 * length to the prefix bytes.
+				 */
+				tmpkey->size = kpack->prefix;
+				WT_ERR(__wt_buf_grow(session,
+				    tmpkey, tmpkey->size + kpack->size));
+				memcpy((uint8_t *)tmpkey->mem + tmpkey->size,
+				    kpack->data, kpack->size);
+				tmpkey->size += kpack->size;
+			} else
+				WT_ERR(__wt_row_leaf_key_copy(
+				    session, page, rip, tmpkey));
+build:
+			WT_ERR(__rec_cell_build_leaf_key(session, r,
+			    tmpkey->data, tmpkey->size, &ovfl_key));
+		}
+
+		/* Boundary: split or write the page. */
+		while (key->len + val->len > r->space_avail) {
+			if (r->raw_compression) {
+				WT_ERR(__rec_split_raw(session, r));
+				continue;
+			}
+
+			/*
+			 * In one path above, we copied address blocks from the
+			 * page rather than building the actual key.  In that
+			 * case, we have to build the actual key now because we
+			 * are about to promote it.
+			 */
+			if (onpage_ovfl) {
+				WT_ERR(__wt_dsk_cell_data_ref(
+				    session, WT_PAGE_ROW_LEAF, kpack, r->cur));
+				onpage_ovfl = 0;
+			}
+			WT_ERR(__rec_split(session, r));
+
+			/*
+			 * Turn off prefix compression until a full key written
+			 * to the new page, and (unless we're already working
+			 * with an overflow key), rebuild the key without prefix
+			 * compression.
+			 */
+			if (r->key_pfx_compress_conf) {
+				r->key_pfx_compress = 0;
+				if (!ovfl_key)
+					WT_ERR(__rec_cell_build_leaf_key(
+					    session, r, NULL, 0, &ovfl_key));
+			}
+		}
+
+		/* Copy the key/value pair onto the page. */
+		__rec_copy_incr(session, r, key);
+		if (val->len == 0)
+			r->any_empty_value = 1;
+		else {
+			r->all_empty_value = 0;
+			if (dictionary && btree->dictionary)
+				WT_ERR(__rec_dict_replace(session, r, 0, val));
+			__rec_copy_incr(session, r, val);
+		}
+
+		/* Update compression state. */
+		__rec_key_state_update(r, ovfl_key);
+
+leaf_insert:	/* Write any K/V pairs inserted into the page after this key. */
+		if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT(page, rip))) != NULL)
+		    WT_ERR(__rec_row_leaf_insert(session, r, ins));
+	}
+
+	/* Write the remnant page. */
+	ret = __rec_split_finish(session, r);
+
+err:	__wt_scr_free(&tmpkey);
+	__wt_scr_free(&tmpval);
+	return (ret);
+}
+
+/*
+ * __rec_row_leaf_insert --
+ *	Walk an insert chain, writing K/V pairs.
+ */
+static int
+__rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
+{
+	WT_BTREE *btree;
+	WT_KV *key, *val;
+	WT_UPDATE *upd;
+	int ovfl_key;
+
+	btree = S2BT(session);
+
+	key = &r->k;
+	val = &r->v;
+
+	for (; ins != NULL; ins = WT_SKIP_NEXT(ins)) {
+		/* Look for an update. */
+		WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd));
+		if (upd == NULL || WT_UPDATE_DELETED_ISSET(upd))
+			continue;
+
+		if (upd->size == 0)			/* Build value cell. */
+			val->len = 0;
+		else
+			WT_RET(__rec_cell_build_val(session, r,
+			    WT_UPDATE_DATA(upd), upd->size, (uint64_t)0));
+
+							/* Build key cell. */
+		WT_RET(__rec_cell_build_leaf_key(session, r,
+		    WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key));
+
+		/* Boundary: split or write the page. */
+		while (key->len + val->len > r->space_avail) {
+			if (r->raw_compression) {
+				WT_RET(__rec_split_raw(session, r));
+				continue;
+			}
+			WT_RET(__rec_split(session, r));
+
+			/*
+			 * Turn off prefix compression until a full key written
+			 * to the new page, and (unless we're already working
+			 * with an overflow key), rebuild the key without prefix
+			 * compression.
+			 */
+			if (r->key_pfx_compress_conf) {
+				r->key_pfx_compress = 0;
+				if (!ovfl_key)
+					WT_RET(__rec_cell_build_leaf_key(
+					    session, r, NULL, 0, &ovfl_key));
+			}
+		}
+
+		/* Copy the key/value pair onto the page. */
+		__rec_copy_incr(session, r, key);
+		if (val->len == 0)
+			r->any_empty_value = 1;
+		else {
+			r->all_empty_value = 0;
+			if (btree->dictionary)
+				WT_RET(__rec_dict_replace(session, r, 0, val));
+			__rec_copy_incr(session, r, val);
+		}
+
+		/* Update compression state. */
+		__rec_key_state_update(r, ovfl_key);
+	}
+
+	return (0);
+}
+
+/*
+ * __rec_split_discard --
+ *	Discard the pages resulting from a previous split.
+ */
+static int
+__rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_BM *bm;
+	WT_DECL_RET;
+	WT_PAGE_MODIFY *mod;
+	WT_MULTI *multi;
+	uint32_t i;
+
+	bm = S2BT(session)->bm;
+	mod = page->modify;
+
+	/*
+	 * A page that split is being reconciled for the second, or subsequent
+	 * time; discard underlying block space used in the last reconciliation
+	 * that is not being reused for this reconciliation.
+	 */
+	for (multi = mod->mod_multi,
+	    i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
+		switch (page->type) {
+		case WT_PAGE_ROW_INT:
+		case WT_PAGE_ROW_LEAF:
+			__wt_free(session, multi->key.ikey);
+			break;
+		}
+		if (multi->skip == NULL) {
+			if (multi->addr.reuse)
+				multi->addr.addr = NULL;
+			else {
+				WT_RET(bm->free(bm, session,
+				    multi->addr.addr, multi->addr.size));
+				__wt_free(session, multi->addr.addr);
+			}
+		} else {
+			__wt_free(session, multi->skip);
+			__wt_free(session, multi->skip_dsk);
+		}
+	}
+	__wt_free(session, mod->mod_multi);
+	mod->mod_multi_entries = 0;
+
+	/*
+	 * This routine would be trivial, and only walk a single page freeing
+	 * any blocks written to support the split, except for root splits.
+	 * In the case of root splits, we have to cope with multiple pages in
+	 * a linked list, and we also have to discard overflow items written
+	 * for the page.
+	 */
+	switch (page->type) {
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_ROW_INT:
+		if (mod->mod_root_split == NULL)
+			break;
+		WT_RET(__rec_split_discard(session, mod->mod_root_split));
+		WT_RET(__wt_ovfl_track_wrapup(session, mod->mod_root_split));
+		__wt_page_out(session, &mod->mod_root_split);
+		break;
+	}
+
+	return (ret);
+}
+
+/*
+ * __rec_write_wrapup --
+ *	Finish the reconciliation.
+ */
+static int
+__rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+	WT_BM *bm;
+	WT_BOUNDARY *bnd;
+	WT_BTREE *btree;
+	WT_MULTI *multi;
+	WT_PAGE_MODIFY *mod;
+	WT_REF *ref;
+	size_t addr_size;
+	const uint8_t *addr;
+
+	btree = S2BT(session);
+	bm = btree->bm;
+	mod = page->modify;
+	ref = r->ref;
+
+	/*
+	 * This page may have previously been reconciled, and that information
+	 * is now about to be replaced.  Make sure it's discarded at some point,
+	 * and clear the underlying modification information, we're creating a
+	 * new reality.
+	 */
+	switch (F_ISSET(mod, WT_PM_REC_MASK)) {
+	case 0:	/*
+		 * The page has never been reconciled before, free the original
+		 * address blocks (if any).  The "if any" is for empty trees
+		 * created when a new tree is opened or previously deleted pages
+		 * instantiated in memory.
+		 *
+		 * The exception is root pages are never tracked or free'd, they
+		 * are checkpoints, and must be explicitly dropped.
+		 */
+		if (__wt_ref_is_root(ref))
+			break;
+		if (ref->addr != NULL) {
+			/*
+			 * Free the page and clear the address (so we don't free
+			 * it twice).
+			 */
+			WT_RET(__wt_ref_info(
+			    session, ref, &addr, &addr_size, NULL));
+			WT_RET(bm->free(bm, session, addr, addr_size));
+			if (__wt_off_page(ref->home, ref->addr)) {
+				__wt_free(
+				    session, ((WT_ADDR *)ref->addr)->addr);
+				__wt_free(session, ref->addr);
+			}
+			ref->addr = NULL;
+		}
+		break;
+	case WT_PM_REC_EMPTY:				/* Page deleted */
+		break;
+	case WT_PM_REC_MULTIBLOCK:			/* Multiple blocks */
+		/*
+		 * Discard the multiple replacement blocks.
+		 */
+		WT_RET(__rec_split_discard(session, page));
+		break;
+	case WT_PM_REC_REPLACE:				/* 1-for-1 page swap */
+		/*
+		 * Discard the replacement leaf page's blocks.
+		 *
+		 * The exception is root pages are never tracked or free'd, they
+		 * are checkpoints, and must be explicitly dropped.
+		 */
+		if (!__wt_ref_is_root(ref))
+			WT_RET(bm->free(bm, session,
+			    mod->mod_replace.addr, mod->mod_replace.size));
+
+		/* Discard the replacement page's address. */
+		__wt_free(session, mod->mod_replace.addr);
+		mod->mod_replace.size = 0;
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+	F_CLR(mod, WT_PM_REC_MASK);
+
+	/*
+	 * Wrap up overflow tracking.  If we are about to create a checkpoint,
+	 * the system must be entirely consistent at that point (the underlying
+	 * block manager is presumably going to do some action to resolve the
+	 * list of allocated/free/whatever blocks that are associated with the
+	 * checkpoint).
+	 */
+	WT_RET(__wt_ovfl_track_wrapup(session, page));
+
+	switch (r->bnd_next) {
+	case 0:						/* Page delete */
+		WT_RET(__wt_verbose(
+		    session, WT_VERB_RECONCILE, "page %p empty", page));
+		WT_STAT_FAST_DATA_INCR(session, rec_page_delete);
+
+		/* If this is the root page, we need to create a sync point. */
+		ref = r->ref;
+		if (__wt_ref_is_root(ref))
+			WT_RET(
+			    bm->checkpoint(bm, session, NULL, btree->ckpt, 0));
+
+		/*
+		 * If the page was empty, we want to discard it from the tree
+		 * by discarding the parent's key when evicting the parent.
+		 * Mark the page as deleted, then return success, leaving the
+		 * page in memory.  If the page is subsequently modified, that
+		 * is OK, we'll just reconcile it again.
+		 */
+		F_SET(mod, WT_PM_REC_EMPTY);
+		break;
+	case 1:						/* 1-for-1 page swap */
+		/*
+		 * Because WiredTiger's pages grow without splitting, we're
+		 * replacing a single page with another single page most of
+		 * the time.
+		 */
+		bnd = &r->bnd[0];
+
+		/*
+		 * If we're saving/restoring changes for this page, there's
+		 * nothing to write. Allocate, then initialize the array of
+		 * replacement blocks.
+		 */
+		if (bnd->skip != NULL) {
+			WT_RET(__wt_calloc_def(
+			    session, r->bnd_next, &mod->mod_multi));
+			multi = mod->mod_multi;
+			multi->skip = bnd->skip;
+			multi->skip_entries = bnd->skip_next;
+			bnd->skip = NULL;
+			multi->skip_dsk = bnd->dsk;
+			bnd->dsk = NULL;
+			mod->mod_multi_entries = 1;
+
+			F_SET(mod, WT_PM_REC_MULTIBLOCK);
+			break;
+		}
+
+		/*
+		 * If this is a root page, then we don't have an address and we
+		 * have to create a sync point.  The address was cleared when
+		 * we were about to write the buffer so we know what to do here.
+		 */
+		if (bnd->addr.addr == NULL)
+			WT_RET(__wt_bt_write(session,
+			    &r->dsk, NULL, NULL, 1, bnd->already_compressed));
+		else {
+			mod->mod_replace = bnd->addr;
+			bnd->addr.addr = NULL;
+		}
+
+		F_SET(mod, WT_PM_REC_REPLACE);
+		break;
+	default:					/* Page split */
+		WT_RET(__wt_verbose(session, WT_VERB_RECONCILE,
+		    "page %p reconciled into %" PRIu32 " pages",
+		    page, r->bnd_next));
+
+		switch (page->type) {
+		case WT_PAGE_COL_INT:
+		case WT_PAGE_ROW_INT:
+			WT_STAT_FAST_DATA_INCR(
+			    session, rec_multiblock_internal);
+			break;
+		case WT_PAGE_COL_FIX:
+		case WT_PAGE_COL_VAR:
+		case WT_PAGE_ROW_LEAF:
+			WT_STAT_FAST_DATA_INCR(session, rec_multiblock_leaf);
+			break;
+		WT_ILLEGAL_VALUE(session);
+		}
+
+		/* Display the actual split keys. */
+		if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT)) {
+			WT_DECL_ITEM(tkey);
+			WT_DECL_RET;
+			uint32_t i;
+
+			if (page->type == WT_PAGE_ROW_INT ||
+			    page->type == WT_PAGE_ROW_LEAF)
+				WT_RET(__wt_scr_alloc(session, 0, &tkey));
+			for (bnd = r->bnd, i = 0; i < r->bnd_next; ++bnd, ++i)
+				switch (page->type) {
+				case WT_PAGE_ROW_INT:
+				case WT_PAGE_ROW_LEAF:
+					WT_ERR(__wt_buf_set_printable(
+					    session, tkey,
+					    bnd->key.data, bnd->key.size));
+					WT_ERR(__wt_verbose(
+					    session, WT_VERB_SPLIT,
+					    "split: starting key "
+					    "%.*s",
+					    (int)tkey->size,
+					    (const char *)tkey->data));
+					break;
+				case WT_PAGE_COL_FIX:
+				case WT_PAGE_COL_INT:
+				case WT_PAGE_COL_VAR:
+					WT_ERR(__wt_verbose(
+					    session, WT_VERB_SPLIT,
+					    "split: starting recno %" PRIu64,
+					    bnd->recno));
+					break;
+				WT_ILLEGAL_VALUE_ERR(session);
+				}
+err:			__wt_scr_free(&tkey);
+			WT_RET(ret);
+		}
+		if (r->bnd_next > r->bnd_next_max) {
+			r->bnd_next_max = r->bnd_next;
+			WT_STAT_FAST_DATA_SET(
+			    session, rec_multiblock_max, r->bnd_next_max);
+		}
+
+		switch (page->type) {
+		case WT_PAGE_ROW_INT:
+		case WT_PAGE_ROW_LEAF:
+			WT_RET(__rec_split_row(session, r, page));
+			break;
+		case WT_PAGE_COL_INT:
+		case WT_PAGE_COL_FIX:
+		case WT_PAGE_COL_VAR:
+			WT_RET(__rec_split_col(session, r, page));
+			break;
+		WT_ILLEGAL_VALUE(session);
+		}
+		F_SET(mod, WT_PM_REC_MULTIBLOCK);
+		break;
+	}
+
+	/*
+	 * If updates were skipped, the tree isn't clean.  The checkpoint call
+	 * cleared the tree's modified value before calling the eviction thread,
+	 * so we must explicitly reset the tree's modified flag.  We insert a
+	 * barrier after the change for clarity (the requirement is the value
+	 * be set before a subsequent checkpoint reads it, and because the
+	 * current checkpoint is waiting on this reconciliation to complete,
+	 * there's no risk of that happening).
+	 *
+	 * Otherwise, if no updates were skipped, we have a new maximum
+	 * transaction written for the page (used to decide if a clean page can
+	 * be evicted).  The page only might be clean; if the write generation
+	 * is unchanged since reconciliation started, clear it and update cache
+	 * dirty statistics, if the write generation changed, then the page has
+	 * been written since we started reconciliation, it cannot be
+	 * discarded.
+	 */
+	if (r->leave_dirty) {
+		mod->first_dirty_txn = r->skipped_txn;
+
+		btree->modified = 1;
+		WT_FULL_BARRIER();
+	} else {
+		mod->rec_max_txn = r->max_txn;
+
+		if (WT_ATOMIC_CAS4(mod->write_gen, r->orig_write_gen, 0))
+			__wt_cache_dirty_decr(session, page);
+	}
+
+	return (0);
+}
+
+/*
+ * __rec_write_wrapup_err --
+ *	Finish the reconciliation on error.
+ */
+static int
+__rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+	WT_BM *bm;
+	WT_BOUNDARY *bnd;
+	WT_DECL_RET;
+	WT_MULTI *multi;
+	WT_PAGE_MODIFY *mod;
+	uint32_t i;
+
+	bm = S2BT(session)->bm;
+	mod = page->modify;
+
+	/*
+	 * Clear the address-reused flag from the multiblock reconciliation
+	 * information (otherwise we might think the backing block is being
+	 * reused on a subsequent reconciliation where we want to free it).
+	 */
+	if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_MULTIBLOCK)
+		for (multi = mod->mod_multi,
+		    i = 0; i < mod->mod_multi_entries; ++multi, ++i)
+			multi->addr.reuse = 0;
+
+	/*
+	 * On error, discard blocks we've written, they're unreferenced by the
+	 * tree.  This is not a question of correctness, we're avoiding block
+	 * leaks.
+	 *
+	 * Don't discard backing blocks marked for reuse, they remain part of
+	 * a previous reconciliation.
+	 */
+	WT_TRET(__wt_ovfl_track_wrapup_err(session, page));
+	for (bnd = r->bnd, i = 0; i < r->bnd_next; ++bnd, ++i)
+		if (bnd->addr.addr != NULL) {
+			if (bnd->addr.reuse)
+				bnd->addr.addr = NULL;
+			else {
+				WT_TRET(bm->free(bm, session,
+				    bnd->addr.addr, bnd->addr.size));
+				__wt_free(session, bnd->addr.addr);
+			}
+		}
+
+	return (ret);
+}
+
+/*
+ * __rec_split_row --
+ *	Split a row-store page into a set of replacement blocks.
+ */
+static int
+__rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+	WT_BOUNDARY *bnd;
+	WT_MULTI *multi;
+	WT_PAGE_MODIFY *mod;
+	WT_REF *ref;
+	uint32_t i;
+	size_t size;
+	void *p;
+
+	mod = page->modify;
+
+	/* We never set the first page's key, grab it from the original page. */
+	ref = r->ref;
+	if (__wt_ref_is_root(ref))
+		WT_RET(__wt_buf_set(session, &r->bnd[0].key, "", 1));
+	else {
+		__wt_ref_key(ref->home, ref, &p, &size);
+		WT_RET(__wt_buf_set(session, &r->bnd[0].key, p, size));
+	}
+
+	/* Allocate, then initialize the array of replacement blocks. */
+	WT_RET(__wt_calloc_def(session, r->bnd_next, &mod->mod_multi));
+
+	for (multi = mod->mod_multi,
+	    bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) {
+		WT_RET(__wt_row_ikey(session, 0,
+		    bnd->key.data, bnd->key.size, &multi->key.ikey));
+
+		if (bnd->skip == NULL) {
+			multi->addr = bnd->addr;
+			multi->addr.reuse = 0;
+			multi->size = bnd->size;
+			multi->cksum = bnd->cksum;
+			bnd->addr.addr = NULL;
+		} else {
+			multi->skip = bnd->skip;
+			multi->skip_entries = bnd->skip_next;
+			bnd->skip = NULL;
+			multi->skip_dsk = bnd->dsk;
+			bnd->dsk = NULL;
+		}
+	}
+	mod->mod_multi_entries = r->bnd_next;
+
+	return (0);
+}
+
+/*
+ * __rec_split_col --
+ *	Split a column-store page into a set of replacement blocks.
+ */
+static int
+__rec_split_col(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+	WT_BOUNDARY *bnd;
+	WT_MULTI *multi;
+	WT_PAGE_MODIFY *mod;
+	uint32_t i;
+
+	mod = page->modify;
+
+	/* Allocate, then initialize the array of replacement blocks. */
+	WT_RET(__wt_calloc_def(session, r->bnd_next, &mod->mod_multi));
+
+	for (multi = mod->mod_multi,
+	    bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) {
+		multi->key.recno = bnd->recno;
+
+		if (bnd->skip == NULL) {
+			multi->addr = bnd->addr;
+			multi->addr.reuse = 0;
+			multi->size = bnd->size;
+			multi->cksum = bnd->cksum;
+			bnd->addr.addr = NULL;
+		} else {
+			multi->skip = bnd->skip;
+			multi->skip_entries = bnd->skip_next;
+			bnd->skip = NULL;
+			multi->skip_dsk = bnd->dsk;
+			bnd->dsk = NULL;
+		}
+	}
+	mod->mod_multi_entries = r->bnd_next;
+
+	return (0);
+}
+
+/*
+ * __rec_cell_build_int_key --
+ *	Process a key and return a WT_CELL structure and byte string to be
+ * stored on a row-store internal page.
+ */
+static int
+__rec_cell_build_int_key(WT_SESSION_IMPL *session,
+    WT_RECONCILE *r, const void *data, size_t size, int *is_ovflp)
+{
+	WT_BTREE *btree;
+	WT_KV *key;
+
+	*is_ovflp = 0;
+
+	btree = S2BT(session);
+
+	key = &r->k;
+
+	/* Copy the bytes into the "current" and key buffers. */
+	WT_RET(__wt_buf_set(session, r->cur, data, size));
+	WT_RET(__wt_buf_set(session, &key->buf, data, size));
+
+	/* Create an overflow object if the data won't fit. */
+	if (size > btree->maxintlitem) {
+		WT_STAT_FAST_DATA_INCR(session, rec_overflow_key_internal);
+
+		*is_ovflp = 1;
+		return (__rec_cell_build_ovfl(
+		    session, r, key, WT_CELL_KEY_OVFL, (uint64_t)0));
+	}
+
+	key->cell_len = __wt_cell_pack_int_key(&key->cell, key->buf.size);
+	key->len = key->cell_len + key->buf.size;
+
+	return (0);
+}
+
+/*
+ * __rec_cell_build_leaf_key --
+ *	Process a key and return a WT_CELL structure and byte string to be
+ * stored on a row-store leaf page.
+ */
+static int
+__rec_cell_build_leaf_key(WT_SESSION_IMPL *session,
+    WT_RECONCILE *r, const void *data, size_t size, int *is_ovflp)
+{
+	WT_BTREE *btree;
+	WT_KV *key;
+	size_t pfx_max;
+	uint8_t pfx;
+	const uint8_t *a, *b;
+
+	*is_ovflp = 0;
+
+	btree = S2BT(session);
+
+	key = &r->k;
+
+	pfx = 0;
+	if (data == NULL)
+		/*
+		 * When data is NULL, our caller has a prefix compressed key
+		 * they can't use (probably because they just crossed a split
+		 * point).  Use the full key saved when last called, instead.
+		 */
+		WT_RET(__wt_buf_set(
+		    session, &key->buf, r->cur->data, r->cur->size));
+	else {
+		/*
+		 * Save a copy of the key for later reference: we use the full
+		 * key for prefix-compression comparisons, and if we are, for
+		 * any reason, unable to use the compressed key we generate.
+		 */
+		WT_RET(__wt_buf_set(session, r->cur, data, size));
+
+		/*
+		 * Do prefix compression on the key.  We know by definition the
+		 * previous key sorts before the current key, which means the
+		 * keys must differ and we just need to compare up to the
+		 * shorter of the two keys.
+		 */
+		if (r->key_pfx_compress) {
+			/*
+			 * We can't compress out more than 256 bytes, limit the
+			 * comparison to that.
+			 */
+			pfx_max = UINT8_MAX;
+			if (size < pfx_max)
+				pfx_max = size;
+			if (r->last->size < pfx_max)
+				pfx_max = r->last->size;
+			for (a = data, b = r->last->data; pfx < pfx_max; ++pfx)
+				if (*a++ != *b++)
+					break;
+
+			/*
+			 * Prefix compression may cost us CPU and memory when
+			 * the page is re-loaded, don't do it unless there's
+			 * reasonable gain.
+			 */
+			if (pfx < btree->prefix_compression_min)
+				pfx = 0;
+			else
+				WT_STAT_FAST_DATA_INCRV(
+				    session, rec_prefix_compression, pfx);
+		}
+
+		/* Copy the non-prefix bytes into the key buffer. */
+		WT_RET(__wt_buf_set(
+		    session, &key->buf, (uint8_t *)data + pfx, size - pfx));
+	}
+
+	/* Optionally compress the key using the Huffman engine. */
+	if (btree->huffman_key != NULL)
+		WT_RET(__wt_huffman_encode(session, btree->huffman_key,
+		    key->buf.data, (uint32_t)key->buf.size, &key->buf));
+
+	/* Create an overflow object if the data won't fit. */
+	if (key->buf.size > btree->maxleafitem) {
+		/*
+		 * Overflow objects aren't prefix compressed -- rebuild any
+		 * object that was prefix compressed.
+		 */
+		if (pfx == 0) {
+			WT_STAT_FAST_DATA_INCR(session, rec_overflow_key_leaf);
+
+			*is_ovflp = 1;
+			return (__rec_cell_build_ovfl(
+			    session, r, key, WT_CELL_KEY_OVFL, (uint64_t)0));
+		}
+		return (
+		    __rec_cell_build_leaf_key(session, r, NULL, 0, is_ovflp));
+	}
+
+	key->cell_len = __wt_cell_pack_leaf_key(&key->cell, pfx, key->buf.size);
+	key->len = key->cell_len + key->buf.size;
+
+	return (0);
+}
+
+/*
+ * __rec_cell_build_addr --
+ *	Process an address reference and return a cell structure to be stored
+ * on the page.
+ */
+static void
+__rec_cell_build_addr(WT_RECONCILE *r,
+    const void *addr, size_t size, u_int cell_type, uint64_t recno)
+{
+	WT_KV *val;
+
+	val = &r->v;
+
+	/*
+	 * We don't check the address size because we can't store an address on
+	 * an overflow page: if the address won't fit, the overflow page's
+	 * address won't fit either.  This possibility must be handled by Btree
+	 * configuration, we have to disallow internal page sizes that are too
+	 * small with respect to the largest address cookie the underlying block
+	 * manager might return.
+	 */
+
+	/*
+	 * We don't copy the data into the buffer, it's not necessary; just
+	 * re-point the buffer's data/length fields.
+	 */
+	val->buf.data = addr;
+	val->buf.size = size;
+	val->cell_len =
+	    __wt_cell_pack_addr(&val->cell, cell_type, recno, val->buf.size);
+	val->len = val->cell_len + val->buf.size;
+}
+
+/*
+ * __rec_cell_build_val --
+ *	Process a data item and return a WT_CELL structure and byte string to
+ * be stored on the page.
+ */
+static int
+__rec_cell_build_val(WT_SESSION_IMPL *session,
+    WT_RECONCILE *r, const void *data, size_t size, uint64_t rle)
+{
+	WT_BTREE *btree;
+	WT_KV *val;
+
+	btree = S2BT(session);
+
+	val = &r->v;
+
+	/*
+	 * We don't copy the data into the buffer, it's not necessary; just
+	 * re-point the buffer's data/length fields.
+	 */
+	val->buf.data = data;
+	val->buf.size = size;
+
+	/* Handle zero-length cells quickly. */
+	if (size != 0) {
+		/* Optionally compress the data using the Huffman engine. */
+		if (btree->huffman_value != NULL)
+			WT_RET(__wt_huffman_encode(
+			    session, btree->huffman_value,
+			    val->buf.data, (uint32_t)val->buf.size, &val->buf));
+
+		/* Create an overflow object if the data won't fit. */
+		if (val->buf.size > btree->maxleafitem) {
+			WT_STAT_FAST_DATA_INCR(session, rec_overflow_value);
+
+			return (__rec_cell_build_ovfl(
+			    session, r, val, WT_CELL_VALUE_OVFL, rle));
+		}
+	}
+	val->cell_len = __wt_cell_pack_data(&val->cell, rle, val->buf.size);
+	val->len = val->cell_len + val->buf.size;
+
+	return (0);
+}
+
+/*
+ * __rec_cell_build_ovfl --
+ *	Store overflow items in the file, returning the address cookie.
+ */
+static int
+__rec_cell_build_ovfl(WT_SESSION_IMPL *session,
+    WT_RECONCILE *r, WT_KV *kv, uint8_t type, uint64_t rle)
+{
+	WT_BM *bm;
+	WT_BTREE *btree;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	WT_PAGE *page;
+	WT_PAGE_HEADER *dsk;
+	size_t size;
+	uint8_t *addr, buf[WT_BTREE_MAX_ADDR_COOKIE];
+
+	btree = S2BT(session);
+	bm = btree->bm;
+	page = r->page;
+
+	/* Track if page has overflow items. */
+	r->ovfl_items = 1;
+
+	/*
+	 * See if this overflow record has already been written and reuse it if
+	 * possible.  Else, write a new overflow record.
+	 */
+	if (!__wt_ovfl_reuse_search(session, page,
+	    &addr, &size, kv->buf.data, kv->buf.size)) {
+		/* Allocate a buffer big enough to write the overflow record. */
+		size = kv->buf.size;
+		WT_RET(bm->write_size(bm, session, &size));
+		WT_RET(__wt_scr_alloc(session, size, &tmp));
+
+		/* Initialize the buffer: disk header and overflow record. */
+		dsk = tmp->mem;
+		memset(dsk, 0, WT_PAGE_HEADER_SIZE);
+		dsk->type = WT_PAGE_OVFL;
+		dsk->u.datalen = (uint32_t)kv->buf.size;
+		memcpy(WT_PAGE_HEADER_BYTE(btree, dsk),
+		    kv->buf.data, kv->buf.size);
+		dsk->mem_size = tmp->size =
+		    WT_PAGE_HEADER_BYTE_SIZE(btree) + (uint32_t)kv->buf.size;
+
+		/* Write the buffer. */
+		addr = buf;
+		WT_ERR(__wt_bt_write(session, tmp, addr, &size, 0, 0));
+
+		/*
+		 * Track the overflow record (unless it's a bulk load, which
+		 * by definition won't ever reuse a record.
+		 */
+		if (!r->is_bulk_load)
+			WT_ERR(__wt_ovfl_reuse_add(session, page,
+			    addr, size, kv->buf.data, kv->buf.size));
+	}
+
+	/* Set the callers K/V to reference the overflow record's address. */
+	WT_ERR(__wt_buf_set(session, &kv->buf, addr, size));
+
+	/* Build the cell and return. */
+	kv->cell_len = __wt_cell_pack_ovfl(&kv->cell, type, rle, kv->buf.size);
+	kv->len = kv->cell_len + kv->buf.size;
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * The dictionary --
+ *	The rest of this file is support for dictionaries.
+ *
+ * It's difficult to write generic skiplist functions without turning a single
+ * memory allocation into two, or requiring a function call instead of a simple
+ * comparison.  Fortunately, skiplists are relatively simple things and we can
+ * include them in-place.  If you need generic skip-list functions to modify,
+ * this set wouldn't be a bad place to start.
+ *
+ * __rec_dictionary_skip_search --
+ *	Search a dictionary skiplist.
+ */
+static WT_DICTIONARY *
+__rec_dictionary_skip_search(WT_DICTIONARY **head, uint64_t hash)
+{
+	WT_DICTIONARY **e;
+	int i;
+
+	/*
+	 * Start at the highest skip level, then go as far as possible at each
+	 * level before stepping down to the next.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
+		if (*e == NULL) {		/* Empty levels */
+			--i;
+			--e;
+			continue;
+		}
+
+		/*
+		 * Return any exact matches: we don't care in what search level
+		 * we found a match.
+		 */
+		if ((*e)->hash == hash)		/* Exact match */
+			return (*e);
+		if ((*e)->hash > hash) {	/* Drop down a level */
+			--i;
+			--e;
+		} else				/* Keep going at this level */
+			e = &(*e)->next[i];
+	}
+	return (NULL);
+}
+
+/*
+ * __rec_dictionary_skip_search_stack --
+ *	Search a dictionary skiplist, returning an insert/remove stack.
+ */
+static void
+__rec_dictionary_skip_search_stack(
+    WT_DICTIONARY **head, WT_DICTIONARY ***stack, uint64_t hash)
+{
+	WT_DICTIONARY **e;
+	int i;
+
+	/*
+	 * Start at the highest skip level, then go as far as possible at each
+	 * level before stepping down to the next.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;)
+		if (*e == NULL || (*e)->hash > hash)
+			stack[i--] = e--;	/* Drop down a level */
+		else
+			e = &(*e)->next[i];	/* Keep going at this level */
+}
+
+/*
+ * __rec_dictionary_skip_insert --
+ *	Insert an entry into the dictionary skip-list.
+ */
+static void
+__rec_dictionary_skip_insert(
+    WT_DICTIONARY **head, WT_DICTIONARY *e, uint64_t hash)
+{
+	WT_DICTIONARY **stack[WT_SKIP_MAXDEPTH];
+	u_int i;
+
+	/* Insert the new entry into the skiplist. */
+	__rec_dictionary_skip_search_stack(head, stack, hash);
+	for (i = 0; i < e->depth; ++i) {
+		e->next[i] = *stack[i];
+		*stack[i] = e;
+	}
+}
+
+/*
+ * __rec_dictionary_init --
+ *	Allocate and initialize the dictionary.
+ */
+static int
+__rec_dictionary_init(WT_SESSION_IMPL *session, WT_RECONCILE *r, u_int slots)
+{
+	u_int depth, i;
+
+	/* Free any previous dictionary. */
+	__rec_dictionary_free(session, r);
+
+	r->dictionary_slots = slots;
+	WT_RET(__wt_calloc(session,
+	    r->dictionary_slots, sizeof(WT_DICTIONARY *), &r->dictionary));
+	for (i = 0; i < r->dictionary_slots; ++i) {
+		depth = __wt_skip_choose_depth(session);
+		WT_RET(__wt_calloc(session, 1,
+		    sizeof(WT_DICTIONARY) + depth * sizeof(WT_DICTIONARY *),
+		    &r->dictionary[i]));
+		r->dictionary[i]->depth = depth;
+	}
+	return (0);
+}
+
+/*
+ * __rec_dictionary_free --
+ *	Free the dictionary.
+ */
+static void
+__rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+	u_int i;
+
+	if (r->dictionary == NULL)
+		return;
+
+	/*
+	 * We don't correct dictionary_slots when we fail during allocation,
+	 * but that's OK, the value is either NULL or a memory reference to
+	 * be free'd.
+	 */
+	for (i = 0; i < r->dictionary_slots; ++i)
+		__wt_free(session, r->dictionary[i]);
+	__wt_free(session, r->dictionary);
+}
+
+/*
+ * __rec_dictionary_reset --
+ *	Reset the dictionary when reconciliation restarts and when crossing a
+ * page boundary (a potential split).
+ */
+static void
+__rec_dictionary_reset(WT_RECONCILE *r)
+{
+	if (r->dictionary_slots) {
+		r->dictionary_next = 0;
+		memset(r->dictionary_head, 0, sizeof(r->dictionary_head));
+	}
+}
+
+/*
+ * __rec_dictionary_lookup --
+ *	Check the dictionary for a matching value on this page.
+ */
+static int
+__rec_dictionary_lookup(
+    WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_KV *val, WT_DICTIONARY **dpp)
+{
+	WT_DICTIONARY *dp, *next;
+	uint64_t hash;
+	int match;
+
+	*dpp = NULL;
+
+	/* Search the dictionary, and return any match we find. */
+	hash = __wt_hash_fnv64(val->buf.data, val->buf.size);
+	for (dp = __rec_dictionary_skip_search(r->dictionary_head, hash);
+	    dp != NULL && dp->hash == hash; dp = dp->next[0]) {
+		WT_RET(__wt_cell_pack_data_match(
+		    dp->cell, &val->cell, val->buf.data, &match));
+		if (match) {
+			WT_STAT_FAST_DATA_INCR(session, rec_dictionary);
+			*dpp = dp;
+			return (0);
+		}
+	}
+
+	/*
+	 * We're not doing value replacement in the dictionary.  We stop adding
+	 * new entries if we run out of empty dictionary slots (but continue to
+	 * use the existing entries).  I can't think of any reason a leaf page
+	 * value is more likely to be seen because it was seen more recently
+	 * than some other value: if we find working sets where that's not the
+	 * case, it shouldn't be too difficult to maintain a pointer which is
+	 * the next dictionary slot to re-use.
+	 */
+	if (r->dictionary_next >= r->dictionary_slots)
+		return (0);
+
+	/*
+	 * Set the hash value, we'll add this entry into the dictionary when we
+	 * write it into the page's disk image buffer (because that's when we
+	 * know where on the page it will be written).
+	 */
+	next = r->dictionary[r->dictionary_next++];
+	next->cell = NULL;		/* Not necessary, just cautious. */
+	next->hash = hash;
+	__rec_dictionary_skip_insert(r->dictionary_head, next, hash);
+	*dpp = next;
+	return (0);
+}
author	Alex Gorrod <alexg@wiredtiger.com>	2014-12-03 13:36:04 +1100
committer	Alex Gorrod <alexg@wiredtiger.com>	2014-12-03 13:36:04 +1100
commit	0512f27b669b8b1a1f4d9f78d38708efd8cdcd87 (patch)
tree	eaf8cfd6effc5823cd6dd275a22394fb01a4f5f9 /src/reconcile
parent	2bd317c731c00ca7b82663014e5662517e344ba4 (diff)
download	mongo-0512f27b669b8b1a1f4d9f78d38708efd8cdcd87.tar.gz