1 files changed, 444 insertions, 263 deletions
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index 6f95b84d292..1c266496ec8 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
  * Copyright (c) 2008-2014 WiredTiger, Inc.
  *	All rights reserved.
  *
@@ -25,12 +25,25 @@ typedef struct {
 	WT_PAGE *page;
 	uint32_t flags;			/* Caller's configuration */
 
-	WT_ITEM	 disk_image;		/* Temporary disk-image buffer */
 	/*
-	 * Temporary buffer used to write out a disk image when managing two
-	 * chunks worth of data in memory
+	 * Reconciliation can end up requiring two temporary disk image buffers
+	 * if a page split is involved. These two disk images are pointed to by
+	 * current and the previous image pointers. During initialization the
+	 * first image is allocated and pointed to by the current image pointer.
+	 * If and when a split is involved the second image gets allocated and
+	 * is pointed to by the current image pointer. The previous image
+	 * pointer is made to refer the first image at this point. Two images
+	 * are kept in memory to redistribute data among them in case the last
+	 * split chunk ends up being smaller than the minimum required. As
+	 * reconciliation generates more split chunks, the image referred to by
+	 * the previous image pointer is written to the disk, the current and
+	 * the previous image pointers are swapped, making space for another
+	 * split chunk to be reconciled in the buffer that was just written out
+	 * to the disk.
 	 */
-	WT_ITEM *interim_buf;
+	WT_ITEM disk_image[2];		/* Temporary disk-image buffers */
+	WT_ITEM *cur_img_ptr;
+	WT_ITEM *prev_img_ptr;
 
 	/*
 	 * Track start/stop write generation to decide if all changes to the
@@ -48,9 +61,9 @@ typedef struct {
 	/* Track the page's maximum transaction ID. */
 	uint64_t max_txn;
 
-	/* Track if all updates were skipped. */
-	uint64_t update_cnt;
-	uint64_t update_skip_cnt;
+	uint64_t update_mem_all;	/* Total update memory size */
+	uint64_t update_mem_saved;	/* Saved update memory size */
+	uint64_t update_mem_uncommitted;/* Uncommitted update memory size */
 
 	/*
 	 * When we can't mark the page clean (for example, checkpoint found some
@@ -146,17 +159,6 @@ typedef struct {
 	 * that references all of our split pages.
 	 */
 	struct __rec_boundary {
-		/*
-		 * Offset is the byte offset in the initial split buffer of the
-		 * first byte of the split chunk, recorded before we decide to
-		 * split the page; the difference between chunk[1]'s offset and
-		 * chunk[0]'s offset is chunk[0]'s length.
-		 *
-		 * Once we split a page, we stop filling in offset values, we're
-		 * writing the split chunks as we find them.
-		 */
-		size_t offset;		/* Split's first byte */
-
 		WT_ADDR addr;		/* Split's written location */
 		uint32_t size;		/* Split's size */
 		uint32_t checksum;	/* Split's checksum */
@@ -338,7 +340,8 @@ static int  __rec_split_write(WT_SESSION_IMPL *,
 		WT_RECONCILE *, WT_BOUNDARY *, WT_ITEM *, bool);
 static int  __rec_update_las(
 		WT_SESSION_IMPL *, WT_RECONCILE *, uint32_t, WT_BOUNDARY *);
-static int  __rec_write_check_complete(WT_SESSION_IMPL *, WT_RECONCILE *);
+static int  __rec_write_check_complete(
+		WT_SESSION_IMPL *, WT_RECONCILE *, bool *);
 static int  __rec_write_init(WT_SESSION_IMPL *,
 		WT_REF *, uint32_t, WT_SALVAGE_COOKIE *, void *);
 static void __rec_write_page_status(WT_SESSION_IMPL *, WT_RECONCILE *);
@@ -351,6 +354,7 @@ static int  __rec_dictionary_init(WT_SESSION_IMPL *, WT_RECONCILE *, u_int);
 static int  __rec_dictionary_lookup(
 		WT_SESSION_IMPL *, WT_RECONCILE *, WT_KV *, WT_DICTIONARY **);
 static void __rec_dictionary_reset(WT_RECONCILE *);
+static void __rec_verbose_lookaside_write(WT_SESSION_IMPL *);
 
 /*
  * __wt_reconcile --
@@ -386,7 +390,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
 	 *    In-memory splits: reconciliation of an internal page cannot handle
 	 * a child page splitting during the reconciliation.
 	 */
-	__wt_writelock(session, &page->page_lock);
+	WT_PAGE_LOCK(session, page);
 
 	oldest_id = __wt_txn_oldest_id(session);
 	if (LF_ISSET(WT_EVICTING))
@@ -405,7 +409,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
 	/* Initialize the reconciliation structure for each new run. */
 	if ((ret = __rec_write_init(
 	    session, ref, flags, salvage, &session->reconcile)) != 0) {
-		__wt_writeunlock(session, &page->page_lock);
+		WT_PAGE_UNLOCK(session, page);
 		return (ret);
 	}
 	r = session->reconcile;
@@ -437,7 +441,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
 
 	/* Checks for a successful reconciliation. */
 	if (ret == 0)
-		ret = __rec_write_check_complete(session, r);
+		ret = __rec_write_check_complete(session, r, lookaside_retryp);
 
 	/* Wrap up the page reconciliation. */
 	if (ret == 0 && (ret = __rec_write_wrapup(session, r, page)) == 0)
@@ -446,15 +450,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
 		WT_TRET(__rec_write_wrapup_err(session, r, page));
 
 	/* Release the reconciliation lock. */
-	__wt_writeunlock(session, &page->page_lock);
-
-	/*
-	 * If our caller can configure lookaside table reconciliation, flag if
-	 * that's worth trying. The lookaside table doesn't help if we skipped
-	 * updates, it can only help with older readers preventing eviction.
-	 */
-	if (lookaside_retryp != NULL && r->update_cnt == r->update_skip_cnt)
-		*lookaside_retryp = true;
+	WT_PAGE_UNLOCK(session, page);
 
 	/* Update statistics. */
 	WT_STAT_CONN_INCR(session, rec_pages);
@@ -526,10 +522,8 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
 static inline bool
 __rec_las_checkpoint_test(WT_SESSION_IMPL *session, WT_RECONCILE *r)
 {
-	WT_CONNECTION_IMPL *conn;
 	WT_BTREE *btree;
 
-	conn = S2C(session);
 	btree = S2BT(session);
 
 	/*
@@ -550,7 +544,8 @@ __rec_las_checkpoint_test(WT_SESSION_IMPL *session, WT_RECONCILE *r)
 	if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT))
 		return (false);
 	if (r->orig_btree_checkpoint_gen == btree->checkpoint_gen &&
-	    r->orig_txn_checkpoint_gen == conn->txn_global.checkpoint_gen &&
+	    r->orig_txn_checkpoint_gen ==
+	    __wt_gen(session, WT_GEN_CHECKPOINT) &&
 	    r->orig_btree_checkpoint_gen == r->orig_txn_checkpoint_gen)
 		return (false);
 	return (true);
@@ -558,13 +553,21 @@ __rec_las_checkpoint_test(WT_SESSION_IMPL *session, WT_RECONCILE *r)
 
 /*
  * __rec_write_check_complete --
- *	Check that reconciliation should complete
+ *	Check that reconciliation should complete.
  */
 static int
-__rec_write_check_complete(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+__rec_write_check_complete(
+    WT_SESSION_IMPL *session, WT_RECONCILE *r, bool *lookaside_retryp)
 {
-	WT_BOUNDARY *bnd;
-	size_t i;
+	/*
+	 * Tests in this function are lookaside tests and tests to decide if
+	 * rewriting a page in memory is worth doing. In-memory configurations
+	 * can't use a lookaside table, and we ignore page rewrite desirability
+	 * checks for in-memory eviction because a small cache can force us to
+	 * rewrite every possible page.
+	 */
+	if (F_ISSET(r, WT_EVICT_IN_MEMORY))
+		return (0);
 
 	/*
 	 * If we have used the lookaside table, check for a lookaside table and
@@ -574,19 +577,62 @@ __rec_write_check_complete(WT_SESSION_IMPL *session, WT_RECONCILE *r)
 		return (EBUSY);
 
 	/*
-	 * If we are doing update/restore based eviction, confirm part of the
-	 * page is being discarded, or at least 10% of the updates won't have
-	 * to be re-instantiated. Otherwise, it isn't progress, don't bother.
+	 * Eviction can configure lookaside table reconciliation, consider if
+	 * it's worth giving up this reconciliation attempt and falling back to
+	 * using the lookaside table.  We continue with evict/restore if
+	 * switching to the lookaside doesn't make sense for any reason: we
+	 * won't retry an evict/restore reconciliation until/unless the
+	 * transactional system moves forward, so at worst it's a single wasted
+	 * effort.
+	 *
+	 * First, check if the lookaside table is a possible alternative.
 	 */
-	if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) {
-		for (bnd = r->bnd, i = 0; i < r->bnd_entries; ++bnd, ++i)
-			if (bnd->supd == NULL)
-				break;
-		if (i == r->bnd_entries &&
-		    r->update_cnt / 10 >= r->update_skip_cnt)
-			return (EBUSY);
-	}
-	return (0);
+	if (lookaside_retryp == NULL)
+		return (0);
+
+	/*
+	 * We only suggest lookaside if currently in an evict/restore attempt
+	 * and some updates were saved.  Our caller sets the evict/restore flag
+	 * based on various conditions (like if this is a leaf page), which is
+	 * why we're testing that flag instead of a set of other conditions.
+	 * If no updates were saved, eviction will succeed without needing to
+	 * restore anything.
+	 */
+	if (!F_ISSET(r, WT_EVICT_UPDATE_RESTORE) || r->bnd->supd == NULL)
+		return (0);
+
+	/*
+	 * Check if this reconciliation attempt is making progress.  If there's
+	 * any sign of progress, don't fall back to the lookaside table.
+	 *
+	 * Check if the current reconciliation split, in which case we'll
+	 * likely get to write at least one of the blocks.  If that page is
+	 * empty, that's also progress.
+	 */
+	if (r->bnd_next != 1)
+		return (0);
+
+	/*
+	 * Check if the current reconciliation applied some updates, in which
+	 * case evict/restore should gain us some space.
+	 */
+	if (r->update_mem_saved != r->update_mem_all)
+		return (0);
+
+	/*
+	 * Check if lookaside eviction is possible.  If any of the updates we
+	 * saw were uncommitted, the lookaside table cannot be used: it only
+	 * helps with older readers preventing eviction.
+	 */
+	if (r->update_mem_uncommitted != 0)
+		return (0);
+
+	/*
+	 * The current evict/restore approach shows no signs of being useful,
+	 * lookaside is possible, suggest the lookaside table.
+	 */
+	*lookaside_retryp = true;
+	return (EBUSY);
 }
 
 /*
@@ -810,12 +856,10 @@ __rec_write_init(WT_SESSION_IMPL *session,
     WT_REF *ref, uint32_t flags, WT_SALVAGE_COOKIE *salvage, void *reconcilep)
 {
 	WT_BTREE *btree;
-	WT_CONNECTION_IMPL *conn;
 	WT_PAGE *page;
 	WT_RECONCILE *r;
 
 	btree = S2BT(session);
-	conn = S2C(session);
 	page = ref->page;
 
 	if ((r = *(WT_RECONCILE **)reconcilep) == NULL) {
@@ -829,7 +873,8 @@ __rec_write_init(WT_SESSION_IMPL *session,
 		r->last = &r->_last;
 
 		/* Disk buffers need to be aligned for writing. */
-		F_SET(&r->disk_image, WT_ITEM_ALIGNED);
+		F_SET(&r->disk_image[0], WT_ITEM_ALIGNED);
+		F_SET(&r->disk_image[1], WT_ITEM_ALIGNED);
 	}
 
 	/* Reconciliation is not re-entrant, make sure that doesn't happen. */
@@ -845,7 +890,7 @@ __rec_write_init(WT_SESSION_IMPL *session,
 	 * These are all ordered reads, but we only need one.
 	 */
 	r->orig_btree_checkpoint_gen = btree->checkpoint_gen;
-	r->orig_txn_checkpoint_gen = conn->txn_global.checkpoint_gen;
+	r->orig_txn_checkpoint_gen = __wt_gen(session, WT_GEN_CHECKPOINT);
 	WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen);
 
 	/*
@@ -891,7 +936,7 @@ __rec_write_init(WT_SESSION_IMPL *session,
 	r->max_txn = WT_TXN_NONE;
 
 	/* Track if all updates were skipped. */
-	r->update_cnt = r->update_skip_cnt = 0;
+	r->update_mem_all = r->update_mem_saved = r->update_mem_uncommitted = 0;
 
 	/* Track if the page can be marked clean. */
 	r->leave_dirty = false;
@@ -974,8 +1019,8 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep)
 		return;
 	*(WT_RECONCILE **)reconcilep = NULL;
 
-	__wt_buf_free(session, &r->disk_image);
-	__wt_scr_free(session, &r->interim_buf);
+	__wt_buf_free(session, &r->disk_image[0]);
+	__wt_buf_free(session, &r->disk_image[1]);
 
 	__wt_free(session, r->raw_entries);
 	__wt_free(session, r->raw_offsets);
@@ -1115,7 +1160,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
 	WT_DECL_ITEM(tmp);
 	WT_PAGE *page;
 	WT_UPDATE *append, *upd, *upd_list;
-	size_t notused;
+	size_t notused, update_mem;
 	uint64_t max_txn, min_txn, txnid;
 	bool append_origv, skipped;
 
@@ -1136,36 +1181,62 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
 	} else
 		upd_list = ins->upd;
 
-	++r->update_cnt;
-	for (skipped = false,
-	    max_txn = WT_TXN_NONE, min_txn = UINT64_MAX,
-	    upd = upd_list; upd != NULL; upd = upd->next) {
-		if ((txnid = upd->txnid) == WT_TXN_ABORTED)
-			continue;
+	skipped = false;
+	update_mem = 0;
+	max_txn = WT_TXN_NONE;
+	min_txn = UINT64_MAX;
 
-		/* Track the largest/smallest transaction IDs on the list. */
-		if (WT_TXNID_LT(max_txn, txnid))
-			max_txn = txnid;
-		if (WT_TXNID_LT(txnid, min_txn))
-			min_txn = txnid;
+	if (F_ISSET(r, WT_EVICTING)) {
+		/* Discard obsolete updates. */
+		if ((upd = __wt_update_obsolete_check(
+		    session, page, upd_list->next)) != NULL)
+			__wt_update_obsolete_free(session, page, upd);
+
+		for (upd = upd_list; upd != NULL; upd = upd->next) {
+			/* Track the total memory in the update chain. */
+			update_mem += WT_UPDATE_MEMSIZE(upd);
+
+			if ((txnid = upd->txnid) == WT_TXN_ABORTED)
+				continue;
 
-		/*
-		 * Find the first update we can use.
-		 */
-		if (F_ISSET(r, WT_EVICTING)) {
 			/*
+			 * Track the largest/smallest transaction IDs on the
+			 * list.
+			 */
+			if (WT_TXNID_LT(max_txn, txnid))
+				max_txn = txnid;
+			if (WT_TXNID_LT(txnid, min_txn))
+				min_txn = txnid;
+
+			/*
+			 * Find the first update we can use.
+			 *
 			 * Eviction can write any committed update.
 			 *
 			 * When reconciling for eviction, track whether any
 			 * uncommitted updates are found.
+			 *
+			 * When reconciling for eviction, track the memory held
+			 * by the update chain.
 			 */
 			if (__wt_txn_committed(session, txnid)) {
 				if (*updp == NULL)
 					*updp = upd;
 			} else
 				skipped = true;
-		} else {
+		}
+	} else
+		for (upd = upd_list; upd != NULL; upd = upd->next) {
+			if ((txnid = upd->txnid) == WT_TXN_ABORTED)
+				continue;
+
+			/* Track the largest transaction ID on the list. */
+			if (WT_TXNID_LT(max_txn, txnid))
+				max_txn = txnid;
+
 			/*
+			 * Find the first update we can use.
+			 *
 			 * Checkpoint can only write updates visible as of its
 			 * snapshot.
 			 *
@@ -1180,7 +1251,12 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
 					skipped = true;
 			}
 		}
-	}
+
+	/* Reconciliation should never see a reserved update. */
+	WT_ASSERT(session,
+	    *updp == NULL || (*updp)->type != WT_UPDATE_RESERVED);
+
+	r->update_mem_all += update_mem;
 
 	/*
 	 * If all of the updates were aborted, quit. This test is not strictly
@@ -1227,12 +1303,6 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
 		    txnid != S2C(session)->txn_global.checkpoint_txnid ||
 		    WT_SESSION_IS_CHECKPOINT(session));
 #endif
-
-		/*
-		 * Track how many update chains we saw vs. how many update
-		 * chains had an entry we skipped.
-		 */
-		++r->update_skip_cnt;
 		return (0);
 	}
 
@@ -1276,6 +1346,23 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
 	if (skipped && !F_ISSET(r, WT_EVICT_UPDATE_RESTORE))
 		return (EBUSY);
 
+	/*
+	 * Track the memory required by the update chain.
+	 *
+	 * A page with no uncommitted (skipped) updates, that can't be evicted
+	 * because some updates aren't yet globally visible, can be evicted by
+	 * writing previous versions of the updates to the lookaside file. That
+	 * test is just checking if the skipped updates memory is zero.
+	 *
+	 * If that's not possible (there are skipped updates), we can rewrite
+	 * the pages in-memory, but we don't want to unless there's memory to
+	 * recover. That test is comparing the memory we'd recover to the memory
+	 * we'd have to re-instantiate as part of the rewrite.
+	 */
+	r->update_mem_saved += update_mem;
+	if (skipped)
+		r->update_mem_uncommitted += update_mem;
+
 	append_origv = false;
 	if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) {
 		/*
@@ -1353,14 +1440,14 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
 		 * place a deleted record at the end of the update list.
 		 */
 		if (vpack == NULL || vpack->type == WT_CELL_DEL)
-			WT_RET(__wt_update_alloc(
-			    session, NULL, &append, &notused));
+			WT_RET(__wt_update_alloc(session,
+			    NULL, &append, &notused, WT_UPDATE_DELETED));
 		else {
 			WT_RET(__wt_scr_alloc(session, 0, &tmp));
 			if ((ret = __wt_page_cell_data_ref(
 			    session, page, vpack, tmp)) == 0)
-				ret = __wt_update_alloc(
-				    session, tmp, &append, &notused);
+				ret = __wt_update_alloc(session,
+				    tmp, &append, &notused, WT_UPDATE_STANDARD);
 			__wt_scr_free(session, &tmp);
 			WT_RET(ret);
 		}
@@ -1721,7 +1808,7 @@ __rec_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size)
 	 */
 	WT_ASSERT(session, r->space_avail >= size);
 	WT_ASSERT(session, WT_BLOCK_FITS(
-	    r->first_free, size, r->disk_image.mem, r->disk_image.memsize));
+	    r->first_free, size, r->cur_img_ptr->mem, r->cur_img_ptr->memsize));
 
 	r->entries += v;
 	r->space_avail -= size;
@@ -1808,7 +1895,7 @@ __rec_dict_replace(
 	 * copy cell instead.
 	 */
 	if (dp->offset == 0)
-		dp->offset = WT_PTRDIFF32(r->first_free, r->disk_image.mem);
+		dp->offset = WT_PTRDIFF32(r->first_free, r->cur_img_ptr->mem);
 	else {
 		/*
 		 * The offset is the byte offset from this cell to the previous,
@@ -1816,7 +1903,7 @@ __rec_dict_replace(
 		 * page.
 		 */
 		offset = (uint64_t)WT_PTRDIFF(r->first_free,
-		    (uint8_t *)r->disk_image.mem + dp->offset);
+		    (uint8_t *)r->cur_img_ptr->mem + dp->offset);
 		val->len = val->cell_len =
 		    __wt_cell_pack_copy(&val->cell, rle, offset);
 		val->buf.data = NULL;
@@ -1952,7 +2039,6 @@ __rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r)
 static void
 __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd)
 {
-	bnd->offset = 0;
 	bnd->max_bnd_recno = WT_RECNO_OOB;
 	bnd->max_bnd_entries = 0;
 
@@ -2105,8 +2191,8 @@ __rec_split_init(WT_SESSION_IMPL *session,
 	r->page_size = r->page_size_orig = max;
 	if (r->raw_compression)
 		r->max_raw_page_size = r->page_size =
-		    (uint32_t)WT_MIN(r->page_size * 10,
-		    WT_MAX(r->page_size, btree->maxmempage / 2));
+		    (uint32_t)WT_MIN((uint64_t)r->page_size * 10,
+		    WT_MAX((uint64_t)r->page_size, btree->maxmempage / 2));
 	/*
 	 * If we have to split, we want to choose a smaller page size for the
 	 * split pages, because otherwise we could end up splitting one large
@@ -2165,15 +2251,14 @@ __rec_split_init(WT_SESSION_IMPL *session,
 	 * Ensure the disk image buffer is large enough for the max object, as
 	 * corrected by the underlying block manager.
 	 *
-	 * The buffer that we build disk image in, needs to hold two chunks
-	 * worth of data. Since we want to support split_size more than the page
-	 * size (to allow for adjustments based on the compression), this buffer
-	 * should be greater of twice of split_size and page_size.
+	 * Since we want to support split_size more than the page size (to allow
+	 * for adjustments based on the compression), this buffer should be
+	 * greater of split_size and page_size.
 	 */
 	corrected_page_size = r->page_size;
-	disk_img_buf_size = 2 * WT_MAX(corrected_page_size, r->split_size);
 	WT_RET(bm->write_size(bm, session, &corrected_page_size));
-	WT_RET(__wt_buf_init(session, &r->disk_image, disk_img_buf_size));
+	disk_img_buf_size = WT_MAX(corrected_page_size, r->split_size);
+	WT_RET(__wt_buf_init(session, &r->disk_image[0], disk_img_buf_size));
 
 	/*
 	 * Clear the disk page header to ensure all of it is initialized, even
@@ -2183,15 +2268,17 @@ __rec_split_init(WT_SESSION_IMPL *session,
 	 * fixed-length column-store sets bits in bytes, where the bytes are
 	 * assumed to initially be 0.
 	 */
-	memset(r->disk_image.mem, 0, page->type == WT_PAGE_COL_FIX ?
+	memset(r->disk_image[0].mem, 0, page->type == WT_PAGE_COL_FIX ?
 	    disk_img_buf_size : WT_PAGE_HEADER_SIZE);
 
 	/*
 	 * Set the page type (the type doesn't change, and setting it later
 	 * would require additional code in a few different places).
 	 */
-	dsk = r->disk_image.mem;
+	dsk = r->disk_image[0].mem;
 	dsk->type = page->type;
+	r->cur_img_ptr = &r->disk_image[0];
+	r->prev_img_ptr = NULL;
 
 	r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
 
@@ -2200,7 +2287,6 @@ __rec_split_init(WT_SESSION_IMPL *session,
 	WT_RET(__rec_split_bnd_grow(session, r));
 	__rec_split_bnd_init(session, &r->bnd[0]);
 	r->bnd[0].max_bnd_recno = recno;
-	r->bnd[0].offset = WT_PAGE_HEADER_BYTE_SIZE(btree);
 
 	/* Initialize the entry counter. */
 	r->entries = 0;
@@ -2406,21 +2492,18 @@ __rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len)
 {
 	WT_BM *bm;
 	WT_BTREE *btree;
-	size_t corrected_page_size, inuse, len;
+	size_t corrected_page_size, inuse;
 
 	btree = S2BT(session);
 	bm = btree->bm;
 
-	len = WT_PTRDIFF(r->first_free, r->disk_image.mem);
-	inuse = (len - r->bnd[r->bnd_next].offset) +
-	    WT_PAGE_HEADER_BYTE_SIZE(btree);
+	inuse = WT_PTRDIFF(r->first_free, r->cur_img_ptr->mem);
 	corrected_page_size = inuse + add_len;
 
 	WT_RET(bm->write_size(bm, session, &corrected_page_size));
-	/* Need to account for buffer carrying two chunks worth of data */
-	WT_RET(__wt_buf_grow(session, &r->disk_image, 2 * corrected_page_size));
+	WT_RET(__wt_buf_grow(session, r->cur_img_ptr, corrected_page_size));
 
-	r->first_free = (uint8_t *)r->disk_image.mem + len;
+	r->first_free = (uint8_t *)r->cur_img_ptr->mem + inuse;
 	WT_ASSERT(session, corrected_page_size >= inuse);
 	r->space_avail = corrected_page_size - inuse;
 	WT_ASSERT(session, r->space_avail >= add_len);
@@ -2429,89 +2512,55 @@ __rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len)
 }
 
 /*
- * __rec_split_write_prev_and_shift_cur --
- *	Write the previous split chunk to the disk as a page. Shift the contents
- *	of the current chunk to the start of the buffer, making space for a new
- *	chunk to be written.
- *	If the caller asks for a chunk resizing, the boundary between the two
- *	chunks is readjusted to the minimum split size boundary details stored
- *	in the previous chunk, letting the current chunk grow at the cost of the
- *	previous chunk.
+ * __rec_split_write_prev_and_swap_buf --
+ *	If there is a previous split chunk held in the memory, write it to the
+ *	disk as a page. If there isn't one, this is the first time we are
+ *	splitting and need to initialize a second buffer. Also, swap the
+ *	previous and the current buffer pointers.
  */
 static int
-__rec_split_write_prev_and_shift_cur(
-    WT_SESSION_IMPL *session, WT_RECONCILE *r, bool resize_chunks)
+__rec_split_write_prev_and_swap_buf(WT_SESSION_IMPL *session, WT_RECONCILE *r)
 {
-	WT_BM *bm;
-	WT_BOUNDARY *bnd_cur, *bnd_prev;
-	WT_BTREE *btree;
-	WT_PAGE_HEADER *dsk, *dsk_tmp;
-	size_t cur_len, len;
-	uint8_t *dsk_start;
-
-	WT_ASSERT(session, r->bnd_next != 0);
-
-	btree = S2BT(session);
-	bm = btree->bm;
-	bnd_cur = &r->bnd[r->bnd_next];
-	bnd_prev = bnd_cur - 1;
-	dsk = r->disk_image.mem;
-	cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset;
-
-	/*
-	 * Resize chunks if the current is smaller than the minimum, and there
-	 * are details on the minimum split size boundary available in the
-	 * previous boundary details.
-	 *
-	 * There is a possibility that we do not have a minimum boundary set, in
-	 * such a case we skip chunk resizing. Such a condition is possible for
-	 * instance when we are building the image in the buffer and the first
-	 * K/V pair is large enough that it surpasses both the minimum split
-	 * size and the split size the application has set. In such a case we
-	 * split the chunk without saving any minimum boundary.
-	 */
-	if (resize_chunks &&
-	    cur_len < r->min_split_size && bnd_prev->min_bnd_offset != 0) {
-		bnd_cur->offset = bnd_prev->min_bnd_offset;
-		bnd_cur->max_bnd_entries +=
-		    bnd_prev->max_bnd_entries - bnd_prev->min_bnd_entries;
-		bnd_prev->max_bnd_entries = bnd_prev->min_bnd_entries;
-		bnd_cur->max_bnd_recno = bnd_prev->min_bnd_recno;
-
-		WT_RET(__wt_buf_set(session, &bnd_cur->max_bnd_key,
-		    bnd_prev->min_bnd_key.data, bnd_prev->min_bnd_key.size));
-
-		/* Update current chunk's length */
-		cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset;
+	WT_BOUNDARY *bnd_prev;
+	WT_ITEM *tmp_img_ptr;
+	WT_PAGE_HEADER *dsk;
+	size_t disk_img_size;
+
+	WT_ASSERT(session, (r->prev_img_ptr == NULL && r->bnd_next == 0) ||
+	    (r->prev_img_ptr != NULL && r->bnd_next != 0));
+
+	/* Write previous chunk, if there is one */
+	if (r->prev_img_ptr != NULL) {
+		bnd_prev = &r->bnd[r->bnd_next - 1];
+		dsk = r->prev_img_ptr->mem;
+		dsk->recno = bnd_prev->max_bnd_recno;
+		dsk->u.entries = bnd_prev->max_bnd_entries;
+		dsk->mem_size = (uint32_t)bnd_prev->size;
+		r->prev_img_ptr->size = dsk->mem_size;
+		WT_RET(__rec_split_write(session,
+		    r, bnd_prev, r->prev_img_ptr, false));
+	} else {
+		/*
+		 * If we do not have a previous buffer, we should initialize the
+		 * second buffer before proceeding. We will create the second
+		 * buffer of the same size as the current buffer.
+		 */
+		disk_img_size = r->cur_img_ptr->memsize;
+		WT_RET(__wt_buf_init(session,
+		    &r->disk_image[1], disk_img_size));
+		r->prev_img_ptr = &r->disk_image[1];
+		dsk = r->prev_img_ptr->mem;
+		memset(dsk, 0,
+		    r->page->type == WT_PAGE_COL_FIX ?
+		    disk_img_size : WT_PAGE_HEADER_SIZE);
+		dsk->type = r->page->type;
 	}
 
-	/*
-	 * Create an interim buffer if not already done to prepare the previous
-	 * chunk's disk image.
-	 */
-	len = bnd_cur->offset;
-	WT_RET(bm->write_size(bm, session, &len));
-	if (r->interim_buf == NULL)
-		WT_RET(__wt_scr_alloc(session, len, &r->interim_buf));
-	else
-		WT_RET(__wt_buf_init(session, r->interim_buf, len));
-
-	dsk_tmp = r->interim_buf->mem;
-	memcpy(dsk_tmp, dsk, bnd_cur->offset);
-	dsk_tmp->recno = bnd_prev->max_bnd_recno;
-	dsk_tmp->u.entries = bnd_prev->max_bnd_entries;
-	dsk_tmp->mem_size = WT_STORE_SIZE(bnd_cur->offset);
-	r->interim_buf->size = dsk_tmp->mem_size;
-	WT_RET(__rec_split_write(session, r, bnd_prev, r->interim_buf, false));
-
-	/* Shift the current chunk to the start of the buffer */
-	dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
-	(void)memmove(dsk_start, (uint8_t *)dsk + bnd_cur->offset, cur_len);
-
-	/* Fix boundary offset */
-	bnd_cur->offset = WT_PAGE_HEADER_BYTE_SIZE(btree);
-	/* Fix where free points */
-	r->first_free = dsk_start + cur_len;
+	/* swap previous and current buffers */
+	tmp_img_ptr = r->prev_img_ptr;
+	r->prev_img_ptr = r->cur_img_ptr;
+	r->cur_img_ptr = tmp_img_ptr;
+
 	return (0);
 }
 
@@ -2529,7 +2578,7 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
 	size_t inuse;
 
 	btree = S2BT(session);
-	dsk = r->disk_image.mem;
+	dsk = r->cur_img_ptr->mem;
 
 	/* Fixed length col store can call with next_len 0 */
 	WT_ASSERT(session, next_len == 0 || r->space_avail < next_len);
@@ -2543,9 +2592,7 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
 		    "%s page too large, attempted split during salvage",
 		    __wt_page_type_string(r->page->type));
 
-	last = &r->bnd[r->bnd_next];
-	inuse = (WT_PTRDIFF(r->first_free, dsk) - last->offset) +
-	    WT_PAGE_HEADER_BYTE_SIZE(btree);
+	inuse = WT_PTRDIFF(r->first_free, dsk);
 
 	/*
 	 * We can get here if the first key/value pair won't fit.
@@ -2558,8 +2605,10 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
 	/* All page boundaries reset the dictionary. */
 	__rec_dictionary_reset(r);
 
-	/* Set the number of entries for the just finished chunk. */
+	/* Set the number of entries and size for the just finished chunk. */
+	last = &r->bnd[r->bnd_next];
 	last->max_bnd_entries = r->entries;
+	last->size = (uint32_t)inuse;
 
 	/*
 	 * In case of bulk load, write out chunks as we get them. Otherwise we
@@ -2571,19 +2620,22 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
 		dsk->recno = last->max_bnd_recno;
 		dsk->u.entries = last->max_bnd_entries;
 		dsk->mem_size = (uint32_t)inuse;
-		r->disk_image.size = dsk->mem_size;
-		WT_RET(__rec_split_write(
-		    session, r, last, &r->disk_image, false));
-		/* Fix where free points */
-		r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
-	} else if (r->bnd_next != 0)
-		WT_RET(__rec_split_write_prev_and_shift_cur(session, r, false));
+		r->cur_img_ptr->size = dsk->mem_size;
+		WT_RET(__rec_split_write(session,
+		    r, last, r->cur_img_ptr, false));
+	} else {
+		WT_RET(__rec_split_write_prev_and_swap_buf(session, r));
+		/* current image we are writing to has changed */
+		dsk = r->cur_img_ptr->mem;
+	}
+
+	/* Fix where free points */
+	r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
 
 	/* Prepare the next boundary */
 	WT_RET(__rec_split_bnd_grow(session, r));
 	r->bnd_next++;
 	next = &r->bnd[r->bnd_next];
-	next->offset = WT_PTRDIFF(r->first_free, dsk);
 	/* Set the key for the next chunk. */
 	next->max_bnd_recno = r->recno;
 	if (dsk->type == WT_PAGE_ROW_INT || dsk->type == WT_PAGE_ROW_LEAF)
@@ -2642,9 +2694,8 @@ __rec_split_crossing_bnd(
 	    !WT_CROSSING_SPLIT_BND(r, next_len)) {
 		btree = S2BT(session);
 		bnd = &r->bnd[r->bnd_next];
-		dsk = r->disk_image.mem;
-		min_bnd_offset = (WT_PTRDIFF(r->first_free, dsk) -
-		    bnd->offset) + WT_PAGE_HEADER_BYTE_SIZE(btree);
+		dsk = r->cur_img_ptr->mem;
+		min_bnd_offset = WT_PTRDIFF(r->first_free, dsk);
 		if (min_bnd_offset == WT_PAGE_HEADER_BYTE_SIZE(btree))
 			/*
 			 * This is possible if the first record doesn't fit in
@@ -2705,7 +2756,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session,
 	unpack = &_unpack;
 	compressor = btree->compressor;
 	dst = &r->raw_destination;
-	dsk = r->disk_image.mem;
+	dsk = r->cur_img_ptr->mem;
 
 	WT_RET(__rec_split_bnd_grow(session, r));
 	last = &r->bnd[r->bnd_next];
@@ -3021,7 +3072,7 @@ no_slots:
 		r->first_free = dsk_start + len;
 		r->space_avail += r->raw_offsets[result_slots];
 		WT_ASSERT(session, r->first_free + r->space_avail <=
-		    (uint8_t *)r->disk_image.mem + r->disk_image.memsize);
+		    (uint8_t *)r->cur_img_ptr->mem + r->cur_img_ptr->memsize);
 
 		/*
 		 * Set the key for the next block (before writing the block, a
@@ -3060,13 +3111,13 @@ no_slots:
 		dsk->recno = last->max_bnd_recno;
 		dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk);
 		dsk->u.entries = r->entries;
-		r->disk_image.size = dsk->mem_size;
+		r->cur_img_ptr->size = dsk->mem_size;
 
 		r->entries = 0;
 		r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
 		r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
 
-		write_ref = &r->disk_image;
+		write_ref = r->cur_img_ptr;
 		last->already_compressed = false;
 	} else {
 		/*
@@ -3094,7 +3145,7 @@ no_slots:
 	    last_block && __rec_is_checkpoint(session, r, last)) {
 		if (write_ref == dst)
 			WT_RET(__wt_buf_set(
-			    session, &r->disk_image, dst->mem, dst->size));
+			    session, r->cur_img_ptr, dst->mem, dst->size));
 	} else
 		WT_RET(
 		    __rec_split_write(session, r, last, write_ref, last_block));
@@ -3128,15 +3179,120 @@ __rec_split_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
 }
 
 /*
+ * __rec_split_finish_process_prev --
+ * 	If the two split chunks together fit in a single page, merge them into
+ * 	one. If they do not fit in a single page but the last is smaller than
+ * 	the minimum desired, move some data from the penultimate chunk to the
+ * 	last chunk and write out the previous/penultimate. Finally, update the
+ * 	pointer to the current image buffer.  After this function exits, we will
+ * 	have one (last) buffer in memory, pointed to by the current image
+ * 	pointer.
+ */
+static int
+__rec_split_finish_process_prev(
+    WT_SESSION_IMPL *session, WT_RECONCILE *r, bool *chunks_merged)
+{
+	WT_BOUNDARY *bnd_cur, *bnd_prev;
+	WT_BTREE *btree;
+	WT_PAGE_HEADER *dsk;
+	size_t len_to_move;
+	uint32_t combined_size;
+	uint8_t *cur_dsk_start;
+
+	WT_ASSERT(session, r->prev_img_ptr != NULL);
+
+	btree = S2BT(session);
+	bnd_cur = &r->bnd[r->bnd_next];
+	bnd_prev = bnd_cur - 1;
+	*chunks_merged = false;
+	/*
+	 * The sizes referred to in the boundary structure include the header,
+	 * so when calculating the combined size, make sure not to include the
+	 * header twice.
+	 */
+	combined_size = bnd_prev->size +
+	    (bnd_cur->size - WT_PAGE_HEADER_BYTE_SIZE(btree));
+
+	if (combined_size <= r->page_size) {
+		/*
+		 * We have two boundaries, but the data in the buffers can fit a
+		 * single page. Merge the boundaries and create a single chunk.
+		 */
+		dsk = r->cur_img_ptr->mem;
+		memcpy((uint8_t *)r->prev_img_ptr->mem + bnd_prev->size,
+		    WT_PAGE_HEADER_BYTE(btree, dsk),
+		    bnd_cur->size - WT_PAGE_HEADER_BYTE_SIZE(btree));
+		bnd_prev->size = combined_size;
+		bnd_prev->max_bnd_entries += bnd_cur->max_bnd_entries;
+		r->bnd_next--;
+		*chunks_merged = true;
+	} else {
+		if (bnd_cur->size < r->min_split_size &&
+		    bnd_prev->min_bnd_offset != 0 ) {
+			/*
+			 * The last chunk, pointed to by the current image
+			 * pointer, has less than the minimum data. Let's move
+			 * any data more than the minimum from the previous
+			 * image into the current.
+			 */
+			len_to_move = bnd_prev->size - bnd_prev->min_bnd_offset;
+			/* Grow current buffer if it is not large enough */
+			if (r->space_avail < len_to_move)
+				WT_RET(__rec_split_grow(session,
+				    r, len_to_move));
+			cur_dsk_start = WT_PAGE_HEADER_BYTE(btree,
+			    r->cur_img_ptr->mem);
+
+			/*
+			 * Shift the contents of the current buffer to make
+			 * space for the data that will be prepended into the
+			 * current buffer
+			 */
+			memmove(cur_dsk_start + len_to_move,
+			    cur_dsk_start, bnd_cur->size -
+			    WT_PAGE_HEADER_BYTE_SIZE(btree));
+			/*
+			 * copy any data more than the minimum, from the
+			 * previous buffer to the start of the current.
+			 */
+			memcpy(cur_dsk_start, (uint8_t *)r->prev_img_ptr->mem +
+			    bnd_prev->min_bnd_offset, len_to_move);
+
+			/* Update boundary information */
+			bnd_cur->size += (uint32_t)len_to_move;
+			bnd_prev->size -= (uint32_t)len_to_move;
+			bnd_cur->max_bnd_entries += bnd_prev->max_bnd_entries -
+			    bnd_prev->min_bnd_entries;
+			bnd_prev->max_bnd_entries = bnd_prev->min_bnd_entries;
+			bnd_cur->max_bnd_recno = bnd_prev->min_bnd_recno;
+			WT_RET(__wt_buf_set(session,
+			    &bnd_cur->max_bnd_key, bnd_prev->min_bnd_key.data,
+			    bnd_prev->min_bnd_key.size));
+		}
+
+		/* Write out the previous image */
+		WT_RET(__rec_split_write_prev_and_swap_buf(session, r));
+	}
+
+	/*
+	 * At this point, there is only one disk image in the memory, pointed to
+	 * by the previous image pointer. Update the current image pointer to
+	 * this image.
+	 */
+	r->cur_img_ptr = r->prev_img_ptr;
+	return (0);
+}
+
+/*
  * __rec_split_finish_std --
  *	Finish processing a page, standard version.
  */
 static int
 __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
 {
-	WT_BOUNDARY *bnd_cur, *bnd_prev;
+	WT_BOUNDARY *bnd_cur;
 	WT_PAGE_HEADER *dsk;
-	bool grow_bnd;
+	bool chunks_merged;
 
 	/*
 	 * We may arrive here with no entries to write if the page was entirely
@@ -3163,50 +3319,22 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
 			return (EBUSY);
 	}
 
-	dsk = r->disk_image.mem;
-
-	/* Set the number of entries for the just finished chunk. */
+	/* Set the number of entries and size for the just finished chunk. */
 	bnd_cur = &r->bnd[r->bnd_next];
 	bnd_cur->max_bnd_entries = r->entries;
+	bnd_cur->size = WT_PTRDIFF32(r->first_free, r->cur_img_ptr->mem);
 
-	grow_bnd = true;
-	/*
-	 * We can reach here even with raw_compression when the last split chunk
-	 * is too small to be sent for raw compression.
-	 */
-	if (!r->is_bulk_load && !r->raw_compression) {
-		if (WT_PTRDIFF(r->first_free, dsk) > r->page_size &&
-		    r->bnd_next != 0) {
-			/*
-			 * We hold two boundaries worth of data in the buffer,
-			 * and this data doesn't fit in a single page.  If the
-			 * last chunk is too small, readjust the boundary to a
-			 * pre-computed minimum.
-			 * Write out the penultimate chunk to the disk as a page
-			 */
-			WT_RET(__rec_split_write_prev_and_shift_cur(
-			    session, r, true));
-		} else
-			if (r->bnd_next != 0) {
-				/*
-				 * We have two boundaries, but the data in the
-				 * buffer can fit a single page. Merge the
-				 * boundaries to create a single chunk.
-				 */
-				bnd_prev = bnd_cur - 1;
-				bnd_prev->max_bnd_entries +=
-				    bnd_cur->max_bnd_entries;
-				r->bnd_next--;
-				grow_bnd = false;
-			}
-	}
+	chunks_merged = false;
+	if (r->prev_img_ptr != NULL)
+		WT_RET(__rec_split_finish_process_prev(session,
+		    r, &chunks_merged));
 
 	/*
 	 * We already have space for an extra boundary if we merged two
 	 * boundaries above, in that case we do not need to grow the boundary
 	 * structure.
 	 */
-	if (grow_bnd)
+	if (!chunks_merged)
 		WT_RET(__rec_split_bnd_grow(session, r));
 	bnd_cur = &r->bnd[r->bnd_next];
 	r->bnd_next++;
@@ -3215,14 +3343,15 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
 	 * Current boundary now has all the remaining data/last page now.
 	 * Let's write it to the disk
 	 */
+	dsk = r->cur_img_ptr->mem;
 	dsk->recno = bnd_cur->max_bnd_recno;
 	dsk->u.entries = bnd_cur->max_bnd_entries;
-	dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk);
-	r->disk_image.size = dsk->mem_size;
+	dsk->mem_size = bnd_cur->size;
+	r->cur_img_ptr->size = dsk->mem_size;
 
 	/* If this is a checkpoint, we're done, otherwise write the page. */
 	return (__rec_is_checkpoint(session, r, bnd_cur) ?
-	    0 : __rec_split_write(session, r, bnd_cur, &r->disk_image, true));
+	    0 : __rec_split_write(session, r, bnd_cur, r->cur_img_ptr, true));
 }
 
 /*
@@ -3244,7 +3373,7 @@ __rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r)
 	if (r->raw_compression && r->entries != 0) {
 		while (r->entries != 0) {
 			data_size =
-			    WT_PTRDIFF(r->first_free, r->disk_image.mem);
+			    WT_PTRDIFF(r->first_free, r->cur_img_ptr->mem);
 			if (data_size <= btree->allocsize)
 				break;
 			WT_RET(__rec_split_raw_worker(session, r, 0, true));
@@ -3523,8 +3652,7 @@ __rec_update_las(WT_SESSION_IMPL *session,
 	WT_PAGE *page;
 	WT_SAVE_UPD *list;
 	WT_UPDATE *upd;
-	uint64_t las_counter;
-	int64_t insert_cnt;
+	uint64_t insert_cnt, las_counter;
 	uint32_t i, session_flags, slot;
 	uint8_t *p;
 
@@ -3613,20 +3741,24 @@ __rec_update_las(WT_SESSION_IMPL *session,
 
 		/*
 		 * Walk the list of updates, storing each key/value pair into
-		 * the lookaside table.
+		 * the lookaside table. Skipped reserved items, they're never
+		 * restored, obviously.
 		 */
 		do {
+			if (upd->type == WT_UPDATE_RESERVED)
+				continue;
+
 			cursor->set_key(cursor, btree_id,
 			    &las_addr, ++las_counter, list->onpage_txn, key);
 
-			if (WT_UPDATE_DELETED_ISSET(upd))
+			if (upd->type == WT_UPDATE_DELETED)
 				las_value.size = 0;
 			else {
 				las_value.data = WT_UPDATE_DATA(upd);
 				las_value.size = upd->size;
 			}
 			cursor->set_value(
-			    cursor, upd->txnid, upd->size, &las_value);
+			    cursor, upd->txnid, upd->type, &las_value);
 
 			WT_ERR(cursor->insert(cursor));
 			++insert_cnt;
@@ -3635,9 +3767,11 @@ __rec_update_las(WT_SESSION_IMPL *session,
 
 err:	WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
 
-	if (insert_cnt > 0)
-		(void)__wt_atomic_addi64(
+	if (insert_cnt > 0) {
+		(void)__wt_atomic_add64(
 		    &S2C(session)->las_record_cnt, insert_cnt);
+		__rec_verbose_lookaside_write(session);
+	}
 
 	__wt_scr_free(session, &key);
 	return (ret);
@@ -4389,8 +4523,7 @@ __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r,
 			WT_RET(__rec_split_raw(session, r, val->len));
 	} else
 		if (WT_CHECK_CROSSING_BND(r, val->len))
-			WT_RET(__rec_split_crossing_bnd(
-			    session, r, val->len));
+			WT_RET(__rec_split_crossing_bnd(session, r, val->len));
 
 	/* Copy the value onto the page. */
 	if (!deleted && !overflow_type && btree->dictionary)
@@ -4553,7 +4686,7 @@ record_loop:	/*
 				update_no_copy = true;	/* No data copy */
 				repeat_count = 1;	/* Single record */
 
-				deleted = WT_UPDATE_DELETED_ISSET(upd);
+				deleted = upd->type == WT_UPDATE_DELETED;
 				if (!deleted) {
 					data = WT_UPDATE_DATA(upd);
 					size = upd->size;
@@ -4788,7 +4921,7 @@ compare:		/*
 				}
 			} else {
 				deleted = upd == NULL ||
-				    WT_UPDATE_DELETED_ISSET(upd);
+				    upd->type == WT_UPDATE_DELETED;
 				if (!deleted) {
 					data = WT_UPDATE_DATA(upd);
 					size = upd->size;
@@ -5333,7 +5466,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
 				    __wt_ovfl_cache(session, page, rip, vpack));
 
 			/* If this key/value pair was deleted, we're done. */
-			if (WT_UPDATE_DELETED_ISSET(upd)) {
+			if (upd->type == WT_UPDATE_DELETED) {
 				/*
 				 * Overflow keys referencing discarded values
 				 * are no longer useful, discard the backing
@@ -5543,7 +5676,7 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
 	for (; ins != NULL; ins = WT_SKIP_NEXT(ins)) {
 		/* Look for an update. */
 		WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd));
-		if (upd == NULL || WT_UPDATE_DELETED_ISSET(upd))
+		if (upd == NULL || upd->type == WT_UPDATE_DELETED)
 			continue;
 
 		if (upd->size == 0)			/* Build value cell. */
@@ -5833,7 +5966,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
 		 * write the buffer so we know what to do here.
 		 */
 		if (bnd->addr.addr == NULL)
-			WT_RET(__wt_bt_write(session, &r->disk_image,
+			WT_RET(__wt_bt_write(session, r->cur_img_ptr,
 			    NULL, NULL, true, F_ISSET(r, WT_CHECKPOINTING),
 			    bnd->already_compressed));
 		else {
@@ -6497,7 +6630,7 @@ __rec_dictionary_lookup(
 	for (dp = __rec_dictionary_skip_search(r->dictionary_head, hash);
 	    dp != NULL && dp->hash == hash; dp = dp->next[0]) {
 		WT_RET(__wt_cell_pack_data_match(
-		    (WT_CELL *)((uint8_t *)r->disk_image.mem + dp->offset),
+		    (WT_CELL *)((uint8_t *)r->cur_img_ptr->mem + dp->offset),
 		    &val->cell, val->buf.data, &match));
 		if (match) {
 			WT_STAT_DATA_INCR(session, rec_dictionary);
@@ -6530,3 +6663,51 @@ __rec_dictionary_lookup(
 	*dpp = next;
 	return (0);
 }
+
+/*
+ * __rec_verbose_lookaside_write --
+ *	Create a verbose message to display once per checkpoint with details
+ * about the cache state when performing a lookaside table write.
+ */
+static void
+__rec_verbose_lookaside_write(WT_SESSION_IMPL *session)
+{
+#ifdef HAVE_VERBOSE
+	WT_CONNECTION_IMPL *conn;
+	uint64_t ckpt_gen_current, ckpt_gen_last;
+	uint32_t pct_dirty, pct_full;
+
+	if (!WT_VERBOSE_ISSET(session, WT_VERB_LOOKASIDE)) return;
+
+	conn = S2C(session);
+	ckpt_gen_current = __wt_gen(session, WT_GEN_CHECKPOINT);
+	ckpt_gen_last = conn->las_verb_gen_write;
+
+	/*
+	 * This message is throttled to one per checkpoint. To do this we
+	 * track the generation of the last checkpoint for which the message
+	 * was printed and check against the current checkpoint generation.
+	 */
+	if (ckpt_gen_current > ckpt_gen_last) {
+		/*
+		 * Attempt to atomically replace the last checkpoint generation
+		 * for which this message was printed. If the atomic swap fails
+		 * we have raced and the winning thread will print the message.
+		 */
+		if (__wt_atomic_casv64(&conn->las_verb_gen_write,
+		    ckpt_gen_last, ckpt_gen_current)) {
+			(void)__wt_eviction_clean_needed(session, &pct_full);
+			(void)__wt_eviction_dirty_needed(session, &pct_dirty);
+
+			__wt_verbose(session, WT_VERB_LOOKASIDE,
+			    "Page reconciliation triggered lookaside write. "
+			    "Entries now in lookaside file: %" PRIu64 ", "
+			    "cache dirty: %" PRIu32 "%% , "
+			    "cache use: %" PRIu32 "%%",
+			    conn->las_record_cnt, pct_dirty, pct_full);
+		}
+	}
+#else
+	WT_UNUSED(session);
+#endif
+}