WT-2439 Improve page layout: keep pages more than half full (#3277)

* Changes `split_pct` to have a minimum of 50%.
author: Sulabh Mahajan <sulabh.mahajan@mongodb.com> 2017-03-29 15:38:35 +1100
committer: Michael Cahill <michael.cahill@mongodb.com> 2017-03-29 15:38:35 +1100
commit: 1c41c7735b3529521b7bd34180f80584caee7f59 (patch)
tree: 7ff9a0eb97bd4667fd2286dae77be06a75426d17
parent: d5a10d2e97853e7db6bb4c2635b97febf13607c5 (diff)
download: mongo-1c41c7735b3529521b7bd34180f80584caee7f59.tar.gz
7 files changed, 532 insertions, 463 deletions
diff --git a/dist/api_data.py b/dist/api_data.py
index 1d669fa7fe0..22600dd5e29 100644
--- a/dist/api_data.py
+++ b/dist/api_data.py
@@ -295,12 +295,12 @@ file_config = format_meta + file_runtime_config + [
     Config('split_deepen_per_child', '0', r'''
         entries allocated per child when deepening the tree''',
         type='int', undoc=True),
-    Config('split_pct', '75', r'''
+    Config('split_pct', '90', r'''
         the Btree page split size as a percentage of the maximum Btree
         page size, that is, when a Btree page is split, it will be
         split into smaller pages, where each page is the specified
         percentage of the maximum Btree page size''',
-        min='25', max='100'),
+        min='50', max='100'),
 ]
 
 # File metadata, including both configurable and non-configurable (internal)
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index f2bffee06da..98c246fb897 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -780,9 +780,16 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
 	 * Get the split percentage (reconciliation splits pages into smaller
 	 * than the maximum page size chunks so we don't split every time a
 	 * new entry is added). Determine how large newly split pages will be.
+	 * Set to the minimum, if the read value is less than that.
 	 */
 	WT_RET(__wt_config_gets(session, cfg, "split_pct", &cval));
-	btree->split_pct = (int)cval.val;
+	if (cval.val < WT_BTREE_MIN_SPLIT_PCT) {
+		btree->split_pct = WT_BTREE_MIN_SPLIT_PCT;
+		WT_RET(__wt_msg(session,
+		    "Re-setting split_pct for %s to the minimum allowed of "
+		    "%d%%.", session->dhandle->name, WT_BTREE_MIN_SPLIT_PCT));
+	} else
+		btree->split_pct = (int)cval.val;
 	intl_split_size = __wt_split_page_size(btree, btree->maxintlpage);
 	leaf_split_size = __wt_split_page_size(btree, btree->maxleafpage);
 
diff --git a/src/config/config_def.c b/src/config/config_def.c
index b11a8d63fdb..f152fbacad4 100644
--- a/src/config/config_def.c
+++ b/src/config/config_def.c
@@ -294,7 +294,7 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_create[] = {
 	{ "source", "string", NULL, NULL, NULL, 0 },
 	{ "split_deepen_min_child", "int", NULL, NULL, NULL, 0 },
 	{ "split_deepen_per_child", "int", NULL, NULL, NULL, 0 },
-	{ "split_pct", "int", NULL, "min=25,max=100", NULL, 0 },
+	{ "split_pct", "int", NULL, "min=50,max=100", NULL, 0 },
 	{ "type", "string", NULL, NULL, NULL, 0 },
 	{ "value_format", "format",
 	    __wt_struct_confchk, NULL,
@@ -466,7 +466,7 @@ static const WT_CONFIG_CHECK confchk_file_config[] = {
 	{ "prefix_compression_min", "int", NULL, "min=0", NULL, 0 },
 	{ "split_deepen_min_child", "int", NULL, NULL, NULL, 0 },
 	{ "split_deepen_per_child", "int", NULL, NULL, NULL, 0 },
-	{ "split_pct", "int", NULL, "min=25,max=100", NULL, 0 },
+	{ "split_pct", "int", NULL, "min=50,max=100", NULL, 0 },
 	{ "value_format", "format",
 	    __wt_struct_confchk, NULL,
 	    NULL, 0 },
@@ -530,7 +530,7 @@ static const WT_CONFIG_CHECK confchk_file_meta[] = {
 	{ "prefix_compression_min", "int", NULL, "min=0", NULL, 0 },
 	{ "split_deepen_min_child", "int", NULL, NULL, NULL, 0 },
 	{ "split_deepen_per_child", "int", NULL, NULL, NULL, 0 },
-	{ "split_pct", "int", NULL, "min=25,max=100", NULL, 0 },
+	{ "split_pct", "int", NULL, "min=50,max=100", NULL, 0 },
 	{ "value_format", "format",
 	    __wt_struct_confchk, NULL,
 	    NULL, 0 },
@@ -614,7 +614,7 @@ static const WT_CONFIG_CHECK confchk_lsm_meta[] = {
 	{ "prefix_compression_min", "int", NULL, "min=0", NULL, 0 },
 	{ "split_deepen_min_child", "int", NULL, NULL, NULL, 0 },
 	{ "split_deepen_per_child", "int", NULL, NULL, NULL, 0 },
-	{ "split_pct", "int", NULL, "min=25,max=100", NULL, 0 },
+	{ "split_pct", "int", NULL, "min=50,max=100", NULL, 0 },
 	{ "value_format", "format",
 	    __wt_struct_confchk, NULL,
 	    NULL, 0 },
@@ -1119,7 +1119,7 @@ static const WT_CONFIG_ENTRY config_entries[] = {
 	  "chunk_size=10MB,merge_max=15,merge_min=0),memory_page_max=5MB,"
 	  "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false,"
 	  "prefix_compression_min=4,source=,split_deepen_min_child=0,"
-	  "split_deepen_per_child=0,split_pct=75,type=file,value_format=u",
+	  "split_deepen_per_child=0,split_pct=90,type=file,value_format=u",
 	  confchk_WT_SESSION_create, 42
 	},
 	{ "WT_SESSION.drop",
@@ -1213,7 +1213,7 @@ static const WT_CONFIG_ENTRY config_entries[] = {
 	  "leaf_value_max=0,log=(enabled=true),memory_page_max=5MB,"
 	  "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false,"
 	  "prefix_compression_min=4,split_deepen_min_child=0,"
-	  "split_deepen_per_child=0,split_pct=75,value_format=u",
+	  "split_deepen_per_child=0,split_pct=90,value_format=u",
 	  confchk_file_config, 35
 	},
 	{ "file.meta",
@@ -1228,7 +1228,7 @@ static const WT_CONFIG_ENTRY config_entries[] = {
 	  "leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=0,"
 	  "log=(enabled=true),memory_page_max=5MB,os_cache_dirty_max=0,"
 	  "os_cache_max=0,prefix_compression=false,prefix_compression_min=4"
-	  ",split_deepen_min_child=0,split_deepen_per_child=0,split_pct=75,"
+	  ",split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90,"
 	  "value_format=u,version=(major=0,minor=0)",
 	  confchk_file_meta, 39
 	},
@@ -1253,7 +1253,7 @@ static const WT_CONFIG_ENTRY config_entries[] = {
 	  "merge_min=0),memory_page_max=5MB,old_chunks=,"
 	  "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false,"
 	  "prefix_compression_min=4,split_deepen_min_child=0,"
-	  "split_deepen_per_child=0,split_pct=75,value_format=u",
+	  "split_deepen_per_child=0,split_pct=90,value_format=u",
 	  confchk_lsm_meta, 39
 	},
 	{ "table.meta",
diff --git a/src/include/btree.h b/src/include/btree.h
index 88312f408cc..28fe1b94b23 100644
--- a/src/include/btree.h
+++ b/src/include/btree.h
@@ -58,6 +58,12 @@
 #define	WT_BTREE_DELETE_THRESHOLD	1000
 
 /*
+ * Minimum size of the chunks (in percentage of the page size) a page gets split
+ * into during reconciliation.
+ */
+#define	WT_BTREE_MIN_SPLIT_PCT		50
+
+/*
  * WT_BTREE --
  *	A btree handle.
  */
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index 558e93d3de0..707159ef6ae 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -1242,8 +1242,8 @@ struct __wt_session {
 	 * @config{split_pct, the Btree page split size as a percentage of the
 	 * maximum Btree page size\, that is\, when a Btree page is split\, it
 	 * will be split into smaller pages\, where each page is the specified
-	 * percentage of the maximum Btree page size., an integer between 25 and
-	 * 100; default \c 75.}
+	 * percentage of the maximum Btree page size., an integer between 50 and
+	 * 100; default \c 90.}
 	 * @config{type, set the type of data source used to store a column
 	 * group\, index or simple table.  By default\, a \c "file:" URI is
 	 * derived from the object name.  The \c type configuration can be used
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index 23f654caa70..e18d44f96ff 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -26,6 +26,11 @@ typedef struct {
 	uint32_t flags;			/* Caller's configuration */
 
 	WT_ITEM	 disk_image;		/* Temporary disk-image buffer */
+	/*
+	 * Temporary buffer used to write out a disk image when managing two
+	 * chunks worth of data in memory
+	 */
+	WT_ITEM *interim_buf;
 
 	/*
 	 * Track start/stop write generation to decide if all changes to the
@@ -127,6 +132,7 @@ typedef struct {
 	 * repeatedly split a packed page.
 	 */
 	uint32_t split_size;		/* Split page size */
+	uint32_t min_split_size;	/* Minimum split page size */
 
 	/*
 	 * The problem with splits is we've done a lot of work by the time we
@@ -151,16 +157,6 @@ typedef struct {
 		 */
 		size_t offset;		/* Split's first byte */
 
-		/*
-		 * The recno and entries fields are the starting record number
-		 * of the split chunk (for column-store splits), and the number
-		 * of entries in the split chunk.  These fields are used both
-		 * to write the split chunk, and to create a new internal page
-		 * to reference the split pages.
-		 */
-		uint64_t recno;		/* Split's starting record */
-		uint32_t entries;	/* Split's entries */
-
 		WT_ADDR addr;		/* Split's written location */
 		uint32_t size;		/* Split's size */
 		uint32_t checksum;	/* Split's checksum */
@@ -182,11 +178,36 @@ typedef struct {
 		size_t	     supd_allocated;
 
 		/*
+		 * While reconciling pages, at any given time, we maintain two
+		 * split chunks in the memory to be written out as pages. As we
+		 * get to the last two chunks, if the last one turns out to be
+		 * smaller than the minimum split size, we go back into the
+		 * penultimate chunk and split at this minimum split size
+		 * boundary. This moves some data from the penultimate chunk to
+		 * the last chunk, hence increasing the size of the last page
+		 * written without decreasing the penultimate page size beyond
+		 * the minimum split size. For this reason, we maintain both a
+		 * maximum split percentage boundary and a minimum split
+		 * percentage boundary.
+		 *
+		 * The recno and entries fields are the starting record number
+		 * of the split chunk (for column-store splits), and the number
+		 * of entries in the split chunk.  These fields are used both to
+		 * write the split chunk, and to create a new internal page to
+		 * reference the split pages.
+		 *
 		 * The key for a row-store page; no column-store key is needed
 		 * because the page's recno, stored in the recno field, is the
 		 * column-store key.
 		 */
-		WT_ITEM key;		/* Promoted row-store key */
+		uint32_t max_bnd_entries;
+		uint64_t max_bnd_recno;
+		WT_ITEM  max_bnd_key;
+
+		size_t   min_bnd_offset;
+		uint32_t min_bnd_entries;
+		uint64_t min_bnd_recno;
+		WT_ITEM  min_bnd_key;
 	} *bnd;				/* Saved boundaries */
 	uint32_t bnd_next;		/* Next boundary slot */
 	uint32_t bnd_next_max;		/* Maximum boundary slots used */
@@ -194,28 +215,6 @@ typedef struct {
 	size_t   bnd_allocated;		/* Bytes allocated */
 
 	/*
-	 * We track the total number of page entries copied into split chunks
-	 * so we can easily figure out how many entries in the current split
-	 * chunk.
-	 */
-	uint32_t total_entries;		/* Total entries in splits */
-
-	/*
-	 * And there's state information as to where in this process we are:
-	 * (1) tracking split boundaries because we can still fit more split
-	 * chunks into the maximum page size, (2) tracking the maximum page
-	 * size boundary because we can't fit any more split chunks into the
-	 * maximum page size, (3) not performing boundary checks because it's
-	 * either not useful with the current page size configuration, or
-	 * because we've already been forced to split.
-	 */
-	enum {	SPLIT_BOUNDARY=0,	/* Next: a split page boundary */
-		SPLIT_MAX=1,		/* Next: the maximum page boundary */
-		SPLIT_TRACKING_OFF=2,	/* No boundary checks */
-		SPLIT_TRACKING_RAW=3 }	/* Underlying compression decides */
-	bnd_state;
-
-	/*
 	 * We track current information about the current record number, the
 	 * number of entries copied into the temporary buffer, where we are
 	 * in the temporary buffer, and how much memory remains.  Those items
@@ -293,6 +292,14 @@ typedef struct {
 	uint32_t tested_ref_state;	/* Debugging information */
 } WT_RECONCILE;
 
+#define	WT_CROSSING_MIN_BND(r, next_len)				\
+	((r)->bnd[(r)->bnd_next].min_bnd_offset == 0 &&			\
+	    ((r)->space_avail - (next_len)) <				\
+	    ((r)->split_size - (r)->min_split_size))
+#define	WT_CROSSING_SPLIT_BND(r, next_len) ((next_len) > (r)->space_avail)
+#define	WT_CHECK_CROSSING_BND(r, next_len)				\
+	(WT_CROSSING_MIN_BND(r, next_len) || WT_CROSSING_SPLIT_BND(r, next_len))
+
 static void __rec_bnd_cleanup(WT_SESSION_IMPL *, WT_RECONCILE *, bool);
 static void __rec_cell_build_addr(WT_SESSION_IMPL *,
 		WT_RECONCILE *, const void *, size_t, u_int, uint64_t);
@@ -314,6 +321,7 @@ static int  __rec_col_var(WT_SESSION_IMPL *,
 static int  __rec_col_var_helper(WT_SESSION_IMPL *, WT_RECONCILE *,
 		WT_SALVAGE_COOKIE *, WT_ITEM *, bool, uint8_t, uint64_t);
 static int  __rec_destroy_session(WT_SESSION_IMPL *);
+static uint32_t __rec_min_split_page_size(WT_BTREE *, uint32_t);
 static int  __rec_root_write(WT_SESSION_IMPL *, WT_PAGE *, uint32_t);
 static int  __rec_row_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
 static int  __rec_row_leaf(WT_SESSION_IMPL *,
@@ -323,7 +331,6 @@ static int  __rec_row_leaf_insert(
 static int  __rec_row_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
 static int  __rec_split_col(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
 static int  __rec_split_discard(WT_SESSION_IMPL *, WT_PAGE *);
-static int  __rec_split_fixup(WT_SESSION_IMPL *, WT_RECONCILE *);
 static int  __rec_split_row(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
 static int  __rec_split_row_promote(
 		WT_SESSION_IMPL *, WT_RECONCILE *, WT_ITEM *, uint8_t);
@@ -968,6 +975,7 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep)
 	*(WT_RECONCILE **)reconcilep = NULL;
 
 	__wt_buf_free(session, &r->disk_image);
+	__wt_scr_free(session, &r->interim_buf);
 
 	__wt_free(session, r->raw_entries);
 	__wt_free(session, r->raw_offsets);
@@ -1032,7 +1040,8 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, bool destroy)
 			__wt_free(session, bnd->addr.addr);
 			__wt_free(session, bnd->disk_image);
 			__wt_free(session, bnd->supd);
-			__wt_buf_free(session, &bnd->key);
+			__wt_buf_free(session, &bnd->max_bnd_key);
+			__wt_buf_free(session, &bnd->min_bnd_key);
 		}
 		__wt_free(session, r->bnd);
 		r->bnd_next = 0;
@@ -1927,8 +1936,8 @@ static void
 __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd)
 {
 	bnd->offset = 0;
-	bnd->recno = WT_RECNO_OOB;
-	bnd->entries = 0;
+	bnd->max_bnd_recno = WT_RECNO_OOB;
+	bnd->max_bnd_entries = 0;
 
 	__wt_free(session, bnd->addr.addr);
 	WT_CLEAR(bnd->addr);
@@ -1943,6 +1952,10 @@ __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd)
 
 	bnd->already_compressed = false;
 
+	bnd->min_bnd_offset = 0;
+	bnd->min_bnd_entries = 0;
+	bnd->min_bnd_recno = WT_RECNO_OOB;
+
 	/*
 	 * Don't touch the key, we re-use that memory in each new
 	 * reconciliation.
@@ -1974,40 +1987,64 @@ __rec_split_bnd_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r)
 }
 
 /*
- * __wt_split_page_size --
- *	Split page size calculation: we don't want to repeatedly split every
- * time a new entry is added, so we split to a smaller-than-maximum page size.
+ * __rec_split_page_size_from_pct --
+ *	Given a split percentage, calculate split page size in bytes.
  */
-uint32_t
-__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize)
-{
+static uint32_t
+__rec_split_page_size_from_pct(
+    int split_pct, uint32_t maxpagesize, uint32_t allocsize) {
 	uintmax_t a;
 	uint32_t split_size;
 
 	/*
 	 * Ideally, the split page size is some percentage of the maximum page
-	 * size rounded to an allocation unit (round to an allocation unit so
-	 * we don't waste space when we write).
+	 * size rounded to an allocation unit (round to an allocation unit so we
+	 * don't waste space when we write).
 	 */
 	a = maxpagesize;			/* Don't overflow. */
 	split_size = (uint32_t)WT_ALIGN_NEAREST(
-	    (a * (u_int)btree->split_pct) / 100, btree->allocsize);
+	    (a * (u_int)split_pct) / 100, allocsize);
 
 	/*
-	 * Respect the configured split percentage if the calculated split
-	 * size is either zero or a full page. The user has either configured
-	 * an allocation size that matches the page size, or a split
-	 * percentage that is close to zero or one hundred. Rounding is going
-	 * to provide a worse outcome than having a split point that doesn't
-	 * fall on an allocation size boundary in those cases.
+	 * Respect the configured split percentage if the calculated split size
+	 * is either zero or a full page. The user has either configured an
+	 * allocation size that matches the page size, or a split percentage
+	 * that is close to zero or one hundred. Rounding is going to provide a
+	 * worse outcome than having a split point that doesn't fall on an
+	 * allocation size boundary in those cases.
 	 */
 	if (split_size == 0 || split_size == maxpagesize)
-		split_size = (uint32_t)((a * (u_int)btree->split_pct) / 100);
+		split_size = (uint32_t)((a * (u_int)split_pct) / 100);
 
 	return (split_size);
 }
 
 /*
+ * __wt_split_page_size --
+ *	Split page size calculation: we don't want to repeatedly split every
+ * time a new entry is added, so we split to a smaller-than-maximum page size.
+ */
+uint32_t
+__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize)
+{
+	return (__rec_split_page_size_from_pct(
+	    btree->split_pct, maxpagesize, btree->allocsize));
+}
+
+/*
+ * __rec_min_split_page_size --
+ *	Minimum split size boundary calculation: To track a boundary at the
+ *	minimum split size that we could have split at instead of splitting at
+ *	the split page size.
+ */
+static uint32_t
+__rec_min_split_page_size(WT_BTREE *btree, uint32_t maxpagesize)
+{
+	return (__rec_split_page_size_from_pct(
+	    WT_BTREE_MIN_SPLIT_PCT, maxpagesize, btree->allocsize));
+}
+
+/*
  * __rec_split_init --
  *	Initialization for the reconciliation split functions.
  */
@@ -2018,7 +2055,7 @@ __rec_split_init(WT_SESSION_IMPL *session,
 	WT_BM *bm;
 	WT_BTREE *btree;
 	WT_PAGE_HEADER *dsk;
-	size_t corrected_page_size;
+	size_t corrected_page_size, disk_img_buf_size;
 
 	btree = S2BT(session);
 	bm = btree->bm;
@@ -2053,33 +2090,6 @@ __rec_split_init(WT_SESSION_IMPL *session,
 		r->max_raw_page_size = r->page_size =
 		    (uint32_t)WT_MIN(r->page_size * 10,
 		    WT_MAX(r->page_size, btree->maxmempage / 2));
-
-	/*
-	 * Ensure the disk image buffer is large enough for the max object, as
-	 * corrected by the underlying block manager.
-	 */
-	corrected_page_size = r->page_size;
-	WT_RET(bm->write_size(bm, session, &corrected_page_size));
-	WT_RET(__wt_buf_init(session, &r->disk_image, corrected_page_size));
-
-	/*
-	 * Clear the disk page header to ensure all of it is initialized, even
-	 * the unused fields.
-	 *
-	 * In the case of fixed-length column-store, clear the entire buffer:
-	 * fixed-length column-store sets bits in bytes, where the bytes are
-	 * assumed to initially be 0.
-	 */
-	memset(r->disk_image.mem, 0, page->type == WT_PAGE_COL_FIX ?
-	    corrected_page_size : WT_PAGE_HEADER_SIZE);
-
-	/*
-	 * Set the page type (the type doesn't change, and setting it later
-	 * would require additional code in a few different places).
-	 */
-	dsk = r->disk_image.mem;
-	dsk->type = page->type;
-
 	/*
 	 * If we have to split, we want to choose a smaller page size for the
 	 * split pages, because otherwise we could end up splitting one large
@@ -2099,22 +2109,28 @@ __rec_split_init(WT_SESSION_IMPL *session,
 	 * creating overflow items and compacted data, for example, as those
 	 * items have already been written to disk).  So, the loop calls the
 	 * helper functions when approaching a split boundary, and we save the
-	 * information at that point.  That allows us to go back and split the
-	 * page at the boundary points if we eventually overflow the maximum
-	 * page size.
+	 * information at that point. We also save the boundary information at
+	 * the minimum split size. We maintain two chunks (each boundary
+	 * represents a chunk that gets written as a page) in the memory,
+	 * writing out the older one to the disk as a page when we need to make
+	 * space for a new chunk. On reaching the last chunk, if it turns out to
+	 * be smaller than the minimum split size, we go back into the
+	 * penultimate chunk and split at this minimum split size boundary. This
+	 * moves some data from the penultimate chunk to the last chunk, hence
+	 * increasing the size of the last page written without decreasing the
+	 * penultimate page size beyond the minimum split size.
 	 *
 	 * Finally, all this doesn't matter for fixed-size column-store pages,
 	 * raw compression, and salvage.  Fixed-size column store pages can
 	 * split under (very) rare circumstances, but they're allocated at a
 	 * fixed page size, never anything smaller.  In raw compression, the
-	 * underlying compression routine decides when we split, so it's not
-	 * our problem.  In salvage, as noted above, we can't split at all.
+	 * underlying compression routine decides when we split, so it's not our
+	 * problem.  In salvage, as noted above, we can't split at all.
 	 */
 	if (r->raw_compression || r->salvage != NULL) {
 		r->split_size = 0;
 		r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
-	}
-	else if (page->type == WT_PAGE_COL_FIX) {
+	} else if (page->type == WT_PAGE_COL_FIX) {
 		r->split_size = r->page_size;
 		r->space_avail =
 		    r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
@@ -2122,32 +2138,53 @@ __rec_split_init(WT_SESSION_IMPL *session,
 		r->split_size = __wt_split_page_size(btree, r->page_size);
 		r->space_avail =
 		    r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+		r->min_split_size =
+		    __rec_min_split_page_size(btree, r->page_size);
 	}
+
+	/*
+	 * Ensure the disk image buffer is large enough for the max object, as
+	 * corrected by the underlying block manager.
+	 *
+	 * The buffer that we build disk image in, needs to hold two chunks
+	 * worth of data. Since we want to support split_size more than the page
+	 * size (to allow for adjustments based on the compression), this buffer
+	 * should be greater of twice of split_size and page_size.
+	 */
+	corrected_page_size = r->page_size;
+	disk_img_buf_size = 2 * WT_MAX(corrected_page_size, r->split_size);
+	WT_RET(bm->write_size(bm, session, &corrected_page_size));
+	WT_RET(__wt_buf_init(session, &r->disk_image, disk_img_buf_size));
+
+	/*
+	 * Clear the disk page header to ensure all of it is initialized, even
+	 * the unused fields.
+	 *
+	 * In the case of fixed-length column-store, clear the entire buffer:
+	 * fixed-length column-store sets bits in bytes, where the bytes are
+	 * assumed to initially be 0.
+	 */
+	memset(r->disk_image.mem, 0, page->type == WT_PAGE_COL_FIX ?
+	    disk_img_buf_size : WT_PAGE_HEADER_SIZE);
+
+	/*
+	 * Set the page type (the type doesn't change, and setting it later
+	 * would require additional code in a few different places).
+	 */
+	dsk = r->disk_image.mem;
+	dsk->type = page->type;
+
 	r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
 
 	/* Initialize the first boundary. */
 	r->bnd_next = 0;
 	WT_RET(__rec_split_bnd_grow(session, r));
 	__rec_split_bnd_init(session, &r->bnd[0]);
-	r->bnd[0].recno = recno;
+	r->bnd[0].max_bnd_recno = recno;
 	r->bnd[0].offset = WT_PAGE_HEADER_BYTE_SIZE(btree);
 
-	/*
-	 * If the maximum page size is the same as the split page size, either
-	 * because of the object type or application configuration, there isn't
-	 * any need to maintain split boundaries within a larger page.
-	 *
-	 * No configuration for salvage here, because salvage can't split.
-	 */
-	if (r->raw_compression)
-		r->bnd_state = SPLIT_TRACKING_RAW;
-	else if (max == r->split_size)
-		r->bnd_state = SPLIT_TRACKING_OFF;
-	else
-		r->bnd_state = SPLIT_BOUNDARY;
-
-	/* Initialize the entry counters. */
-	r->entries = r->total_entries = 0;
+	/* Initialize the entry counter. */
+	r->entries = 0;
 
 	/* Initialize the starting record number. */
 	r->recno = recno;
@@ -2350,19 +2387,112 @@ __rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len)
 {
 	WT_BM *bm;
 	WT_BTREE *btree;
-	size_t corrected_page_size, len;
+	size_t corrected_page_size, inuse, len;
 
 	btree = S2BT(session);
 	bm = btree->bm;
 
 	len = WT_PTRDIFF(r->first_free, r->disk_image.mem);
-	corrected_page_size = len + add_len;
+	inuse = (len - r->bnd[r->bnd_next].offset) +
+	    WT_PAGE_HEADER_BYTE_SIZE(btree);
+	corrected_page_size = inuse + add_len;
+
 	WT_RET(bm->write_size(bm, session, &corrected_page_size));
-	WT_RET(__wt_buf_grow(session, &r->disk_image, corrected_page_size));
+	/* Need to account for buffer carrying two chunks worth of data */
+	WT_RET(__wt_buf_grow(session, &r->disk_image, 2 * corrected_page_size));
+
 	r->first_free = (uint8_t *)r->disk_image.mem + len;
-	WT_ASSERT(session, corrected_page_size >= len);
-	r->space_avail = corrected_page_size - len;
+	WT_ASSERT(session, corrected_page_size >= inuse);
+	r->space_avail = corrected_page_size - inuse;
 	WT_ASSERT(session, r->space_avail >= add_len);
+
+	return (0);
+}
+
+/*
+ * __rec_split_write_prev_and_shift_cur --
+ *	Write the previous split chunk to the disk as a page. Shift the contents
+ *	of the current chunk to the start of the buffer, making space for a new
+ *	chunk to be written.
+ *	If the caller asks for a chunk resizing, the boundary between the two
+ *	chunks is readjusted to the minimum split size boundary details stored
+ *	in the previous chunk, letting the current chunk grow at the cost of the
+ *	previous chunk.
+ */
+static int
+__rec_split_write_prev_and_shift_cur(
+    WT_SESSION_IMPL *session, WT_RECONCILE *r, bool resize_chunks)
+{
+	WT_BM *bm;
+	WT_BOUNDARY *bnd_cur, *bnd_prev;
+	WT_BTREE *btree;
+	WT_PAGE_HEADER *dsk, *dsk_tmp;
+	size_t cur_len, len;
+	uint8_t *dsk_start;
+
+	WT_ASSERT(session, r->bnd_next != 0);
+
+	btree = S2BT(session);
+	bm = btree->bm;
+	bnd_cur = &r->bnd[r->bnd_next];
+	bnd_prev = bnd_cur - 1;
+	dsk = r->disk_image.mem;
+	cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset;
+
+	/*
+	 * Resize chunks if the current is smaller than the minimum, and there
+	 * are details on the minimum split size boundary available in the
+	 * previous boundary details.
+	 *
+	 * There is a possibility that we do not have a minimum boundary set, in
+	 * such a case we skip chunk resizing. Such a condition is possible for
+	 * instance when we are building the image in the buffer and the first
+	 * K/V pair is large enough that it surpasses both the minimum split
+	 * size and the split size the application has set. In such a case we
+	 * split the chunk without saving any minimum boundary.
+	 */
+	if (resize_chunks &&
+	    cur_len < r->min_split_size && bnd_prev->min_bnd_offset != 0) {
+		bnd_cur->offset = bnd_prev->min_bnd_offset;
+		bnd_cur->max_bnd_entries +=
+		    bnd_prev->max_bnd_entries - bnd_prev->min_bnd_entries;
+		bnd_prev->max_bnd_entries = bnd_prev->min_bnd_entries;
+		bnd_cur->max_bnd_recno = bnd_prev->min_bnd_recno;
+
+		WT_RET(__wt_buf_set(session, &bnd_cur->max_bnd_key,
+		    bnd_prev->min_bnd_key.data, bnd_prev->min_bnd_key.size));
+
+		/* Update current chunk's length */
+		cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset;
+	}
+
+	/*
+	 * Create an interim buffer if not already done to prepare the previous
+	 * chunk's disk image.
+	 */
+	len = bnd_cur->offset;
+	WT_RET(bm->write_size(bm, session, &len));
+	if (r->interim_buf == NULL)
+		WT_RET(__wt_scr_alloc(session, len, &r->interim_buf));
+	else
+		WT_RET(__wt_buf_init(session, r->interim_buf, len));
+
+	dsk_tmp = r->interim_buf->mem;
+	memcpy(dsk_tmp, dsk, bnd_cur->offset);
+	dsk_tmp->recno = bnd_prev->max_bnd_recno;
+	dsk_tmp->u.entries = bnd_prev->max_bnd_entries;
+	dsk_tmp->mem_size = WT_STORE_SIZE(bnd_cur->offset);
+	r->interim_buf->size = dsk_tmp->mem_size;
+	WT_RET(__rec_split_write(session, r, bnd_prev, r->interim_buf, false));
+
+	/* Shift the current chunk to the start of the buffer */
+	dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
+	(void)memmove(dsk_start, (uint8_t *)dsk + bnd_cur->offset, cur_len);
+
+	/* Fix boundary offset */
+	bnd_cur->offset = WT_PAGE_HEADER_BYTE_SIZE(btree);
+	/* Fix where free points */
+	r->first_free = dsk_start + cur_len;
 	return (0);
 }
 
@@ -2382,6 +2512,9 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
 	btree = S2BT(session);
 	dsk = r->disk_image.mem;
 
+	/* Fixed length col store can call with next_len 0 */
+	WT_ASSERT(session, next_len == 0 || r->space_avail < next_len);
+
 	/*
 	 * We should never split during salvage, and we're about to drop core
 	 * because there's no parent page.
@@ -2391,147 +2524,58 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
 		    "%s page too large, attempted split during salvage",
 		    __wt_page_type_string(r->page->type));
 
-	/* Hitting a page boundary resets the dictionary, in all cases. */
-	__rec_dictionary_reset(r);
-
-	inuse = WT_PTRDIFF(r->first_free, dsk);
-	switch (r->bnd_state) {
-	case SPLIT_BOUNDARY:
-		/*
-		 * We can get here if the first key/value pair won't fit.
-		 * Additionally, grow the buffer to contain the current item if
-		 * we haven't already consumed a reasonable portion of a split
-		 * chunk.
-		 */
-		if (inuse < r->split_size / 2)
-			break;
-
-		/*
-		 * About to cross a split boundary but not yet forced to split
-		 * into multiple pages. If we have to split, this is one of the
-		 * split points, save information about where we are when the
-		 * split would have happened.
-		 */
-		WT_RET(__rec_split_bnd_grow(session, r));
-		last = &r->bnd[r->bnd_next++];
-		next = last + 1;
-
-		/* Set the number of entries for the just finished chunk. */
-		last->entries = r->entries - r->total_entries;
-		r->total_entries = r->entries;
-
-		/* Set the key for the next chunk. */
-		next->recno = r->recno;
-		if (dsk->type == WT_PAGE_ROW_INT ||
-		    dsk->type == WT_PAGE_ROW_LEAF)
-			WT_RET(__rec_split_row_promote(
-			    session, r, &next->key, dsk->type));
-
-		/*
-		 * Set the starting buffer offset and clear the entries (the
-		 * latter not required, but cleaner).
-		 */
-		next->offset = WT_PTRDIFF(r->first_free, dsk);
-		next->entries = 0;
-
-		/* Set the space available to another split-size chunk. */
-		r->space_avail =
-		    r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
-
-		/*
-		 * Adjust the space available to handle two cases:
-		 *  - We don't have enough room for another full split-size
-		 *    chunk on the page.
-		 *  - We chose to fill past a page boundary because of a
-		 *    large item.
-		 */
-		if (inuse + r->space_avail > r->page_size) {
-			r->space_avail =
-			    r->page_size > inuse ? (r->page_size - inuse) : 0;
-
-			/* There are no further boundary points. */
-			r->bnd_state = SPLIT_MAX;
-		}
-
-		/*
-		 * Return if the next object fits into this page, else we have
-		 * to split the page.
-		 */
-		if (r->space_avail >= next_len)
-			return (0);
-
-		/* FALLTHROUGH */
-	case SPLIT_MAX:
-		/*
-		 * We're going to have to split and create multiple pages.
-		 *
-		 * Cycle through the saved split-point information, writing the
-		 * split chunks we have tracked.  The underlying fixup function
-		 * sets the space available and other information, and copied
-		 * any unwritten chunk of data to the beginning of the buffer.
-		 */
-		WT_RET(__rec_split_fixup(session, r));
-
-		/* We're done saving split chunks. */
-		r->bnd_state = SPLIT_TRACKING_OFF;
-		break;
-	case SPLIT_TRACKING_OFF:
-		/*
-		 * We can get here if the first key/value pair won't fit.
-		 * Additionally, grow the buffer to contain the current item if
-		 * we haven't already consumed a reasonable portion of a split
-		 * chunk.
-		 */
-		if (inuse < r->split_size / 2)
-			break;
+	last = &r->bnd[r->bnd_next];
+	inuse = (WT_PTRDIFF(r->first_free, dsk) - last->offset) +
+	    WT_PAGE_HEADER_BYTE_SIZE(btree);
 
-		/*
-		 * The key/value pairs didn't fit into a single page, but either
-		 * we've already noticed that and are now processing the rest of
-		 * the pairs at split size boundaries, or the split size was the
-		 * same as the page size, and we never bothered with split point
-		 * information at all.
-		 */
-		WT_RET(__rec_split_bnd_grow(session, r));
-		last = &r->bnd[r->bnd_next++];
-		next = last + 1;
+	/*
+	 * We can get here if the first key/value pair won't fit.
+	 * Additionally, grow the buffer to contain the current item if we
+	 * haven't already consumed a reasonable portion of a split chunk.
+	 */
+	if (inuse < r->split_size / 2)
+		goto done;
 
-		/*
-		 * Set the key for the next chunk (before writing the block, a
-		 * key range is needed in that code).
-		 */
-		next->recno = r->recno;
-		if (dsk->type == WT_PAGE_ROW_INT ||
-		    dsk->type == WT_PAGE_ROW_LEAF)
-			WT_RET(__rec_split_row_promote(
-			    session, r, &next->key, dsk->type));
+	/* Hitting a page boundary resets the dictionary, in all cases. */
+	__rec_dictionary_reset(r);
 
-		/* Clear the entries (not required, but cleaner). */
-		next->entries = 0;
+	/* Set the number of entries for the just finished chunk. */
+	last->max_bnd_entries = r->entries;
 
-		/* Finalize the header information and write the page. */
-		dsk->recno = last->recno;
-		dsk->u.entries = r->entries;
-		dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk);
+	/*
+	 * In case of bulk load, write out chunks as we get them.
+	 * In other cases, we keep two chunks in memory at a given time. So, if
+	 * there is a previous chunk, write it out, making space in the buffer
+	 * for the next chunk to be written.
+	 */
+	if (r->is_bulk_load) {
+		dsk->recno = last->max_bnd_recno;
+		dsk->u.entries = last->max_bnd_entries;
+		dsk->mem_size = (uint32_t)inuse;
 		r->disk_image.size = dsk->mem_size;
-		WT_RET(
-		    __rec_split_write(session, r, last, &r->disk_image, false));
-
-		/*
-		 * Set the caller's entry count and buffer information for the
-		 * next chunk.  We only get here if we're not splitting or have
-		 * already split, so it's split-size chunks from here on out.
-		 */
-		r->entries = 0;
+		WT_RET(__rec_split_write(
+		    session, r, last, &r->disk_image, false));
+		/* Fix where free points */
 		r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
-		r->space_avail =
-		    r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
-		break;
-	case SPLIT_TRACKING_RAW:
-		return (__wt_illegal_value(session, NULL));
-	}
+	} else if (r->bnd_next != 0)
+		WT_RET(__rec_split_write_prev_and_shift_cur(session, r, false));
 
-	/*
+	/* Prepare the next boundary */
+	WT_RET(__rec_split_bnd_grow(session, r));
+	r->bnd_next++;
+	next = &r->bnd[r->bnd_next];
+	next->offset = WT_PTRDIFF(r->first_free, dsk);
+	/* Set the key for the next chunk. */
+	next->max_bnd_recno = r->recno;
+	if (dsk->type == WT_PAGE_ROW_INT || dsk->type == WT_PAGE_ROW_LEAF)
+		WT_RET(__rec_split_row_promote(
+		    session, r, &next->max_bnd_key, dsk->type));
+
+	r->entries = 0;
+	/* Set the space available to another split-size chunk. */
+	r->space_avail = r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+
+done:  	/*
 	 * Overflow values can be larger than the maximum page size but still be
 	 * "on-page". If the next key/value pair is larger than space available
 	 * after a split has happened (in other words, larger than the maximum
@@ -2549,6 +2593,66 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
 }
 
 /*
+ * __rec_split_crossing_bnd --
+ * 	Save the details for the minimum split size boundary or call for a
+ * 	split.
+ */
+static inline int
+__rec_split_crossing_bnd(
+    WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
+{
+	WT_BOUNDARY *bnd;
+	WT_BTREE *btree;
+	WT_PAGE_HEADER *dsk;
+	size_t min_bnd_offset;
+
+	WT_ASSERT(session, WT_CHECK_CROSSING_BND(r, next_len));
+
+	/*
+	 * If crossing the minimum split size boundary, store the boundary
+	 * details at the current location in the buffer. If we are crossing the
+	 * split boundary at the same time, possible when the next record is
+	 * large enough, just split at this point.
+	 */
+	if (WT_CROSSING_MIN_BND(r, next_len) &&
+	    !WT_CROSSING_SPLIT_BND(r, next_len)) {
+		btree = S2BT(session);
+		bnd = &r->bnd[r->bnd_next];
+		dsk = r->disk_image.mem;
+		min_bnd_offset = (WT_PTRDIFF(r->first_free, dsk) -
+		    bnd->offset) + WT_PAGE_HEADER_BYTE_SIZE(btree);
+		if (min_bnd_offset == WT_PAGE_HEADER_BYTE_SIZE(btree))
+			/*
+			 * This is possible if the first record doesn't fit in
+			 * the minimum split size, we write this record without
+			 * setting up any boundary here. We will get the
+			 * opportunity to setup a boundary before writing out
+			 * the next record.
+			 */
+			return (0);
+
+		WT_ASSERT(session, bnd->min_bnd_offset == 0);
+
+		/*
+		 * Hitting a page boundary resets the dictionary, in all cases.
+		 */
+		__rec_dictionary_reset(r);
+
+		bnd->min_bnd_offset = min_bnd_offset;
+		bnd->min_bnd_entries = r->entries;
+		bnd->min_bnd_recno = r->recno;
+		if (dsk->type == WT_PAGE_ROW_INT ||
+		    dsk->type == WT_PAGE_ROW_LEAF)
+			WT_RET(__rec_split_row_promote(
+			    session, r, &bnd->min_bnd_key, dsk->type));
+		return (0);
+	}
+
+	/* We are crossing a split boundary */
+	return (__rec_split(session, r, next_len));
+}
+
+/*
  * __rec_split_raw_worker --
  *	Handle the raw compression page reconciliation bookkeeping.
  */
@@ -2626,7 +2730,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session,
 	 */
 	recno = WT_RECNO_OOB;
 	if (dsk->type == WT_PAGE_COL_VAR)
-		recno = last->recno;
+		recno = last->max_bnd_recno;
 
 	entry = max_image_slot = slots = 0;
 	WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
@@ -2853,7 +2957,7 @@ no_slots:
 		 */
 		dst->size = result_len + WT_BLOCK_COMPRESS_SKIP;
 		dsk_dst = dst->mem;
-		dsk_dst->recno = last->recno;
+		dsk_dst->recno = last->max_bnd_recno;
 		dsk_dst->mem_size =
 		    r->raw_offsets[result_slots] + WT_BLOCK_COMPRESS_SKIP;
 		dsk_dst->u.entries = r->raw_entries[result_slots - 1];
@@ -2873,7 +2977,7 @@ no_slots:
 			WT_RET(__wt_strndup(session, dsk,
 			    dsk_dst->mem_size, &last->disk_image));
 			disk_image = last->disk_image;
-			disk_image->recno = last->recno;
+			disk_image->recno = last->max_bnd_recno;
 			disk_image->mem_size = dsk_dst->mem_size;
 			disk_image->u.entries = dsk_dst->u.entries;
 		}
@@ -2903,14 +3007,14 @@ no_slots:
 		 */
 		switch (dsk->type) {
 		case WT_PAGE_COL_INT:
-			next->recno = r->raw_recnos[result_slots];
+			next->max_bnd_recno = r->raw_recnos[result_slots];
 			break;
 		case WT_PAGE_COL_VAR:
-			next->recno = r->raw_recnos[result_slots - 1];
+			next->max_bnd_recno = r->raw_recnos[result_slots - 1];
 			break;
 		case WT_PAGE_ROW_INT:
 		case WT_PAGE_ROW_LEAF:
-			next->recno = WT_RECNO_OOB;
+			next->max_bnd_recno = WT_RECNO_OOB;
 			if (!last_block) {
 				/*
 				 * Confirm there was uncompressed data remaining
@@ -2919,7 +3023,7 @@ no_slots:
 				 */
 				WT_ASSERT(session, len > 0);
 				WT_RET(__rec_split_row_promote_cell(
-				    session, dsk, &next->key));
+				    session, dsk, &next->max_bnd_key));
 			}
 			break;
 		}
@@ -2931,7 +3035,7 @@ no_slots:
 		 */
 		WT_STAT_DATA_INCR(session, compress_raw_fail);
 
-		dsk->recno = last->recno;
+		dsk->recno = last->max_bnd_recno;
 		dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk);
 		dsk->u.entries = r->entries;
 		r->disk_image.size = dsk->mem_size;
@@ -3008,35 +3112,9 @@ __rec_split_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
 static int
 __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
 {
-	WT_BOUNDARY *bnd;
+	WT_BOUNDARY *bnd_cur, *bnd_prev;
 	WT_PAGE_HEADER *dsk;
-
-	/* Adjust the boundary information based on our split status. */
-	switch (r->bnd_state) {
-	case SPLIT_BOUNDARY:
-	case SPLIT_MAX:
-		/*
-		 * We never split, the reconciled page fit into a maximum page
-		 * size.  Change the first boundary slot to represent the full
-		 * page (the first boundary slot is largely correct, just update
-		 * the number of entries).
-		 */
-		r->bnd_next = 0;
-		break;
-	case SPLIT_TRACKING_OFF:
-		/*
-		 * If we have already split, or aren't tracking boundaries, put
-		 * the remaining data in the next boundary slot.
-		 */
-		WT_RET(__rec_split_bnd_grow(session, r));
-		break;
-	case SPLIT_TRACKING_RAW:
-		/*
-		 * We were configured for raw compression, and either we never
-		 * wrote anything, or there's a remaindered block of data.
-		 */
-		break;
-	}
+	bool grow_bnd;
 
 	/*
 	 * We may arrive here with no entries to write if the page was entirely
@@ -3063,20 +3141,66 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
 			return (EBUSY);
 	}
 
-	/* Set the boundary reference and increment the count. */
-	bnd = &r->bnd[r->bnd_next++];
-	bnd->entries = r->entries;
-
-	/* Finalize the header information. */
 	dsk = r->disk_image.mem;
-	dsk->recno = bnd->recno;
-	dsk->u.entries = r->entries;
+
+	/* Set the number of entries for the just finished chunk. */
+	bnd_cur = &r->bnd[r->bnd_next];
+	bnd_cur->max_bnd_entries = r->entries;
+
+	grow_bnd = true;
+	/*
+	 * We can reach here even with raw_compression when the last split chunk
+	 * is too small to be sent for raw compression.
+	 */
+	if (!r->is_bulk_load && !r->raw_compression) {
+		if (WT_PTRDIFF(r->first_free, dsk) > r->page_size &&
+		    r->bnd_next != 0) {
+			/*
+			 * We hold two boundaries worth of data in the buffer,
+			 * and this data doesn't fit in a single page.  If the
+			 * last chunk is too small, readjust the boundary to a
+			 * pre-computed minimum.
+			 * Write out the penultimate chunk to the disk as a page
+			 */
+			WT_RET(__rec_split_write_prev_and_shift_cur(
+			    session, r, true));
+		} else
+			if (r->bnd_next != 0) {
+				/*
+				 * We have two boundaries, but the data in the
+				 * buffer can fit a single page. Merge the
+				 * boundaries to create a single chunk.
+				 */
+				bnd_prev = bnd_cur - 1;
+				bnd_prev->max_bnd_entries +=
+				    bnd_cur->max_bnd_entries;
+				r->bnd_next--;
+				grow_bnd = false;
+			}
+	}
+
+	/*
+	 * We already have space for an extra boundary if we merged two
+	 * boundaries above, in that case we do not need to grow the boundary
+	 * structure.
+	 */
+	if (grow_bnd)
+		WT_RET(__rec_split_bnd_grow(session, r));
+	bnd_cur = &r->bnd[r->bnd_next];
+	r->bnd_next++;
+
+	/*
+	 * Current boundary now has all the remaining data/last page now.
+	 * Let's write it to the disk
+	 */
+	dsk->recno = bnd_cur->max_bnd_recno;
+	dsk->u.entries = bnd_cur->max_bnd_entries;
 	dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk);
 	r->disk_image.size = dsk->mem_size;
 
 	/* If this is a checkpoint, we're done, otherwise write the page. */
-	return (__rec_is_checkpoint(session, r, bnd) ?
-	    0 : __rec_split_write(session, r, bnd, &r->disk_image, true));
+	return (__rec_is_checkpoint(session, r, bnd_cur) ?
+	    0 : __rec_split_write(session, r, bnd_cur, &r->disk_image, true));
 }
 
 /*
@@ -3110,98 +3234,6 @@ __rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r)
 }
 
 /*
- * __rec_split_fixup --
- *	Fix up after crossing the maximum page boundary.
- */
-static int
-__rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r)
-{
-	WT_BOUNDARY *bnd;
-	WT_BTREE *btree;
-	WT_DECL_ITEM(tmp);
-	WT_DECL_RET;
-	WT_PAGE_HEADER *dsk;
-	size_t i, len;
-	uint8_t *dsk_start, *p;
-
-	/*
-	 * When we overflow physical limits of the page, we walk the list of
-	 * split chunks we've created and write those pages out, then update
-	 * the caller's information.
-	 */
-	btree = S2BT(session);
-
-	/*
-	 * The data isn't laid out on a page boundary or nul padded; copy it to
-	 * a clean, aligned, padded buffer before writing it.
-	 *
-	 * Allocate a scratch buffer to hold the new disk image. Copy the disk
-	 * page's header and block-manager space into the scratch buffer, most
-	 * of the header information remains unchanged between the pages.
-	 */
-	WT_RET(__wt_scr_alloc(session, r->disk_image.memsize, &tmp));
-	dsk = tmp->mem;
-	memcpy(dsk, r->disk_image.mem, WT_PAGE_HEADER_BYTE_SIZE(btree));
-
-	/*
-	 * For each split chunk we've created, update the disk image and copy
-	 * it into place.
-	 */
-	dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
-	for (i = 0, bnd = r->bnd; i < r->bnd_next; ++i, ++bnd) {
-		/* Copy the page contents to the temporary buffer. */
-		len = (bnd + 1)->offset - bnd->offset;
-		memcpy(dsk_start,
-		    (uint8_t *)r->disk_image.mem + bnd->offset, len);
-
-		/* Finalize the header information and write the page. */
-		dsk->recno = bnd->recno;
-		dsk->u.entries = bnd->entries;
-		tmp->size = WT_PAGE_HEADER_BYTE_SIZE(btree) + len;
-		dsk->mem_size = WT_STORE_SIZE(tmp->size);
-		WT_ERR(__rec_split_write(session, r, bnd, tmp, false));
-	}
-
-	/*
-	 * There is probably a remnant in the working buffer that didn't get
-	 * written, copy it down to the beginning of the working buffer.
-	 *
-	 * Confirm the remnant is no larger than a split-sized chunk, including
-	 * header. We know that's the maximum sized remnant because we only have
-	 * remnants if split switches from accumulating to a split boundary to
-	 * accumulating to the end of the page (the other path here is when we
-	 * hit a split boundary, there was room for another split chunk in the
-	 * page, and the next item still wouldn't fit, in which case there is no
-	 * remnant). So: we were accumulating to the end of the page and created
-	 * a remnant. We know the remnant cannot be as large as a split-sized
-	 * chunk, including header, because if there was room for that large a
-	 * remnant, we wouldn't have switched from accumulating to a page end.
-	 */
-	p = (uint8_t *)r->disk_image.mem + bnd->offset;
-	len = WT_PTRDIFF(r->first_free, p);
-	if (len >= r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree))
-		WT_PANIC_ERR(session, EINVAL,
-		    "Reconciliation remnant too large for the split buffer");
-	dsk = r->disk_image.mem;
-	dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
-	(void)memmove(dsk_start, p, len);
-
-	/*
-	 * Fix up our caller's information, including updating the starting
-	 * record number.
-	 */
-	r->entries -= r->total_entries;
-	r->first_free = dsk_start + len;
-	WT_ASSERT(session,
-	    r->page_size >= (WT_PAGE_HEADER_BYTE_SIZE(btree) + len));
-	r->space_avail =
-	    r->split_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len);
-
-err:	__wt_scr_free(session, &tmp);
-	return (ret);
-}
-
-/*
  * __rec_split_write --
  *	Write a disk block out for the split helper functions.
  */
@@ -3238,8 +3270,6 @@ __rec_split_write(WT_SESSION_IMPL *session,
 			F_SET(dsk, WT_PAGE_EMPTY_V_NONE);
 	}
 
-	bnd->entries = r->entries;
-
 	/* Initialize the address (set the page type for the parent). */
 	switch (dsk->type) {
 	case WT_PAGE_COL_FIX:
@@ -3285,7 +3315,8 @@ __rec_split_write(WT_SESSION_IMPL *session,
 		switch (page->type) {
 		case WT_PAGE_COL_FIX:
 		case WT_PAGE_COL_VAR:
-			if (WT_INSERT_RECNO(supd->ins) >= (bnd + 1)->recno)
+			if (WT_INSERT_RECNO(supd->ins) >=
+			    (bnd + 1)->max_bnd_recno)
 				goto supd_check_complete;
 			break;
 		case WT_PAGE_ROW_LEAF:
@@ -3296,8 +3327,8 @@ __rec_split_write(WT_SESSION_IMPL *session,
 				key->data = WT_INSERT_KEY(supd->ins);
 				key->size = WT_INSERT_KEY_SIZE(supd->ins);
 			}
-			WT_ERR(__wt_compare(session,
-			    btree->collator, key, &(bnd + 1)->key, &cmp));
+			WT_ERR(__wt_compare(session, btree->collator,
+			    key, &(bnd + 1)->max_bnd_key, &cmp));
 			if (cmp >= 0)
 				goto supd_check_complete;
 			break;
@@ -3387,14 +3418,14 @@ supd_check_complete:
 
 #ifdef HAVE_VERBOSE
 	/* Output a verbose message if we create a page without many entries */
-	if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT) && r->entries < 6)
+	if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT) &&
+	    bnd->max_bnd_entries < 6)
 		__wt_verbose(session, WT_VERB_SPLIT,
 		    "Reconciliation creating a page with %" PRIu32
 		    " entries, memory footprint %" WT_SIZET_FMT
-		    ", page count %" PRIu32 ", %s, split state: %d",
-		    r->entries, r->page->memory_footprint, r->bnd_next,
-		    F_ISSET(r, WT_EVICTING) ? "evict" : "checkpoint",
-		    r->bnd_state);
+		    ", page count %" PRIu32 ", %s", bnd->max_bnd_entries,
+		    r->page->memory_footprint, r->bnd_next,
+		    F_ISSET(r, WT_EVICTING) ? "evict" : "checkpoint");
 #endif
 
 	WT_ERR(__wt_bt_write(session, buf, addr, &addr_size,
@@ -3680,11 +3711,12 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
 	    cursor->value.data, cursor->value.size, (uint64_t)0));
 
 	/* Boundary: split or write the page. */
-	if (key->len + val->len > r->space_avail) {
-		if (r->raw_compression)
-			WT_RET(
-			    __rec_split_raw(session, r, key->len + val->len));
-		else {
+	if (r->raw_compression) {
+		if (key->len + val->len > r->space_avail)
+			WT_RET(__rec_split_raw(
+			    session, r, key->len + val->len));
+	} else
+		if (WT_CROSSING_SPLIT_BND(r, key->len + val->len)) {
 			/*
 			 * Turn off prefix compression until a full key written
 			 * to the new page, and (unless already working with an
@@ -3696,10 +3728,9 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
 					WT_RET(__rec_cell_build_leaf_key(
 					    session, r, NULL, 0, &ovfl_key));
 			}
-
-			WT_RET(__rec_split(session, r, key->len + val->len));
+			WT_RET(__rec_split_crossing_bnd(
+			    session, r, key->len + val->len));
 		}
-	}
 
 	/* Copy the key/value pair onto the page. */
 	__rec_copy_incr(session, r, key);
@@ -3740,6 +3771,10 @@ __rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk)
 			 * split.
 			 *
 			 * Boundary: split or write the page.
+			 *
+			 * No need to have a minimum split size boundary, all
+			 * pages are filled 100% except the last, allowing it to
+			 * grow in the future.
 			 */
 			__rec_incr(session, r, cbulk->entry,
 			    __bitstr_size(
@@ -3844,10 +3879,12 @@ __wt_bulk_insert_var(
 		    r, cbulk->last.data, cbulk->last.size, cbulk->rle));
 
 	/* Boundary: split or write the page. */
-	if (val->len > r->space_avail)
-		WT_RET(r->raw_compression ?
-		    __rec_split_raw(session, r, val->len) :
-		    __rec_split(session, r, val->len));
+	if (r->raw_compression) {
+		if (val->len > r->space_avail)
+			WT_RET(__rec_split_raw(session, r, val->len));
+	} else
+		if (WT_CROSSING_SPLIT_BND(r, val->len))
+			WT_RET(__rec_split_crossing_bnd(session, r, val->len));
 
 	/* Copy the value onto the page. */
 	if (btree->dictionary)
@@ -3983,10 +4020,13 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref)
 		WT_CHILD_RELEASE_ERR(session, hazard, ref);
 
 		/* Boundary: split or write the page. */
-		if (val->len > r->space_avail)
-			WT_ERR(r->raw_compression ?
-			    __rec_split_raw(session, r, val->len) :
-			    __rec_split(session, r, val->len));
+		if (r->raw_compression) {
+			if (val->len > r->space_avail)
+				WT_ERR(__rec_split_raw(session, r, val->len));
+		} else
+			if (WT_CHECK_CROSSING_BND(r, val->len))
+				WT_ERR(__rec_split_crossing_bnd(
+				    session, r, val->len));
 
 		/* Copy the value onto the page. */
 		__rec_copy_incr(session, r, val);
@@ -4028,10 +4068,13 @@ __rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
 		    addr->addr, addr->size, __rec_vtype(addr), r->recno);
 
 		/* Boundary: split or write the page. */
-		if (val->len > r->space_avail)
-			WT_RET(r->raw_compression ?
-			    __rec_split_raw(session, r, val->len) :
-			    __rec_split(session, r, val->len));
+		if (r->raw_compression) {
+			if (val->len > r->space_avail)
+				WT_RET(__rec_split_raw(session, r, val->len));
+		} else
+			if (WT_CHECK_CROSSING_BND(r, val->len))
+				WT_RET(__rec_split_crossing_bnd(
+				    session, r, val->len));
 
 		/* Copy the value onto the page. */
 		__rec_copy_incr(session, r, val);
@@ -4139,6 +4182,10 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref)
 			 * split.
 			 *
 			 * Boundary: split or write the page.
+			 *
+			 * No need to have a minimum split size boundary, all
+			 * pages are filled 100% except the last, allowing it to
+			 * grow in the future.
 			 */
 			__rec_incr(session, r, entry,
 			    __bitstr_size((size_t)entry * btree->bitcnt));
@@ -4295,10 +4342,13 @@ __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r,
 		    session, r, value->data, value->size, rle));
 
 	/* Boundary: split or write the page. */
-	if (val->len > r->space_avail)
-		WT_RET(r->raw_compression ?
-		    __rec_split_raw(session, r, val->len) :
-		    __rec_split(session, r, val->len));
+	if (r->raw_compression) {
+		if (val->len > r->space_avail)
+			WT_RET(__rec_split_raw(session, r, val->len));
+	} else
+		if (WT_CHECK_CROSSING_BND(r, val->len))
+			WT_RET(__rec_split_crossing_bnd(
+			    session, r, val->len));
 
 	/* Copy the value onto the page. */
 	if (!deleted && !overflow_type && btree->dictionary)
@@ -4961,11 +5011,12 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
 		r->cell_zero = false;
 
 		/* Boundary: split or write the page. */
-		if (key->len + val->len > r->space_avail) {
-			if (r->raw_compression)
+		if (r->raw_compression) {
+			if (key->len + val->len > r->space_avail)
 				WT_ERR(__rec_split_raw(
 				    session, r, key->len + val->len));
-			else {
+		} else
+			if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) {
 				/*
 				 * In one path above, we copied address blocks
 				 * from the page rather than building the actual
@@ -4977,10 +5028,10 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
 					    WT_IKEY_DATA(ikey), ikey->size));
 					key_onpage_ovfl = false;
 				}
-				WT_ERR(__rec_split(
+
+				WT_ERR(__rec_split_crossing_bnd(
 				    session, r, key->len + val->len));
 			}
-		}
 
 		/* Copy the key and value onto the page. */
 		__rec_copy_incr(session, r, key);
@@ -5030,10 +5081,14 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
 		    addr->addr, addr->size, __rec_vtype(addr), WT_RECNO_OOB);
 
 		/* Boundary: split or write the page. */
-		if (key->len + val->len > r->space_avail)
-			WT_RET(r->raw_compression ?
-			    __rec_split_raw(session, r, key->len + val->len) :
-			    __rec_split(session, r, key->len + val->len));
+		if (r->raw_compression) {
+			if (key->len + val->len > r->space_avail)
+				WT_RET(__rec_split_raw(
+				    session, r, key->len + val->len));
+		} else
+			if (WT_CHECK_CROSSING_BND(r, key->len + val->len))
+				WT_RET(__rec_split_crossing_bnd(
+				    session, r, key->len + val->len));
 
 		/* Copy the key and value onto the page. */
 		__rec_copy_incr(session, r, key);
@@ -5362,16 +5417,17 @@ build:
 		}
 
 		/* Boundary: split or write the page. */
-		if (key->len + val->len > r->space_avail) {
-			if (r->raw_compression)
+		if (r->raw_compression) {
+			if (key->len + val->len > r->space_avail)
 				WT_ERR(__rec_split_raw(
 				    session, r, key->len + val->len));
-			else {
+		} else
+			if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) {
 				/*
-				 * In one path above, we copied address blocks
-				 * from the page rather than building the actual
-				 * key.  In that case, we have to build the key
-				 * now because we are about to promote it.
+				 * If we copied address blocks from the page
+				 * rather than building the actual key, we have
+				 * to build the key now because we are about to
+				 * promote it.
 				 */
 				if (key_onpage_ovfl) {
 					WT_ERR(__wt_dsk_cell_data_ref(session,
@@ -5390,14 +5446,13 @@ build:
 					if (!ovfl_key)
 						WT_ERR(
 						    __rec_cell_build_leaf_key(
-						    session,
-						    r, NULL, 0, &ovfl_key));
+						    session, r, NULL, 0,
+						    &ovfl_key));
 				}
 
-				WT_ERR(__rec_split(
+				WT_ERR(__rec_split_crossing_bnd(
 				    session, r, key->len + val->len));
 			}
-		}
 
 		/* Copy the key/value pair onto the page. */
 		__rec_copy_incr(session, r, key);
@@ -5460,11 +5515,12 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
 		    WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key));
 
 		/* Boundary: split or write the page. */
-		if (key->len + val->len > r->space_avail) {
-			if (r->raw_compression)
+		if (r->raw_compression) {
+			if (key->len + val->len > r->space_avail)
 				WT_RET(__rec_split_raw(
 				    session, r, key->len + val->len));
-			else {
+		} else
+			if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) {
 				/*
 				 * Turn off prefix compression until a full key
 				 * written to the new page, and (unless already
@@ -5476,14 +5532,13 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
 					if (!ovfl_key)
 						WT_RET(
 						    __rec_cell_build_leaf_key(
-						    session,
-						    r, NULL, 0, &ovfl_key));
+						    session, r, NULL, 0,
+						    &ovfl_key));
 				}
 
-				WT_RET(__rec_split(
+				WT_RET(__rec_split_crossing_bnd(
 				    session, r, key->len + val->len));
 			}
-		}
 
 		/* Copy the key/value pair onto the page. */
 		__rec_copy_incr(session, r, key);
@@ -5595,13 +5650,14 @@ __rec_split_dump_keys(WT_SESSION_IMPL *session, WT_PAGE *page, WT_RECONCILE *r)
 			__wt_verbose(session, WT_VERB_SPLIT,
 			    "starting key %s",
 			    __wt_buf_set_printable(
-			    session, bnd->key.data, bnd->key.size, tkey));
+			    session, bnd->max_bnd_key.data,
+			    bnd->max_bnd_key.size, tkey));
 			break;
 		case WT_PAGE_COL_FIX:
 		case WT_PAGE_COL_INT:
 		case WT_PAGE_COL_VAR:
 			__wt_verbose(session, WT_VERB_SPLIT,
-			    "starting recno %" PRIu64, bnd->recno);
+			    "starting recno %" PRIu64, bnd->max_bnd_recno);
 			break;
 		WT_ILLEGAL_VALUE_ERR(session);
 		}
@@ -5863,10 +5919,10 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
 	/* We never set the first page's key, grab it from the original page. */
 	ref = r->ref;
 	if (__wt_ref_is_root(ref))
-		WT_RET(__wt_buf_set(session, &r->bnd[0].key, "", 1));
+		WT_RET(__wt_buf_set(session, &r->bnd[0].max_bnd_key, "", 1));
 	else {
 		__wt_ref_key(ref->home, ref, &p, &size);
-		WT_RET(__wt_buf_set(session, &r->bnd[0].key, p, size));
+		WT_RET(__wt_buf_set(session, &r->bnd[0].max_bnd_key, p, size));
 	}
 
 	/* Allocate, then initialize the array of replacement blocks. */
@@ -5874,8 +5930,8 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
 
 	for (multi = mod->mod_multi,
 	    bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) {
-		WT_RET(__wt_row_ikey_alloc(session, 0,
-		    bnd->key.data, bnd->key.size, &multi->key.ikey));
+		WT_RET(__wt_row_ikey_alloc(session, 0, bnd->max_bnd_key.data,
+		    bnd->max_bnd_key.size, &multi->key.ikey));
 
 		/*
 		 * Copy any disk image.  Don't take saved updates without a
@@ -5922,7 +5978,7 @@ __rec_split_col(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
 
 	for (multi = mod->mod_multi,
 	    bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) {
-		multi->key.recno = bnd->recno;
+		multi->key.recno = bnd->max_bnd_recno;
 
 		/*
 		 * Copy any disk image.  Don't take saved updates without a
diff --git a/test/format/config.h b/test/format/config.h
index e3e1e73a786..b5feb7a5321 100644
--- a/test/format/config.h
+++ b/test/format/config.h
@@ -284,7 +284,7 @@ static CONFIG c[] = {
 
 	{ "split_pct",
 	  "page split size as a percentage of the maximum page size",
-	  0x0, 40, 85, 85, &g.c_split_pct, NULL },
+	  0x0, 50, 100, 100, &g.c_split_pct, NULL },
 
 	{ "statistics",
 	  "maintain statistics",				/* 20% */
author	Sulabh Mahajan <sulabh.mahajan@mongodb.com>	2017-03-29 15:38:35 +1100
committer	Michael Cahill <michael.cahill@mongodb.com>	2017-03-29 15:38:35 +1100
commit	1c41c7735b3529521b7bd34180f80584caee7f59 (patch)
tree	7ff9a0eb97bd4667fd2286dae77be06a75426d17
parent	d5a10d2e97853e7db6bb4c2635b97febf13607c5 (diff)
download	mongo-1c41c7735b3529521b7bd34180f80584caee7f59.tar.gz