summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@mongodb.com>2017-03-29 15:55:04 +1100
committerMichael Cahill <michael.cahill@mongodb.com>2017-03-29 15:55:04 +1100
commitebff498af45a3e64fab05cd6360b117c010634b9 (patch)
treee75de7ac2cd26cb56d19b06400bf0ac197bf68f0
parent1c41c7735b3529521b7bd34180f80584caee7f59 (diff)
downloadmongo-ebff498af45a3e64fab05cd6360b117c010634b9.tar.gz
Revert "WT-2439 Improve page layout: keep pages more than half full (#3277)"
This reverts commit 1c41c7735b3529521b7bd34180f80584caee7f59.
-rw-r--r--dist/api_data.py4
-rw-r--r--src/btree/bt_handle.c9
-rw-r--r--src/config/config_def.c16
-rw-r--r--src/include/btree.h6
-rw-r--r--src/include/wiredtiger.in4
-rw-r--r--src/reconcile/rec_write.c954
-rw-r--r--test/format/config.h2
7 files changed, 463 insertions, 532 deletions
diff --git a/dist/api_data.py b/dist/api_data.py
index 22600dd5e29..1d669fa7fe0 100644
--- a/dist/api_data.py
+++ b/dist/api_data.py
@@ -295,12 +295,12 @@ file_config = format_meta + file_runtime_config + [
Config('split_deepen_per_child', '0', r'''
entries allocated per child when deepening the tree''',
type='int', undoc=True),
- Config('split_pct', '90', r'''
+ Config('split_pct', '75', r'''
the Btree page split size as a percentage of the maximum Btree
page size, that is, when a Btree page is split, it will be
split into smaller pages, where each page is the specified
percentage of the maximum Btree page size''',
- min='50', max='100'),
+ min='25', max='100'),
]
# File metadata, including both configurable and non-configurable (internal)
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index 98c246fb897..f2bffee06da 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -780,16 +780,9 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
* Get the split percentage (reconciliation splits pages into smaller
* than the maximum page size chunks so we don't split every time a
* new entry is added). Determine how large newly split pages will be.
- * Set to the minimum, if the read value is less than that.
*/
WT_RET(__wt_config_gets(session, cfg, "split_pct", &cval));
- if (cval.val < WT_BTREE_MIN_SPLIT_PCT) {
- btree->split_pct = WT_BTREE_MIN_SPLIT_PCT;
- WT_RET(__wt_msg(session,
- "Re-setting split_pct for %s to the minimum allowed of "
- "%d%%.", session->dhandle->name, WT_BTREE_MIN_SPLIT_PCT));
- } else
- btree->split_pct = (int)cval.val;
+ btree->split_pct = (int)cval.val;
intl_split_size = __wt_split_page_size(btree, btree->maxintlpage);
leaf_split_size = __wt_split_page_size(btree, btree->maxleafpage);
diff --git a/src/config/config_def.c b/src/config/config_def.c
index f152fbacad4..b11a8d63fdb 100644
--- a/src/config/config_def.c
+++ b/src/config/config_def.c
@@ -294,7 +294,7 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_create[] = {
{ "source", "string", NULL, NULL, NULL, 0 },
{ "split_deepen_min_child", "int", NULL, NULL, NULL, 0 },
{ "split_deepen_per_child", "int", NULL, NULL, NULL, 0 },
- { "split_pct", "int", NULL, "min=50,max=100", NULL, 0 },
+ { "split_pct", "int", NULL, "min=25,max=100", NULL, 0 },
{ "type", "string", NULL, NULL, NULL, 0 },
{ "value_format", "format",
__wt_struct_confchk, NULL,
@@ -466,7 +466,7 @@ static const WT_CONFIG_CHECK confchk_file_config[] = {
{ "prefix_compression_min", "int", NULL, "min=0", NULL, 0 },
{ "split_deepen_min_child", "int", NULL, NULL, NULL, 0 },
{ "split_deepen_per_child", "int", NULL, NULL, NULL, 0 },
- { "split_pct", "int", NULL, "min=50,max=100", NULL, 0 },
+ { "split_pct", "int", NULL, "min=25,max=100", NULL, 0 },
{ "value_format", "format",
__wt_struct_confchk, NULL,
NULL, 0 },
@@ -530,7 +530,7 @@ static const WT_CONFIG_CHECK confchk_file_meta[] = {
{ "prefix_compression_min", "int", NULL, "min=0", NULL, 0 },
{ "split_deepen_min_child", "int", NULL, NULL, NULL, 0 },
{ "split_deepen_per_child", "int", NULL, NULL, NULL, 0 },
- { "split_pct", "int", NULL, "min=50,max=100", NULL, 0 },
+ { "split_pct", "int", NULL, "min=25,max=100", NULL, 0 },
{ "value_format", "format",
__wt_struct_confchk, NULL,
NULL, 0 },
@@ -614,7 +614,7 @@ static const WT_CONFIG_CHECK confchk_lsm_meta[] = {
{ "prefix_compression_min", "int", NULL, "min=0", NULL, 0 },
{ "split_deepen_min_child", "int", NULL, NULL, NULL, 0 },
{ "split_deepen_per_child", "int", NULL, NULL, NULL, 0 },
- { "split_pct", "int", NULL, "min=50,max=100", NULL, 0 },
+ { "split_pct", "int", NULL, "min=25,max=100", NULL, 0 },
{ "value_format", "format",
__wt_struct_confchk, NULL,
NULL, 0 },
@@ -1119,7 +1119,7 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"chunk_size=10MB,merge_max=15,merge_min=0),memory_page_max=5MB,"
"os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false,"
"prefix_compression_min=4,source=,split_deepen_min_child=0,"
- "split_deepen_per_child=0,split_pct=90,type=file,value_format=u",
+ "split_deepen_per_child=0,split_pct=75,type=file,value_format=u",
confchk_WT_SESSION_create, 42
},
{ "WT_SESSION.drop",
@@ -1213,7 +1213,7 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"leaf_value_max=0,log=(enabled=true),memory_page_max=5MB,"
"os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false,"
"prefix_compression_min=4,split_deepen_min_child=0,"
- "split_deepen_per_child=0,split_pct=90,value_format=u",
+ "split_deepen_per_child=0,split_pct=75,value_format=u",
confchk_file_config, 35
},
{ "file.meta",
@@ -1228,7 +1228,7 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=0,"
"log=(enabled=true),memory_page_max=5MB,os_cache_dirty_max=0,"
"os_cache_max=0,prefix_compression=false,prefix_compression_min=4"
- ",split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90,"
+ ",split_deepen_min_child=0,split_deepen_per_child=0,split_pct=75,"
"value_format=u,version=(major=0,minor=0)",
confchk_file_meta, 39
},
@@ -1253,7 +1253,7 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"merge_min=0),memory_page_max=5MB,old_chunks=,"
"os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false,"
"prefix_compression_min=4,split_deepen_min_child=0,"
- "split_deepen_per_child=0,split_pct=90,value_format=u",
+ "split_deepen_per_child=0,split_pct=75,value_format=u",
confchk_lsm_meta, 39
},
{ "table.meta",
diff --git a/src/include/btree.h b/src/include/btree.h
index 28fe1b94b23..88312f408cc 100644
--- a/src/include/btree.h
+++ b/src/include/btree.h
@@ -58,12 +58,6 @@
#define WT_BTREE_DELETE_THRESHOLD 1000
/*
- * Minimum size of the chunks (in percentage of the page size) a page gets split
- * into during reconciliation.
- */
-#define WT_BTREE_MIN_SPLIT_PCT 50
-
-/*
* WT_BTREE --
* A btree handle.
*/
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index 707159ef6ae..558e93d3de0 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -1242,8 +1242,8 @@ struct __wt_session {
* @config{split_pct, the Btree page split size as a percentage of the
* maximum Btree page size\, that is\, when a Btree page is split\, it
* will be split into smaller pages\, where each page is the specified
- * percentage of the maximum Btree page size., an integer between 50 and
- * 100; default \c 90.}
+ * percentage of the maximum Btree page size., an integer between 25 and
+ * 100; default \c 75.}
* @config{type, set the type of data source used to store a column
* group\, index or simple table. By default\, a \c "file:" URI is
* derived from the object name. The \c type configuration can be used
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index e18d44f96ff..23f654caa70 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -26,11 +26,6 @@ typedef struct {
uint32_t flags; /* Caller's configuration */
WT_ITEM disk_image; /* Temporary disk-image buffer */
- /*
- * Temporary buffer used to write out a disk image when managing two
- * chunks worth of data in memory
- */
- WT_ITEM *interim_buf;
/*
* Track start/stop write generation to decide if all changes to the
@@ -132,7 +127,6 @@ typedef struct {
* repeatedly split a packed page.
*/
uint32_t split_size; /* Split page size */
- uint32_t min_split_size; /* Minimum split page size */
/*
* The problem with splits is we've done a lot of work by the time we
@@ -157,6 +151,16 @@ typedef struct {
*/
size_t offset; /* Split's first byte */
+ /*
+ * The recno and entries fields are the starting record number
+ * of the split chunk (for column-store splits), and the number
+ * of entries in the split chunk. These fields are used both
+ * to write the split chunk, and to create a new internal page
+ * to reference the split pages.
+ */
+ uint64_t recno; /* Split's starting record */
+ uint32_t entries; /* Split's entries */
+
WT_ADDR addr; /* Split's written location */
uint32_t size; /* Split's size */
uint32_t checksum; /* Split's checksum */
@@ -178,36 +182,11 @@ typedef struct {
size_t supd_allocated;
/*
- * While reconciling pages, at any given time, we maintain two
- * split chunks in the memory to be written out as pages. As we
- * get to the last two chunks, if the last one turns out to be
- * smaller than the minimum split size, we go back into the
- * penultimate chunk and split at this minimum split size
- * boundary. This moves some data from the penultimate chunk to
- * the last chunk, hence increasing the size of the last page
- * written without decreasing the penultimate page size beyond
- * the minimum split size. For this reason, we maintain both a
- * maximum split percentage boundary and a minimum split
- * percentage boundary.
- *
- * The recno and entries fields are the starting record number
- * of the split chunk (for column-store splits), and the number
- * of entries in the split chunk. These fields are used both to
- * write the split chunk, and to create a new internal page to
- * reference the split pages.
- *
* The key for a row-store page; no column-store key is needed
* because the page's recno, stored in the recno field, is the
* column-store key.
*/
- uint32_t max_bnd_entries;
- uint64_t max_bnd_recno;
- WT_ITEM max_bnd_key;
-
- size_t min_bnd_offset;
- uint32_t min_bnd_entries;
- uint64_t min_bnd_recno;
- WT_ITEM min_bnd_key;
+ WT_ITEM key; /* Promoted row-store key */
} *bnd; /* Saved boundaries */
uint32_t bnd_next; /* Next boundary slot */
uint32_t bnd_next_max; /* Maximum boundary slots used */
@@ -215,6 +194,28 @@ typedef struct {
size_t bnd_allocated; /* Bytes allocated */
/*
+ * We track the total number of page entries copied into split chunks
+ * so we can easily figure out how many entries in the current split
+ * chunk.
+ */
+ uint32_t total_entries; /* Total entries in splits */
+
+ /*
+ * And there's state information as to where in this process we are:
+ * (1) tracking split boundaries because we can still fit more split
+ * chunks into the maximum page size, (2) tracking the maximum page
+ * size boundary because we can't fit any more split chunks into the
+ * maximum page size, (3) not performing boundary checks because it's
+ * either not useful with the current page size configuration, or
+ * because we've already been forced to split.
+ */
+ enum { SPLIT_BOUNDARY=0, /* Next: a split page boundary */
+ SPLIT_MAX=1, /* Next: the maximum page boundary */
+ SPLIT_TRACKING_OFF=2, /* No boundary checks */
+ SPLIT_TRACKING_RAW=3 } /* Underlying compression decides */
+ bnd_state;
+
+ /*
* We track current information about the current record number, the
* number of entries copied into the temporary buffer, where we are
* in the temporary buffer, and how much memory remains. Those items
@@ -292,14 +293,6 @@ typedef struct {
uint32_t tested_ref_state; /* Debugging information */
} WT_RECONCILE;
-#define WT_CROSSING_MIN_BND(r, next_len) \
- ((r)->bnd[(r)->bnd_next].min_bnd_offset == 0 && \
- ((r)->space_avail - (next_len)) < \
- ((r)->split_size - (r)->min_split_size))
-#define WT_CROSSING_SPLIT_BND(r, next_len) ((next_len) > (r)->space_avail)
-#define WT_CHECK_CROSSING_BND(r, next_len) \
- (WT_CROSSING_MIN_BND(r, next_len) || WT_CROSSING_SPLIT_BND(r, next_len))
-
static void __rec_bnd_cleanup(WT_SESSION_IMPL *, WT_RECONCILE *, bool);
static void __rec_cell_build_addr(WT_SESSION_IMPL *,
WT_RECONCILE *, const void *, size_t, u_int, uint64_t);
@@ -321,7 +314,6 @@ static int __rec_col_var(WT_SESSION_IMPL *,
static int __rec_col_var_helper(WT_SESSION_IMPL *, WT_RECONCILE *,
WT_SALVAGE_COOKIE *, WT_ITEM *, bool, uint8_t, uint64_t);
static int __rec_destroy_session(WT_SESSION_IMPL *);
-static uint32_t __rec_min_split_page_size(WT_BTREE *, uint32_t);
static int __rec_root_write(WT_SESSION_IMPL *, WT_PAGE *, uint32_t);
static int __rec_row_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
static int __rec_row_leaf(WT_SESSION_IMPL *,
@@ -331,6 +323,7 @@ static int __rec_row_leaf_insert(
static int __rec_row_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
static int __rec_split_col(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
static int __rec_split_discard(WT_SESSION_IMPL *, WT_PAGE *);
+static int __rec_split_fixup(WT_SESSION_IMPL *, WT_RECONCILE *);
static int __rec_split_row(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
static int __rec_split_row_promote(
WT_SESSION_IMPL *, WT_RECONCILE *, WT_ITEM *, uint8_t);
@@ -975,7 +968,6 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep)
*(WT_RECONCILE **)reconcilep = NULL;
__wt_buf_free(session, &r->disk_image);
- __wt_scr_free(session, &r->interim_buf);
__wt_free(session, r->raw_entries);
__wt_free(session, r->raw_offsets);
@@ -1040,8 +1032,7 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, bool destroy)
__wt_free(session, bnd->addr.addr);
__wt_free(session, bnd->disk_image);
__wt_free(session, bnd->supd);
- __wt_buf_free(session, &bnd->max_bnd_key);
- __wt_buf_free(session, &bnd->min_bnd_key);
+ __wt_buf_free(session, &bnd->key);
}
__wt_free(session, r->bnd);
r->bnd_next = 0;
@@ -1936,8 +1927,8 @@ static void
__rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd)
{
bnd->offset = 0;
- bnd->max_bnd_recno = WT_RECNO_OOB;
- bnd->max_bnd_entries = 0;
+ bnd->recno = WT_RECNO_OOB;
+ bnd->entries = 0;
__wt_free(session, bnd->addr.addr);
WT_CLEAR(bnd->addr);
@@ -1952,10 +1943,6 @@ __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd)
bnd->already_compressed = false;
- bnd->min_bnd_offset = 0;
- bnd->min_bnd_entries = 0;
- bnd->min_bnd_recno = WT_RECNO_OOB;
-
/*
* Don't touch the key, we re-use that memory in each new
* reconciliation.
@@ -1987,64 +1974,40 @@ __rec_split_bnd_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r)
}
/*
- * __rec_split_page_size_from_pct --
- * Given a split percentage, calculate split page size in bytes.
+ * __wt_split_page_size --
+ * Split page size calculation: we don't want to repeatedly split every
+ * time a new entry is added, so we split to a smaller-than-maximum page size.
*/
-static uint32_t
-__rec_split_page_size_from_pct(
- int split_pct, uint32_t maxpagesize, uint32_t allocsize) {
+uint32_t
+__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize)
+{
uintmax_t a;
uint32_t split_size;
/*
* Ideally, the split page size is some percentage of the maximum page
- * size rounded to an allocation unit (round to an allocation unit so we
- * don't waste space when we write).
+ * size rounded to an allocation unit (round to an allocation unit so
+ * we don't waste space when we write).
*/
a = maxpagesize; /* Don't overflow. */
split_size = (uint32_t)WT_ALIGN_NEAREST(
- (a * (u_int)split_pct) / 100, allocsize);
+ (a * (u_int)btree->split_pct) / 100, btree->allocsize);
/*
- * Respect the configured split percentage if the calculated split size
- * is either zero or a full page. The user has either configured an
- * allocation size that matches the page size, or a split percentage
- * that is close to zero or one hundred. Rounding is going to provide a
- * worse outcome than having a split point that doesn't fall on an
- * allocation size boundary in those cases.
+ * Respect the configured split percentage if the calculated split
+ * size is either zero or a full page. The user has either configured
+ * an allocation size that matches the page size, or a split
+ * percentage that is close to zero or one hundred. Rounding is going
+ * to provide a worse outcome than having a split point that doesn't
+ * fall on an allocation size boundary in those cases.
*/
if (split_size == 0 || split_size == maxpagesize)
- split_size = (uint32_t)((a * (u_int)split_pct) / 100);
+ split_size = (uint32_t)((a * (u_int)btree->split_pct) / 100);
return (split_size);
}
/*
- * __wt_split_page_size --
- * Split page size calculation: we don't want to repeatedly split every
- * time a new entry is added, so we split to a smaller-than-maximum page size.
- */
-uint32_t
-__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize)
-{
- return (__rec_split_page_size_from_pct(
- btree->split_pct, maxpagesize, btree->allocsize));
-}
-
-/*
- * __rec_min_split_page_size --
- * Minimum split size boundary calculation: To track a boundary at the
- * minimum split size that we could have split at instead of splitting at
- * the split page size.
- */
-static uint32_t
-__rec_min_split_page_size(WT_BTREE *btree, uint32_t maxpagesize)
-{
- return (__rec_split_page_size_from_pct(
- WT_BTREE_MIN_SPLIT_PCT, maxpagesize, btree->allocsize));
-}
-
-/*
* __rec_split_init --
* Initialization for the reconciliation split functions.
*/
@@ -2055,7 +2018,7 @@ __rec_split_init(WT_SESSION_IMPL *session,
WT_BM *bm;
WT_BTREE *btree;
WT_PAGE_HEADER *dsk;
- size_t corrected_page_size, disk_img_buf_size;
+ size_t corrected_page_size;
btree = S2BT(session);
bm = btree->bm;
@@ -2090,6 +2053,33 @@ __rec_split_init(WT_SESSION_IMPL *session,
r->max_raw_page_size = r->page_size =
(uint32_t)WT_MIN(r->page_size * 10,
WT_MAX(r->page_size, btree->maxmempage / 2));
+
+ /*
+ * Ensure the disk image buffer is large enough for the max object, as
+ * corrected by the underlying block manager.
+ */
+ corrected_page_size = r->page_size;
+ WT_RET(bm->write_size(bm, session, &corrected_page_size));
+ WT_RET(__wt_buf_init(session, &r->disk_image, corrected_page_size));
+
+ /*
+ * Clear the disk page header to ensure all of it is initialized, even
+ * the unused fields.
+ *
+ * In the case of fixed-length column-store, clear the entire buffer:
+ * fixed-length column-store sets bits in bytes, where the bytes are
+ * assumed to initially be 0.
+ */
+ memset(r->disk_image.mem, 0, page->type == WT_PAGE_COL_FIX ?
+ corrected_page_size : WT_PAGE_HEADER_SIZE);
+
+ /*
+ * Set the page type (the type doesn't change, and setting it later
+ * would require additional code in a few different places).
+ */
+ dsk = r->disk_image.mem;
+ dsk->type = page->type;
+
/*
* If we have to split, we want to choose a smaller page size for the
* split pages, because otherwise we could end up splitting one large
@@ -2109,28 +2099,22 @@ __rec_split_init(WT_SESSION_IMPL *session,
* creating overflow items and compacted data, for example, as those
* items have already been written to disk). So, the loop calls the
* helper functions when approaching a split boundary, and we save the
- * information at that point. We also save the boundary information at
- * the minimum split size. We maintain two chunks (each boundary
- * represents a chunk that gets written as a page) in the memory,
- * writing out the older one to the disk as a page when we need to make
- * space for a new chunk. On reaching the last chunk, if it turns out to
- * be smaller than the minimum split size, we go back into the
- * penultimate chunk and split at this minimum split size boundary. This
- * moves some data from the penultimate chunk to the last chunk, hence
- * increasing the size of the last page written without decreasing the
- * penultimate page size beyond the minimum split size.
+ * information at that point. That allows us to go back and split the
+ * page at the boundary points if we eventually overflow the maximum
+ * page size.
*
* Finally, all this doesn't matter for fixed-size column-store pages,
* raw compression, and salvage. Fixed-size column store pages can
* split under (very) rare circumstances, but they're allocated at a
* fixed page size, never anything smaller. In raw compression, the
- * underlying compression routine decides when we split, so it's not our
- * problem. In salvage, as noted above, we can't split at all.
+ * underlying compression routine decides when we split, so it's not
+ * our problem. In salvage, as noted above, we can't split at all.
*/
if (r->raw_compression || r->salvage != NULL) {
r->split_size = 0;
r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
- } else if (page->type == WT_PAGE_COL_FIX) {
+ }
+ else if (page->type == WT_PAGE_COL_FIX) {
r->split_size = r->page_size;
r->space_avail =
r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
@@ -2138,53 +2122,32 @@ __rec_split_init(WT_SESSION_IMPL *session,
r->split_size = __wt_split_page_size(btree, r->page_size);
r->space_avail =
r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
- r->min_split_size =
- __rec_min_split_page_size(btree, r->page_size);
}
-
- /*
- * Ensure the disk image buffer is large enough for the max object, as
- * corrected by the underlying block manager.
- *
- * The buffer that we build disk image in, needs to hold two chunks
- * worth of data. Since we want to support split_size more than the page
- * size (to allow for adjustments based on the compression), this buffer
- * should be greater of twice of split_size and page_size.
- */
- corrected_page_size = r->page_size;
- disk_img_buf_size = 2 * WT_MAX(corrected_page_size, r->split_size);
- WT_RET(bm->write_size(bm, session, &corrected_page_size));
- WT_RET(__wt_buf_init(session, &r->disk_image, disk_img_buf_size));
-
- /*
- * Clear the disk page header to ensure all of it is initialized, even
- * the unused fields.
- *
- * In the case of fixed-length column-store, clear the entire buffer:
- * fixed-length column-store sets bits in bytes, where the bytes are
- * assumed to initially be 0.
- */
- memset(r->disk_image.mem, 0, page->type == WT_PAGE_COL_FIX ?
- disk_img_buf_size : WT_PAGE_HEADER_SIZE);
-
- /*
- * Set the page type (the type doesn't change, and setting it later
- * would require additional code in a few different places).
- */
- dsk = r->disk_image.mem;
- dsk->type = page->type;
-
r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
/* Initialize the first boundary. */
r->bnd_next = 0;
WT_RET(__rec_split_bnd_grow(session, r));
__rec_split_bnd_init(session, &r->bnd[0]);
- r->bnd[0].max_bnd_recno = recno;
+ r->bnd[0].recno = recno;
r->bnd[0].offset = WT_PAGE_HEADER_BYTE_SIZE(btree);
- /* Initialize the entry counter. */
- r->entries = 0;
+ /*
+ * If the maximum page size is the same as the split page size, either
+ * because of the object type or application configuration, there isn't
+ * any need to maintain split boundaries within a larger page.
+ *
+ * No configuration for salvage here, because salvage can't split.
+ */
+ if (r->raw_compression)
+ r->bnd_state = SPLIT_TRACKING_RAW;
+ else if (max == r->split_size)
+ r->bnd_state = SPLIT_TRACKING_OFF;
+ else
+ r->bnd_state = SPLIT_BOUNDARY;
+
+ /* Initialize the entry counters. */
+ r->entries = r->total_entries = 0;
/* Initialize the starting record number. */
r->recno = recno;
@@ -2387,112 +2350,19 @@ __rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len)
{
WT_BM *bm;
WT_BTREE *btree;
- size_t corrected_page_size, inuse, len;
+ size_t corrected_page_size, len;
btree = S2BT(session);
bm = btree->bm;
len = WT_PTRDIFF(r->first_free, r->disk_image.mem);
- inuse = (len - r->bnd[r->bnd_next].offset) +
- WT_PAGE_HEADER_BYTE_SIZE(btree);
- corrected_page_size = inuse + add_len;
-
+ corrected_page_size = len + add_len;
WT_RET(bm->write_size(bm, session, &corrected_page_size));
- /* Need to account for buffer carrying two chunks worth of data */
- WT_RET(__wt_buf_grow(session, &r->disk_image, 2 * corrected_page_size));
-
+ WT_RET(__wt_buf_grow(session, &r->disk_image, corrected_page_size));
r->first_free = (uint8_t *)r->disk_image.mem + len;
- WT_ASSERT(session, corrected_page_size >= inuse);
- r->space_avail = corrected_page_size - inuse;
+ WT_ASSERT(session, corrected_page_size >= len);
+ r->space_avail = corrected_page_size - len;
WT_ASSERT(session, r->space_avail >= add_len);
-
- return (0);
-}
-
-/*
- * __rec_split_write_prev_and_shift_cur --
- * Write the previous split chunk to the disk as a page. Shift the contents
- * of the current chunk to the start of the buffer, making space for a new
- * chunk to be written.
- * If the caller asks for a chunk resizing, the boundary between the two
- * chunks is readjusted to the minimum split size boundary details stored
- * in the previous chunk, letting the current chunk grow at the cost of the
- * previous chunk.
- */
-static int
-__rec_split_write_prev_and_shift_cur(
- WT_SESSION_IMPL *session, WT_RECONCILE *r, bool resize_chunks)
-{
- WT_BM *bm;
- WT_BOUNDARY *bnd_cur, *bnd_prev;
- WT_BTREE *btree;
- WT_PAGE_HEADER *dsk, *dsk_tmp;
- size_t cur_len, len;
- uint8_t *dsk_start;
-
- WT_ASSERT(session, r->bnd_next != 0);
-
- btree = S2BT(session);
- bm = btree->bm;
- bnd_cur = &r->bnd[r->bnd_next];
- bnd_prev = bnd_cur - 1;
- dsk = r->disk_image.mem;
- cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset;
-
- /*
- * Resize chunks if the current is smaller than the minimum, and there
- * are details on the minimum split size boundary available in the
- * previous boundary details.
- *
- * There is a possibility that we do not have a minimum boundary set, in
- * such a case we skip chunk resizing. Such a condition is possible for
- * instance when we are building the image in the buffer and the first
- * K/V pair is large enough that it surpasses both the minimum split
- * size and the split size the application has set. In such a case we
- * split the chunk without saving any minimum boundary.
- */
- if (resize_chunks &&
- cur_len < r->min_split_size && bnd_prev->min_bnd_offset != 0) {
- bnd_cur->offset = bnd_prev->min_bnd_offset;
- bnd_cur->max_bnd_entries +=
- bnd_prev->max_bnd_entries - bnd_prev->min_bnd_entries;
- bnd_prev->max_bnd_entries = bnd_prev->min_bnd_entries;
- bnd_cur->max_bnd_recno = bnd_prev->min_bnd_recno;
-
- WT_RET(__wt_buf_set(session, &bnd_cur->max_bnd_key,
- bnd_prev->min_bnd_key.data, bnd_prev->min_bnd_key.size));
-
- /* Update current chunk's length */
- cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset;
- }
-
- /*
- * Create an interim buffer if not already done to prepare the previous
- * chunk's disk image.
- */
- len = bnd_cur->offset;
- WT_RET(bm->write_size(bm, session, &len));
- if (r->interim_buf == NULL)
- WT_RET(__wt_scr_alloc(session, len, &r->interim_buf));
- else
- WT_RET(__wt_buf_init(session, r->interim_buf, len));
-
- dsk_tmp = r->interim_buf->mem;
- memcpy(dsk_tmp, dsk, bnd_cur->offset);
- dsk_tmp->recno = bnd_prev->max_bnd_recno;
- dsk_tmp->u.entries = bnd_prev->max_bnd_entries;
- dsk_tmp->mem_size = WT_STORE_SIZE(bnd_cur->offset);
- r->interim_buf->size = dsk_tmp->mem_size;
- WT_RET(__rec_split_write(session, r, bnd_prev, r->interim_buf, false));
-
- /* Shift the current chunk to the start of the buffer */
- dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
- (void)memmove(dsk_start, (uint8_t *)dsk + bnd_cur->offset, cur_len);
-
- /* Fix boundary offset */
- bnd_cur->offset = WT_PAGE_HEADER_BYTE_SIZE(btree);
- /* Fix where free points */
- r->first_free = dsk_start + cur_len;
return (0);
}
@@ -2512,9 +2382,6 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
btree = S2BT(session);
dsk = r->disk_image.mem;
- /* Fixed length col store can call with next_len 0 */
- WT_ASSERT(session, next_len == 0 || r->space_avail < next_len);
-
/*
* We should never split during salvage, and we're about to drop core
* because there's no parent page.
@@ -2524,58 +2391,147 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
"%s page too large, attempted split during salvage",
__wt_page_type_string(r->page->type));
- last = &r->bnd[r->bnd_next];
- inuse = (WT_PTRDIFF(r->first_free, dsk) - last->offset) +
- WT_PAGE_HEADER_BYTE_SIZE(btree);
-
- /*
- * We can get here if the first key/value pair won't fit.
- * Additionally, grow the buffer to contain the current item if we
- * haven't already consumed a reasonable portion of a split chunk.
- */
- if (inuse < r->split_size / 2)
- goto done;
-
/* Hitting a page boundary resets the dictionary, in all cases. */
__rec_dictionary_reset(r);
- /* Set the number of entries for the just finished chunk. */
- last->max_bnd_entries = r->entries;
+ inuse = WT_PTRDIFF(r->first_free, dsk);
+ switch (r->bnd_state) {
+ case SPLIT_BOUNDARY:
+ /*
+ * We can get here if the first key/value pair won't fit.
+ * Additionally, grow the buffer to contain the current item if
+ * we haven't already consumed a reasonable portion of a split
+ * chunk.
+ */
+ if (inuse < r->split_size / 2)
+ break;
- /*
- * In case of bulk load, write out chunks as we get them.
- * In other cases, we keep two chunks in memory at a given time. So, if
- * there is a previous chunk, write it out, making space in the buffer
- * for the next chunk to be written.
- */
- if (r->is_bulk_load) {
- dsk->recno = last->max_bnd_recno;
- dsk->u.entries = last->max_bnd_entries;
- dsk->mem_size = (uint32_t)inuse;
+ /*
+ * About to cross a split boundary but not yet forced to split
+ * into multiple pages. If we have to split, this is one of the
+ * split points, save information about where we are when the
+ * split would have happened.
+ */
+ WT_RET(__rec_split_bnd_grow(session, r));
+ last = &r->bnd[r->bnd_next++];
+ next = last + 1;
+
+ /* Set the number of entries for the just finished chunk. */
+ last->entries = r->entries - r->total_entries;
+ r->total_entries = r->entries;
+
+ /* Set the key for the next chunk. */
+ next->recno = r->recno;
+ if (dsk->type == WT_PAGE_ROW_INT ||
+ dsk->type == WT_PAGE_ROW_LEAF)
+ WT_RET(__rec_split_row_promote(
+ session, r, &next->key, dsk->type));
+
+ /*
+ * Set the starting buffer offset and clear the entries (the
+ * latter not required, but cleaner).
+ */
+ next->offset = WT_PTRDIFF(r->first_free, dsk);
+ next->entries = 0;
+
+ /* Set the space available to another split-size chunk. */
+ r->space_avail =
+ r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+
+ /*
+ * Adjust the space available to handle two cases:
+ * - We don't have enough room for another full split-size
+ * chunk on the page.
+ * - We chose to fill past a page boundary because of a
+ * large item.
+ */
+ if (inuse + r->space_avail > r->page_size) {
+ r->space_avail =
+ r->page_size > inuse ? (r->page_size - inuse) : 0;
+
+ /* There are no further boundary points. */
+ r->bnd_state = SPLIT_MAX;
+ }
+
+ /*
+ * Return if the next object fits into this page, else we have
+ * to split the page.
+ */
+ if (r->space_avail >= next_len)
+ return (0);
+
+ /* FALLTHROUGH */
+ case SPLIT_MAX:
+ /*
+ * We're going to have to split and create multiple pages.
+ *
+ * Cycle through the saved split-point information, writing the
+ * split chunks we have tracked. The underlying fixup function
+ * sets the space available and other information, and copied
+ * any unwritten chunk of data to the beginning of the buffer.
+ */
+ WT_RET(__rec_split_fixup(session, r));
+
+ /* We're done saving split chunks. */
+ r->bnd_state = SPLIT_TRACKING_OFF;
+ break;
+ case SPLIT_TRACKING_OFF:
+ /*
+ * We can get here if the first key/value pair won't fit.
+ * Additionally, grow the buffer to contain the current item if
+ * we haven't already consumed a reasonable portion of a split
+ * chunk.
+ */
+ if (inuse < r->split_size / 2)
+ break;
+
+ /*
+ * The key/value pairs didn't fit into a single page, but either
+ * we've already noticed that and are now processing the rest of
+ * the pairs at split size boundaries, or the split size was the
+ * same as the page size, and we never bothered with split point
+ * information at all.
+ */
+ WT_RET(__rec_split_bnd_grow(session, r));
+ last = &r->bnd[r->bnd_next++];
+ next = last + 1;
+
+ /*
+ * Set the key for the next chunk (before writing the block, a
+ * key range is needed in that code).
+ */
+ next->recno = r->recno;
+ if (dsk->type == WT_PAGE_ROW_INT ||
+ dsk->type == WT_PAGE_ROW_LEAF)
+ WT_RET(__rec_split_row_promote(
+ session, r, &next->key, dsk->type));
+
+ /* Clear the entries (not required, but cleaner). */
+ next->entries = 0;
+
+ /* Finalize the header information and write the page. */
+ dsk->recno = last->recno;
+ dsk->u.entries = r->entries;
+ dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk);
r->disk_image.size = dsk->mem_size;
- WT_RET(__rec_split_write(
- session, r, last, &r->disk_image, false));
- /* Fix where free points */
+ WT_RET(
+ __rec_split_write(session, r, last, &r->disk_image, false));
+
+ /*
+ * Set the caller's entry count and buffer information for the
+ * next chunk. We only get here if we're not splitting or have
+ * already split, so it's split-size chunks from here on out.
+ */
+ r->entries = 0;
r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
- } else if (r->bnd_next != 0)
- WT_RET(__rec_split_write_prev_and_shift_cur(session, r, false));
+ r->space_avail =
+ r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+ break;
+ case SPLIT_TRACKING_RAW:
+ return (__wt_illegal_value(session, NULL));
+ }
- /* Prepare the next boundary */
- WT_RET(__rec_split_bnd_grow(session, r));
- r->bnd_next++;
- next = &r->bnd[r->bnd_next];
- next->offset = WT_PTRDIFF(r->first_free, dsk);
- /* Set the key for the next chunk. */
- next->max_bnd_recno = r->recno;
- if (dsk->type == WT_PAGE_ROW_INT || dsk->type == WT_PAGE_ROW_LEAF)
- WT_RET(__rec_split_row_promote(
- session, r, &next->max_bnd_key, dsk->type));
-
- r->entries = 0;
- /* Set the space available to another split-size chunk. */
- r->space_avail = r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
-
-done: /*
+ /*
* Overflow values can be larger than the maximum page size but still be
* "on-page". If the next key/value pair is larger than space available
* after a split has happened (in other words, larger than the maximum
@@ -2593,66 +2549,6 @@ done: /*
}
/*
- * __rec_split_crossing_bnd --
- * Save the details for the minimum split size boundary or call for a
- * split.
- */
-static inline int
-__rec_split_crossing_bnd(
- WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
-{
- WT_BOUNDARY *bnd;
- WT_BTREE *btree;
- WT_PAGE_HEADER *dsk;
- size_t min_bnd_offset;
-
- WT_ASSERT(session, WT_CHECK_CROSSING_BND(r, next_len));
-
- /*
- * If crossing the minimum split size boundary, store the boundary
- * details at the current location in the buffer. If we are crossing the
- * split boundary at the same time, possible when the next record is
- * large enough, just split at this point.
- */
- if (WT_CROSSING_MIN_BND(r, next_len) &&
- !WT_CROSSING_SPLIT_BND(r, next_len)) {
- btree = S2BT(session);
- bnd = &r->bnd[r->bnd_next];
- dsk = r->disk_image.mem;
- min_bnd_offset = (WT_PTRDIFF(r->first_free, dsk) -
- bnd->offset) + WT_PAGE_HEADER_BYTE_SIZE(btree);
- if (min_bnd_offset == WT_PAGE_HEADER_BYTE_SIZE(btree))
- /*
- * This is possible if the first record doesn't fit in
- * the minimum split size, we write this record without
- * setting up any boundary here. We will get the
- * opportunity to setup a boundary before writing out
- * the next record.
- */
- return (0);
-
- WT_ASSERT(session, bnd->min_bnd_offset == 0);
-
- /*
- * Hitting a page boundary resets the dictionary, in all cases.
- */
- __rec_dictionary_reset(r);
-
- bnd->min_bnd_offset = min_bnd_offset;
- bnd->min_bnd_entries = r->entries;
- bnd->min_bnd_recno = r->recno;
- if (dsk->type == WT_PAGE_ROW_INT ||
- dsk->type == WT_PAGE_ROW_LEAF)
- WT_RET(__rec_split_row_promote(
- session, r, &bnd->min_bnd_key, dsk->type));
- return (0);
- }
-
- /* We are crossing a split boundary */
- return (__rec_split(session, r, next_len));
-}
-
-/*
* __rec_split_raw_worker --
* Handle the raw compression page reconciliation bookkeeping.
*/
@@ -2730,7 +2626,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session,
*/
recno = WT_RECNO_OOB;
if (dsk->type == WT_PAGE_COL_VAR)
- recno = last->max_bnd_recno;
+ recno = last->recno;
entry = max_image_slot = slots = 0;
WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
@@ -2957,7 +2853,7 @@ no_slots:
*/
dst->size = result_len + WT_BLOCK_COMPRESS_SKIP;
dsk_dst = dst->mem;
- dsk_dst->recno = last->max_bnd_recno;
+ dsk_dst->recno = last->recno;
dsk_dst->mem_size =
r->raw_offsets[result_slots] + WT_BLOCK_COMPRESS_SKIP;
dsk_dst->u.entries = r->raw_entries[result_slots - 1];
@@ -2977,7 +2873,7 @@ no_slots:
WT_RET(__wt_strndup(session, dsk,
dsk_dst->mem_size, &last->disk_image));
disk_image = last->disk_image;
- disk_image->recno = last->max_bnd_recno;
+ disk_image->recno = last->recno;
disk_image->mem_size = dsk_dst->mem_size;
disk_image->u.entries = dsk_dst->u.entries;
}
@@ -3007,14 +2903,14 @@ no_slots:
*/
switch (dsk->type) {
case WT_PAGE_COL_INT:
- next->max_bnd_recno = r->raw_recnos[result_slots];
+ next->recno = r->raw_recnos[result_slots];
break;
case WT_PAGE_COL_VAR:
- next->max_bnd_recno = r->raw_recnos[result_slots - 1];
+ next->recno = r->raw_recnos[result_slots - 1];
break;
case WT_PAGE_ROW_INT:
case WT_PAGE_ROW_LEAF:
- next->max_bnd_recno = WT_RECNO_OOB;
+ next->recno = WT_RECNO_OOB;
if (!last_block) {
/*
* Confirm there was uncompressed data remaining
@@ -3023,7 +2919,7 @@ no_slots:
*/
WT_ASSERT(session, len > 0);
WT_RET(__rec_split_row_promote_cell(
- session, dsk, &next->max_bnd_key));
+ session, dsk, &next->key));
}
break;
}
@@ -3035,7 +2931,7 @@ no_slots:
*/
WT_STAT_DATA_INCR(session, compress_raw_fail);
- dsk->recno = last->max_bnd_recno;
+ dsk->recno = last->recno;
dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk);
dsk->u.entries = r->entries;
r->disk_image.size = dsk->mem_size;
@@ -3112,9 +3008,35 @@ __rec_split_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
static int
__rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
{
- WT_BOUNDARY *bnd_cur, *bnd_prev;
+ WT_BOUNDARY *bnd;
WT_PAGE_HEADER *dsk;
- bool grow_bnd;
+
+ /* Adjust the boundary information based on our split status. */
+ switch (r->bnd_state) {
+ case SPLIT_BOUNDARY:
+ case SPLIT_MAX:
+ /*
+ * We never split, the reconciled page fit into a maximum page
+ * size. Change the first boundary slot to represent the full
+ * page (the first boundary slot is largely correct, just update
+ * the number of entries).
+ */
+ r->bnd_next = 0;
+ break;
+ case SPLIT_TRACKING_OFF:
+ /*
+ * If we have already split, or aren't tracking boundaries, put
+ * the remaining data in the next boundary slot.
+ */
+ WT_RET(__rec_split_bnd_grow(session, r));
+ break;
+ case SPLIT_TRACKING_RAW:
+ /*
+ * We were configured for raw compression, and either we never
+ * wrote anything, or there's a remaindered block of data.
+ */
+ break;
+ }
/*
* We may arrive here with no entries to write if the page was entirely
@@ -3141,66 +3063,20 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
return (EBUSY);
}
- dsk = r->disk_image.mem;
+ /* Set the boundary reference and increment the count. */
+ bnd = &r->bnd[r->bnd_next++];
+ bnd->entries = r->entries;
- /* Set the number of entries for the just finished chunk. */
- bnd_cur = &r->bnd[r->bnd_next];
- bnd_cur->max_bnd_entries = r->entries;
-
- grow_bnd = true;
- /*
- * We can reach here even with raw_compression when the last split chunk
- * is too small to be sent for raw compression.
- */
- if (!r->is_bulk_load && !r->raw_compression) {
- if (WT_PTRDIFF(r->first_free, dsk) > r->page_size &&
- r->bnd_next != 0) {
- /*
- * We hold two boundaries worth of data in the buffer,
- * and this data doesn't fit in a single page. If the
- * last chunk is too small, readjust the boundary to a
- * pre-computed minimum.
- * Write out the penultimate chunk to the disk as a page
- */
- WT_RET(__rec_split_write_prev_and_shift_cur(
- session, r, true));
- } else
- if (r->bnd_next != 0) {
- /*
- * We have two boundaries, but the data in the
- * buffer can fit a single page. Merge the
- * boundaries to create a single chunk.
- */
- bnd_prev = bnd_cur - 1;
- bnd_prev->max_bnd_entries +=
- bnd_cur->max_bnd_entries;
- r->bnd_next--;
- grow_bnd = false;
- }
- }
-
- /*
- * We already have space for an extra boundary if we merged two
- * boundaries above, in that case we do not need to grow the boundary
- * structure.
- */
- if (grow_bnd)
- WT_RET(__rec_split_bnd_grow(session, r));
- bnd_cur = &r->bnd[r->bnd_next];
- r->bnd_next++;
-
- /*
- * Current boundary now has all the remaining data/last page now.
- * Let's write it to the disk
- */
- dsk->recno = bnd_cur->max_bnd_recno;
- dsk->u.entries = bnd_cur->max_bnd_entries;
+ /* Finalize the header information. */
+ dsk = r->disk_image.mem;
+ dsk->recno = bnd->recno;
+ dsk->u.entries = r->entries;
dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk);
r->disk_image.size = dsk->mem_size;
/* If this is a checkpoint, we're done, otherwise write the page. */
- return (__rec_is_checkpoint(session, r, bnd_cur) ?
- 0 : __rec_split_write(session, r, bnd_cur, &r->disk_image, true));
+ return (__rec_is_checkpoint(session, r, bnd) ?
+ 0 : __rec_split_write(session, r, bnd, &r->disk_image, true));
}
/*
@@ -3234,6 +3110,98 @@ __rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r)
}
/*
+ * __rec_split_fixup --
+ * Fix up after crossing the maximum page boundary.
+ */
+static int
+__rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+ WT_BOUNDARY *bnd;
+ WT_BTREE *btree;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_PAGE_HEADER *dsk;
+ size_t i, len;
+ uint8_t *dsk_start, *p;
+
+ /*
+ * When we overflow physical limits of the page, we walk the list of
+ * split chunks we've created and write those pages out, then update
+ * the caller's information.
+ */
+ btree = S2BT(session);
+
+ /*
+ * The data isn't laid out on a page boundary or nul padded; copy it to
+ * a clean, aligned, padded buffer before writing it.
+ *
+ * Allocate a scratch buffer to hold the new disk image. Copy the disk
+ * page's header and block-manager space into the scratch buffer, most
+ * of the header information remains unchanged between the pages.
+ */
+ WT_RET(__wt_scr_alloc(session, r->disk_image.memsize, &tmp));
+ dsk = tmp->mem;
+ memcpy(dsk, r->disk_image.mem, WT_PAGE_HEADER_BYTE_SIZE(btree));
+
+ /*
+ * For each split chunk we've created, update the disk image and copy
+ * it into place.
+ */
+ dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
+ for (i = 0, bnd = r->bnd; i < r->bnd_next; ++i, ++bnd) {
+ /* Copy the page contents to the temporary buffer. */
+ len = (bnd + 1)->offset - bnd->offset;
+ memcpy(dsk_start,
+ (uint8_t *)r->disk_image.mem + bnd->offset, len);
+
+ /* Finalize the header information and write the page. */
+ dsk->recno = bnd->recno;
+ dsk->u.entries = bnd->entries;
+ tmp->size = WT_PAGE_HEADER_BYTE_SIZE(btree) + len;
+ dsk->mem_size = WT_STORE_SIZE(tmp->size);
+ WT_ERR(__rec_split_write(session, r, bnd, tmp, false));
+ }
+
+ /*
+ * There is probably a remnant in the working buffer that didn't get
+ * written, copy it down to the beginning of the working buffer.
+ *
+ * Confirm the remnant is no larger than a split-sized chunk, including
+ * header. We know that's the maximum sized remnant because we only have
+ * remnants if split switches from accumulating to a split boundary to
+ * accumulating to the end of the page (the other path here is when we
+ * hit a split boundary, there was room for another split chunk in the
+ * page, and the next item still wouldn't fit, in which case there is no
+ * remnant). So: we were accumulating to the end of the page and created
+ * a remnant. We know the remnant cannot be as large as a split-sized
+ * chunk, including header, because if there was room for that large a
+ * remnant, we wouldn't have switched from accumulating to a page end.
+ */
+ p = (uint8_t *)r->disk_image.mem + bnd->offset;
+ len = WT_PTRDIFF(r->first_free, p);
+ if (len >= r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree))
+ WT_PANIC_ERR(session, EINVAL,
+ "Reconciliation remnant too large for the split buffer");
+ dsk = r->disk_image.mem;
+ dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
+ (void)memmove(dsk_start, p, len);
+
+ /*
+ * Fix up our caller's information, including updating the starting
+ * record number.
+ */
+ r->entries -= r->total_entries;
+ r->first_free = dsk_start + len;
+ WT_ASSERT(session,
+ r->page_size >= (WT_PAGE_HEADER_BYTE_SIZE(btree) + len));
+ r->space_avail =
+ r->split_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len);
+
+err: __wt_scr_free(session, &tmp);
+ return (ret);
+}
+
+/*
* __rec_split_write --
* Write a disk block out for the split helper functions.
*/
@@ -3270,6 +3238,8 @@ __rec_split_write(WT_SESSION_IMPL *session,
F_SET(dsk, WT_PAGE_EMPTY_V_NONE);
}
+ bnd->entries = r->entries;
+
/* Initialize the address (set the page type for the parent). */
switch (dsk->type) {
case WT_PAGE_COL_FIX:
@@ -3315,8 +3285,7 @@ __rec_split_write(WT_SESSION_IMPL *session,
switch (page->type) {
case WT_PAGE_COL_FIX:
case WT_PAGE_COL_VAR:
- if (WT_INSERT_RECNO(supd->ins) >=
- (bnd + 1)->max_bnd_recno)
+ if (WT_INSERT_RECNO(supd->ins) >= (bnd + 1)->recno)
goto supd_check_complete;
break;
case WT_PAGE_ROW_LEAF:
@@ -3327,8 +3296,8 @@ __rec_split_write(WT_SESSION_IMPL *session,
key->data = WT_INSERT_KEY(supd->ins);
key->size = WT_INSERT_KEY_SIZE(supd->ins);
}
- WT_ERR(__wt_compare(session, btree->collator,
- key, &(bnd + 1)->max_bnd_key, &cmp));
+ WT_ERR(__wt_compare(session,
+ btree->collator, key, &(bnd + 1)->key, &cmp));
if (cmp >= 0)
goto supd_check_complete;
break;
@@ -3418,14 +3387,14 @@ supd_check_complete:
#ifdef HAVE_VERBOSE
/* Output a verbose message if we create a page without many entries */
- if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT) &&
- bnd->max_bnd_entries < 6)
+ if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT) && r->entries < 6)
__wt_verbose(session, WT_VERB_SPLIT,
"Reconciliation creating a page with %" PRIu32
" entries, memory footprint %" WT_SIZET_FMT
- ", page count %" PRIu32 ", %s", bnd->max_bnd_entries,
- r->page->memory_footprint, r->bnd_next,
- F_ISSET(r, WT_EVICTING) ? "evict" : "checkpoint");
+ ", page count %" PRIu32 ", %s, split state: %d",
+ r->entries, r->page->memory_footprint, r->bnd_next,
+ F_ISSET(r, WT_EVICTING) ? "evict" : "checkpoint",
+ r->bnd_state);
#endif
WT_ERR(__wt_bt_write(session, buf, addr, &addr_size,
@@ -3711,12 +3680,11 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
cursor->value.data, cursor->value.size, (uint64_t)0));
/* Boundary: split or write the page. */
- if (r->raw_compression) {
- if (key->len + val->len > r->space_avail)
- WT_RET(__rec_split_raw(
- session, r, key->len + val->len));
- } else
- if (WT_CROSSING_SPLIT_BND(r, key->len + val->len)) {
+ if (key->len + val->len > r->space_avail) {
+ if (r->raw_compression)
+ WT_RET(
+ __rec_split_raw(session, r, key->len + val->len));
+ else {
/*
* Turn off prefix compression until a full key written
* to the new page, and (unless already working with an
@@ -3728,9 +3696,10 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
WT_RET(__rec_cell_build_leaf_key(
session, r, NULL, 0, &ovfl_key));
}
- WT_RET(__rec_split_crossing_bnd(
- session, r, key->len + val->len));
+
+ WT_RET(__rec_split(session, r, key->len + val->len));
}
+ }
/* Copy the key/value pair onto the page. */
__rec_copy_incr(session, r, key);
@@ -3771,10 +3740,6 @@ __rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk)
* split.
*
* Boundary: split or write the page.
- *
- * No need to have a minimum split size boundary, all
- * pages are filled 100% except the last, allowing it to
- * grow in the future.
*/
__rec_incr(session, r, cbulk->entry,
__bitstr_size(
@@ -3879,12 +3844,10 @@ __wt_bulk_insert_var(
r, cbulk->last.data, cbulk->last.size, cbulk->rle));
/* Boundary: split or write the page. */
- if (r->raw_compression) {
- if (val->len > r->space_avail)
- WT_RET(__rec_split_raw(session, r, val->len));
- } else
- if (WT_CROSSING_SPLIT_BND(r, val->len))
- WT_RET(__rec_split_crossing_bnd(session, r, val->len));
+ if (val->len > r->space_avail)
+ WT_RET(r->raw_compression ?
+ __rec_split_raw(session, r, val->len) :
+ __rec_split(session, r, val->len));
/* Copy the value onto the page. */
if (btree->dictionary)
@@ -4020,13 +3983,10 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref)
WT_CHILD_RELEASE_ERR(session, hazard, ref);
/* Boundary: split or write the page. */
- if (r->raw_compression) {
- if (val->len > r->space_avail)
- WT_ERR(__rec_split_raw(session, r, val->len));
- } else
- if (WT_CHECK_CROSSING_BND(r, val->len))
- WT_ERR(__rec_split_crossing_bnd(
- session, r, val->len));
+ if (val->len > r->space_avail)
+ WT_ERR(r->raw_compression ?
+ __rec_split_raw(session, r, val->len) :
+ __rec_split(session, r, val->len));
/* Copy the value onto the page. */
__rec_copy_incr(session, r, val);
@@ -4068,13 +4028,10 @@ __rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
addr->addr, addr->size, __rec_vtype(addr), r->recno);
/* Boundary: split or write the page. */
- if (r->raw_compression) {
- if (val->len > r->space_avail)
- WT_RET(__rec_split_raw(session, r, val->len));
- } else
- if (WT_CHECK_CROSSING_BND(r, val->len))
- WT_RET(__rec_split_crossing_bnd(
- session, r, val->len));
+ if (val->len > r->space_avail)
+ WT_RET(r->raw_compression ?
+ __rec_split_raw(session, r, val->len) :
+ __rec_split(session, r, val->len));
/* Copy the value onto the page. */
__rec_copy_incr(session, r, val);
@@ -4182,10 +4139,6 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref)
* split.
*
* Boundary: split or write the page.
- *
- * No need to have a minimum split size boundary, all
- * pages are filled 100% except the last, allowing it to
- * grow in the future.
*/
__rec_incr(session, r, entry,
__bitstr_size((size_t)entry * btree->bitcnt));
@@ -4342,13 +4295,10 @@ __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r,
session, r, value->data, value->size, rle));
/* Boundary: split or write the page. */
- if (r->raw_compression) {
- if (val->len > r->space_avail)
- WT_RET(__rec_split_raw(session, r, val->len));
- } else
- if (WT_CHECK_CROSSING_BND(r, val->len))
- WT_RET(__rec_split_crossing_bnd(
- session, r, val->len));
+ if (val->len > r->space_avail)
+ WT_RET(r->raw_compression ?
+ __rec_split_raw(session, r, val->len) :
+ __rec_split(session, r, val->len));
/* Copy the value onto the page. */
if (!deleted && !overflow_type && btree->dictionary)
@@ -5011,12 +4961,11 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
r->cell_zero = false;
/* Boundary: split or write the page. */
- if (r->raw_compression) {
- if (key->len + val->len > r->space_avail)
+ if (key->len + val->len > r->space_avail) {
+ if (r->raw_compression)
WT_ERR(__rec_split_raw(
session, r, key->len + val->len));
- } else
- if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) {
+ else {
/*
* In one path above, we copied address blocks
* from the page rather than building the actual
@@ -5028,10 +4977,10 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
WT_IKEY_DATA(ikey), ikey->size));
key_onpage_ovfl = false;
}
-
- WT_ERR(__rec_split_crossing_bnd(
+ WT_ERR(__rec_split(
session, r, key->len + val->len));
}
+ }
/* Copy the key and value onto the page. */
__rec_copy_incr(session, r, key);
@@ -5081,14 +5030,10 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
addr->addr, addr->size, __rec_vtype(addr), WT_RECNO_OOB);
/* Boundary: split or write the page. */
- if (r->raw_compression) {
- if (key->len + val->len > r->space_avail)
- WT_RET(__rec_split_raw(
- session, r, key->len + val->len));
- } else
- if (WT_CHECK_CROSSING_BND(r, key->len + val->len))
- WT_RET(__rec_split_crossing_bnd(
- session, r, key->len + val->len));
+ if (key->len + val->len > r->space_avail)
+ WT_RET(r->raw_compression ?
+ __rec_split_raw(session, r, key->len + val->len) :
+ __rec_split(session, r, key->len + val->len));
/* Copy the key and value onto the page. */
__rec_copy_incr(session, r, key);
@@ -5417,17 +5362,16 @@ build:
}
/* Boundary: split or write the page. */
- if (r->raw_compression) {
- if (key->len + val->len > r->space_avail)
+ if (key->len + val->len > r->space_avail) {
+ if (r->raw_compression)
WT_ERR(__rec_split_raw(
session, r, key->len + val->len));
- } else
- if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) {
+ else {
/*
- * If we copied address blocks from the page
- * rather than building the actual key, we have
- * to build the key now because we are about to
- * promote it.
+ * In one path above, we copied address blocks
+ * from the page rather than building the actual
+ * key. In that case, we have to build the key
+ * now because we are about to promote it.
*/
if (key_onpage_ovfl) {
WT_ERR(__wt_dsk_cell_data_ref(session,
@@ -5446,13 +5390,14 @@ build:
if (!ovfl_key)
WT_ERR(
__rec_cell_build_leaf_key(
- session, r, NULL, 0,
- &ovfl_key));
+ session,
+ r, NULL, 0, &ovfl_key));
}
- WT_ERR(__rec_split_crossing_bnd(
+ WT_ERR(__rec_split(
session, r, key->len + val->len));
}
+ }
/* Copy the key/value pair onto the page. */
__rec_copy_incr(session, r, key);
@@ -5515,12 +5460,11 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key));
/* Boundary: split or write the page. */
- if (r->raw_compression) {
- if (key->len + val->len > r->space_avail)
+ if (key->len + val->len > r->space_avail) {
+ if (r->raw_compression)
WT_RET(__rec_split_raw(
session, r, key->len + val->len));
- } else
- if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) {
+ else {
/*
* Turn off prefix compression until a full key
* written to the new page, and (unless already
@@ -5532,13 +5476,14 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
if (!ovfl_key)
WT_RET(
__rec_cell_build_leaf_key(
- session, r, NULL, 0,
- &ovfl_key));
+ session,
+ r, NULL, 0, &ovfl_key));
}
- WT_RET(__rec_split_crossing_bnd(
+ WT_RET(__rec_split(
session, r, key->len + val->len));
}
+ }
/* Copy the key/value pair onto the page. */
__rec_copy_incr(session, r, key);
@@ -5650,14 +5595,13 @@ __rec_split_dump_keys(WT_SESSION_IMPL *session, WT_PAGE *page, WT_RECONCILE *r)
__wt_verbose(session, WT_VERB_SPLIT,
"starting key %s",
__wt_buf_set_printable(
- session, bnd->max_bnd_key.data,
- bnd->max_bnd_key.size, tkey));
+ session, bnd->key.data, bnd->key.size, tkey));
break;
case WT_PAGE_COL_FIX:
case WT_PAGE_COL_INT:
case WT_PAGE_COL_VAR:
__wt_verbose(session, WT_VERB_SPLIT,
- "starting recno %" PRIu64, bnd->max_bnd_recno);
+ "starting recno %" PRIu64, bnd->recno);
break;
WT_ILLEGAL_VALUE_ERR(session);
}
@@ -5919,10 +5863,10 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
/* We never set the first page's key, grab it from the original page. */
ref = r->ref;
if (__wt_ref_is_root(ref))
- WT_RET(__wt_buf_set(session, &r->bnd[0].max_bnd_key, "", 1));
+ WT_RET(__wt_buf_set(session, &r->bnd[0].key, "", 1));
else {
__wt_ref_key(ref->home, ref, &p, &size);
- WT_RET(__wt_buf_set(session, &r->bnd[0].max_bnd_key, p, size));
+ WT_RET(__wt_buf_set(session, &r->bnd[0].key, p, size));
}
/* Allocate, then initialize the array of replacement blocks. */
@@ -5930,8 +5874,8 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
for (multi = mod->mod_multi,
bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) {
- WT_RET(__wt_row_ikey_alloc(session, 0, bnd->max_bnd_key.data,
- bnd->max_bnd_key.size, &multi->key.ikey));
+ WT_RET(__wt_row_ikey_alloc(session, 0,
+ bnd->key.data, bnd->key.size, &multi->key.ikey));
/*
* Copy any disk image. Don't take saved updates without a
@@ -5978,7 +5922,7 @@ __rec_split_col(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
for (multi = mod->mod_multi,
bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) {
- multi->key.recno = bnd->max_bnd_recno;
+ multi->key.recno = bnd->recno;
/*
* Copy any disk image. Don't take saved updates without a
diff --git a/test/format/config.h b/test/format/config.h
index b5feb7a5321..e3e1e73a786 100644
--- a/test/format/config.h
+++ b/test/format/config.h
@@ -284,7 +284,7 @@ static CONFIG c[] = {
{ "split_pct",
"page split size as a percentage of the maximum page size",
- 0x0, 50, 100, 100, &g.c_split_pct, NULL },
+ 0x0, 40, 85, 85, &g.c_split_pct, NULL },
{ "statistics",
"maintain statistics", /* 20% */