diff options
Diffstat (limited to 'src/reconcile/rec_write.c')
-rw-r--r-- | src/reconcile/rec_write.c | 954 |
1 files changed, 505 insertions, 449 deletions
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index 23f654caa70..e18d44f96ff 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -26,6 +26,11 @@ typedef struct { uint32_t flags; /* Caller's configuration */ WT_ITEM disk_image; /* Temporary disk-image buffer */ + /* + * Temporary buffer used to write out a disk image when managing two + * chunks worth of data in memory + */ + WT_ITEM *interim_buf; /* * Track start/stop write generation to decide if all changes to the @@ -127,6 +132,7 @@ typedef struct { * repeatedly split a packed page. */ uint32_t split_size; /* Split page size */ + uint32_t min_split_size; /* Minimum split page size */ /* * The problem with splits is we've done a lot of work by the time we @@ -151,16 +157,6 @@ typedef struct { */ size_t offset; /* Split's first byte */ - /* - * The recno and entries fields are the starting record number - * of the split chunk (for column-store splits), and the number - * of entries in the split chunk. These fields are used both - * to write the split chunk, and to create a new internal page - * to reference the split pages. - */ - uint64_t recno; /* Split's starting record */ - uint32_t entries; /* Split's entries */ - WT_ADDR addr; /* Split's written location */ uint32_t size; /* Split's size */ uint32_t checksum; /* Split's checksum */ @@ -182,11 +178,36 @@ typedef struct { size_t supd_allocated; /* + * While reconciling pages, at any given time, we maintain two + * split chunks in the memory to be written out as pages. As we + * get to the last two chunks, if the last one turns out to be + * smaller than the minimum split size, we go back into the + * penultimate chunk and split at this minimum split size + * boundary. This moves some data from the penultimate chunk to + * the last chunk, hence increasing the size of the last page + * written without decreasing the penultimate page size beyond + * the minimum split size. For this reason, we maintain both a + * maximum split percentage boundary and a minimum split + * percentage boundary. + * + * The recno and entries fields are the starting record number + * of the split chunk (for column-store splits), and the number + * of entries in the split chunk. These fields are used both to + * write the split chunk, and to create a new internal page to + * reference the split pages. + * * The key for a row-store page; no column-store key is needed * because the page's recno, stored in the recno field, is the * column-store key. */ - WT_ITEM key; /* Promoted row-store key */ + uint32_t max_bnd_entries; + uint64_t max_bnd_recno; + WT_ITEM max_bnd_key; + + size_t min_bnd_offset; + uint32_t min_bnd_entries; + uint64_t min_bnd_recno; + WT_ITEM min_bnd_key; } *bnd; /* Saved boundaries */ uint32_t bnd_next; /* Next boundary slot */ uint32_t bnd_next_max; /* Maximum boundary slots used */ @@ -194,28 +215,6 @@ typedef struct { size_t bnd_allocated; /* Bytes allocated */ /* - * We track the total number of page entries copied into split chunks - * so we can easily figure out how many entries in the current split - * chunk. - */ - uint32_t total_entries; /* Total entries in splits */ - - /* - * And there's state information as to where in this process we are: - * (1) tracking split boundaries because we can still fit more split - * chunks into the maximum page size, (2) tracking the maximum page - * size boundary because we can't fit any more split chunks into the - * maximum page size, (3) not performing boundary checks because it's - * either not useful with the current page size configuration, or - * because we've already been forced to split. - */ - enum { SPLIT_BOUNDARY=0, /* Next: a split page boundary */ - SPLIT_MAX=1, /* Next: the maximum page boundary */ - SPLIT_TRACKING_OFF=2, /* No boundary checks */ - SPLIT_TRACKING_RAW=3 } /* Underlying compression decides */ - bnd_state; - - /* * We track current information about the current record number, the * number of entries copied into the temporary buffer, where we are * in the temporary buffer, and how much memory remains. Those items @@ -293,6 +292,14 @@ typedef struct { uint32_t tested_ref_state; /* Debugging information */ } WT_RECONCILE; +#define WT_CROSSING_MIN_BND(r, next_len) \ + ((r)->bnd[(r)->bnd_next].min_bnd_offset == 0 && \ + ((r)->space_avail - (next_len)) < \ + ((r)->split_size - (r)->min_split_size)) +#define WT_CROSSING_SPLIT_BND(r, next_len) ((next_len) > (r)->space_avail) +#define WT_CHECK_CROSSING_BND(r, next_len) \ + (WT_CROSSING_MIN_BND(r, next_len) || WT_CROSSING_SPLIT_BND(r, next_len)) + static void __rec_bnd_cleanup(WT_SESSION_IMPL *, WT_RECONCILE *, bool); static void __rec_cell_build_addr(WT_SESSION_IMPL *, WT_RECONCILE *, const void *, size_t, u_int, uint64_t); @@ -314,6 +321,7 @@ static int __rec_col_var(WT_SESSION_IMPL *, static int __rec_col_var_helper(WT_SESSION_IMPL *, WT_RECONCILE *, WT_SALVAGE_COOKIE *, WT_ITEM *, bool, uint8_t, uint64_t); static int __rec_destroy_session(WT_SESSION_IMPL *); +static uint32_t __rec_min_split_page_size(WT_BTREE *, uint32_t); static int __rec_root_write(WT_SESSION_IMPL *, WT_PAGE *, uint32_t); static int __rec_row_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_row_leaf(WT_SESSION_IMPL *, @@ -323,7 +331,6 @@ static int __rec_row_leaf_insert( static int __rec_row_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_split_col(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_split_discard(WT_SESSION_IMPL *, WT_PAGE *); -static int __rec_split_fixup(WT_SESSION_IMPL *, WT_RECONCILE *); static int __rec_split_row(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_split_row_promote( WT_SESSION_IMPL *, WT_RECONCILE *, WT_ITEM *, uint8_t); @@ -968,6 +975,7 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep) *(WT_RECONCILE **)reconcilep = NULL; __wt_buf_free(session, &r->disk_image); + __wt_scr_free(session, &r->interim_buf); __wt_free(session, r->raw_entries); __wt_free(session, r->raw_offsets); @@ -1032,7 +1040,8 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, bool destroy) __wt_free(session, bnd->addr.addr); __wt_free(session, bnd->disk_image); __wt_free(session, bnd->supd); - __wt_buf_free(session, &bnd->key); + __wt_buf_free(session, &bnd->max_bnd_key); + __wt_buf_free(session, &bnd->min_bnd_key); } __wt_free(session, r->bnd); r->bnd_next = 0; @@ -1927,8 +1936,8 @@ static void __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd) { bnd->offset = 0; - bnd->recno = WT_RECNO_OOB; - bnd->entries = 0; + bnd->max_bnd_recno = WT_RECNO_OOB; + bnd->max_bnd_entries = 0; __wt_free(session, bnd->addr.addr); WT_CLEAR(bnd->addr); @@ -1943,6 +1952,10 @@ __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd) bnd->already_compressed = false; + bnd->min_bnd_offset = 0; + bnd->min_bnd_entries = 0; + bnd->min_bnd_recno = WT_RECNO_OOB; + /* * Don't touch the key, we re-use that memory in each new * reconciliation. @@ -1974,40 +1987,64 @@ __rec_split_bnd_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r) } /* - * __wt_split_page_size -- - * Split page size calculation: we don't want to repeatedly split every - * time a new entry is added, so we split to a smaller-than-maximum page size. + * __rec_split_page_size_from_pct -- + * Given a split percentage, calculate split page size in bytes. */ -uint32_t -__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize) -{ +static uint32_t +__rec_split_page_size_from_pct( + int split_pct, uint32_t maxpagesize, uint32_t allocsize) { uintmax_t a; uint32_t split_size; /* * Ideally, the split page size is some percentage of the maximum page - * size rounded to an allocation unit (round to an allocation unit so - * we don't waste space when we write). + * size rounded to an allocation unit (round to an allocation unit so we + * don't waste space when we write). */ a = maxpagesize; /* Don't overflow. */ split_size = (uint32_t)WT_ALIGN_NEAREST( - (a * (u_int)btree->split_pct) / 100, btree->allocsize); + (a * (u_int)split_pct) / 100, allocsize); /* - * Respect the configured split percentage if the calculated split - * size is either zero or a full page. The user has either configured - * an allocation size that matches the page size, or a split - * percentage that is close to zero or one hundred. Rounding is going - * to provide a worse outcome than having a split point that doesn't - * fall on an allocation size boundary in those cases. + * Respect the configured split percentage if the calculated split size + * is either zero or a full page. The user has either configured an + * allocation size that matches the page size, or a split percentage + * that is close to zero or one hundred. Rounding is going to provide a + * worse outcome than having a split point that doesn't fall on an + * allocation size boundary in those cases. */ if (split_size == 0 || split_size == maxpagesize) - split_size = (uint32_t)((a * (u_int)btree->split_pct) / 100); + split_size = (uint32_t)((a * (u_int)split_pct) / 100); return (split_size); } /* + * __wt_split_page_size -- + * Split page size calculation: we don't want to repeatedly split every + * time a new entry is added, so we split to a smaller-than-maximum page size. + */ +uint32_t +__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize) +{ + return (__rec_split_page_size_from_pct( + btree->split_pct, maxpagesize, btree->allocsize)); +} + +/* + * __rec_min_split_page_size -- + * Minimum split size boundary calculation: To track a boundary at the + * minimum split size that we could have split at instead of splitting at + * the split page size. + */ +static uint32_t +__rec_min_split_page_size(WT_BTREE *btree, uint32_t maxpagesize) +{ + return (__rec_split_page_size_from_pct( + WT_BTREE_MIN_SPLIT_PCT, maxpagesize, btree->allocsize)); +} + +/* * __rec_split_init -- * Initialization for the reconciliation split functions. */ @@ -2018,7 +2055,7 @@ __rec_split_init(WT_SESSION_IMPL *session, WT_BM *bm; WT_BTREE *btree; WT_PAGE_HEADER *dsk; - size_t corrected_page_size; + size_t corrected_page_size, disk_img_buf_size; btree = S2BT(session); bm = btree->bm; @@ -2053,33 +2090,6 @@ __rec_split_init(WT_SESSION_IMPL *session, r->max_raw_page_size = r->page_size = (uint32_t)WT_MIN(r->page_size * 10, WT_MAX(r->page_size, btree->maxmempage / 2)); - - /* - * Ensure the disk image buffer is large enough for the max object, as - * corrected by the underlying block manager. - */ - corrected_page_size = r->page_size; - WT_RET(bm->write_size(bm, session, &corrected_page_size)); - WT_RET(__wt_buf_init(session, &r->disk_image, corrected_page_size)); - - /* - * Clear the disk page header to ensure all of it is initialized, even - * the unused fields. - * - * In the case of fixed-length column-store, clear the entire buffer: - * fixed-length column-store sets bits in bytes, where the bytes are - * assumed to initially be 0. - */ - memset(r->disk_image.mem, 0, page->type == WT_PAGE_COL_FIX ? - corrected_page_size : WT_PAGE_HEADER_SIZE); - - /* - * Set the page type (the type doesn't change, and setting it later - * would require additional code in a few different places). - */ - dsk = r->disk_image.mem; - dsk->type = page->type; - /* * If we have to split, we want to choose a smaller page size for the * split pages, because otherwise we could end up splitting one large @@ -2099,22 +2109,28 @@ __rec_split_init(WT_SESSION_IMPL *session, * creating overflow items and compacted data, for example, as those * items have already been written to disk). So, the loop calls the * helper functions when approaching a split boundary, and we save the - * information at that point. That allows us to go back and split the - * page at the boundary points if we eventually overflow the maximum - * page size. + * information at that point. We also save the boundary information at + * the minimum split size. We maintain two chunks (each boundary + * represents a chunk that gets written as a page) in the memory, + * writing out the older one to the disk as a page when we need to make + * space for a new chunk. On reaching the last chunk, if it turns out to + * be smaller than the minimum split size, we go back into the + * penultimate chunk and split at this minimum split size boundary. This + * moves some data from the penultimate chunk to the last chunk, hence + * increasing the size of the last page written without decreasing the + * penultimate page size beyond the minimum split size. * * Finally, all this doesn't matter for fixed-size column-store pages, * raw compression, and salvage. Fixed-size column store pages can * split under (very) rare circumstances, but they're allocated at a * fixed page size, never anything smaller. In raw compression, the - * underlying compression routine decides when we split, so it's not - * our problem. In salvage, as noted above, we can't split at all. + * underlying compression routine decides when we split, so it's not our + * problem. In salvage, as noted above, we can't split at all. */ if (r->raw_compression || r->salvage != NULL) { r->split_size = 0; r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree); - } - else if (page->type == WT_PAGE_COL_FIX) { + } else if (page->type == WT_PAGE_COL_FIX) { r->split_size = r->page_size; r->space_avail = r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); @@ -2122,32 +2138,53 @@ __rec_split_init(WT_SESSION_IMPL *session, r->split_size = __wt_split_page_size(btree, r->page_size); r->space_avail = r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); + r->min_split_size = + __rec_min_split_page_size(btree, r->page_size); } + + /* + * Ensure the disk image buffer is large enough for the max object, as + * corrected by the underlying block manager. + * + * The buffer that we build disk image in, needs to hold two chunks + * worth of data. Since we want to support split_size more than the page + * size (to allow for adjustments based on the compression), this buffer + * should be greater of twice of split_size and page_size. + */ + corrected_page_size = r->page_size; + disk_img_buf_size = 2 * WT_MAX(corrected_page_size, r->split_size); + WT_RET(bm->write_size(bm, session, &corrected_page_size)); + WT_RET(__wt_buf_init(session, &r->disk_image, disk_img_buf_size)); + + /* + * Clear the disk page header to ensure all of it is initialized, even + * the unused fields. + * + * In the case of fixed-length column-store, clear the entire buffer: + * fixed-length column-store sets bits in bytes, where the bytes are + * assumed to initially be 0. + */ + memset(r->disk_image.mem, 0, page->type == WT_PAGE_COL_FIX ? + disk_img_buf_size : WT_PAGE_HEADER_SIZE); + + /* + * Set the page type (the type doesn't change, and setting it later + * would require additional code in a few different places). + */ + dsk = r->disk_image.mem; + dsk->type = page->type; + r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); /* Initialize the first boundary. */ r->bnd_next = 0; WT_RET(__rec_split_bnd_grow(session, r)); __rec_split_bnd_init(session, &r->bnd[0]); - r->bnd[0].recno = recno; + r->bnd[0].max_bnd_recno = recno; r->bnd[0].offset = WT_PAGE_HEADER_BYTE_SIZE(btree); - /* - * If the maximum page size is the same as the split page size, either - * because of the object type or application configuration, there isn't - * any need to maintain split boundaries within a larger page. - * - * No configuration for salvage here, because salvage can't split. - */ - if (r->raw_compression) - r->bnd_state = SPLIT_TRACKING_RAW; - else if (max == r->split_size) - r->bnd_state = SPLIT_TRACKING_OFF; - else - r->bnd_state = SPLIT_BOUNDARY; - - /* Initialize the entry counters. */ - r->entries = r->total_entries = 0; + /* Initialize the entry counter. */ + r->entries = 0; /* Initialize the starting record number. */ r->recno = recno; @@ -2350,19 +2387,112 @@ __rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len) { WT_BM *bm; WT_BTREE *btree; - size_t corrected_page_size, len; + size_t corrected_page_size, inuse, len; btree = S2BT(session); bm = btree->bm; len = WT_PTRDIFF(r->first_free, r->disk_image.mem); - corrected_page_size = len + add_len; + inuse = (len - r->bnd[r->bnd_next].offset) + + WT_PAGE_HEADER_BYTE_SIZE(btree); + corrected_page_size = inuse + add_len; + WT_RET(bm->write_size(bm, session, &corrected_page_size)); - WT_RET(__wt_buf_grow(session, &r->disk_image, corrected_page_size)); + /* Need to account for buffer carrying two chunks worth of data */ + WT_RET(__wt_buf_grow(session, &r->disk_image, 2 * corrected_page_size)); + r->first_free = (uint8_t *)r->disk_image.mem + len; - WT_ASSERT(session, corrected_page_size >= len); - r->space_avail = corrected_page_size - len; + WT_ASSERT(session, corrected_page_size >= inuse); + r->space_avail = corrected_page_size - inuse; WT_ASSERT(session, r->space_avail >= add_len); + + return (0); +} + +/* + * __rec_split_write_prev_and_shift_cur -- + * Write the previous split chunk to the disk as a page. Shift the contents + * of the current chunk to the start of the buffer, making space for a new + * chunk to be written. + * If the caller asks for a chunk resizing, the boundary between the two + * chunks is readjusted to the minimum split size boundary details stored + * in the previous chunk, letting the current chunk grow at the cost of the + * previous chunk. + */ +static int +__rec_split_write_prev_and_shift_cur( + WT_SESSION_IMPL *session, WT_RECONCILE *r, bool resize_chunks) +{ + WT_BM *bm; + WT_BOUNDARY *bnd_cur, *bnd_prev; + WT_BTREE *btree; + WT_PAGE_HEADER *dsk, *dsk_tmp; + size_t cur_len, len; + uint8_t *dsk_start; + + WT_ASSERT(session, r->bnd_next != 0); + + btree = S2BT(session); + bm = btree->bm; + bnd_cur = &r->bnd[r->bnd_next]; + bnd_prev = bnd_cur - 1; + dsk = r->disk_image.mem; + cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset; + + /* + * Resize chunks if the current is smaller than the minimum, and there + * are details on the minimum split size boundary available in the + * previous boundary details. + * + * There is a possibility that we do not have a minimum boundary set, in + * such a case we skip chunk resizing. Such a condition is possible for + * instance when we are building the image in the buffer and the first + * K/V pair is large enough that it surpasses both the minimum split + * size and the split size the application has set. In such a case we + * split the chunk without saving any minimum boundary. + */ + if (resize_chunks && + cur_len < r->min_split_size && bnd_prev->min_bnd_offset != 0) { + bnd_cur->offset = bnd_prev->min_bnd_offset; + bnd_cur->max_bnd_entries += + bnd_prev->max_bnd_entries - bnd_prev->min_bnd_entries; + bnd_prev->max_bnd_entries = bnd_prev->min_bnd_entries; + bnd_cur->max_bnd_recno = bnd_prev->min_bnd_recno; + + WT_RET(__wt_buf_set(session, &bnd_cur->max_bnd_key, + bnd_prev->min_bnd_key.data, bnd_prev->min_bnd_key.size)); + + /* Update current chunk's length */ + cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset; + } + + /* + * Create an interim buffer if not already done to prepare the previous + * chunk's disk image. + */ + len = bnd_cur->offset; + WT_RET(bm->write_size(bm, session, &len)); + if (r->interim_buf == NULL) + WT_RET(__wt_scr_alloc(session, len, &r->interim_buf)); + else + WT_RET(__wt_buf_init(session, r->interim_buf, len)); + + dsk_tmp = r->interim_buf->mem; + memcpy(dsk_tmp, dsk, bnd_cur->offset); + dsk_tmp->recno = bnd_prev->max_bnd_recno; + dsk_tmp->u.entries = bnd_prev->max_bnd_entries; + dsk_tmp->mem_size = WT_STORE_SIZE(bnd_cur->offset); + r->interim_buf->size = dsk_tmp->mem_size; + WT_RET(__rec_split_write(session, r, bnd_prev, r->interim_buf, false)); + + /* Shift the current chunk to the start of the buffer */ + dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); + (void)memmove(dsk_start, (uint8_t *)dsk + bnd_cur->offset, cur_len); + + /* Fix boundary offset */ + bnd_cur->offset = WT_PAGE_HEADER_BYTE_SIZE(btree); + /* Fix where free points */ + r->first_free = dsk_start + cur_len; return (0); } @@ -2382,6 +2512,9 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) btree = S2BT(session); dsk = r->disk_image.mem; + /* Fixed length col store can call with next_len 0 */ + WT_ASSERT(session, next_len == 0 || r->space_avail < next_len); + /* * We should never split during salvage, and we're about to drop core * because there's no parent page. @@ -2391,147 +2524,58 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) "%s page too large, attempted split during salvage", __wt_page_type_string(r->page->type)); - /* Hitting a page boundary resets the dictionary, in all cases. */ - __rec_dictionary_reset(r); - - inuse = WT_PTRDIFF(r->first_free, dsk); - switch (r->bnd_state) { - case SPLIT_BOUNDARY: - /* - * We can get here if the first key/value pair won't fit. - * Additionally, grow the buffer to contain the current item if - * we haven't already consumed a reasonable portion of a split - * chunk. - */ - if (inuse < r->split_size / 2) - break; - - /* - * About to cross a split boundary but not yet forced to split - * into multiple pages. If we have to split, this is one of the - * split points, save information about where we are when the - * split would have happened. - */ - WT_RET(__rec_split_bnd_grow(session, r)); - last = &r->bnd[r->bnd_next++]; - next = last + 1; - - /* Set the number of entries for the just finished chunk. */ - last->entries = r->entries - r->total_entries; - r->total_entries = r->entries; - - /* Set the key for the next chunk. */ - next->recno = r->recno; - if (dsk->type == WT_PAGE_ROW_INT || - dsk->type == WT_PAGE_ROW_LEAF) - WT_RET(__rec_split_row_promote( - session, r, &next->key, dsk->type)); - - /* - * Set the starting buffer offset and clear the entries (the - * latter not required, but cleaner). - */ - next->offset = WT_PTRDIFF(r->first_free, dsk); - next->entries = 0; - - /* Set the space available to another split-size chunk. */ - r->space_avail = - r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); - - /* - * Adjust the space available to handle two cases: - * - We don't have enough room for another full split-size - * chunk on the page. - * - We chose to fill past a page boundary because of a - * large item. - */ - if (inuse + r->space_avail > r->page_size) { - r->space_avail = - r->page_size > inuse ? (r->page_size - inuse) : 0; - - /* There are no further boundary points. */ - r->bnd_state = SPLIT_MAX; - } - - /* - * Return if the next object fits into this page, else we have - * to split the page. - */ - if (r->space_avail >= next_len) - return (0); - - /* FALLTHROUGH */ - case SPLIT_MAX: - /* - * We're going to have to split and create multiple pages. - * - * Cycle through the saved split-point information, writing the - * split chunks we have tracked. The underlying fixup function - * sets the space available and other information, and copied - * any unwritten chunk of data to the beginning of the buffer. - */ - WT_RET(__rec_split_fixup(session, r)); - - /* We're done saving split chunks. */ - r->bnd_state = SPLIT_TRACKING_OFF; - break; - case SPLIT_TRACKING_OFF: - /* - * We can get here if the first key/value pair won't fit. - * Additionally, grow the buffer to contain the current item if - * we haven't already consumed a reasonable portion of a split - * chunk. - */ - if (inuse < r->split_size / 2) - break; + last = &r->bnd[r->bnd_next]; + inuse = (WT_PTRDIFF(r->first_free, dsk) - last->offset) + + WT_PAGE_HEADER_BYTE_SIZE(btree); - /* - * The key/value pairs didn't fit into a single page, but either - * we've already noticed that and are now processing the rest of - * the pairs at split size boundaries, or the split size was the - * same as the page size, and we never bothered with split point - * information at all. - */ - WT_RET(__rec_split_bnd_grow(session, r)); - last = &r->bnd[r->bnd_next++]; - next = last + 1; + /* + * We can get here if the first key/value pair won't fit. + * Additionally, grow the buffer to contain the current item if we + * haven't already consumed a reasonable portion of a split chunk. + */ + if (inuse < r->split_size / 2) + goto done; - /* - * Set the key for the next chunk (before writing the block, a - * key range is needed in that code). - */ - next->recno = r->recno; - if (dsk->type == WT_PAGE_ROW_INT || - dsk->type == WT_PAGE_ROW_LEAF) - WT_RET(__rec_split_row_promote( - session, r, &next->key, dsk->type)); + /* Hitting a page boundary resets the dictionary, in all cases. */ + __rec_dictionary_reset(r); - /* Clear the entries (not required, but cleaner). */ - next->entries = 0; + /* Set the number of entries for the just finished chunk. */ + last->max_bnd_entries = r->entries; - /* Finalize the header information and write the page. */ - dsk->recno = last->recno; - dsk->u.entries = r->entries; - dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk); + /* + * In case of bulk load, write out chunks as we get them. + * In other cases, we keep two chunks in memory at a given time. So, if + * there is a previous chunk, write it out, making space in the buffer + * for the next chunk to be written. + */ + if (r->is_bulk_load) { + dsk->recno = last->max_bnd_recno; + dsk->u.entries = last->max_bnd_entries; + dsk->mem_size = (uint32_t)inuse; r->disk_image.size = dsk->mem_size; - WT_RET( - __rec_split_write(session, r, last, &r->disk_image, false)); - - /* - * Set the caller's entry count and buffer information for the - * next chunk. We only get here if we're not splitting or have - * already split, so it's split-size chunks from here on out. - */ - r->entries = 0; + WT_RET(__rec_split_write( + session, r, last, &r->disk_image, false)); + /* Fix where free points */ r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); - r->space_avail = - r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); - break; - case SPLIT_TRACKING_RAW: - return (__wt_illegal_value(session, NULL)); - } + } else if (r->bnd_next != 0) + WT_RET(__rec_split_write_prev_and_shift_cur(session, r, false)); - /* + /* Prepare the next boundary */ + WT_RET(__rec_split_bnd_grow(session, r)); + r->bnd_next++; + next = &r->bnd[r->bnd_next]; + next->offset = WT_PTRDIFF(r->first_free, dsk); + /* Set the key for the next chunk. */ + next->max_bnd_recno = r->recno; + if (dsk->type == WT_PAGE_ROW_INT || dsk->type == WT_PAGE_ROW_LEAF) + WT_RET(__rec_split_row_promote( + session, r, &next->max_bnd_key, dsk->type)); + + r->entries = 0; + /* Set the space available to another split-size chunk. */ + r->space_avail = r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); + +done: /* * Overflow values can be larger than the maximum page size but still be * "on-page". If the next key/value pair is larger than space available * after a split has happened (in other words, larger than the maximum @@ -2549,6 +2593,66 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) } /* + * __rec_split_crossing_bnd -- + * Save the details for the minimum split size boundary or call for a + * split. + */ +static inline int +__rec_split_crossing_bnd( + WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) +{ + WT_BOUNDARY *bnd; + WT_BTREE *btree; + WT_PAGE_HEADER *dsk; + size_t min_bnd_offset; + + WT_ASSERT(session, WT_CHECK_CROSSING_BND(r, next_len)); + + /* + * If crossing the minimum split size boundary, store the boundary + * details at the current location in the buffer. If we are crossing the + * split boundary at the same time, possible when the next record is + * large enough, just split at this point. + */ + if (WT_CROSSING_MIN_BND(r, next_len) && + !WT_CROSSING_SPLIT_BND(r, next_len)) { + btree = S2BT(session); + bnd = &r->bnd[r->bnd_next]; + dsk = r->disk_image.mem; + min_bnd_offset = (WT_PTRDIFF(r->first_free, dsk) - + bnd->offset) + WT_PAGE_HEADER_BYTE_SIZE(btree); + if (min_bnd_offset == WT_PAGE_HEADER_BYTE_SIZE(btree)) + /* + * This is possible if the first record doesn't fit in + * the minimum split size, we write this record without + * setting up any boundary here. We will get the + * opportunity to setup a boundary before writing out + * the next record. + */ + return (0); + + WT_ASSERT(session, bnd->min_bnd_offset == 0); + + /* + * Hitting a page boundary resets the dictionary, in all cases. + */ + __rec_dictionary_reset(r); + + bnd->min_bnd_offset = min_bnd_offset; + bnd->min_bnd_entries = r->entries; + bnd->min_bnd_recno = r->recno; + if (dsk->type == WT_PAGE_ROW_INT || + dsk->type == WT_PAGE_ROW_LEAF) + WT_RET(__rec_split_row_promote( + session, r, &bnd->min_bnd_key, dsk->type)); + return (0); + } + + /* We are crossing a split boundary */ + return (__rec_split(session, r, next_len)); +} + +/* * __rec_split_raw_worker -- * Handle the raw compression page reconciliation bookkeeping. */ @@ -2626,7 +2730,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, */ recno = WT_RECNO_OOB; if (dsk->type == WT_PAGE_COL_VAR) - recno = last->recno; + recno = last->max_bnd_recno; entry = max_image_slot = slots = 0; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { @@ -2853,7 +2957,7 @@ no_slots: */ dst->size = result_len + WT_BLOCK_COMPRESS_SKIP; dsk_dst = dst->mem; - dsk_dst->recno = last->recno; + dsk_dst->recno = last->max_bnd_recno; dsk_dst->mem_size = r->raw_offsets[result_slots] + WT_BLOCK_COMPRESS_SKIP; dsk_dst->u.entries = r->raw_entries[result_slots - 1]; @@ -2873,7 +2977,7 @@ no_slots: WT_RET(__wt_strndup(session, dsk, dsk_dst->mem_size, &last->disk_image)); disk_image = last->disk_image; - disk_image->recno = last->recno; + disk_image->recno = last->max_bnd_recno; disk_image->mem_size = dsk_dst->mem_size; disk_image->u.entries = dsk_dst->u.entries; } @@ -2903,14 +3007,14 @@ no_slots: */ switch (dsk->type) { case WT_PAGE_COL_INT: - next->recno = r->raw_recnos[result_slots]; + next->max_bnd_recno = r->raw_recnos[result_slots]; break; case WT_PAGE_COL_VAR: - next->recno = r->raw_recnos[result_slots - 1]; + next->max_bnd_recno = r->raw_recnos[result_slots - 1]; break; case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: - next->recno = WT_RECNO_OOB; + next->max_bnd_recno = WT_RECNO_OOB; if (!last_block) { /* * Confirm there was uncompressed data remaining @@ -2919,7 +3023,7 @@ no_slots: */ WT_ASSERT(session, len > 0); WT_RET(__rec_split_row_promote_cell( - session, dsk, &next->key)); + session, dsk, &next->max_bnd_key)); } break; } @@ -2931,7 +3035,7 @@ no_slots: */ WT_STAT_DATA_INCR(session, compress_raw_fail); - dsk->recno = last->recno; + dsk->recno = last->max_bnd_recno; dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk); dsk->u.entries = r->entries; r->disk_image.size = dsk->mem_size; @@ -3008,35 +3112,9 @@ __rec_split_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) static int __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) { - WT_BOUNDARY *bnd; + WT_BOUNDARY *bnd_cur, *bnd_prev; WT_PAGE_HEADER *dsk; - - /* Adjust the boundary information based on our split status. */ - switch (r->bnd_state) { - case SPLIT_BOUNDARY: - case SPLIT_MAX: - /* - * We never split, the reconciled page fit into a maximum page - * size. Change the first boundary slot to represent the full - * page (the first boundary slot is largely correct, just update - * the number of entries). - */ - r->bnd_next = 0; - break; - case SPLIT_TRACKING_OFF: - /* - * If we have already split, or aren't tracking boundaries, put - * the remaining data in the next boundary slot. - */ - WT_RET(__rec_split_bnd_grow(session, r)); - break; - case SPLIT_TRACKING_RAW: - /* - * We were configured for raw compression, and either we never - * wrote anything, or there's a remaindered block of data. - */ - break; - } + bool grow_bnd; /* * We may arrive here with no entries to write if the page was entirely @@ -3063,20 +3141,66 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) return (EBUSY); } - /* Set the boundary reference and increment the count. */ - bnd = &r->bnd[r->bnd_next++]; - bnd->entries = r->entries; - - /* Finalize the header information. */ dsk = r->disk_image.mem; - dsk->recno = bnd->recno; - dsk->u.entries = r->entries; + + /* Set the number of entries for the just finished chunk. */ + bnd_cur = &r->bnd[r->bnd_next]; + bnd_cur->max_bnd_entries = r->entries; + + grow_bnd = true; + /* + * We can reach here even with raw_compression when the last split chunk + * is too small to be sent for raw compression. + */ + if (!r->is_bulk_load && !r->raw_compression) { + if (WT_PTRDIFF(r->first_free, dsk) > r->page_size && + r->bnd_next != 0) { + /* + * We hold two boundaries worth of data in the buffer, + * and this data doesn't fit in a single page. If the + * last chunk is too small, readjust the boundary to a + * pre-computed minimum. + * Write out the penultimate chunk to the disk as a page + */ + WT_RET(__rec_split_write_prev_and_shift_cur( + session, r, true)); + } else + if (r->bnd_next != 0) { + /* + * We have two boundaries, but the data in the + * buffer can fit a single page. Merge the + * boundaries to create a single chunk. + */ + bnd_prev = bnd_cur - 1; + bnd_prev->max_bnd_entries += + bnd_cur->max_bnd_entries; + r->bnd_next--; + grow_bnd = false; + } + } + + /* + * We already have space for an extra boundary if we merged two + * boundaries above, in that case we do not need to grow the boundary + * structure. + */ + if (grow_bnd) + WT_RET(__rec_split_bnd_grow(session, r)); + bnd_cur = &r->bnd[r->bnd_next]; + r->bnd_next++; + + /* + * Current boundary now has all the remaining data/last page now. + * Let's write it to the disk + */ + dsk->recno = bnd_cur->max_bnd_recno; + dsk->u.entries = bnd_cur->max_bnd_entries; dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk); r->disk_image.size = dsk->mem_size; /* If this is a checkpoint, we're done, otherwise write the page. */ - return (__rec_is_checkpoint(session, r, bnd) ? - 0 : __rec_split_write(session, r, bnd, &r->disk_image, true)); + return (__rec_is_checkpoint(session, r, bnd_cur) ? + 0 : __rec_split_write(session, r, bnd_cur, &r->disk_image, true)); } /* @@ -3110,98 +3234,6 @@ __rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r) } /* - * __rec_split_fixup -- - * Fix up after crossing the maximum page boundary. - */ -static int -__rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r) -{ - WT_BOUNDARY *bnd; - WT_BTREE *btree; - WT_DECL_ITEM(tmp); - WT_DECL_RET; - WT_PAGE_HEADER *dsk; - size_t i, len; - uint8_t *dsk_start, *p; - - /* - * When we overflow physical limits of the page, we walk the list of - * split chunks we've created and write those pages out, then update - * the caller's information. - */ - btree = S2BT(session); - - /* - * The data isn't laid out on a page boundary or nul padded; copy it to - * a clean, aligned, padded buffer before writing it. - * - * Allocate a scratch buffer to hold the new disk image. Copy the disk - * page's header and block-manager space into the scratch buffer, most - * of the header information remains unchanged between the pages. - */ - WT_RET(__wt_scr_alloc(session, r->disk_image.memsize, &tmp)); - dsk = tmp->mem; - memcpy(dsk, r->disk_image.mem, WT_PAGE_HEADER_BYTE_SIZE(btree)); - - /* - * For each split chunk we've created, update the disk image and copy - * it into place. - */ - dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); - for (i = 0, bnd = r->bnd; i < r->bnd_next; ++i, ++bnd) { - /* Copy the page contents to the temporary buffer. */ - len = (bnd + 1)->offset - bnd->offset; - memcpy(dsk_start, - (uint8_t *)r->disk_image.mem + bnd->offset, len); - - /* Finalize the header information and write the page. */ - dsk->recno = bnd->recno; - dsk->u.entries = bnd->entries; - tmp->size = WT_PAGE_HEADER_BYTE_SIZE(btree) + len; - dsk->mem_size = WT_STORE_SIZE(tmp->size); - WT_ERR(__rec_split_write(session, r, bnd, tmp, false)); - } - - /* - * There is probably a remnant in the working buffer that didn't get - * written, copy it down to the beginning of the working buffer. - * - * Confirm the remnant is no larger than a split-sized chunk, including - * header. We know that's the maximum sized remnant because we only have - * remnants if split switches from accumulating to a split boundary to - * accumulating to the end of the page (the other path here is when we - * hit a split boundary, there was room for another split chunk in the - * page, and the next item still wouldn't fit, in which case there is no - * remnant). So: we were accumulating to the end of the page and created - * a remnant. We know the remnant cannot be as large as a split-sized - * chunk, including header, because if there was room for that large a - * remnant, we wouldn't have switched from accumulating to a page end. - */ - p = (uint8_t *)r->disk_image.mem + bnd->offset; - len = WT_PTRDIFF(r->first_free, p); - if (len >= r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree)) - WT_PANIC_ERR(session, EINVAL, - "Reconciliation remnant too large for the split buffer"); - dsk = r->disk_image.mem; - dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); - (void)memmove(dsk_start, p, len); - - /* - * Fix up our caller's information, including updating the starting - * record number. - */ - r->entries -= r->total_entries; - r->first_free = dsk_start + len; - WT_ASSERT(session, - r->page_size >= (WT_PAGE_HEADER_BYTE_SIZE(btree) + len)); - r->space_avail = - r->split_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len); - -err: __wt_scr_free(session, &tmp); - return (ret); -} - -/* * __rec_split_write -- * Write a disk block out for the split helper functions. */ @@ -3238,8 +3270,6 @@ __rec_split_write(WT_SESSION_IMPL *session, F_SET(dsk, WT_PAGE_EMPTY_V_NONE); } - bnd->entries = r->entries; - /* Initialize the address (set the page type for the parent). */ switch (dsk->type) { case WT_PAGE_COL_FIX: @@ -3285,7 +3315,8 @@ __rec_split_write(WT_SESSION_IMPL *session, switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: - if (WT_INSERT_RECNO(supd->ins) >= (bnd + 1)->recno) + if (WT_INSERT_RECNO(supd->ins) >= + (bnd + 1)->max_bnd_recno) goto supd_check_complete; break; case WT_PAGE_ROW_LEAF: @@ -3296,8 +3327,8 @@ __rec_split_write(WT_SESSION_IMPL *session, key->data = WT_INSERT_KEY(supd->ins); key->size = WT_INSERT_KEY_SIZE(supd->ins); } - WT_ERR(__wt_compare(session, - btree->collator, key, &(bnd + 1)->key, &cmp)); + WT_ERR(__wt_compare(session, btree->collator, + key, &(bnd + 1)->max_bnd_key, &cmp)); if (cmp >= 0) goto supd_check_complete; break; @@ -3387,14 +3418,14 @@ supd_check_complete: #ifdef HAVE_VERBOSE /* Output a verbose message if we create a page without many entries */ - if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT) && r->entries < 6) + if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT) && + bnd->max_bnd_entries < 6) __wt_verbose(session, WT_VERB_SPLIT, "Reconciliation creating a page with %" PRIu32 " entries, memory footprint %" WT_SIZET_FMT - ", page count %" PRIu32 ", %s, split state: %d", - r->entries, r->page->memory_footprint, r->bnd_next, - F_ISSET(r, WT_EVICTING) ? "evict" : "checkpoint", - r->bnd_state); + ", page count %" PRIu32 ", %s", bnd->max_bnd_entries, + r->page->memory_footprint, r->bnd_next, + F_ISSET(r, WT_EVICTING) ? "evict" : "checkpoint"); #endif WT_ERR(__wt_bt_write(session, buf, addr, &addr_size, @@ -3680,11 +3711,12 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) cursor->value.data, cursor->value.size, (uint64_t)0)); /* Boundary: split or write the page. */ - if (key->len + val->len > r->space_avail) { - if (r->raw_compression) - WT_RET( - __rec_split_raw(session, r, key->len + val->len)); - else { + if (r->raw_compression) { + if (key->len + val->len > r->space_avail) + WT_RET(__rec_split_raw( + session, r, key->len + val->len)); + } else + if (WT_CROSSING_SPLIT_BND(r, key->len + val->len)) { /* * Turn off prefix compression until a full key written * to the new page, and (unless already working with an @@ -3696,10 +3728,9 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_RET(__rec_cell_build_leaf_key( session, r, NULL, 0, &ovfl_key)); } - - WT_RET(__rec_split(session, r, key->len + val->len)); + WT_RET(__rec_split_crossing_bnd( + session, r, key->len + val->len)); } - } /* Copy the key/value pair onto the page. */ __rec_copy_incr(session, r, key); @@ -3740,6 +3771,10 @@ __rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk) * split. * * Boundary: split or write the page. + * + * No need to have a minimum split size boundary, all + * pages are filled 100% except the last, allowing it to + * grow in the future. */ __rec_incr(session, r, cbulk->entry, __bitstr_size( @@ -3844,10 +3879,12 @@ __wt_bulk_insert_var( r, cbulk->last.data, cbulk->last.size, cbulk->rle)); /* Boundary: split or write the page. */ - if (val->len > r->space_avail) - WT_RET(r->raw_compression ? - __rec_split_raw(session, r, val->len) : - __rec_split(session, r, val->len)); + if (r->raw_compression) { + if (val->len > r->space_avail) + WT_RET(__rec_split_raw(session, r, val->len)); + } else + if (WT_CROSSING_SPLIT_BND(r, val->len)) + WT_RET(__rec_split_crossing_bnd(session, r, val->len)); /* Copy the value onto the page. */ if (btree->dictionary) @@ -3983,10 +4020,13 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) WT_CHILD_RELEASE_ERR(session, hazard, ref); /* Boundary: split or write the page. */ - if (val->len > r->space_avail) - WT_ERR(r->raw_compression ? - __rec_split_raw(session, r, val->len) : - __rec_split(session, r, val->len)); + if (r->raw_compression) { + if (val->len > r->space_avail) + WT_ERR(__rec_split_raw(session, r, val->len)); + } else + if (WT_CHECK_CROSSING_BND(r, val->len)) + WT_ERR(__rec_split_crossing_bnd( + session, r, val->len)); /* Copy the value onto the page. */ __rec_copy_incr(session, r, val); @@ -4028,10 +4068,13 @@ __rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) addr->addr, addr->size, __rec_vtype(addr), r->recno); /* Boundary: split or write the page. */ - if (val->len > r->space_avail) - WT_RET(r->raw_compression ? - __rec_split_raw(session, r, val->len) : - __rec_split(session, r, val->len)); + if (r->raw_compression) { + if (val->len > r->space_avail) + WT_RET(__rec_split_raw(session, r, val->len)); + } else + if (WT_CHECK_CROSSING_BND(r, val->len)) + WT_RET(__rec_split_crossing_bnd( + session, r, val->len)); /* Copy the value onto the page. */ __rec_copy_incr(session, r, val); @@ -4139,6 +4182,10 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) * split. * * Boundary: split or write the page. + * + * No need to have a minimum split size boundary, all + * pages are filled 100% except the last, allowing it to + * grow in the future. */ __rec_incr(session, r, entry, __bitstr_size((size_t)entry * btree->bitcnt)); @@ -4295,10 +4342,13 @@ __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r, session, r, value->data, value->size, rle)); /* Boundary: split or write the page. */ - if (val->len > r->space_avail) - WT_RET(r->raw_compression ? - __rec_split_raw(session, r, val->len) : - __rec_split(session, r, val->len)); + if (r->raw_compression) { + if (val->len > r->space_avail) + WT_RET(__rec_split_raw(session, r, val->len)); + } else + if (WT_CHECK_CROSSING_BND(r, val->len)) + WT_RET(__rec_split_crossing_bnd( + session, r, val->len)); /* Copy the value onto the page. */ if (!deleted && !overflow_type && btree->dictionary) @@ -4961,11 +5011,12 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) r->cell_zero = false; /* Boundary: split or write the page. */ - if (key->len + val->len > r->space_avail) { - if (r->raw_compression) + if (r->raw_compression) { + if (key->len + val->len > r->space_avail) WT_ERR(__rec_split_raw( session, r, key->len + val->len)); - else { + } else + if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) { /* * In one path above, we copied address blocks * from the page rather than building the actual @@ -4977,10 +5028,10 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_IKEY_DATA(ikey), ikey->size)); key_onpage_ovfl = false; } - WT_ERR(__rec_split( + + WT_ERR(__rec_split_crossing_bnd( session, r, key->len + val->len)); } - } /* Copy the key and value onto the page. */ __rec_copy_incr(session, r, key); @@ -5030,10 +5081,14 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) addr->addr, addr->size, __rec_vtype(addr), WT_RECNO_OOB); /* Boundary: split or write the page. */ - if (key->len + val->len > r->space_avail) - WT_RET(r->raw_compression ? - __rec_split_raw(session, r, key->len + val->len) : - __rec_split(session, r, key->len + val->len)); + if (r->raw_compression) { + if (key->len + val->len > r->space_avail) + WT_RET(__rec_split_raw( + session, r, key->len + val->len)); + } else + if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) + WT_RET(__rec_split_crossing_bnd( + session, r, key->len + val->len)); /* Copy the key and value onto the page. */ __rec_copy_incr(session, r, key); @@ -5362,16 +5417,17 @@ build: } /* Boundary: split or write the page. */ - if (key->len + val->len > r->space_avail) { - if (r->raw_compression) + if (r->raw_compression) { + if (key->len + val->len > r->space_avail) WT_ERR(__rec_split_raw( session, r, key->len + val->len)); - else { + } else + if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) { /* - * In one path above, we copied address blocks - * from the page rather than building the actual - * key. In that case, we have to build the key - * now because we are about to promote it. + * If we copied address blocks from the page + * rather than building the actual key, we have + * to build the key now because we are about to + * promote it. */ if (key_onpage_ovfl) { WT_ERR(__wt_dsk_cell_data_ref(session, @@ -5390,14 +5446,13 @@ build: if (!ovfl_key) WT_ERR( __rec_cell_build_leaf_key( - session, - r, NULL, 0, &ovfl_key)); + session, r, NULL, 0, + &ovfl_key)); } - WT_ERR(__rec_split( + WT_ERR(__rec_split_crossing_bnd( session, r, key->len + val->len)); } - } /* Copy the key/value pair onto the page. */ __rec_copy_incr(session, r, key); @@ -5460,11 +5515,12 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key)); /* Boundary: split or write the page. */ - if (key->len + val->len > r->space_avail) { - if (r->raw_compression) + if (r->raw_compression) { + if (key->len + val->len > r->space_avail) WT_RET(__rec_split_raw( session, r, key->len + val->len)); - else { + } else + if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) { /* * Turn off prefix compression until a full key * written to the new page, and (unless already @@ -5476,14 +5532,13 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) if (!ovfl_key) WT_RET( __rec_cell_build_leaf_key( - session, - r, NULL, 0, &ovfl_key)); + session, r, NULL, 0, + &ovfl_key)); } - WT_RET(__rec_split( + WT_RET(__rec_split_crossing_bnd( session, r, key->len + val->len)); } - } /* Copy the key/value pair onto the page. */ __rec_copy_incr(session, r, key); @@ -5595,13 +5650,14 @@ __rec_split_dump_keys(WT_SESSION_IMPL *session, WT_PAGE *page, WT_RECONCILE *r) __wt_verbose(session, WT_VERB_SPLIT, "starting key %s", __wt_buf_set_printable( - session, bnd->key.data, bnd->key.size, tkey)); + session, bnd->max_bnd_key.data, + bnd->max_bnd_key.size, tkey)); break; case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: __wt_verbose(session, WT_VERB_SPLIT, - "starting recno %" PRIu64, bnd->recno); + "starting recno %" PRIu64, bnd->max_bnd_recno); break; WT_ILLEGAL_VALUE_ERR(session); } @@ -5863,10 +5919,10 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) /* We never set the first page's key, grab it from the original page. */ ref = r->ref; if (__wt_ref_is_root(ref)) - WT_RET(__wt_buf_set(session, &r->bnd[0].key, "", 1)); + WT_RET(__wt_buf_set(session, &r->bnd[0].max_bnd_key, "", 1)); else { __wt_ref_key(ref->home, ref, &p, &size); - WT_RET(__wt_buf_set(session, &r->bnd[0].key, p, size)); + WT_RET(__wt_buf_set(session, &r->bnd[0].max_bnd_key, p, size)); } /* Allocate, then initialize the array of replacement blocks. */ @@ -5874,8 +5930,8 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) for (multi = mod->mod_multi, bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) { - WT_RET(__wt_row_ikey_alloc(session, 0, - bnd->key.data, bnd->key.size, &multi->key.ikey)); + WT_RET(__wt_row_ikey_alloc(session, 0, bnd->max_bnd_key.data, + bnd->max_bnd_key.size, &multi->key.ikey)); /* * Copy any disk image. Don't take saved updates without a @@ -5922,7 +5978,7 @@ __rec_split_col(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) for (multi = mod->mod_multi, bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) { - multi->key.recno = bnd->recno; + multi->key.recno = bnd->max_bnd_recno; /* * Copy any disk image. Don't take saved updates without a |