summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSulabh Mahajan <sulabh.mahajan@mongodb.com>2017-06-21 09:27:00 +1000
committerGitHub <noreply@github.com>2017-06-21 09:27:00 +1000
commita79e471dada3a1a1aa298575882df4f4fb3aafaa (patch)
tree74aaa324d8f1fa32cae016a2284c61dc687c9194
parentc455dcfd99c4311838a194df917b63ceb61876f3 (diff)
downloadmongo-a79e471dada3a1a1aa298575882df4f4fb3aafaa.tar.gz
WT-3251 remove interim buffer when splitting during reconciliation (#3453)
* Don't initialize second buffer unless needed * When moving data from previous to current image, grow buffer if needed * Fix a bug caused by keeping pointer across realloc * Address Dave's comments * Address Keith's review comments * Address comments
-rw-r--r--src/reconcile/rec_write.c407
1 files changed, 226 insertions, 181 deletions
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index f7df73c4ecb..4c79893bd94 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -25,12 +25,25 @@ typedef struct {
WT_PAGE *page;
uint32_t flags; /* Caller's configuration */
- WT_ITEM disk_image; /* Temporary disk-image buffer */
/*
- * Temporary buffer used to write out a disk image when managing two
- * chunks worth of data in memory
- */
- WT_ITEM *interim_buf;
+ * Reconciliation can end up requiring two temporary disk image buffers
+ * if a page split is involved. These two disk images are pointed to by
+ * current and the previous image pointers. During initialization the
+ * first image is allocated and pointed to by the current image pointer.
+ * If and when a split is involved the second image gets allocated and
+ * is pointed to by the current image pointer. The previous image
+ * pointer is made to refer the first image at this point. Two images
+ * are kept in memory to redistribute data among them in case the last
+ * split chunk ends up being smaller than the minimum required. As
+ * reconciliation generates more split chunks, the image referred to by
+ * the previous image pointer is written to the disk, the current and
+ * the previous image pointers are swapped, making space for another
+ * split chunk to be reconciled in the buffer that was just written out
+ * to the disk.
+ */
+ WT_ITEM disk_image[2]; /* Temporary disk-image buffers */
+ WT_ITEM *cur_img_ptr;
+ WT_ITEM *prev_img_ptr;
/*
* Track start/stop write generation to decide if all changes to the
@@ -146,17 +159,6 @@ typedef struct {
* that references all of our split pages.
*/
struct __rec_boundary {
- /*
- * Offset is the byte offset in the initial split buffer of the
- * first byte of the split chunk, recorded before we decide to
- * split the page; the difference between chunk[1]'s offset and
- * chunk[0]'s offset is chunk[0]'s length.
- *
- * Once we split a page, we stop filling in offset values, we're
- * writing the split chunks as we find them.
- */
- size_t offset; /* Split's first byte */
-
WT_ADDR addr; /* Split's written location */
uint32_t size; /* Split's size */
uint32_t checksum; /* Split's checksum */
@@ -832,7 +834,8 @@ __rec_write_init(WT_SESSION_IMPL *session,
r->last = &r->_last;
/* Disk buffers need to be aligned for writing. */
- F_SET(&r->disk_image, WT_ITEM_ALIGNED);
+ F_SET(&r->disk_image[0], WT_ITEM_ALIGNED);
+ F_SET(&r->disk_image[1], WT_ITEM_ALIGNED);
}
/* Reconciliation is not re-entrant, make sure that doesn't happen. */
@@ -977,8 +980,8 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep)
return;
*(WT_RECONCILE **)reconcilep = NULL;
- __wt_buf_free(session, &r->disk_image);
- __wt_scr_free(session, &r->interim_buf);
+ __wt_buf_free(session, &r->disk_image[0]);
+ __wt_buf_free(session, &r->disk_image[1]);
__wt_free(session, r->raw_entries);
__wt_free(session, r->raw_offsets);
@@ -1766,7 +1769,7 @@ __rec_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size)
*/
WT_ASSERT(session, r->space_avail >= size);
WT_ASSERT(session, WT_BLOCK_FITS(
- r->first_free, size, r->disk_image.mem, r->disk_image.memsize));
+ r->first_free, size, r->cur_img_ptr->mem, r->cur_img_ptr->memsize));
r->entries += v;
r->space_avail -= size;
@@ -1853,7 +1856,7 @@ __rec_dict_replace(
* copy cell instead.
*/
if (dp->offset == 0)
- dp->offset = WT_PTRDIFF32(r->first_free, r->disk_image.mem);
+ dp->offset = WT_PTRDIFF32(r->first_free, r->cur_img_ptr->mem);
else {
/*
* The offset is the byte offset from this cell to the previous,
@@ -1861,7 +1864,7 @@ __rec_dict_replace(
* page.
*/
offset = (uint64_t)WT_PTRDIFF(r->first_free,
- (uint8_t *)r->disk_image.mem + dp->offset);
+ (uint8_t *)r->cur_img_ptr->mem + dp->offset);
val->len = val->cell_len =
__wt_cell_pack_copy(&val->cell, rle, offset);
val->buf.data = NULL;
@@ -1997,7 +2000,6 @@ __rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r)
static void
__rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd)
{
- bnd->offset = 0;
bnd->max_bnd_recno = WT_RECNO_OOB;
bnd->max_bnd_entries = 0;
@@ -2210,15 +2212,14 @@ __rec_split_init(WT_SESSION_IMPL *session,
* Ensure the disk image buffer is large enough for the max object, as
* corrected by the underlying block manager.
*
- * The buffer that we build disk image in, needs to hold two chunks
- * worth of data. Since we want to support split_size more than the page
- * size (to allow for adjustments based on the compression), this buffer
- * should be greater of twice of split_size and page_size.
+ * Since we want to support split_size more than the page size (to allow
+ * for adjustments based on the compression), this buffer should be
+ * greater of split_size and page_size.
*/
corrected_page_size = r->page_size;
- disk_img_buf_size = 2 * WT_MAX(corrected_page_size, r->split_size);
WT_RET(bm->write_size(bm, session, &corrected_page_size));
- WT_RET(__wt_buf_init(session, &r->disk_image, disk_img_buf_size));
+ disk_img_buf_size = WT_MAX(corrected_page_size, r->split_size);
+ WT_RET(__wt_buf_init(session, &r->disk_image[0], disk_img_buf_size));
/*
* Clear the disk page header to ensure all of it is initialized, even
@@ -2228,15 +2229,17 @@ __rec_split_init(WT_SESSION_IMPL *session,
* fixed-length column-store sets bits in bytes, where the bytes are
* assumed to initially be 0.
*/
- memset(r->disk_image.mem, 0, page->type == WT_PAGE_COL_FIX ?
+ memset(r->disk_image[0].mem, 0, page->type == WT_PAGE_COL_FIX ?
disk_img_buf_size : WT_PAGE_HEADER_SIZE);
/*
* Set the page type (the type doesn't change, and setting it later
* would require additional code in a few different places).
*/
- dsk = r->disk_image.mem;
+ dsk = r->disk_image[0].mem;
dsk->type = page->type;
+ r->cur_img_ptr = &r->disk_image[0];
+ r->prev_img_ptr = NULL;
r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
@@ -2245,7 +2248,6 @@ __rec_split_init(WT_SESSION_IMPL *session,
WT_RET(__rec_split_bnd_grow(session, r));
__rec_split_bnd_init(session, &r->bnd[0]);
r->bnd[0].max_bnd_recno = recno;
- r->bnd[0].offset = WT_PAGE_HEADER_BYTE_SIZE(btree);
/* Initialize the entry counter. */
r->entries = 0;
@@ -2451,21 +2453,18 @@ __rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len)
{
WT_BM *bm;
WT_BTREE *btree;
- size_t corrected_page_size, inuse, len;
+ size_t corrected_page_size, inuse;
btree = S2BT(session);
bm = btree->bm;
- len = WT_PTRDIFF(r->first_free, r->disk_image.mem);
- inuse = (len - r->bnd[r->bnd_next].offset) +
- WT_PAGE_HEADER_BYTE_SIZE(btree);
+ inuse = WT_PTRDIFF(r->first_free, r->cur_img_ptr->mem);
corrected_page_size = inuse + add_len;
WT_RET(bm->write_size(bm, session, &corrected_page_size));
- /* Need to account for buffer carrying two chunks worth of data */
- WT_RET(__wt_buf_grow(session, &r->disk_image, 2 * corrected_page_size));
+ WT_RET(__wt_buf_grow(session, r->cur_img_ptr, corrected_page_size));
- r->first_free = (uint8_t *)r->disk_image.mem + len;
+ r->first_free = (uint8_t *)r->cur_img_ptr->mem + inuse;
WT_ASSERT(session, corrected_page_size >= inuse);
r->space_avail = corrected_page_size - inuse;
WT_ASSERT(session, r->space_avail >= add_len);
@@ -2474,89 +2473,55 @@ __rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len)
}
/*
- * __rec_split_write_prev_and_shift_cur --
- * Write the previous split chunk to the disk as a page. Shift the contents
- * of the current chunk to the start of the buffer, making space for a new
- * chunk to be written.
- * If the caller asks for a chunk resizing, the boundary between the two
- * chunks is readjusted to the minimum split size boundary details stored
- * in the previous chunk, letting the current chunk grow at the cost of the
- * previous chunk.
+ * __rec_split_write_prev_and_swap_buf --
+ * If there is a previous split chunk held in the memory, write it to the
+ * disk as a page. If there isn't one, this is the first time we are
+ * splitting and need to initialize a second buffer. Also, swap the
+ * previous and the current buffer pointers.
*/
static int
-__rec_split_write_prev_and_shift_cur(
- WT_SESSION_IMPL *session, WT_RECONCILE *r, bool resize_chunks)
+__rec_split_write_prev_and_swap_buf(WT_SESSION_IMPL *session, WT_RECONCILE *r)
{
- WT_BM *bm;
- WT_BOUNDARY *bnd_cur, *bnd_prev;
- WT_BTREE *btree;
- WT_PAGE_HEADER *dsk, *dsk_tmp;
- size_t cur_len, len;
- uint8_t *dsk_start;
-
- WT_ASSERT(session, r->bnd_next != 0);
-
- btree = S2BT(session);
- bm = btree->bm;
- bnd_cur = &r->bnd[r->bnd_next];
- bnd_prev = bnd_cur - 1;
- dsk = r->disk_image.mem;
- cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset;
-
- /*
- * Resize chunks if the current is smaller than the minimum, and there
- * are details on the minimum split size boundary available in the
- * previous boundary details.
- *
- * There is a possibility that we do not have a minimum boundary set, in
- * such a case we skip chunk resizing. Such a condition is possible for
- * instance when we are building the image in the buffer and the first
- * K/V pair is large enough that it surpasses both the minimum split
- * size and the split size the application has set. In such a case we
- * split the chunk without saving any minimum boundary.
- */
- if (resize_chunks &&
- cur_len < r->min_split_size && bnd_prev->min_bnd_offset != 0) {
- bnd_cur->offset = bnd_prev->min_bnd_offset;
- bnd_cur->max_bnd_entries +=
- bnd_prev->max_bnd_entries - bnd_prev->min_bnd_entries;
- bnd_prev->max_bnd_entries = bnd_prev->min_bnd_entries;
- bnd_cur->max_bnd_recno = bnd_prev->min_bnd_recno;
-
- WT_RET(__wt_buf_set(session, &bnd_cur->max_bnd_key,
- bnd_prev->min_bnd_key.data, bnd_prev->min_bnd_key.size));
-
- /* Update current chunk's length */
- cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset;
+ WT_BOUNDARY *bnd_prev;
+ WT_ITEM *tmp_img_ptr;
+ WT_PAGE_HEADER *dsk;
+ size_t disk_img_size;
+
+ WT_ASSERT(session, (r->prev_img_ptr == NULL && r->bnd_next == 0) ||
+ (r->prev_img_ptr != NULL && r->bnd_next != 0));
+
+ /* Write previous chunk, if there is one */
+ if (r->prev_img_ptr != NULL) {
+ bnd_prev = &r->bnd[r->bnd_next - 1];
+ dsk = r->prev_img_ptr->mem;
+ dsk->recno = bnd_prev->max_bnd_recno;
+ dsk->u.entries = bnd_prev->max_bnd_entries;
+ dsk->mem_size = (uint32_t)bnd_prev->size;
+ r->prev_img_ptr->size = dsk->mem_size;
+ WT_RET(__rec_split_write(session,
+ r, bnd_prev, r->prev_img_ptr, false));
+ } else {
+ /*
+ * If we do not have a previous buffer, we should initialize the
+ * second buffer before proceeding. We will create the second
+ * buffer of the same size as the current buffer.
+ */
+ disk_img_size = r->cur_img_ptr->memsize;
+ WT_RET(__wt_buf_init(session,
+ &r->disk_image[1], disk_img_size));
+ r->prev_img_ptr = &r->disk_image[1];
+ dsk = r->prev_img_ptr->mem;
+ memset(dsk, 0,
+ r->page->type == WT_PAGE_COL_FIX ?
+ disk_img_size : WT_PAGE_HEADER_SIZE);
+ dsk->type = r->page->type;
}
- /*
- * Create an interim buffer if not already done to prepare the previous
- * chunk's disk image.
- */
- len = bnd_cur->offset;
- WT_RET(bm->write_size(bm, session, &len));
- if (r->interim_buf == NULL)
- WT_RET(__wt_scr_alloc(session, len, &r->interim_buf));
- else
- WT_RET(__wt_buf_init(session, r->interim_buf, len));
-
- dsk_tmp = r->interim_buf->mem;
- memcpy(dsk_tmp, dsk, bnd_cur->offset);
- dsk_tmp->recno = bnd_prev->max_bnd_recno;
- dsk_tmp->u.entries = bnd_prev->max_bnd_entries;
- dsk_tmp->mem_size = WT_STORE_SIZE(bnd_cur->offset);
- r->interim_buf->size = dsk_tmp->mem_size;
- WT_RET(__rec_split_write(session, r, bnd_prev, r->interim_buf, false));
-
- /* Shift the current chunk to the start of the buffer */
- dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
- (void)memmove(dsk_start, (uint8_t *)dsk + bnd_cur->offset, cur_len);
-
- /* Fix boundary offset */
- bnd_cur->offset = WT_PAGE_HEADER_BYTE_SIZE(btree);
- /* Fix where free points */
- r->first_free = dsk_start + cur_len;
+ /* swap previous and current buffers */
+ tmp_img_ptr = r->prev_img_ptr;
+ r->prev_img_ptr = r->cur_img_ptr;
+ r->cur_img_ptr = tmp_img_ptr;
+
return (0);
}
@@ -2574,7 +2539,7 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
size_t inuse;
btree = S2BT(session);
- dsk = r->disk_image.mem;
+ dsk = r->cur_img_ptr->mem;
/* Fixed length col store can call with next_len 0 */
WT_ASSERT(session, next_len == 0 || r->space_avail < next_len);
@@ -2588,9 +2553,7 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
"%s page too large, attempted split during salvage",
__wt_page_type_string(r->page->type));
- last = &r->bnd[r->bnd_next];
- inuse = (WT_PTRDIFF(r->first_free, dsk) - last->offset) +
- WT_PAGE_HEADER_BYTE_SIZE(btree);
+ inuse = WT_PTRDIFF(r->first_free, dsk);
/*
* We can get here if the first key/value pair won't fit.
@@ -2603,8 +2566,10 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
/* All page boundaries reset the dictionary. */
__rec_dictionary_reset(r);
- /* Set the number of entries for the just finished chunk. */
+ /* Set the number of entries and size for the just finished chunk. */
+ last = &r->bnd[r->bnd_next];
last->max_bnd_entries = r->entries;
+ last->size = (uint32_t)inuse;
/*
* In case of bulk load, write out chunks as we get them. Otherwise we
@@ -2616,19 +2581,22 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
dsk->recno = last->max_bnd_recno;
dsk->u.entries = last->max_bnd_entries;
dsk->mem_size = (uint32_t)inuse;
- r->disk_image.size = dsk->mem_size;
- WT_RET(__rec_split_write(
- session, r, last, &r->disk_image, false));
- /* Fix where free points */
- r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
- } else if (r->bnd_next != 0)
- WT_RET(__rec_split_write_prev_and_shift_cur(session, r, false));
+ r->cur_img_ptr->size = dsk->mem_size;
+ WT_RET(__rec_split_write(session,
+ r, last, r->cur_img_ptr, false));
+ } else {
+ WT_RET(__rec_split_write_prev_and_swap_buf(session, r));
+ /* current image we are writing to has changed */
+ dsk = r->cur_img_ptr->mem;
+ }
+
+ /* Fix where free points */
+ r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
/* Prepare the next boundary */
WT_RET(__rec_split_bnd_grow(session, r));
r->bnd_next++;
next = &r->bnd[r->bnd_next];
- next->offset = WT_PTRDIFF(r->first_free, dsk);
/* Set the key for the next chunk. */
next->max_bnd_recno = r->recno;
if (dsk->type == WT_PAGE_ROW_INT || dsk->type == WT_PAGE_ROW_LEAF)
@@ -2687,9 +2655,8 @@ __rec_split_crossing_bnd(
!WT_CROSSING_SPLIT_BND(r, next_len)) {
btree = S2BT(session);
bnd = &r->bnd[r->bnd_next];
- dsk = r->disk_image.mem;
- min_bnd_offset = (WT_PTRDIFF(r->first_free, dsk) -
- bnd->offset) + WT_PAGE_HEADER_BYTE_SIZE(btree);
+ dsk = r->cur_img_ptr->mem;
+ min_bnd_offset = WT_PTRDIFF(r->first_free, dsk);
if (min_bnd_offset == WT_PAGE_HEADER_BYTE_SIZE(btree))
/*
* This is possible if the first record doesn't fit in
@@ -2750,7 +2717,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session,
unpack = &_unpack;
compressor = btree->compressor;
dst = &r->raw_destination;
- dsk = r->disk_image.mem;
+ dsk = r->cur_img_ptr->mem;
WT_RET(__rec_split_bnd_grow(session, r));
last = &r->bnd[r->bnd_next];
@@ -3066,7 +3033,7 @@ no_slots:
r->first_free = dsk_start + len;
r->space_avail += r->raw_offsets[result_slots];
WT_ASSERT(session, r->first_free + r->space_avail <=
- (uint8_t *)r->disk_image.mem + r->disk_image.memsize);
+ (uint8_t *)r->cur_img_ptr->mem + r->cur_img_ptr->memsize);
/*
* Set the key for the next block (before writing the block, a
@@ -3105,13 +3072,13 @@ no_slots:
dsk->recno = last->max_bnd_recno;
dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk);
dsk->u.entries = r->entries;
- r->disk_image.size = dsk->mem_size;
+ r->cur_img_ptr->size = dsk->mem_size;
r->entries = 0;
r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
- write_ref = &r->disk_image;
+ write_ref = r->cur_img_ptr;
last->already_compressed = false;
} else {
/*
@@ -3139,7 +3106,7 @@ no_slots:
last_block && __rec_is_checkpoint(session, r, last)) {
if (write_ref == dst)
WT_RET(__wt_buf_set(
- session, &r->disk_image, dst->mem, dst->size));
+ session, r->cur_img_ptr, dst->mem, dst->size));
} else
WT_RET(
__rec_split_write(session, r, last, write_ref, last_block));
@@ -3173,15 +3140,120 @@ __rec_split_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
}
/*
+ * __rec_split_finish_process_prev --
+ * If the two split chunks together fit in a single page, merge them into
+ * one. If they do not fit in a single page but the last is smaller than
+ * the minimum desired, move some data from the penultimate chunk to the
+ * last chunk and write out the previous/penultimate. Finally, update the
+ * pointer to the current image buffer. After this function exits, we will
+ * have one (last) buffer in memory, pointed to by the current image
+ * pointer.
+ */
+static int
+__rec_split_finish_process_prev(
+ WT_SESSION_IMPL *session, WT_RECONCILE *r, bool *chunks_merged)
+{
+ WT_BOUNDARY *bnd_cur, *bnd_prev;
+ WT_BTREE *btree;
+ WT_PAGE_HEADER *dsk;
+ size_t len_to_move;
+ uint32_t combined_size;
+ uint8_t *cur_dsk_start;
+
+ WT_ASSERT(session, r->prev_img_ptr != NULL);
+
+ btree = S2BT(session);
+ bnd_cur = &r->bnd[r->bnd_next];
+ bnd_prev = bnd_cur - 1;
+ *chunks_merged = false;
+ /*
+ * The sizes referred to in the boundary structure include the header,
+ * so when calculating the combined size, make sure not to include the
+ * header twice.
+ */
+ combined_size = bnd_prev->size +
+ (bnd_cur->size - WT_PAGE_HEADER_BYTE_SIZE(btree));
+
+ if (combined_size <= r->page_size) {
+ /*
+ * We have two boundaries, but the data in the buffers can fit a
+ * single page. Merge the boundaries and create a single chunk.
+ */
+ dsk = r->cur_img_ptr->mem;
+ memcpy((uint8_t *)r->prev_img_ptr->mem + bnd_prev->size,
+ WT_PAGE_HEADER_BYTE(btree, dsk),
+ bnd_cur->size - WT_PAGE_HEADER_BYTE_SIZE(btree));
+ bnd_prev->size = combined_size;
+ bnd_prev->max_bnd_entries += bnd_cur->max_bnd_entries;
+ r->bnd_next--;
+ *chunks_merged = true;
+ } else {
+ if (bnd_cur->size < r->min_split_size &&
+ bnd_prev->min_bnd_offset != 0 ) {
+ /*
+ * The last chunk, pointed to by the current image
+ * pointer, has less than the minimum data. Let's move
+ * any data more than the minimum from the previous
+ * image into the current.
+ */
+ len_to_move = bnd_prev->size - bnd_prev->min_bnd_offset;
+ /* Grow current buffer if it is not large enough */
+ if (r->space_avail < len_to_move)
+ WT_RET(__rec_split_grow(session,
+ r, len_to_move));
+ cur_dsk_start = WT_PAGE_HEADER_BYTE(btree,
+ r->cur_img_ptr->mem);
+
+ /*
+ * Shift the contents of the current buffer to make
+ * space for the data that will be prepended into the
+ * current buffer
+ */
+ memmove(cur_dsk_start + len_to_move,
+ cur_dsk_start, bnd_cur->size -
+ WT_PAGE_HEADER_BYTE_SIZE(btree));
+ /*
+ * copy any data more than the minimum, from the
+ * previous buffer to the start of the current.
+ */
+ memcpy(cur_dsk_start, (uint8_t *)r->prev_img_ptr->mem +
+ bnd_prev->min_bnd_offset, len_to_move);
+
+ /* Update boundary information */
+ bnd_cur->size += len_to_move;
+ bnd_prev->size -= len_to_move;
+ bnd_cur->max_bnd_entries += bnd_prev->max_bnd_entries -
+ bnd_prev->min_bnd_entries;
+ bnd_prev->max_bnd_entries = bnd_prev->min_bnd_entries;
+ bnd_cur->max_bnd_recno = bnd_prev->min_bnd_recno;
+ WT_RET(__wt_buf_set(session,
+ &bnd_cur->max_bnd_key, bnd_prev->min_bnd_key.data,
+ bnd_prev->min_bnd_key.size));
+ }
+
+ /* Write out the previous image */
+ WT_RET(__rec_split_write_prev_and_swap_buf(session, r));
+ }
+
+ /*
+ * At this point, there is only one disk image in the memory, pointed to
+ * by the previous image pointer. Update the current image pointer to
+ * this image.
+ */
+ r->cur_img_ptr = r->prev_img_ptr;
+ return (0);
+}
+
+/*
* __rec_split_finish_std --
* Finish processing a page, standard version.
*/
static int
__rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
{
- WT_BOUNDARY *bnd_cur, *bnd_prev;
+ WT_BOUNDARY *bnd_cur;
WT_PAGE_HEADER *dsk;
- bool grow_bnd;
+ bool chunks_merged;
/*
* We may arrive here with no entries to write if the page was entirely
@@ -3208,50 +3280,22 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
return (EBUSY);
}
- dsk = r->disk_image.mem;
-
- /* Set the number of entries for the just finished chunk. */
+ /* Set the number of entries and size for the just finished chunk. */
bnd_cur = &r->bnd[r->bnd_next];
bnd_cur->max_bnd_entries = r->entries;
+ bnd_cur->size = WT_PTRDIFF32(r->first_free, r->cur_img_ptr->mem);
- grow_bnd = true;
- /*
- * We can reach here even with raw_compression when the last split chunk
- * is too small to be sent for raw compression.
- */
- if (!r->is_bulk_load && !r->raw_compression) {
- if (WT_PTRDIFF(r->first_free, dsk) > r->page_size &&
- r->bnd_next != 0) {
- /*
- * We hold two boundaries worth of data in the buffer,
- * and this data doesn't fit in a single page. If the
- * last chunk is too small, readjust the boundary to a
- * pre-computed minimum.
- * Write out the penultimate chunk to the disk as a page
- */
- WT_RET(__rec_split_write_prev_and_shift_cur(
- session, r, true));
- } else
- if (r->bnd_next != 0) {
- /*
- * We have two boundaries, but the data in the
- * buffer can fit a single page. Merge the
- * boundaries to create a single chunk.
- */
- bnd_prev = bnd_cur - 1;
- bnd_prev->max_bnd_entries +=
- bnd_cur->max_bnd_entries;
- r->bnd_next--;
- grow_bnd = false;
- }
- }
+ chunks_merged = false;
+ if (r->prev_img_ptr != NULL)
+ WT_RET(__rec_split_finish_process_prev(session,
+ r, &chunks_merged));
/*
* We already have space for an extra boundary if we merged two
* boundaries above, in that case we do not need to grow the boundary
* structure.
*/
- if (grow_bnd)
+ if (!chunks_merged)
WT_RET(__rec_split_bnd_grow(session, r));
bnd_cur = &r->bnd[r->bnd_next];
r->bnd_next++;
@@ -3260,14 +3304,15 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
* Current boundary now has all the remaining data/last page now.
* Let's write it to the disk
*/
+ dsk = r->cur_img_ptr->mem;
dsk->recno = bnd_cur->max_bnd_recno;
dsk->u.entries = bnd_cur->max_bnd_entries;
- dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk);
- r->disk_image.size = dsk->mem_size;
+ dsk->mem_size = bnd_cur->size;
+ r->cur_img_ptr->size = dsk->mem_size;
/* If this is a checkpoint, we're done, otherwise write the page. */
return (__rec_is_checkpoint(session, r, bnd_cur) ?
- 0 : __rec_split_write(session, r, bnd_cur, &r->disk_image, true));
+ 0 : __rec_split_write(session, r, bnd_cur, r->cur_img_ptr, true));
}
/*
@@ -3289,7 +3334,7 @@ __rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r)
if (r->raw_compression && r->entries != 0) {
while (r->entries != 0) {
data_size =
- WT_PTRDIFF(r->first_free, r->disk_image.mem);
+ WT_PTRDIFF(r->first_free, r->cur_img_ptr->mem);
if (data_size <= btree->allocsize)
break;
WT_RET(__rec_split_raw_worker(session, r, 0, true));
@@ -5882,7 +5927,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
* write the buffer so we know what to do here.
*/
if (bnd->addr.addr == NULL)
- WT_RET(__wt_bt_write(session, &r->disk_image,
+ WT_RET(__wt_bt_write(session, r->cur_img_ptr,
NULL, NULL, true, F_ISSET(r, WT_CHECKPOINTING),
bnd->already_compressed));
else {
@@ -6546,7 +6591,7 @@ __rec_dictionary_lookup(
for (dp = __rec_dictionary_skip_search(r->dictionary_head, hash);
dp != NULL && dp->hash == hash; dp = dp->next[0]) {
WT_RET(__wt_cell_pack_data_match(
- (WT_CELL *)((uint8_t *)r->disk_image.mem + dp->offset),
+ (WT_CELL *)((uint8_t *)r->cur_img_ptr->mem + dp->offset),
&val->cell, val->buf.data, &match));
if (match) {
WT_STAT_DATA_INCR(session, rec_dictionary);