diff options
author | Sulabh Mahajan <sulabh.mahajan@mongodb.com> | 2017-06-21 09:27:00 +1000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-06-21 09:27:00 +1000 |
commit | a79e471dada3a1a1aa298575882df4f4fb3aafaa (patch) | |
tree | 74aaa324d8f1fa32cae016a2284c61dc687c9194 | |
parent | c455dcfd99c4311838a194df917b63ceb61876f3 (diff) | |
download | mongo-a79e471dada3a1a1aa298575882df4f4fb3aafaa.tar.gz |
WT-3251 remove interim buffer when splitting during reconciliation (#3453)
* Don't initialize second buffer unless needed
* When moving data from previous to current image, grow buffer if needed
* Fix a bug caused by keeping pointer across realloc
* Address Dave's comments
* Address Keith's review comments
* Address comments
-rw-r--r-- | src/reconcile/rec_write.c | 407 |
1 files changed, 226 insertions, 181 deletions
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index f7df73c4ecb..4c79893bd94 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -25,12 +25,25 @@ typedef struct { WT_PAGE *page; uint32_t flags; /* Caller's configuration */ - WT_ITEM disk_image; /* Temporary disk-image buffer */ /* - * Temporary buffer used to write out a disk image when managing two - * chunks worth of data in memory - */ - WT_ITEM *interim_buf; + * Reconciliation can end up requiring two temporary disk image buffers + * if a page split is involved. These two disk images are pointed to by + * current and the previous image pointers. During initialization the + * first image is allocated and pointed to by the current image pointer. + * If and when a split is involved the second image gets allocated and + * is pointed to by the current image pointer. The previous image + * pointer is made to refer the first image at this point. Two images + * are kept in memory to redistribute data among them in case the last + * split chunk ends up being smaller than the minimum required. As + * reconciliation generates more split chunks, the image referred to by + * the previous image pointer is written to the disk, the current and + * the previous image pointers are swapped, making space for another + * split chunk to be reconciled in the buffer that was just written out + * to the disk. + */ + WT_ITEM disk_image[2]; /* Temporary disk-image buffers */ + WT_ITEM *cur_img_ptr; + WT_ITEM *prev_img_ptr; /* * Track start/stop write generation to decide if all changes to the @@ -146,17 +159,6 @@ typedef struct { * that references all of our split pages. */ struct __rec_boundary { - /* - * Offset is the byte offset in the initial split buffer of the - * first byte of the split chunk, recorded before we decide to - * split the page; the difference between chunk[1]'s offset and - * chunk[0]'s offset is chunk[0]'s length. - * - * Once we split a page, we stop filling in offset values, we're - * writing the split chunks as we find them. - */ - size_t offset; /* Split's first byte */ - WT_ADDR addr; /* Split's written location */ uint32_t size; /* Split's size */ uint32_t checksum; /* Split's checksum */ @@ -832,7 +834,8 @@ __rec_write_init(WT_SESSION_IMPL *session, r->last = &r->_last; /* Disk buffers need to be aligned for writing. */ - F_SET(&r->disk_image, WT_ITEM_ALIGNED); + F_SET(&r->disk_image[0], WT_ITEM_ALIGNED); + F_SET(&r->disk_image[1], WT_ITEM_ALIGNED); } /* Reconciliation is not re-entrant, make sure that doesn't happen. */ @@ -977,8 +980,8 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep) return; *(WT_RECONCILE **)reconcilep = NULL; - __wt_buf_free(session, &r->disk_image); - __wt_scr_free(session, &r->interim_buf); + __wt_buf_free(session, &r->disk_image[0]); + __wt_buf_free(session, &r->disk_image[1]); __wt_free(session, r->raw_entries); __wt_free(session, r->raw_offsets); @@ -1766,7 +1769,7 @@ __rec_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size) */ WT_ASSERT(session, r->space_avail >= size); WT_ASSERT(session, WT_BLOCK_FITS( - r->first_free, size, r->disk_image.mem, r->disk_image.memsize)); + r->first_free, size, r->cur_img_ptr->mem, r->cur_img_ptr->memsize)); r->entries += v; r->space_avail -= size; @@ -1853,7 +1856,7 @@ __rec_dict_replace( * copy cell instead. */ if (dp->offset == 0) - dp->offset = WT_PTRDIFF32(r->first_free, r->disk_image.mem); + dp->offset = WT_PTRDIFF32(r->first_free, r->cur_img_ptr->mem); else { /* * The offset is the byte offset from this cell to the previous, @@ -1861,7 +1864,7 @@ __rec_dict_replace( * page. */ offset = (uint64_t)WT_PTRDIFF(r->first_free, - (uint8_t *)r->disk_image.mem + dp->offset); + (uint8_t *)r->cur_img_ptr->mem + dp->offset); val->len = val->cell_len = __wt_cell_pack_copy(&val->cell, rle, offset); val->buf.data = NULL; @@ -1997,7 +2000,6 @@ __rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r) static void __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd) { - bnd->offset = 0; bnd->max_bnd_recno = WT_RECNO_OOB; bnd->max_bnd_entries = 0; @@ -2210,15 +2212,14 @@ __rec_split_init(WT_SESSION_IMPL *session, * Ensure the disk image buffer is large enough for the max object, as * corrected by the underlying block manager. * - * The buffer that we build disk image in, needs to hold two chunks - * worth of data. Since we want to support split_size more than the page - * size (to allow for adjustments based on the compression), this buffer - * should be greater of twice of split_size and page_size. + * Since we want to support split_size more than the page size (to allow + * for adjustments based on the compression), this buffer should be + * greater of split_size and page_size. */ corrected_page_size = r->page_size; - disk_img_buf_size = 2 * WT_MAX(corrected_page_size, r->split_size); WT_RET(bm->write_size(bm, session, &corrected_page_size)); - WT_RET(__wt_buf_init(session, &r->disk_image, disk_img_buf_size)); + disk_img_buf_size = WT_MAX(corrected_page_size, r->split_size); + WT_RET(__wt_buf_init(session, &r->disk_image[0], disk_img_buf_size)); /* * Clear the disk page header to ensure all of it is initialized, even @@ -2228,15 +2229,17 @@ __rec_split_init(WT_SESSION_IMPL *session, * fixed-length column-store sets bits in bytes, where the bytes are * assumed to initially be 0. */ - memset(r->disk_image.mem, 0, page->type == WT_PAGE_COL_FIX ? + memset(r->disk_image[0].mem, 0, page->type == WT_PAGE_COL_FIX ? disk_img_buf_size : WT_PAGE_HEADER_SIZE); /* * Set the page type (the type doesn't change, and setting it later * would require additional code in a few different places). */ - dsk = r->disk_image.mem; + dsk = r->disk_image[0].mem; dsk->type = page->type; + r->cur_img_ptr = &r->disk_image[0]; + r->prev_img_ptr = NULL; r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); @@ -2245,7 +2248,6 @@ __rec_split_init(WT_SESSION_IMPL *session, WT_RET(__rec_split_bnd_grow(session, r)); __rec_split_bnd_init(session, &r->bnd[0]); r->bnd[0].max_bnd_recno = recno; - r->bnd[0].offset = WT_PAGE_HEADER_BYTE_SIZE(btree); /* Initialize the entry counter. */ r->entries = 0; @@ -2451,21 +2453,18 @@ __rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len) { WT_BM *bm; WT_BTREE *btree; - size_t corrected_page_size, inuse, len; + size_t corrected_page_size, inuse; btree = S2BT(session); bm = btree->bm; - len = WT_PTRDIFF(r->first_free, r->disk_image.mem); - inuse = (len - r->bnd[r->bnd_next].offset) + - WT_PAGE_HEADER_BYTE_SIZE(btree); + inuse = WT_PTRDIFF(r->first_free, r->cur_img_ptr->mem); corrected_page_size = inuse + add_len; WT_RET(bm->write_size(bm, session, &corrected_page_size)); - /* Need to account for buffer carrying two chunks worth of data */ - WT_RET(__wt_buf_grow(session, &r->disk_image, 2 * corrected_page_size)); + WT_RET(__wt_buf_grow(session, r->cur_img_ptr, corrected_page_size)); - r->first_free = (uint8_t *)r->disk_image.mem + len; + r->first_free = (uint8_t *)r->cur_img_ptr->mem + inuse; WT_ASSERT(session, corrected_page_size >= inuse); r->space_avail = corrected_page_size - inuse; WT_ASSERT(session, r->space_avail >= add_len); @@ -2474,89 +2473,55 @@ __rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len) } /* - * __rec_split_write_prev_and_shift_cur -- - * Write the previous split chunk to the disk as a page. Shift the contents - * of the current chunk to the start of the buffer, making space for a new - * chunk to be written. - * If the caller asks for a chunk resizing, the boundary between the two - * chunks is readjusted to the minimum split size boundary details stored - * in the previous chunk, letting the current chunk grow at the cost of the - * previous chunk. + * __rec_split_write_prev_and_swap_buf -- + * If there is a previous split chunk held in the memory, write it to the + * disk as a page. If there isn't one, this is the first time we are + * splitting and need to initialize a second buffer. Also, swap the + * previous and the current buffer pointers. */ static int -__rec_split_write_prev_and_shift_cur( - WT_SESSION_IMPL *session, WT_RECONCILE *r, bool resize_chunks) +__rec_split_write_prev_and_swap_buf(WT_SESSION_IMPL *session, WT_RECONCILE *r) { - WT_BM *bm; - WT_BOUNDARY *bnd_cur, *bnd_prev; - WT_BTREE *btree; - WT_PAGE_HEADER *dsk, *dsk_tmp; - size_t cur_len, len; - uint8_t *dsk_start; - - WT_ASSERT(session, r->bnd_next != 0); - - btree = S2BT(session); - bm = btree->bm; - bnd_cur = &r->bnd[r->bnd_next]; - bnd_prev = bnd_cur - 1; - dsk = r->disk_image.mem; - cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset; - - /* - * Resize chunks if the current is smaller than the minimum, and there - * are details on the minimum split size boundary available in the - * previous boundary details. - * - * There is a possibility that we do not have a minimum boundary set, in - * such a case we skip chunk resizing. Such a condition is possible for - * instance when we are building the image in the buffer and the first - * K/V pair is large enough that it surpasses both the minimum split - * size and the split size the application has set. In such a case we - * split the chunk without saving any minimum boundary. - */ - if (resize_chunks && - cur_len < r->min_split_size && bnd_prev->min_bnd_offset != 0) { - bnd_cur->offset = bnd_prev->min_bnd_offset; - bnd_cur->max_bnd_entries += - bnd_prev->max_bnd_entries - bnd_prev->min_bnd_entries; - bnd_prev->max_bnd_entries = bnd_prev->min_bnd_entries; - bnd_cur->max_bnd_recno = bnd_prev->min_bnd_recno; - - WT_RET(__wt_buf_set(session, &bnd_cur->max_bnd_key, - bnd_prev->min_bnd_key.data, bnd_prev->min_bnd_key.size)); - - /* Update current chunk's length */ - cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset; + WT_BOUNDARY *bnd_prev; + WT_ITEM *tmp_img_ptr; + WT_PAGE_HEADER *dsk; + size_t disk_img_size; + + WT_ASSERT(session, (r->prev_img_ptr == NULL && r->bnd_next == 0) || + (r->prev_img_ptr != NULL && r->bnd_next != 0)); + + /* Write previous chunk, if there is one */ + if (r->prev_img_ptr != NULL) { + bnd_prev = &r->bnd[r->bnd_next - 1]; + dsk = r->prev_img_ptr->mem; + dsk->recno = bnd_prev->max_bnd_recno; + dsk->u.entries = bnd_prev->max_bnd_entries; + dsk->mem_size = (uint32_t)bnd_prev->size; + r->prev_img_ptr->size = dsk->mem_size; + WT_RET(__rec_split_write(session, + r, bnd_prev, r->prev_img_ptr, false)); + } else { + /* + * If we do not have a previous buffer, we should initialize the + * second buffer before proceeding. We will create the second + * buffer of the same size as the current buffer. + */ + disk_img_size = r->cur_img_ptr->memsize; + WT_RET(__wt_buf_init(session, + &r->disk_image[1], disk_img_size)); + r->prev_img_ptr = &r->disk_image[1]; + dsk = r->prev_img_ptr->mem; + memset(dsk, 0, + r->page->type == WT_PAGE_COL_FIX ? + disk_img_size : WT_PAGE_HEADER_SIZE); + dsk->type = r->page->type; } - /* - * Create an interim buffer if not already done to prepare the previous - * chunk's disk image. - */ - len = bnd_cur->offset; - WT_RET(bm->write_size(bm, session, &len)); - if (r->interim_buf == NULL) - WT_RET(__wt_scr_alloc(session, len, &r->interim_buf)); - else - WT_RET(__wt_buf_init(session, r->interim_buf, len)); - - dsk_tmp = r->interim_buf->mem; - memcpy(dsk_tmp, dsk, bnd_cur->offset); - dsk_tmp->recno = bnd_prev->max_bnd_recno; - dsk_tmp->u.entries = bnd_prev->max_bnd_entries; - dsk_tmp->mem_size = WT_STORE_SIZE(bnd_cur->offset); - r->interim_buf->size = dsk_tmp->mem_size; - WT_RET(__rec_split_write(session, r, bnd_prev, r->interim_buf, false)); - - /* Shift the current chunk to the start of the buffer */ - dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); - (void)memmove(dsk_start, (uint8_t *)dsk + bnd_cur->offset, cur_len); - - /* Fix boundary offset */ - bnd_cur->offset = WT_PAGE_HEADER_BYTE_SIZE(btree); - /* Fix where free points */ - r->first_free = dsk_start + cur_len; + /* swap previous and current buffers */ + tmp_img_ptr = r->prev_img_ptr; + r->prev_img_ptr = r->cur_img_ptr; + r->cur_img_ptr = tmp_img_ptr; + return (0); } @@ -2574,7 +2539,7 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) size_t inuse; btree = S2BT(session); - dsk = r->disk_image.mem; + dsk = r->cur_img_ptr->mem; /* Fixed length col store can call with next_len 0 */ WT_ASSERT(session, next_len == 0 || r->space_avail < next_len); @@ -2588,9 +2553,7 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) "%s page too large, attempted split during salvage", __wt_page_type_string(r->page->type)); - last = &r->bnd[r->bnd_next]; - inuse = (WT_PTRDIFF(r->first_free, dsk) - last->offset) + - WT_PAGE_HEADER_BYTE_SIZE(btree); + inuse = WT_PTRDIFF(r->first_free, dsk); /* * We can get here if the first key/value pair won't fit. @@ -2603,8 +2566,10 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) /* All page boundaries reset the dictionary. */ __rec_dictionary_reset(r); - /* Set the number of entries for the just finished chunk. */ + /* Set the number of entries and size for the just finished chunk. */ + last = &r->bnd[r->bnd_next]; last->max_bnd_entries = r->entries; + last->size = (uint32_t)inuse; /* * In case of bulk load, write out chunks as we get them. Otherwise we @@ -2616,19 +2581,22 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) dsk->recno = last->max_bnd_recno; dsk->u.entries = last->max_bnd_entries; dsk->mem_size = (uint32_t)inuse; - r->disk_image.size = dsk->mem_size; - WT_RET(__rec_split_write( - session, r, last, &r->disk_image, false)); - /* Fix where free points */ - r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); - } else if (r->bnd_next != 0) - WT_RET(__rec_split_write_prev_and_shift_cur(session, r, false)); + r->cur_img_ptr->size = dsk->mem_size; + WT_RET(__rec_split_write(session, + r, last, r->cur_img_ptr, false)); + } else { + WT_RET(__rec_split_write_prev_and_swap_buf(session, r)); + /* current image we are writing to has changed */ + dsk = r->cur_img_ptr->mem; + } + + /* Fix where free points */ + r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); /* Prepare the next boundary */ WT_RET(__rec_split_bnd_grow(session, r)); r->bnd_next++; next = &r->bnd[r->bnd_next]; - next->offset = WT_PTRDIFF(r->first_free, dsk); /* Set the key for the next chunk. */ next->max_bnd_recno = r->recno; if (dsk->type == WT_PAGE_ROW_INT || dsk->type == WT_PAGE_ROW_LEAF) @@ -2687,9 +2655,8 @@ __rec_split_crossing_bnd( !WT_CROSSING_SPLIT_BND(r, next_len)) { btree = S2BT(session); bnd = &r->bnd[r->bnd_next]; - dsk = r->disk_image.mem; - min_bnd_offset = (WT_PTRDIFF(r->first_free, dsk) - - bnd->offset) + WT_PAGE_HEADER_BYTE_SIZE(btree); + dsk = r->cur_img_ptr->mem; + min_bnd_offset = WT_PTRDIFF(r->first_free, dsk); if (min_bnd_offset == WT_PAGE_HEADER_BYTE_SIZE(btree)) /* * This is possible if the first record doesn't fit in @@ -2750,7 +2717,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, unpack = &_unpack; compressor = btree->compressor; dst = &r->raw_destination; - dsk = r->disk_image.mem; + dsk = r->cur_img_ptr->mem; WT_RET(__rec_split_bnd_grow(session, r)); last = &r->bnd[r->bnd_next]; @@ -3066,7 +3033,7 @@ no_slots: r->first_free = dsk_start + len; r->space_avail += r->raw_offsets[result_slots]; WT_ASSERT(session, r->first_free + r->space_avail <= - (uint8_t *)r->disk_image.mem + r->disk_image.memsize); + (uint8_t *)r->cur_img_ptr->mem + r->cur_img_ptr->memsize); /* * Set the key for the next block (before writing the block, a @@ -3105,13 +3072,13 @@ no_slots: dsk->recno = last->max_bnd_recno; dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk); dsk->u.entries = r->entries; - r->disk_image.size = dsk->mem_size; + r->cur_img_ptr->size = dsk->mem_size; r->entries = 0; r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree); - write_ref = &r->disk_image; + write_ref = r->cur_img_ptr; last->already_compressed = false; } else { /* @@ -3139,7 +3106,7 @@ no_slots: last_block && __rec_is_checkpoint(session, r, last)) { if (write_ref == dst) WT_RET(__wt_buf_set( - session, &r->disk_image, dst->mem, dst->size)); + session, r->cur_img_ptr, dst->mem, dst->size)); } else WT_RET( __rec_split_write(session, r, last, write_ref, last_block)); @@ -3173,15 +3140,120 @@ __rec_split_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) } /* + * __rec_split_finish_process_prev -- + * If the two split chunks together fit in a single page, merge them into + * one. If they do not fit in a single page but the last is smaller than + * the minimum desired, move some data from the penultimate chunk to the + * last chunk and write out the previous/penultimate. Finally, update the + * pointer to the current image buffer. After this function exits, we will + * have one (last) buffer in memory, pointed to by the current image + * pointer. + */ +static int +__rec_split_finish_process_prev( + WT_SESSION_IMPL *session, WT_RECONCILE *r, bool *chunks_merged) +{ + WT_BOUNDARY *bnd_cur, *bnd_prev; + WT_BTREE *btree; + WT_PAGE_HEADER *dsk; + size_t len_to_move; + uint32_t combined_size; + uint8_t *cur_dsk_start; + + WT_ASSERT(session, r->prev_img_ptr != NULL); + + btree = S2BT(session); + bnd_cur = &r->bnd[r->bnd_next]; + bnd_prev = bnd_cur - 1; + *chunks_merged = false; + /* + * The sizes referred to in the boundary structure include the header, + * so when calculating the combined size, make sure not to include the + * header twice. + */ + combined_size = bnd_prev->size + + (bnd_cur->size - WT_PAGE_HEADER_BYTE_SIZE(btree)); + + if (combined_size <= r->page_size) { + /* + * We have two boundaries, but the data in the buffers can fit a + * single page. Merge the boundaries and create a single chunk. + */ + dsk = r->cur_img_ptr->mem; + memcpy((uint8_t *)r->prev_img_ptr->mem + bnd_prev->size, + WT_PAGE_HEADER_BYTE(btree, dsk), + bnd_cur->size - WT_PAGE_HEADER_BYTE_SIZE(btree)); + bnd_prev->size = combined_size; + bnd_prev->max_bnd_entries += bnd_cur->max_bnd_entries; + r->bnd_next--; + *chunks_merged = true; + } else { + if (bnd_cur->size < r->min_split_size && + bnd_prev->min_bnd_offset != 0 ) { + /* + * The last chunk, pointed to by the current image + * pointer, has less than the minimum data. Let's move + * any data more than the minimum from the previous + * image into the current. + */ + len_to_move = bnd_prev->size - bnd_prev->min_bnd_offset; + /* Grow current buffer if it is not large enough */ + if (r->space_avail < len_to_move) + WT_RET(__rec_split_grow(session, + r, len_to_move)); + cur_dsk_start = WT_PAGE_HEADER_BYTE(btree, + r->cur_img_ptr->mem); + + /* + * Shift the contents of the current buffer to make + * space for the data that will be prepended into the + * current buffer + */ + memmove(cur_dsk_start + len_to_move, + cur_dsk_start, bnd_cur->size - + WT_PAGE_HEADER_BYTE_SIZE(btree)); + /* + * copy any data more than the minimum, from the + * previous buffer to the start of the current. + */ + memcpy(cur_dsk_start, (uint8_t *)r->prev_img_ptr->mem + + bnd_prev->min_bnd_offset, len_to_move); + + /* Update boundary information */ + bnd_cur->size += len_to_move; + bnd_prev->size -= len_to_move; + bnd_cur->max_bnd_entries += bnd_prev->max_bnd_entries - + bnd_prev->min_bnd_entries; + bnd_prev->max_bnd_entries = bnd_prev->min_bnd_entries; + bnd_cur->max_bnd_recno = bnd_prev->min_bnd_recno; + WT_RET(__wt_buf_set(session, + &bnd_cur->max_bnd_key, bnd_prev->min_bnd_key.data, + bnd_prev->min_bnd_key.size)); + } + + /* Write out the previous image */ + WT_RET(__rec_split_write_prev_and_swap_buf(session, r)); + } + + /* + * At this point, there is only one disk image in the memory, pointed to + * by the previous image pointer. Update the current image pointer to + * this image. + */ + r->cur_img_ptr = r->prev_img_ptr; + return (0); +} + +/* * __rec_split_finish_std -- * Finish processing a page, standard version. */ static int __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) { - WT_BOUNDARY *bnd_cur, *bnd_prev; + WT_BOUNDARY *bnd_cur; WT_PAGE_HEADER *dsk; - bool grow_bnd; + bool chunks_merged; /* * We may arrive here with no entries to write if the page was entirely @@ -3208,50 +3280,22 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) return (EBUSY); } - dsk = r->disk_image.mem; - - /* Set the number of entries for the just finished chunk. */ + /* Set the number of entries and size for the just finished chunk. */ bnd_cur = &r->bnd[r->bnd_next]; bnd_cur->max_bnd_entries = r->entries; + bnd_cur->size = WT_PTRDIFF32(r->first_free, r->cur_img_ptr->mem); - grow_bnd = true; - /* - * We can reach here even with raw_compression when the last split chunk - * is too small to be sent for raw compression. - */ - if (!r->is_bulk_load && !r->raw_compression) { - if (WT_PTRDIFF(r->first_free, dsk) > r->page_size && - r->bnd_next != 0) { - /* - * We hold two boundaries worth of data in the buffer, - * and this data doesn't fit in a single page. If the - * last chunk is too small, readjust the boundary to a - * pre-computed minimum. - * Write out the penultimate chunk to the disk as a page - */ - WT_RET(__rec_split_write_prev_and_shift_cur( - session, r, true)); - } else - if (r->bnd_next != 0) { - /* - * We have two boundaries, but the data in the - * buffer can fit a single page. Merge the - * boundaries to create a single chunk. - */ - bnd_prev = bnd_cur - 1; - bnd_prev->max_bnd_entries += - bnd_cur->max_bnd_entries; - r->bnd_next--; - grow_bnd = false; - } - } + chunks_merged = false; + if (r->prev_img_ptr != NULL) + WT_RET(__rec_split_finish_process_prev(session, + r, &chunks_merged)); /* * We already have space for an extra boundary if we merged two * boundaries above, in that case we do not need to grow the boundary * structure. */ - if (grow_bnd) + if (!chunks_merged) WT_RET(__rec_split_bnd_grow(session, r)); bnd_cur = &r->bnd[r->bnd_next]; r->bnd_next++; @@ -3260,14 +3304,15 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) * Current boundary now has all the remaining data/last page now. * Let's write it to the disk */ + dsk = r->cur_img_ptr->mem; dsk->recno = bnd_cur->max_bnd_recno; dsk->u.entries = bnd_cur->max_bnd_entries; - dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk); - r->disk_image.size = dsk->mem_size; + dsk->mem_size = bnd_cur->size; + r->cur_img_ptr->size = dsk->mem_size; /* If this is a checkpoint, we're done, otherwise write the page. */ return (__rec_is_checkpoint(session, r, bnd_cur) ? - 0 : __rec_split_write(session, r, bnd_cur, &r->disk_image, true)); + 0 : __rec_split_write(session, r, bnd_cur, r->cur_img_ptr, true)); } /* @@ -3289,7 +3334,7 @@ __rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r) if (r->raw_compression && r->entries != 0) { while (r->entries != 0) { data_size = - WT_PTRDIFF(r->first_free, r->disk_image.mem); + WT_PTRDIFF(r->first_free, r->cur_img_ptr->mem); if (data_size <= btree->allocsize) break; WT_RET(__rec_split_raw_worker(session, r, 0, true)); @@ -5882,7 +5927,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) * write the buffer so we know what to do here. */ if (bnd->addr.addr == NULL) - WT_RET(__wt_bt_write(session, &r->disk_image, + WT_RET(__wt_bt_write(session, r->cur_img_ptr, NULL, NULL, true, F_ISSET(r, WT_CHECKPOINTING), bnd->already_compressed)); else { @@ -6546,7 +6591,7 @@ __rec_dictionary_lookup( for (dp = __rec_dictionary_skip_search(r->dictionary_head, hash); dp != NULL && dp->hash == hash; dp = dp->next[0]) { WT_RET(__wt_cell_pack_data_match( - (WT_CELL *)((uint8_t *)r->disk_image.mem + dp->offset), + (WT_CELL *)((uint8_t *)r->cur_img_ptr->mem + dp->offset), &val->cell, val->buf.data, &match)); if (match) { WT_STAT_DATA_INCR(session, rec_dictionary); |