diff options
Diffstat (limited to 'src/reconcile/rec_write.c')
-rw-r--r-- | src/reconcile/rec_write.c | 707 |
1 files changed, 444 insertions, 263 deletions
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index 6f95b84d292..1c266496ec8 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -25,12 +25,25 @@ typedef struct { WT_PAGE *page; uint32_t flags; /* Caller's configuration */ - WT_ITEM disk_image; /* Temporary disk-image buffer */ /* - * Temporary buffer used to write out a disk image when managing two - * chunks worth of data in memory + * Reconciliation can end up requiring two temporary disk image buffers + * if a page split is involved. These two disk images are pointed to by + * current and the previous image pointers. During initialization the + * first image is allocated and pointed to by the current image pointer. + * If and when a split is involved the second image gets allocated and + * is pointed to by the current image pointer. The previous image + * pointer is made to refer the first image at this point. Two images + * are kept in memory to redistribute data among them in case the last + * split chunk ends up being smaller than the minimum required. As + * reconciliation generates more split chunks, the image referred to by + * the previous image pointer is written to the disk, the current and + * the previous image pointers are swapped, making space for another + * split chunk to be reconciled in the buffer that was just written out + * to the disk. */ - WT_ITEM *interim_buf; + WT_ITEM disk_image[2]; /* Temporary disk-image buffers */ + WT_ITEM *cur_img_ptr; + WT_ITEM *prev_img_ptr; /* * Track start/stop write generation to decide if all changes to the @@ -48,9 +61,9 @@ typedef struct { /* Track the page's maximum transaction ID. */ uint64_t max_txn; - /* Track if all updates were skipped. */ - uint64_t update_cnt; - uint64_t update_skip_cnt; + uint64_t update_mem_all; /* Total update memory size */ + uint64_t update_mem_saved; /* Saved update memory size */ + uint64_t update_mem_uncommitted;/* Uncommitted update memory size */ /* * When we can't mark the page clean (for example, checkpoint found some @@ -146,17 +159,6 @@ typedef struct { * that references all of our split pages. */ struct __rec_boundary { - /* - * Offset is the byte offset in the initial split buffer of the - * first byte of the split chunk, recorded before we decide to - * split the page; the difference between chunk[1]'s offset and - * chunk[0]'s offset is chunk[0]'s length. - * - * Once we split a page, we stop filling in offset values, we're - * writing the split chunks as we find them. - */ - size_t offset; /* Split's first byte */ - WT_ADDR addr; /* Split's written location */ uint32_t size; /* Split's size */ uint32_t checksum; /* Split's checksum */ @@ -338,7 +340,8 @@ static int __rec_split_write(WT_SESSION_IMPL *, WT_RECONCILE *, WT_BOUNDARY *, WT_ITEM *, bool); static int __rec_update_las( WT_SESSION_IMPL *, WT_RECONCILE *, uint32_t, WT_BOUNDARY *); -static int __rec_write_check_complete(WT_SESSION_IMPL *, WT_RECONCILE *); +static int __rec_write_check_complete( + WT_SESSION_IMPL *, WT_RECONCILE *, bool *); static int __rec_write_init(WT_SESSION_IMPL *, WT_REF *, uint32_t, WT_SALVAGE_COOKIE *, void *); static void __rec_write_page_status(WT_SESSION_IMPL *, WT_RECONCILE *); @@ -351,6 +354,7 @@ static int __rec_dictionary_init(WT_SESSION_IMPL *, WT_RECONCILE *, u_int); static int __rec_dictionary_lookup( WT_SESSION_IMPL *, WT_RECONCILE *, WT_KV *, WT_DICTIONARY **); static void __rec_dictionary_reset(WT_RECONCILE *); +static void __rec_verbose_lookaside_write(WT_SESSION_IMPL *); /* * __wt_reconcile -- @@ -386,7 +390,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, * In-memory splits: reconciliation of an internal page cannot handle * a child page splitting during the reconciliation. */ - __wt_writelock(session, &page->page_lock); + WT_PAGE_LOCK(session, page); oldest_id = __wt_txn_oldest_id(session); if (LF_ISSET(WT_EVICTING)) @@ -405,7 +409,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, /* Initialize the reconciliation structure for each new run. */ if ((ret = __rec_write_init( session, ref, flags, salvage, &session->reconcile)) != 0) { - __wt_writeunlock(session, &page->page_lock); + WT_PAGE_UNLOCK(session, page); return (ret); } r = session->reconcile; @@ -437,7 +441,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, /* Checks for a successful reconciliation. */ if (ret == 0) - ret = __rec_write_check_complete(session, r); + ret = __rec_write_check_complete(session, r, lookaside_retryp); /* Wrap up the page reconciliation. */ if (ret == 0 && (ret = __rec_write_wrapup(session, r, page)) == 0) @@ -446,15 +450,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_TRET(__rec_write_wrapup_err(session, r, page)); /* Release the reconciliation lock. */ - __wt_writeunlock(session, &page->page_lock); - - /* - * If our caller can configure lookaside table reconciliation, flag if - * that's worth trying. The lookaside table doesn't help if we skipped - * updates, it can only help with older readers preventing eviction. - */ - if (lookaside_retryp != NULL && r->update_cnt == r->update_skip_cnt) - *lookaside_retryp = true; + WT_PAGE_UNLOCK(session, page); /* Update statistics. */ WT_STAT_CONN_INCR(session, rec_pages); @@ -526,10 +522,8 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, static inline bool __rec_las_checkpoint_test(WT_SESSION_IMPL *session, WT_RECONCILE *r) { - WT_CONNECTION_IMPL *conn; WT_BTREE *btree; - conn = S2C(session); btree = S2BT(session); /* @@ -550,7 +544,8 @@ __rec_las_checkpoint_test(WT_SESSION_IMPL *session, WT_RECONCILE *r) if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT)) return (false); if (r->orig_btree_checkpoint_gen == btree->checkpoint_gen && - r->orig_txn_checkpoint_gen == conn->txn_global.checkpoint_gen && + r->orig_txn_checkpoint_gen == + __wt_gen(session, WT_GEN_CHECKPOINT) && r->orig_btree_checkpoint_gen == r->orig_txn_checkpoint_gen) return (false); return (true); @@ -558,13 +553,21 @@ __rec_las_checkpoint_test(WT_SESSION_IMPL *session, WT_RECONCILE *r) /* * __rec_write_check_complete -- - * Check that reconciliation should complete + * Check that reconciliation should complete. */ static int -__rec_write_check_complete(WT_SESSION_IMPL *session, WT_RECONCILE *r) +__rec_write_check_complete( + WT_SESSION_IMPL *session, WT_RECONCILE *r, bool *lookaside_retryp) { - WT_BOUNDARY *bnd; - size_t i; + /* + * Tests in this function are lookaside tests and tests to decide if + * rewriting a page in memory is worth doing. In-memory configurations + * can't use a lookaside table, and we ignore page rewrite desirability + * checks for in-memory eviction because a small cache can force us to + * rewrite every possible page. + */ + if (F_ISSET(r, WT_EVICT_IN_MEMORY)) + return (0); /* * If we have used the lookaside table, check for a lookaside table and @@ -574,19 +577,62 @@ __rec_write_check_complete(WT_SESSION_IMPL *session, WT_RECONCILE *r) return (EBUSY); /* - * If we are doing update/restore based eviction, confirm part of the - * page is being discarded, or at least 10% of the updates won't have - * to be re-instantiated. Otherwise, it isn't progress, don't bother. + * Eviction can configure lookaside table reconciliation, consider if + * it's worth giving up this reconciliation attempt and falling back to + * using the lookaside table. We continue with evict/restore if + * switching to the lookaside doesn't make sense for any reason: we + * won't retry an evict/restore reconciliation until/unless the + * transactional system moves forward, so at worst it's a single wasted + * effort. + * + * First, check if the lookaside table is a possible alternative. */ - if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) { - for (bnd = r->bnd, i = 0; i < r->bnd_entries; ++bnd, ++i) - if (bnd->supd == NULL) - break; - if (i == r->bnd_entries && - r->update_cnt / 10 >= r->update_skip_cnt) - return (EBUSY); - } - return (0); + if (lookaside_retryp == NULL) + return (0); + + /* + * We only suggest lookaside if currently in an evict/restore attempt + * and some updates were saved. Our caller sets the evict/restore flag + * based on various conditions (like if this is a leaf page), which is + * why we're testing that flag instead of a set of other conditions. + * If no updates were saved, eviction will succeed without needing to + * restore anything. + */ + if (!F_ISSET(r, WT_EVICT_UPDATE_RESTORE) || r->bnd->supd == NULL) + return (0); + + /* + * Check if this reconciliation attempt is making progress. If there's + * any sign of progress, don't fall back to the lookaside table. + * + * Check if the current reconciliation split, in which case we'll + * likely get to write at least one of the blocks. If that page is + * empty, that's also progress. + */ + if (r->bnd_next != 1) + return (0); + + /* + * Check if the current reconciliation applied some updates, in which + * case evict/restore should gain us some space. + */ + if (r->update_mem_saved != r->update_mem_all) + return (0); + + /* + * Check if lookaside eviction is possible. If any of the updates we + * saw were uncommitted, the lookaside table cannot be used: it only + * helps with older readers preventing eviction. + */ + if (r->update_mem_uncommitted != 0) + return (0); + + /* + * The current evict/restore approach shows no signs of being useful, + * lookaside is possible, suggest the lookaside table. + */ + *lookaside_retryp = true; + return (EBUSY); } /* @@ -810,12 +856,10 @@ __rec_write_init(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags, WT_SALVAGE_COOKIE *salvage, void *reconcilep) { WT_BTREE *btree; - WT_CONNECTION_IMPL *conn; WT_PAGE *page; WT_RECONCILE *r; btree = S2BT(session); - conn = S2C(session); page = ref->page; if ((r = *(WT_RECONCILE **)reconcilep) == NULL) { @@ -829,7 +873,8 @@ __rec_write_init(WT_SESSION_IMPL *session, r->last = &r->_last; /* Disk buffers need to be aligned for writing. */ - F_SET(&r->disk_image, WT_ITEM_ALIGNED); + F_SET(&r->disk_image[0], WT_ITEM_ALIGNED); + F_SET(&r->disk_image[1], WT_ITEM_ALIGNED); } /* Reconciliation is not re-entrant, make sure that doesn't happen. */ @@ -845,7 +890,7 @@ __rec_write_init(WT_SESSION_IMPL *session, * These are all ordered reads, but we only need one. */ r->orig_btree_checkpoint_gen = btree->checkpoint_gen; - r->orig_txn_checkpoint_gen = conn->txn_global.checkpoint_gen; + r->orig_txn_checkpoint_gen = __wt_gen(session, WT_GEN_CHECKPOINT); WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen); /* @@ -891,7 +936,7 @@ __rec_write_init(WT_SESSION_IMPL *session, r->max_txn = WT_TXN_NONE; /* Track if all updates were skipped. */ - r->update_cnt = r->update_skip_cnt = 0; + r->update_mem_all = r->update_mem_saved = r->update_mem_uncommitted = 0; /* Track if the page can be marked clean. */ r->leave_dirty = false; @@ -974,8 +1019,8 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep) return; *(WT_RECONCILE **)reconcilep = NULL; - __wt_buf_free(session, &r->disk_image); - __wt_scr_free(session, &r->interim_buf); + __wt_buf_free(session, &r->disk_image[0]); + __wt_buf_free(session, &r->disk_image[1]); __wt_free(session, r->raw_entries); __wt_free(session, r->raw_offsets); @@ -1115,7 +1160,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_DECL_ITEM(tmp); WT_PAGE *page; WT_UPDATE *append, *upd, *upd_list; - size_t notused; + size_t notused, update_mem; uint64_t max_txn, min_txn, txnid; bool append_origv, skipped; @@ -1136,36 +1181,62 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, } else upd_list = ins->upd; - ++r->update_cnt; - for (skipped = false, - max_txn = WT_TXN_NONE, min_txn = UINT64_MAX, - upd = upd_list; upd != NULL; upd = upd->next) { - if ((txnid = upd->txnid) == WT_TXN_ABORTED) - continue; + skipped = false; + update_mem = 0; + max_txn = WT_TXN_NONE; + min_txn = UINT64_MAX; - /* Track the largest/smallest transaction IDs on the list. */ - if (WT_TXNID_LT(max_txn, txnid)) - max_txn = txnid; - if (WT_TXNID_LT(txnid, min_txn)) - min_txn = txnid; + if (F_ISSET(r, WT_EVICTING)) { + /* Discard obsolete updates. */ + if ((upd = __wt_update_obsolete_check( + session, page, upd_list->next)) != NULL) + __wt_update_obsolete_free(session, page, upd); + + for (upd = upd_list; upd != NULL; upd = upd->next) { + /* Track the total memory in the update chain. */ + update_mem += WT_UPDATE_MEMSIZE(upd); + + if ((txnid = upd->txnid) == WT_TXN_ABORTED) + continue; - /* - * Find the first update we can use. - */ - if (F_ISSET(r, WT_EVICTING)) { /* + * Track the largest/smallest transaction IDs on the + * list. + */ + if (WT_TXNID_LT(max_txn, txnid)) + max_txn = txnid; + if (WT_TXNID_LT(txnid, min_txn)) + min_txn = txnid; + + /* + * Find the first update we can use. + * * Eviction can write any committed update. * * When reconciling for eviction, track whether any * uncommitted updates are found. + * + * When reconciling for eviction, track the memory held + * by the update chain. */ if (__wt_txn_committed(session, txnid)) { if (*updp == NULL) *updp = upd; } else skipped = true; - } else { + } + } else + for (upd = upd_list; upd != NULL; upd = upd->next) { + if ((txnid = upd->txnid) == WT_TXN_ABORTED) + continue; + + /* Track the largest transaction ID on the list. */ + if (WT_TXNID_LT(max_txn, txnid)) + max_txn = txnid; + /* + * Find the first update we can use. + * * Checkpoint can only write updates visible as of its * snapshot. * @@ -1180,7 +1251,12 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, skipped = true; } } - } + + /* Reconciliation should never see a reserved update. */ + WT_ASSERT(session, + *updp == NULL || (*updp)->type != WT_UPDATE_RESERVED); + + r->update_mem_all += update_mem; /* * If all of the updates were aborted, quit. This test is not strictly @@ -1227,12 +1303,6 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, txnid != S2C(session)->txn_global.checkpoint_txnid || WT_SESSION_IS_CHECKPOINT(session)); #endif - - /* - * Track how many update chains we saw vs. how many update - * chains had an entry we skipped. - */ - ++r->update_skip_cnt; return (0); } @@ -1276,6 +1346,23 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, if (skipped && !F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) return (EBUSY); + /* + * Track the memory required by the update chain. + * + * A page with no uncommitted (skipped) updates, that can't be evicted + * because some updates aren't yet globally visible, can be evicted by + * writing previous versions of the updates to the lookaside file. That + * test is just checking if the skipped updates memory is zero. + * + * If that's not possible (there are skipped updates), we can rewrite + * the pages in-memory, but we don't want to unless there's memory to + * recover. That test is comparing the memory we'd recover to the memory + * we'd have to re-instantiate as part of the rewrite. + */ + r->update_mem_saved += update_mem; + if (skipped) + r->update_mem_uncommitted += update_mem; + append_origv = false; if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) { /* @@ -1353,14 +1440,14 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, * place a deleted record at the end of the update list. */ if (vpack == NULL || vpack->type == WT_CELL_DEL) - WT_RET(__wt_update_alloc( - session, NULL, &append, ¬used)); + WT_RET(__wt_update_alloc(session, + NULL, &append, ¬used, WT_UPDATE_DELETED)); else { WT_RET(__wt_scr_alloc(session, 0, &tmp)); if ((ret = __wt_page_cell_data_ref( session, page, vpack, tmp)) == 0) - ret = __wt_update_alloc( - session, tmp, &append, ¬used); + ret = __wt_update_alloc(session, + tmp, &append, ¬used, WT_UPDATE_STANDARD); __wt_scr_free(session, &tmp); WT_RET(ret); } @@ -1721,7 +1808,7 @@ __rec_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size) */ WT_ASSERT(session, r->space_avail >= size); WT_ASSERT(session, WT_BLOCK_FITS( - r->first_free, size, r->disk_image.mem, r->disk_image.memsize)); + r->first_free, size, r->cur_img_ptr->mem, r->cur_img_ptr->memsize)); r->entries += v; r->space_avail -= size; @@ -1808,7 +1895,7 @@ __rec_dict_replace( * copy cell instead. */ if (dp->offset == 0) - dp->offset = WT_PTRDIFF32(r->first_free, r->disk_image.mem); + dp->offset = WT_PTRDIFF32(r->first_free, r->cur_img_ptr->mem); else { /* * The offset is the byte offset from this cell to the previous, @@ -1816,7 +1903,7 @@ __rec_dict_replace( * page. */ offset = (uint64_t)WT_PTRDIFF(r->first_free, - (uint8_t *)r->disk_image.mem + dp->offset); + (uint8_t *)r->cur_img_ptr->mem + dp->offset); val->len = val->cell_len = __wt_cell_pack_copy(&val->cell, rle, offset); val->buf.data = NULL; @@ -1952,7 +2039,6 @@ __rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r) static void __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd) { - bnd->offset = 0; bnd->max_bnd_recno = WT_RECNO_OOB; bnd->max_bnd_entries = 0; @@ -2105,8 +2191,8 @@ __rec_split_init(WT_SESSION_IMPL *session, r->page_size = r->page_size_orig = max; if (r->raw_compression) r->max_raw_page_size = r->page_size = - (uint32_t)WT_MIN(r->page_size * 10, - WT_MAX(r->page_size, btree->maxmempage / 2)); + (uint32_t)WT_MIN((uint64_t)r->page_size * 10, + WT_MAX((uint64_t)r->page_size, btree->maxmempage / 2)); /* * If we have to split, we want to choose a smaller page size for the * split pages, because otherwise we could end up splitting one large @@ -2165,15 +2251,14 @@ __rec_split_init(WT_SESSION_IMPL *session, * Ensure the disk image buffer is large enough for the max object, as * corrected by the underlying block manager. * - * The buffer that we build disk image in, needs to hold two chunks - * worth of data. Since we want to support split_size more than the page - * size (to allow for adjustments based on the compression), this buffer - * should be greater of twice of split_size and page_size. + * Since we want to support split_size more than the page size (to allow + * for adjustments based on the compression), this buffer should be + * greater of split_size and page_size. */ corrected_page_size = r->page_size; - disk_img_buf_size = 2 * WT_MAX(corrected_page_size, r->split_size); WT_RET(bm->write_size(bm, session, &corrected_page_size)); - WT_RET(__wt_buf_init(session, &r->disk_image, disk_img_buf_size)); + disk_img_buf_size = WT_MAX(corrected_page_size, r->split_size); + WT_RET(__wt_buf_init(session, &r->disk_image[0], disk_img_buf_size)); /* * Clear the disk page header to ensure all of it is initialized, even @@ -2183,15 +2268,17 @@ __rec_split_init(WT_SESSION_IMPL *session, * fixed-length column-store sets bits in bytes, where the bytes are * assumed to initially be 0. */ - memset(r->disk_image.mem, 0, page->type == WT_PAGE_COL_FIX ? + memset(r->disk_image[0].mem, 0, page->type == WT_PAGE_COL_FIX ? disk_img_buf_size : WT_PAGE_HEADER_SIZE); /* * Set the page type (the type doesn't change, and setting it later * would require additional code in a few different places). */ - dsk = r->disk_image.mem; + dsk = r->disk_image[0].mem; dsk->type = page->type; + r->cur_img_ptr = &r->disk_image[0]; + r->prev_img_ptr = NULL; r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); @@ -2200,7 +2287,6 @@ __rec_split_init(WT_SESSION_IMPL *session, WT_RET(__rec_split_bnd_grow(session, r)); __rec_split_bnd_init(session, &r->bnd[0]); r->bnd[0].max_bnd_recno = recno; - r->bnd[0].offset = WT_PAGE_HEADER_BYTE_SIZE(btree); /* Initialize the entry counter. */ r->entries = 0; @@ -2406,21 +2492,18 @@ __rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len) { WT_BM *bm; WT_BTREE *btree; - size_t corrected_page_size, inuse, len; + size_t corrected_page_size, inuse; btree = S2BT(session); bm = btree->bm; - len = WT_PTRDIFF(r->first_free, r->disk_image.mem); - inuse = (len - r->bnd[r->bnd_next].offset) + - WT_PAGE_HEADER_BYTE_SIZE(btree); + inuse = WT_PTRDIFF(r->first_free, r->cur_img_ptr->mem); corrected_page_size = inuse + add_len; WT_RET(bm->write_size(bm, session, &corrected_page_size)); - /* Need to account for buffer carrying two chunks worth of data */ - WT_RET(__wt_buf_grow(session, &r->disk_image, 2 * corrected_page_size)); + WT_RET(__wt_buf_grow(session, r->cur_img_ptr, corrected_page_size)); - r->first_free = (uint8_t *)r->disk_image.mem + len; + r->first_free = (uint8_t *)r->cur_img_ptr->mem + inuse; WT_ASSERT(session, corrected_page_size >= inuse); r->space_avail = corrected_page_size - inuse; WT_ASSERT(session, r->space_avail >= add_len); @@ -2429,89 +2512,55 @@ __rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len) } /* - * __rec_split_write_prev_and_shift_cur -- - * Write the previous split chunk to the disk as a page. Shift the contents - * of the current chunk to the start of the buffer, making space for a new - * chunk to be written. - * If the caller asks for a chunk resizing, the boundary between the two - * chunks is readjusted to the minimum split size boundary details stored - * in the previous chunk, letting the current chunk grow at the cost of the - * previous chunk. + * __rec_split_write_prev_and_swap_buf -- + * If there is a previous split chunk held in the memory, write it to the + * disk as a page. If there isn't one, this is the first time we are + * splitting and need to initialize a second buffer. Also, swap the + * previous and the current buffer pointers. */ static int -__rec_split_write_prev_and_shift_cur( - WT_SESSION_IMPL *session, WT_RECONCILE *r, bool resize_chunks) +__rec_split_write_prev_and_swap_buf(WT_SESSION_IMPL *session, WT_RECONCILE *r) { - WT_BM *bm; - WT_BOUNDARY *bnd_cur, *bnd_prev; - WT_BTREE *btree; - WT_PAGE_HEADER *dsk, *dsk_tmp; - size_t cur_len, len; - uint8_t *dsk_start; - - WT_ASSERT(session, r->bnd_next != 0); - - btree = S2BT(session); - bm = btree->bm; - bnd_cur = &r->bnd[r->bnd_next]; - bnd_prev = bnd_cur - 1; - dsk = r->disk_image.mem; - cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset; - - /* - * Resize chunks if the current is smaller than the minimum, and there - * are details on the minimum split size boundary available in the - * previous boundary details. - * - * There is a possibility that we do not have a minimum boundary set, in - * such a case we skip chunk resizing. Such a condition is possible for - * instance when we are building the image in the buffer and the first - * K/V pair is large enough that it surpasses both the minimum split - * size and the split size the application has set. In such a case we - * split the chunk without saving any minimum boundary. - */ - if (resize_chunks && - cur_len < r->min_split_size && bnd_prev->min_bnd_offset != 0) { - bnd_cur->offset = bnd_prev->min_bnd_offset; - bnd_cur->max_bnd_entries += - bnd_prev->max_bnd_entries - bnd_prev->min_bnd_entries; - bnd_prev->max_bnd_entries = bnd_prev->min_bnd_entries; - bnd_cur->max_bnd_recno = bnd_prev->min_bnd_recno; - - WT_RET(__wt_buf_set(session, &bnd_cur->max_bnd_key, - bnd_prev->min_bnd_key.data, bnd_prev->min_bnd_key.size)); - - /* Update current chunk's length */ - cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset; + WT_BOUNDARY *bnd_prev; + WT_ITEM *tmp_img_ptr; + WT_PAGE_HEADER *dsk; + size_t disk_img_size; + + WT_ASSERT(session, (r->prev_img_ptr == NULL && r->bnd_next == 0) || + (r->prev_img_ptr != NULL && r->bnd_next != 0)); + + /* Write previous chunk, if there is one */ + if (r->prev_img_ptr != NULL) { + bnd_prev = &r->bnd[r->bnd_next - 1]; + dsk = r->prev_img_ptr->mem; + dsk->recno = bnd_prev->max_bnd_recno; + dsk->u.entries = bnd_prev->max_bnd_entries; + dsk->mem_size = (uint32_t)bnd_prev->size; + r->prev_img_ptr->size = dsk->mem_size; + WT_RET(__rec_split_write(session, + r, bnd_prev, r->prev_img_ptr, false)); + } else { + /* + * If we do not have a previous buffer, we should initialize the + * second buffer before proceeding. We will create the second + * buffer of the same size as the current buffer. + */ + disk_img_size = r->cur_img_ptr->memsize; + WT_RET(__wt_buf_init(session, + &r->disk_image[1], disk_img_size)); + r->prev_img_ptr = &r->disk_image[1]; + dsk = r->prev_img_ptr->mem; + memset(dsk, 0, + r->page->type == WT_PAGE_COL_FIX ? + disk_img_size : WT_PAGE_HEADER_SIZE); + dsk->type = r->page->type; } - /* - * Create an interim buffer if not already done to prepare the previous - * chunk's disk image. - */ - len = bnd_cur->offset; - WT_RET(bm->write_size(bm, session, &len)); - if (r->interim_buf == NULL) - WT_RET(__wt_scr_alloc(session, len, &r->interim_buf)); - else - WT_RET(__wt_buf_init(session, r->interim_buf, len)); - - dsk_tmp = r->interim_buf->mem; - memcpy(dsk_tmp, dsk, bnd_cur->offset); - dsk_tmp->recno = bnd_prev->max_bnd_recno; - dsk_tmp->u.entries = bnd_prev->max_bnd_entries; - dsk_tmp->mem_size = WT_STORE_SIZE(bnd_cur->offset); - r->interim_buf->size = dsk_tmp->mem_size; - WT_RET(__rec_split_write(session, r, bnd_prev, r->interim_buf, false)); - - /* Shift the current chunk to the start of the buffer */ - dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); - (void)memmove(dsk_start, (uint8_t *)dsk + bnd_cur->offset, cur_len); - - /* Fix boundary offset */ - bnd_cur->offset = WT_PAGE_HEADER_BYTE_SIZE(btree); - /* Fix where free points */ - r->first_free = dsk_start + cur_len; + /* swap previous and current buffers */ + tmp_img_ptr = r->prev_img_ptr; + r->prev_img_ptr = r->cur_img_ptr; + r->cur_img_ptr = tmp_img_ptr; + return (0); } @@ -2529,7 +2578,7 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) size_t inuse; btree = S2BT(session); - dsk = r->disk_image.mem; + dsk = r->cur_img_ptr->mem; /* Fixed length col store can call with next_len 0 */ WT_ASSERT(session, next_len == 0 || r->space_avail < next_len); @@ -2543,9 +2592,7 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) "%s page too large, attempted split during salvage", __wt_page_type_string(r->page->type)); - last = &r->bnd[r->bnd_next]; - inuse = (WT_PTRDIFF(r->first_free, dsk) - last->offset) + - WT_PAGE_HEADER_BYTE_SIZE(btree); + inuse = WT_PTRDIFF(r->first_free, dsk); /* * We can get here if the first key/value pair won't fit. @@ -2558,8 +2605,10 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) /* All page boundaries reset the dictionary. */ __rec_dictionary_reset(r); - /* Set the number of entries for the just finished chunk. */ + /* Set the number of entries and size for the just finished chunk. */ + last = &r->bnd[r->bnd_next]; last->max_bnd_entries = r->entries; + last->size = (uint32_t)inuse; /* * In case of bulk load, write out chunks as we get them. Otherwise we @@ -2571,19 +2620,22 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) dsk->recno = last->max_bnd_recno; dsk->u.entries = last->max_bnd_entries; dsk->mem_size = (uint32_t)inuse; - r->disk_image.size = dsk->mem_size; - WT_RET(__rec_split_write( - session, r, last, &r->disk_image, false)); - /* Fix where free points */ - r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); - } else if (r->bnd_next != 0) - WT_RET(__rec_split_write_prev_and_shift_cur(session, r, false)); + r->cur_img_ptr->size = dsk->mem_size; + WT_RET(__rec_split_write(session, + r, last, r->cur_img_ptr, false)); + } else { + WT_RET(__rec_split_write_prev_and_swap_buf(session, r)); + /* current image we are writing to has changed */ + dsk = r->cur_img_ptr->mem; + } + + /* Fix where free points */ + r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); /* Prepare the next boundary */ WT_RET(__rec_split_bnd_grow(session, r)); r->bnd_next++; next = &r->bnd[r->bnd_next]; - next->offset = WT_PTRDIFF(r->first_free, dsk); /* Set the key for the next chunk. */ next->max_bnd_recno = r->recno; if (dsk->type == WT_PAGE_ROW_INT || dsk->type == WT_PAGE_ROW_LEAF) @@ -2642,9 +2694,8 @@ __rec_split_crossing_bnd( !WT_CROSSING_SPLIT_BND(r, next_len)) { btree = S2BT(session); bnd = &r->bnd[r->bnd_next]; - dsk = r->disk_image.mem; - min_bnd_offset = (WT_PTRDIFF(r->first_free, dsk) - - bnd->offset) + WT_PAGE_HEADER_BYTE_SIZE(btree); + dsk = r->cur_img_ptr->mem; + min_bnd_offset = WT_PTRDIFF(r->first_free, dsk); if (min_bnd_offset == WT_PAGE_HEADER_BYTE_SIZE(btree)) /* * This is possible if the first record doesn't fit in @@ -2705,7 +2756,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, unpack = &_unpack; compressor = btree->compressor; dst = &r->raw_destination; - dsk = r->disk_image.mem; + dsk = r->cur_img_ptr->mem; WT_RET(__rec_split_bnd_grow(session, r)); last = &r->bnd[r->bnd_next]; @@ -3021,7 +3072,7 @@ no_slots: r->first_free = dsk_start + len; r->space_avail += r->raw_offsets[result_slots]; WT_ASSERT(session, r->first_free + r->space_avail <= - (uint8_t *)r->disk_image.mem + r->disk_image.memsize); + (uint8_t *)r->cur_img_ptr->mem + r->cur_img_ptr->memsize); /* * Set the key for the next block (before writing the block, a @@ -3060,13 +3111,13 @@ no_slots: dsk->recno = last->max_bnd_recno; dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk); dsk->u.entries = r->entries; - r->disk_image.size = dsk->mem_size; + r->cur_img_ptr->size = dsk->mem_size; r->entries = 0; r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree); - write_ref = &r->disk_image; + write_ref = r->cur_img_ptr; last->already_compressed = false; } else { /* @@ -3094,7 +3145,7 @@ no_slots: last_block && __rec_is_checkpoint(session, r, last)) { if (write_ref == dst) WT_RET(__wt_buf_set( - session, &r->disk_image, dst->mem, dst->size)); + session, r->cur_img_ptr, dst->mem, dst->size)); } else WT_RET( __rec_split_write(session, r, last, write_ref, last_block)); @@ -3128,15 +3179,120 @@ __rec_split_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) } /* + * __rec_split_finish_process_prev -- + * If the two split chunks together fit in a single page, merge them into + * one. If they do not fit in a single page but the last is smaller than + * the minimum desired, move some data from the penultimate chunk to the + * last chunk and write out the previous/penultimate. Finally, update the + * pointer to the current image buffer. After this function exits, we will + * have one (last) buffer in memory, pointed to by the current image + * pointer. + */ +static int +__rec_split_finish_process_prev( + WT_SESSION_IMPL *session, WT_RECONCILE *r, bool *chunks_merged) +{ + WT_BOUNDARY *bnd_cur, *bnd_prev; + WT_BTREE *btree; + WT_PAGE_HEADER *dsk; + size_t len_to_move; + uint32_t combined_size; + uint8_t *cur_dsk_start; + + WT_ASSERT(session, r->prev_img_ptr != NULL); + + btree = S2BT(session); + bnd_cur = &r->bnd[r->bnd_next]; + bnd_prev = bnd_cur - 1; + *chunks_merged = false; + /* + * The sizes referred to in the boundary structure include the header, + * so when calculating the combined size, make sure not to include the + * header twice. + */ + combined_size = bnd_prev->size + + (bnd_cur->size - WT_PAGE_HEADER_BYTE_SIZE(btree)); + + if (combined_size <= r->page_size) { + /* + * We have two boundaries, but the data in the buffers can fit a + * single page. Merge the boundaries and create a single chunk. + */ + dsk = r->cur_img_ptr->mem; + memcpy((uint8_t *)r->prev_img_ptr->mem + bnd_prev->size, + WT_PAGE_HEADER_BYTE(btree, dsk), + bnd_cur->size - WT_PAGE_HEADER_BYTE_SIZE(btree)); + bnd_prev->size = combined_size; + bnd_prev->max_bnd_entries += bnd_cur->max_bnd_entries; + r->bnd_next--; + *chunks_merged = true; + } else { + if (bnd_cur->size < r->min_split_size && + bnd_prev->min_bnd_offset != 0 ) { + /* + * The last chunk, pointed to by the current image + * pointer, has less than the minimum data. Let's move + * any data more than the minimum from the previous + * image into the current. + */ + len_to_move = bnd_prev->size - bnd_prev->min_bnd_offset; + /* Grow current buffer if it is not large enough */ + if (r->space_avail < len_to_move) + WT_RET(__rec_split_grow(session, + r, len_to_move)); + cur_dsk_start = WT_PAGE_HEADER_BYTE(btree, + r->cur_img_ptr->mem); + + /* + * Shift the contents of the current buffer to make + * space for the data that will be prepended into the + * current buffer + */ + memmove(cur_dsk_start + len_to_move, + cur_dsk_start, bnd_cur->size - + WT_PAGE_HEADER_BYTE_SIZE(btree)); + /* + * copy any data more than the minimum, from the + * previous buffer to the start of the current. + */ + memcpy(cur_dsk_start, (uint8_t *)r->prev_img_ptr->mem + + bnd_prev->min_bnd_offset, len_to_move); + + /* Update boundary information */ + bnd_cur->size += (uint32_t)len_to_move; + bnd_prev->size -= (uint32_t)len_to_move; + bnd_cur->max_bnd_entries += bnd_prev->max_bnd_entries - + bnd_prev->min_bnd_entries; + bnd_prev->max_bnd_entries = bnd_prev->min_bnd_entries; + bnd_cur->max_bnd_recno = bnd_prev->min_bnd_recno; + WT_RET(__wt_buf_set(session, + &bnd_cur->max_bnd_key, bnd_prev->min_bnd_key.data, + bnd_prev->min_bnd_key.size)); + } + + /* Write out the previous image */ + WT_RET(__rec_split_write_prev_and_swap_buf(session, r)); + } + + /* + * At this point, there is only one disk image in the memory, pointed to + * by the previous image pointer. Update the current image pointer to + * this image. + */ + r->cur_img_ptr = r->prev_img_ptr; + return (0); +} + +/* * __rec_split_finish_std -- * Finish processing a page, standard version. */ static int __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) { - WT_BOUNDARY *bnd_cur, *bnd_prev; + WT_BOUNDARY *bnd_cur; WT_PAGE_HEADER *dsk; - bool grow_bnd; + bool chunks_merged; /* * We may arrive here with no entries to write if the page was entirely @@ -3163,50 +3319,22 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) return (EBUSY); } - dsk = r->disk_image.mem; - - /* Set the number of entries for the just finished chunk. */ + /* Set the number of entries and size for the just finished chunk. */ bnd_cur = &r->bnd[r->bnd_next]; bnd_cur->max_bnd_entries = r->entries; + bnd_cur->size = WT_PTRDIFF32(r->first_free, r->cur_img_ptr->mem); - grow_bnd = true; - /* - * We can reach here even with raw_compression when the last split chunk - * is too small to be sent for raw compression. - */ - if (!r->is_bulk_load && !r->raw_compression) { - if (WT_PTRDIFF(r->first_free, dsk) > r->page_size && - r->bnd_next != 0) { - /* - * We hold two boundaries worth of data in the buffer, - * and this data doesn't fit in a single page. If the - * last chunk is too small, readjust the boundary to a - * pre-computed minimum. - * Write out the penultimate chunk to the disk as a page - */ - WT_RET(__rec_split_write_prev_and_shift_cur( - session, r, true)); - } else - if (r->bnd_next != 0) { - /* - * We have two boundaries, but the data in the - * buffer can fit a single page. Merge the - * boundaries to create a single chunk. - */ - bnd_prev = bnd_cur - 1; - bnd_prev->max_bnd_entries += - bnd_cur->max_bnd_entries; - r->bnd_next--; - grow_bnd = false; - } - } + chunks_merged = false; + if (r->prev_img_ptr != NULL) + WT_RET(__rec_split_finish_process_prev(session, + r, &chunks_merged)); /* * We already have space for an extra boundary if we merged two * boundaries above, in that case we do not need to grow the boundary * structure. */ - if (grow_bnd) + if (!chunks_merged) WT_RET(__rec_split_bnd_grow(session, r)); bnd_cur = &r->bnd[r->bnd_next]; r->bnd_next++; @@ -3215,14 +3343,15 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) * Current boundary now has all the remaining data/last page now. * Let's write it to the disk */ + dsk = r->cur_img_ptr->mem; dsk->recno = bnd_cur->max_bnd_recno; dsk->u.entries = bnd_cur->max_bnd_entries; - dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk); - r->disk_image.size = dsk->mem_size; + dsk->mem_size = bnd_cur->size; + r->cur_img_ptr->size = dsk->mem_size; /* If this is a checkpoint, we're done, otherwise write the page. */ return (__rec_is_checkpoint(session, r, bnd_cur) ? - 0 : __rec_split_write(session, r, bnd_cur, &r->disk_image, true)); + 0 : __rec_split_write(session, r, bnd_cur, r->cur_img_ptr, true)); } /* @@ -3244,7 +3373,7 @@ __rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r) if (r->raw_compression && r->entries != 0) { while (r->entries != 0) { data_size = - WT_PTRDIFF(r->first_free, r->disk_image.mem); + WT_PTRDIFF(r->first_free, r->cur_img_ptr->mem); if (data_size <= btree->allocsize) break; WT_RET(__rec_split_raw_worker(session, r, 0, true)); @@ -3523,8 +3652,7 @@ __rec_update_las(WT_SESSION_IMPL *session, WT_PAGE *page; WT_SAVE_UPD *list; WT_UPDATE *upd; - uint64_t las_counter; - int64_t insert_cnt; + uint64_t insert_cnt, las_counter; uint32_t i, session_flags, slot; uint8_t *p; @@ -3613,20 +3741,24 @@ __rec_update_las(WT_SESSION_IMPL *session, /* * Walk the list of updates, storing each key/value pair into - * the lookaside table. + * the lookaside table. Skipped reserved items, they're never + * restored, obviously. */ do { + if (upd->type == WT_UPDATE_RESERVED) + continue; + cursor->set_key(cursor, btree_id, &las_addr, ++las_counter, list->onpage_txn, key); - if (WT_UPDATE_DELETED_ISSET(upd)) + if (upd->type == WT_UPDATE_DELETED) las_value.size = 0; else { las_value.data = WT_UPDATE_DATA(upd); las_value.size = upd->size; } cursor->set_value( - cursor, upd->txnid, upd->size, &las_value); + cursor, upd->txnid, upd->type, &las_value); WT_ERR(cursor->insert(cursor)); ++insert_cnt; @@ -3635,9 +3767,11 @@ __rec_update_las(WT_SESSION_IMPL *session, err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); - if (insert_cnt > 0) - (void)__wt_atomic_addi64( + if (insert_cnt > 0) { + (void)__wt_atomic_add64( &S2C(session)->las_record_cnt, insert_cnt); + __rec_verbose_lookaside_write(session); + } __wt_scr_free(session, &key); return (ret); @@ -4389,8 +4523,7 @@ __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_RET(__rec_split_raw(session, r, val->len)); } else if (WT_CHECK_CROSSING_BND(r, val->len)) - WT_RET(__rec_split_crossing_bnd( - session, r, val->len)); + WT_RET(__rec_split_crossing_bnd(session, r, val->len)); /* Copy the value onto the page. */ if (!deleted && !overflow_type && btree->dictionary) @@ -4553,7 +4686,7 @@ record_loop: /* update_no_copy = true; /* No data copy */ repeat_count = 1; /* Single record */ - deleted = WT_UPDATE_DELETED_ISSET(upd); + deleted = upd->type == WT_UPDATE_DELETED; if (!deleted) { data = WT_UPDATE_DATA(upd); size = upd->size; @@ -4788,7 +4921,7 @@ compare: /* } } else { deleted = upd == NULL || - WT_UPDATE_DELETED_ISSET(upd); + upd->type == WT_UPDATE_DELETED; if (!deleted) { data = WT_UPDATE_DATA(upd); size = upd->size; @@ -5333,7 +5466,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session, __wt_ovfl_cache(session, page, rip, vpack)); /* If this key/value pair was deleted, we're done. */ - if (WT_UPDATE_DELETED_ISSET(upd)) { + if (upd->type == WT_UPDATE_DELETED) { /* * Overflow keys referencing discarded values * are no longer useful, discard the backing @@ -5543,7 +5676,7 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) for (; ins != NULL; ins = WT_SKIP_NEXT(ins)) { /* Look for an update. */ WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd)); - if (upd == NULL || WT_UPDATE_DELETED_ISSET(upd)) + if (upd == NULL || upd->type == WT_UPDATE_DELETED) continue; if (upd->size == 0) /* Build value cell. */ @@ -5833,7 +5966,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) * write the buffer so we know what to do here. */ if (bnd->addr.addr == NULL) - WT_RET(__wt_bt_write(session, &r->disk_image, + WT_RET(__wt_bt_write(session, r->cur_img_ptr, NULL, NULL, true, F_ISSET(r, WT_CHECKPOINTING), bnd->already_compressed)); else { @@ -6497,7 +6630,7 @@ __rec_dictionary_lookup( for (dp = __rec_dictionary_skip_search(r->dictionary_head, hash); dp != NULL && dp->hash == hash; dp = dp->next[0]) { WT_RET(__wt_cell_pack_data_match( - (WT_CELL *)((uint8_t *)r->disk_image.mem + dp->offset), + (WT_CELL *)((uint8_t *)r->cur_img_ptr->mem + dp->offset), &val->cell, val->buf.data, &match)); if (match) { WT_STAT_DATA_INCR(session, rec_dictionary); @@ -6530,3 +6663,51 @@ __rec_dictionary_lookup( *dpp = next; return (0); } + +/* + * __rec_verbose_lookaside_write -- + * Create a verbose message to display once per checkpoint with details + * about the cache state when performing a lookaside table write. + */ +static void +__rec_verbose_lookaside_write(WT_SESSION_IMPL *session) +{ +#ifdef HAVE_VERBOSE + WT_CONNECTION_IMPL *conn; + uint64_t ckpt_gen_current, ckpt_gen_last; + uint32_t pct_dirty, pct_full; + + if (!WT_VERBOSE_ISSET(session, WT_VERB_LOOKASIDE)) return; + + conn = S2C(session); + ckpt_gen_current = __wt_gen(session, WT_GEN_CHECKPOINT); + ckpt_gen_last = conn->las_verb_gen_write; + + /* + * This message is throttled to one per checkpoint. To do this we + * track the generation of the last checkpoint for which the message + * was printed and check against the current checkpoint generation. + */ + if (ckpt_gen_current > ckpt_gen_last) { + /* + * Attempt to atomically replace the last checkpoint generation + * for which this message was printed. If the atomic swap fails + * we have raced and the winning thread will print the message. + */ + if (__wt_atomic_casv64(&conn->las_verb_gen_write, + ckpt_gen_last, ckpt_gen_current)) { + (void)__wt_eviction_clean_needed(session, &pct_full); + (void)__wt_eviction_dirty_needed(session, &pct_dirty); + + __wt_verbose(session, WT_VERB_LOOKASIDE, + "Page reconciliation triggered lookaside write. " + "Entries now in lookaside file: %" PRIu64 ", " + "cache dirty: %" PRIu32 "%% , " + "cache use: %" PRIu32 "%%", + conn->las_record_cnt, pct_dirty, pct_full); + } + } +#else + WT_UNUSED(session); +#endif +} |