diff options
Diffstat (limited to 'src/reconcile/rec_write.c')
-rw-r--r-- | src/reconcile/rec_write.c | 319 |
1 files changed, 178 insertions, 141 deletions
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index b49946bb10e..b96b34594b0 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -115,6 +115,7 @@ typedef struct { */ uint32_t page_size; /* Set page size */ uint32_t page_size_orig; /* Saved set page size */ + uint32_t max_raw_page_size; /* Max page size with raw compression */ /* * Second, the split size: if we're doing the page layout, split to a @@ -159,9 +160,16 @@ typedef struct { WT_ADDR addr; /* Split's written location */ uint32_t size; /* Split's size */ uint32_t cksum; /* Split's checksum */ + void *disk_image; /* Split's disk image */ /* + * Raw compression, the disk image being written is already + * compressed. + */ + bool already_compressed; + + /* * Saved update list, supporting the WT_EVICT_UPDATE_RESTORE and * WT_EVICT_LOOKASIDE configurations. */ @@ -175,13 +183,6 @@ typedef struct { * column-store key. */ WT_ITEM key; /* Promoted row-store key */ - - /* - * During wrapup, after reconciling the root page, we write a - * final block as part of a checkpoint. If raw compression - * was configured, that block may have already been compressed. - */ - bool already_compressed; } *bnd; /* Saved boundaries */ uint32_t bnd_next; /* Next boundary slot */ uint32_t bnd_next_max; /* Maximum boundary slots used */ @@ -445,17 +446,32 @@ __wt_reconcile(WT_SESSION_IMPL *session, } /* - * Clean up reconciliation resources: some workloads have millions of - * boundary structures, and if associated with an application session - * pulled into doing forced eviction, they won't be discarded for the - * life of the session (or until session.reset is called). Discard all - * of the reconciliation resources if an application thread, not doing - * a checkpoint. - */ - __rec_bnd_cleanup(session, r, - F_ISSET(session, WT_SESSION_INTERNAL) || - WT_SESSION_IS_CHECKPOINT(session) ? false : true); + * When application threads perform eviction, don't cache block manager + * or reconciliation structures (even across calls), we can have a + * significant number of application threads doing eviction at the same + * time with large items. We ignore checkpoints, once the checkpoint + * completes, all unnecessary session resources will be discarded. + * + * Even in application threads doing checkpoints or in internal threads + * doing any reconciliation, clean up reconciliation resources. Some + * workloads have millions of boundary structures in a reconciliation + * and we don't want to tie that memory down, even across calls. + */ + if (WT_SESSION_IS_CHECKPOINT(session) || + F_ISSET(session, WT_SESSION_INTERNAL)) + __rec_bnd_cleanup(session, r, false); + else { + /* + * Clean up the underlying block manager memory too: it's not + * reconciliation, but threads discarding reconciliation + * structures want to clean up the block manager's structures + * as well, and there's no obvious place to do that. + */ + if (session->block_manager_cleanup != NULL) + WT_TRET(session->block_manager_cleanup(session)); + WT_TRET(__rec_destroy_session(session)); + } WT_RET(ret); /* @@ -652,7 +668,7 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) WT_ASSERT(session, mod->mod_multi[i].supd == NULL); WT_ERR(__wt_multi_to_ref(session, - next, &mod->mod_multi[i], &pindex->index[i], NULL)); + next, &mod->mod_multi[i], &pindex->index[i], NULL, false)); pindex->index[i]->home = next; } @@ -1135,8 +1151,20 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, */ if (!skipped && (F_ISSET(btree, WT_BTREE_LOOKASIDE) || - __wt_txn_visible_all(session, max_txn))) + __wt_txn_visible_all(session, max_txn))) { +#ifdef HAVE_DIAGNOSTIC + /* + * The checkpoint transaction is special. Make sure we never + * write (metadata) updates from a checkpoint in a concurrent + * session. + */ + txnid = *updp == NULL ? WT_TXN_NONE : (*updp)->txnid; + WT_ASSERT(session, txnid == WT_TXN_NONE || + txnid != S2C(session)->txn_global.checkpoint_txnid || + WT_SESSION_IS_CHECKPOINT(session)); +#endif return (0); + } /* * In some cases, there had better not be skipped updates or updates not @@ -1845,18 +1873,19 @@ __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd) WT_CLEAR(bnd->addr); bnd->size = 0; bnd->cksum = 0; + __wt_free(session, bnd->disk_image); __wt_free(session, bnd->supd); bnd->supd_next = 0; bnd->supd_allocated = 0; + bnd->already_compressed = false; + /* * Don't touch the key, we re-use that memory in each new * reconciliation. */ - - bnd->already_compressed = false; } /* @@ -1950,10 +1979,19 @@ __rec_split_init(WT_SESSION_IMPL *session, * additional data because we don't know how well it will compress, and * we don't want to increment our way up to the amount of data needed by * the application to successfully compress to the target page size. + * Ideally accumulate data several times the page size without + * approaching the memory page maximum, but at least have data worth + * one page. + * + * There are cases when we grow the page size to accommodate large + * records, in those cases we split the pages once they have crossed + * the maximum size for a page with raw compression. */ r->page_size = r->page_size_orig = max; if (r->raw_compression) - r->page_size *= 10; + r->max_raw_page_size = r->page_size = + (uint32_t)WT_MIN(r->page_size * 10, + WT_MAX(r->page_size, btree->maxmempage / 2)); /* * Ensure the disk image buffer is large enough for the max object, as @@ -2295,7 +2333,7 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) /* Hitting a page boundary resets the dictionary, in all cases. */ __rec_dictionary_reset(r); - inuse = WT_PTRDIFF32(r->first_free, dsk); + inuse = WT_PTRDIFF(r->first_free, dsk); switch (r->bnd_state) { case SPLIT_BOUNDARY: /* @@ -2465,7 +2503,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, WT_COMPRESSOR *compressor; WT_DECL_RET; WT_ITEM *dst, *write_ref; - WT_PAGE_HEADER *dsk, *dsk_dst; + WT_PAGE_HEADER *dsk, *dsk_dst, *disk_image; WT_SESSION *wt_session; size_t corrected_page_size, extra_skip, len, result_len; uint64_t recno; @@ -2582,11 +2620,9 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, /* * Don't create an image so large that any future update will - * cause a split in memory. Use half of the maximum size so - * we split very compressible pages that have reached the - * maximum size in memory into two equal blocks. + * cause a split in memory. */ - if (len > (size_t)btree->maxmempage / 2) + if (max_image_slot == 0 && len > (size_t)r->max_raw_page_size) max_image_slot = slots; } @@ -2648,7 +2684,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, r->page_size_orig, btree->split_pct, WT_BLOCK_COMPRESS_SKIP + extra_skip, (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP, r->raw_offsets, - no_more_rows || max_image_slot == 0 ? slots : max_image_slot, + max_image_slot == 0 ? slots : max_image_slot, (uint8_t *)dst->mem + WT_BLOCK_COMPRESS_SKIP, result_len, no_more_rows || max_image_slot != 0, @@ -2751,7 +2787,8 @@ no_slots: if (result_slots != 0) { /* - * We have a block, finalize the header information. + * We have a block, finalize the compressed disk image's header + * information. */ dst->size = result_len + WT_BLOCK_COMPRESS_SKIP; dsk_dst = dst->mem; @@ -2761,6 +2798,26 @@ no_slots: dsk_dst->u.entries = r->raw_entries[result_slots - 1]; /* + * Optionally keep the disk image in cache. Update the initial + * page-header fields to reflect the actual data being written. + * + * If updates are saved and need to be restored, we have to keep + * a copy of the disk image. Unfortunately, we don't yet know if + * there are updates to restore for the key range covered by the + * disk image just created. If there are any saved updates, take + * a copy of the disk image, it's freed later if not needed. + */ + if (F_ISSET(r, WT_EVICT_SCRUB) || + (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && r->supd_next > 0)) { + WT_RET(__wt_strndup(session, dsk, + dsk_dst->mem_size, &last->disk_image)); + disk_image = last->disk_image; + disk_image->recno = last->recno; + disk_image->mem_size = dsk_dst->mem_size; + disk_image->u.entries = dsk_dst->u.entries; + } + + /* * There is likely a remnant in the working buffer that didn't * get compressed; copy it down to the start of the buffer and * update the starting record number, free space and so on. @@ -2874,48 +2931,6 @@ split_grow: /* } /* - * __rec_raw_decompress -- - * Decompress a raw-compressed image. - */ -static int -__rec_raw_decompress( - WT_SESSION_IMPL *session, const void *image, size_t size, void *retp) -{ - WT_BTREE *btree; - WT_DECL_ITEM(tmp); - WT_DECL_RET; - WT_PAGE_HEADER const *dsk; - size_t result_len; - - btree = S2BT(session); - dsk = image; - - /* - * We skipped an update and we can't write a block, but unfortunately, - * the block has already been compressed. Decompress the block so we - * can subsequently re-instantiate it in memory. - */ - WT_RET(__wt_scr_alloc(session, dsk->mem_size, &tmp)); - memcpy(tmp->mem, image, WT_BLOCK_COMPRESS_SKIP); - WT_ERR(btree->compressor->decompress(btree->compressor, - &session->iface, - (uint8_t *)image + WT_BLOCK_COMPRESS_SKIP, - size - WT_BLOCK_COMPRESS_SKIP, - (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP, - dsk->mem_size - WT_BLOCK_COMPRESS_SKIP, - &result_len)); - if (result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) - WT_ERR(__wt_illegal_value(session, btree->dhandle->name)); - - WT_ERR(__wt_strndup(session, tmp->data, dsk->mem_size, retp)); - WT_ASSERT(session, __wt_verify_dsk_image(session, - "[raw evict split]", tmp->data, dsk->mem_size, false) == 0); - -err: __wt_scr_free(session, &tmp); - return (ret); -} - -/* * __rec_split_raw -- * Raw compression split routine. */ @@ -3022,7 +3037,7 @@ __rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r) if (r->raw_compression && r->entries != 0) { while (r->entries != 0) { data_size = - WT_PTRDIFF32(r->first_free, r->disk_image.mem); + WT_PTRDIFF(r->first_free, r->disk_image.mem); if (data_size <= btree->allocsize) break; WT_RET(__rec_split_raw_worker(session, r, 0, true)); @@ -3145,14 +3160,13 @@ __rec_split_write(WT_SESSION_IMPL *session, uint32_t bnd_slot, i, j; int cmp; uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE]; + bool need_image; btree = S2BT(session); dsk = buf->mem; page = r->page; mod = page->modify; - WT_RET(__wt_scr_alloc(session, 0, &key)); - /* Set the zero-length value flag in the page header. */ if (dsk->type == WT_PAGE_ROW_LEAF) { F_CLR(dsk, WT_PAGE_EMPTY_V_ALL | WT_PAGE_EMPTY_V_NONE); @@ -3163,6 +3177,8 @@ __rec_split_write(WT_SESSION_IMPL *session, F_SET(dsk, WT_PAGE_EMPTY_V_NONE); } + bnd->entries = r->entries; + /* Initialize the address (set the page type for the parent). */ switch (dsk->type) { case WT_PAGE_COL_FIX: @@ -3176,9 +3192,8 @@ __rec_split_write(WT_SESSION_IMPL *session, case WT_PAGE_ROW_INT: bnd->addr.type = WT_ADDR_INT; break; - WT_ILLEGAL_VALUE_ERR(session); + WT_ILLEGAL_VALUE(session); } - bnd->size = (uint32_t)buf->size; bnd->cksum = 0; @@ -3190,6 +3205,8 @@ __rec_split_write(WT_SESSION_IMPL *session, * This code requires a key be filled in for the next block (or the * last block flag be set, if there's no next block). */ + if (page->type == WT_PAGE_ROW_LEAF) + WT_RET(__wt_scr_alloc(session, 0, &key)); for (i = 0, supd = r->supd; i < r->supd_next; ++i, ++supd) { /* The last block gets all remaining saved updates. */ if (last_block) { @@ -3254,33 +3271,11 @@ supd_check_complete: * image, we can't actually write it. Instead, we will re-instantiate * the page using the disk image and any list of updates we skipped. */ - if (F_ISSET(r, WT_EVICT_IN_MEMORY) || - (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL)) { - - /* Statistics tracking that we used update/restore. */ - if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) - r->cache_write_restore = true; - - /* - * If the buffer is compressed (raw compression was configured), - * we have to decompress it so we can instantiate it later. It's - * a slow and convoluted path, but it's also a rare one and it's - * not worth making it faster. Else, the disk image is ready, - * copy it into place for later. It's possible the disk image - * has no items; we have to flag that for verification, it's a - * special case since read/writing empty pages isn't generally - * allowed. - */ - if (bnd->already_compressed) - WT_ERR(__rec_raw_decompress( - session, buf->data, buf->size, &bnd->disk_image)); - else { - WT_ERR(__wt_strndup( - session, buf->data, buf->size, &bnd->disk_image)); - WT_ASSERT(session, __wt_verify_dsk_image(session, - "[evict split]", buf->data, buf->size, true) == 0); - } - goto done; + if (F_ISSET(r, WT_EVICT_IN_MEMORY)) + goto copy_image; + if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) { + r->cache_write_restore = true; + goto copy_image; } /* @@ -3324,13 +3319,11 @@ supd_check_complete: bnd->addr = multi->addr; WT_STAT_FAST_DATA_INCR(session, rec_page_match); - goto done; + goto copy_image; } } } - bnd->entries = r->entries; - #ifdef HAVE_VERBOSE /* Output a verbose message if we create a page without many entries */ if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT) && r->entries < 6) @@ -3343,8 +3336,8 @@ supd_check_complete: r->bnd_state)); #endif - WT_ERR(__wt_bt_write(session, - buf, addr, &addr_size, false, bnd->already_compressed)); + WT_ERR(__wt_bt_write(session, buf, addr, &addr_size, + false, F_ISSET(r, WT_CHECKPOINTING), bnd->already_compressed)); WT_ERR(__wt_strndup(session, addr, addr_size, &bnd->addr.addr)); bnd->addr.size = (uint8_t)addr_size; @@ -3354,9 +3347,29 @@ supd_check_complete: * the database's lookaside store. */ if (F_ISSET(r, WT_EVICT_LOOKASIDE) && bnd->supd != NULL) - ret = __rec_update_las(session, r, btree->id, bnd); + WT_ERR(__rec_update_las(session, r, btree->id, bnd)); + +copy_image: + /* + * If re-instantiating this page in memory (either because eviction + * wants to, or because we skipped updates to build the disk image), + * save a copy of the disk image. + * + * Raw compression might have already saved a copy of the disk image + * before we could know if we skipped updates to create it, and now + * we know if we're going to need it. + * + * Copy the disk image if we need a copy and don't already have one, + * discard any already saved copy we don't need. + */ + need_image = F_ISSET(r, WT_EVICT_SCRUB) || + (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL); + if (need_image && bnd->disk_image == NULL) + WT_ERR(__wt_strndup( + session, buf->data, buf->size, &bnd->disk_image)); + if (!need_image) + __wt_free(session, bnd->disk_image); -done: err: __wt_scr_free(session, &key); return (ret); } @@ -3556,8 +3569,9 @@ __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_PAGE *parent; WT_RECONCILE *r; - r = cbulk->reconcile; btree = S2BT(session); + if ((r = cbulk->reconcile) == NULL) + return (0); switch (btree->type) { case BTREE_COL_FIX: @@ -5601,9 +5615,10 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_RET(__wt_btree_block_free(session, mod->mod_replace.addr, mod->mod_replace.size)); - /* Discard the replacement page's address. */ + /* Discard the replacement page's address and disk image. */ __wt_free(session, mod->mod_replace.addr); mod->mod_replace.size = 0; + __wt_free(session, mod->mod_disk_image); break; WT_ILLEGAL_VALUE(session); } @@ -5651,26 +5666,33 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) bnd = &r->bnd[0]; /* - * If saving/restoring changes for this page and there's only - * one block, there's nothing to write. This is an in-memory - * configuration or a special case of forced eviction: set up + * If in-memory, or saving/restoring changes for this page and + * there's only one block, there's nothing to write. Set up * a single block as if to split, then use that disk image to - * rewrite the page in memory. + * rewrite the page in memory. This is separate from simple + * replacements where eviction has decided to retain the page + * in memory because the latter can't handle update lists and + * splits can. */ - if (bnd->disk_image != NULL) + if (F_ISSET(r, WT_EVICT_IN_MEMORY) || + (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL)) goto split; /* - * If this is a root page, then we don't have an address and we - * have to create a sync point. The address was cleared when - * we were about to write the buffer so we know what to do here. + * A root page, we don't have an address and we have to create + * a sync point. The address was cleared when we were about to + * write the buffer so we know what to do here. */ if (bnd->addr.addr == NULL) WT_RET(__wt_bt_write(session, &r->disk_image, - NULL, NULL, true, bnd->already_compressed)); + NULL, NULL, true, F_ISSET(r, WT_CHECKPOINTING), + bnd->already_compressed)); else { mod->mod_replace = bnd->addr; bnd->addr.addr = NULL; + + mod->mod_disk_image = bnd->disk_image; + bnd->disk_image = NULL; } mod->rec_result = WT_PM_REC_REPLACE; @@ -5805,19 +5827,26 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_RET(__wt_row_ikey_alloc(session, 0, bnd->key.data, bnd->key.size, &multi->key.ikey)); - if (bnd->disk_image == NULL) { - multi->addr = bnd->addr; - multi->addr.reuse = 0; - multi->size = bnd->size; - multi->cksum = bnd->cksum; - bnd->addr.addr = NULL; - } else { + /* + * Copy any disk image. Don't take saved updates without a + * disk image (which happens if they have been saved to the + * lookaside table): they should be discarded along with the + * original page. + */ + multi->disk_image = bnd->disk_image; + bnd->disk_image = NULL; + if (multi->disk_image != NULL) { multi->supd = bnd->supd; multi->supd_entries = bnd->supd_next; bnd->supd = NULL; - multi->disk_image = bnd->disk_image; - bnd->disk_image = NULL; } + + /* Copy any address. */ + multi->addr = bnd->addr; + multi->addr.reuse = 0; + multi->size = bnd->size; + multi->cksum = bnd->cksum; + bnd->addr.addr = NULL; } mod->mod_multi_entries = r->bnd_next; @@ -5845,19 +5874,26 @@ __rec_split_col(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) { multi->key.recno = bnd->recno; - if (bnd->disk_image == NULL) { - multi->addr = bnd->addr; - multi->addr.reuse = 0; - multi->size = bnd->size; - multi->cksum = bnd->cksum; - bnd->addr.addr = NULL; - } else { + /* + * Copy any disk image. Don't take saved updates without a + * disk image (which happens if they have been saved to the + * lookaside table): they should be discarded along with the + * original page. + */ + multi->disk_image = bnd->disk_image; + bnd->disk_image = NULL; + if (multi->disk_image != NULL) { multi->supd = bnd->supd; multi->supd_entries = bnd->supd_next; bnd->supd = NULL; - multi->disk_image = bnd->disk_image; - bnd->disk_image = NULL; } + + /* Copy any address. */ + multi->addr = bnd->addr; + multi->addr.reuse = 0; + multi->size = bnd->size; + multi->cksum = bnd->cksum; + bnd->addr.addr = NULL; } mod->mod_multi_entries = r->bnd_next; @@ -6133,7 +6169,8 @@ __rec_cell_build_ovfl(WT_SESSION_IMPL *session, /* Write the buffer. */ addr = buf; - WT_ERR(__wt_bt_write(session, tmp, addr, &size, false, false)); + WT_ERR(__wt_bt_write(session, tmp, + addr, &size, false, F_ISSET(r, WT_CHECKPOINTING), false)); /* * Track the overflow record (unless it's a bulk load, which |