summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/reconcile/rec_write.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/wiredtiger/src/reconcile/rec_write.c')
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c370
1 files changed, 201 insertions, 169 deletions
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 6bcb5457385..a9912628942 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -115,6 +115,7 @@ typedef struct {
*/
uint32_t page_size; /* Set page size */
uint32_t page_size_orig; /* Saved set page size */
+ uint32_t max_raw_page_size; /* Max page size with raw compression */
/*
* Second, the split size: if we're doing the page layout, split to a
@@ -158,10 +159,17 @@ typedef struct {
WT_ADDR addr; /* Split's written location */
uint32_t size; /* Split's size */
- uint32_t cksum; /* Split's checksum */
+ uint32_t checksum; /* Split's checksum */
+
void *disk_image; /* Split's disk image */
/*
+ * Raw compression, the disk image being written is already
+ * compressed.
+ */
+ bool already_compressed;
+
+ /*
* Saved update list, supporting the WT_EVICT_UPDATE_RESTORE and
* WT_EVICT_LOOKASIDE configurations.
*/
@@ -175,13 +183,6 @@ typedef struct {
* column-store key.
*/
WT_ITEM key; /* Promoted row-store key */
-
- /*
- * During wrapup, after reconciling the root page, we write a
- * final block as part of a checkpoint. If raw compression
- * was configured, that block may have already been compressed.
- */
- bool already_compressed;
} *bnd; /* Saved boundaries */
uint32_t bnd_next; /* Next boundary slot */
uint32_t bnd_next_max; /* Maximum boundary slots used */
@@ -356,8 +357,8 @@ __wt_reconcile(WT_SESSION_IMPL *session,
page = ref->page;
mod = page->modify;
- WT_RET(__wt_verbose(session,
- WT_VERB_RECONCILE, "%s", __wt_page_type_string(page->type)));
+ __wt_verbose(session,
+ WT_VERB_RECONCILE, "%s", __wt_page_type_string(page->type));
/* We shouldn't get called with a clean page, that's an error. */
WT_ASSERT(session, __wt_page_is_modified(page));
@@ -371,21 +372,26 @@ __wt_reconcile(WT_SESSION_IMPL *session,
* In-memory splits: reconciliation of an internal page cannot handle
* a child page splitting during the reconciliation.
*/
- WT_RET(__wt_fair_lock(session, &page->page_lock));
+ __wt_writelock(session, &page->page_lock);
+
+ oldest_id = __wt_txn_oldest_id(session);
+ if (LF_ISSET(WT_EVICTING))
+ mod->last_eviction_id = oldest_id;
+#ifdef HAVE_DIAGNOSTIC
/*
* Check that transaction time always moves forward for a given page.
* If this check fails, reconciliation can free something that a future
* reconciliation will need.
*/
- oldest_id = __wt_txn_oldest_id(session);
WT_ASSERT(session, WT_TXNID_LE(mod->last_oldest_id, oldest_id));
mod->last_oldest_id = oldest_id;
+#endif
/* Initialize the reconciliation structure for each new run. */
if ((ret = __rec_write_init(
session, ref, flags, salvage, &session->reconcile)) != 0) {
- WT_TRET(__wt_fair_unlock(session, &page->page_lock));
+ __wt_writeunlock(session, &page->page_lock);
return (ret);
}
r = session->reconcile;
@@ -426,7 +432,7 @@ __wt_reconcile(WT_SESSION_IMPL *session,
WT_TRET(__rec_write_wrapup_err(session, r, page));
/* Release the reconciliation lock. */
- WT_TRET(__wt_fair_unlock(session, &page->page_lock));
+ __wt_writeunlock(session, &page->page_lock);
/* Update statistics. */
WT_STAT_FAST_CONN_INCR(session, rec_pages);
@@ -445,17 +451,32 @@ __wt_reconcile(WT_SESSION_IMPL *session,
}
/*
- * Clean up reconciliation resources: some workloads have millions of
- * boundary structures, and if associated with an application session
- * pulled into doing forced eviction, they won't be discarded for the
- * life of the session (or until session.reset is called). Discard all
- * of the reconciliation resources if an application thread, not doing
- * a checkpoint.
- */
- __rec_bnd_cleanup(session, r,
- F_ISSET(session, WT_SESSION_INTERNAL) ||
- WT_SESSION_IS_CHECKPOINT(session) ? false : true);
+ * When application threads perform eviction, don't cache block manager
+ * or reconciliation structures (even across calls), we can have a
+ * significant number of application threads doing eviction at the same
+ * time with large items. We ignore checkpoints, once the checkpoint
+ * completes, all unnecessary session resources will be discarded.
+ *
+ * Even in application threads doing checkpoints or in internal threads
+ * doing any reconciliation, clean up reconciliation resources. Some
+ * workloads have millions of boundary structures in a reconciliation
+ * and we don't want to tie that memory down, even across calls.
+ */
+ if (WT_SESSION_IS_CHECKPOINT(session) ||
+ F_ISSET(session, WT_SESSION_INTERNAL))
+ __rec_bnd_cleanup(session, r, false);
+ else {
+ /*
+ * Clean up the underlying block manager memory too: it's not
+ * reconciliation, but threads discarding reconciliation
+ * structures want to clean up the block manager's structures
+ * as well, and there's no obvious place to do that.
+ */
+ if (session->block_manager_cleanup != NULL)
+ WT_TRET(session->block_manager_cleanup(session));
+ WT_TRET(__rec_destroy_session(session));
+ }
WT_RET(ret);
/*
@@ -624,8 +645,8 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
WT_ILLEGAL_VALUE(session);
}
- WT_RET(__wt_verbose(session, WT_VERB_SPLIT,
- "root page split -> %" PRIu32 " pages", mod->mod_multi_entries));
+ __wt_verbose(session, WT_VERB_SPLIT,
+ "root page split -> %" PRIu32 " pages", mod->mod_multi_entries);
/*
* Create a new root page, initialize the array of child references,
@@ -652,7 +673,7 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
WT_ASSERT(session, mod->mod_multi[i].supd == NULL);
WT_ERR(__wt_multi_to_ref(session,
- next, &mod->mod_multi[i], &pindex->index[i], NULL));
+ next, &mod->mod_multi[i], &pindex->index[i], NULL, false));
pindex->index[i]->home = next;
}
@@ -1136,15 +1157,17 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
if (!skipped &&
(F_ISSET(btree, WT_BTREE_LOOKASIDE) ||
__wt_txn_visible_all(session, max_txn))) {
+#ifdef HAVE_DIAGNOSTIC
/*
* The checkpoint transaction is special. Make sure we never
* write (metadata) updates from a checkpoint in a concurrent
* session.
*/
- WT_ASSERT(session, *updp == NULL ||
- (txnid = (*updp)->txnid) == WT_TXN_NONE ||
+ txnid = *updp == NULL ? WT_TXN_NONE : (*updp)->txnid;
+ WT_ASSERT(session, txnid == WT_TXN_NONE ||
txnid != S2C(session)->txn_global.checkpoint_txnid ||
WT_SESSION_IS_CHECKPOINT(session));
+#endif
return (0);
}
@@ -1854,19 +1877,20 @@ __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd)
__wt_free(session, bnd->addr.addr);
WT_CLEAR(bnd->addr);
bnd->size = 0;
- bnd->cksum = 0;
+ bnd->checksum = 0;
+
__wt_free(session, bnd->disk_image);
__wt_free(session, bnd->supd);
bnd->supd_next = 0;
bnd->supd_allocated = 0;
+ bnd->already_compressed = false;
+
/*
* Don't touch the key, we re-use that memory in each new
* reconciliation.
*/
-
- bnd->already_compressed = false;
}
/*
@@ -1960,10 +1984,19 @@ __rec_split_init(WT_SESSION_IMPL *session,
* additional data because we don't know how well it will compress, and
* we don't want to increment our way up to the amount of data needed by
* the application to successfully compress to the target page size.
+ * Ideally accumulate data several times the page size without
+ * approaching the memory page maximum, but at least have data worth
+ * one page.
+ *
+ * There are cases when we grow the page size to accommodate large
+ * records, in those cases we split the pages once they have crossed
+ * the maximum size for a page with raw compression.
*/
r->page_size = r->page_size_orig = max;
if (r->raw_compression)
- r->page_size *= 10;
+ r->max_raw_page_size = r->page_size =
+ (uint32_t)WT_MIN(r->page_size * 10,
+ WT_MAX(r->page_size, btree->maxmempage / 2));
/*
* Ensure the disk image buffer is large enough for the max object, as
@@ -2305,7 +2338,7 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
/* Hitting a page boundary resets the dictionary, in all cases. */
__rec_dictionary_reset(r);
- inuse = WT_PTRDIFF32(r->first_free, dsk);
+ inuse = WT_PTRDIFF(r->first_free, dsk);
switch (r->bnd_state) {
case SPLIT_BOUNDARY:
/*
@@ -2475,7 +2508,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session,
WT_COMPRESSOR *compressor;
WT_DECL_RET;
WT_ITEM *dst, *write_ref;
- WT_PAGE_HEADER *dsk, *dsk_dst;
+ WT_PAGE_HEADER *dsk, *dsk_dst, *disk_image;
WT_SESSION *wt_session;
size_t corrected_page_size, extra_skip, len, result_len;
uint64_t recno;
@@ -2592,11 +2625,9 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session,
/*
* Don't create an image so large that any future update will
- * cause a split in memory. Use half of the maximum size so
- * we split very compressible pages that have reached the
- * maximum size in memory into two equal blocks.
+ * cause a split in memory.
*/
- if (len > (size_t)btree->maxmempage / 2)
+ if (max_image_slot == 0 && len > (size_t)r->max_raw_page_size)
max_image_slot = slots;
}
@@ -2658,7 +2689,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session,
r->page_size_orig, btree->split_pct,
WT_BLOCK_COMPRESS_SKIP + extra_skip,
(uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP, r->raw_offsets,
- no_more_rows || max_image_slot == 0 ? slots : max_image_slot,
+ max_image_slot == 0 ? slots : max_image_slot,
(uint8_t *)dst->mem + WT_BLOCK_COMPRESS_SKIP,
result_len,
no_more_rows || max_image_slot != 0,
@@ -2761,7 +2792,8 @@ no_slots:
if (result_slots != 0) {
/*
- * We have a block, finalize the header information.
+ * We have a block, finalize the compressed disk image's header
+ * information.
*/
dst->size = result_len + WT_BLOCK_COMPRESS_SKIP;
dsk_dst = dst->mem;
@@ -2771,6 +2803,26 @@ no_slots:
dsk_dst->u.entries = r->raw_entries[result_slots - 1];
/*
+ * Optionally keep the disk image in cache. Update the initial
+ * page-header fields to reflect the actual data being written.
+ *
+ * If updates are saved and need to be restored, we have to keep
+ * a copy of the disk image. Unfortunately, we don't yet know if
+ * there are updates to restore for the key range covered by the
+ * disk image just created. If there are any saved updates, take
+ * a copy of the disk image, it's freed later if not needed.
+ */
+ if (F_ISSET(r, WT_EVICT_SCRUB) ||
+ (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && r->supd_next > 0)) {
+ WT_RET(__wt_strndup(session, dsk,
+ dsk_dst->mem_size, &last->disk_image));
+ disk_image = last->disk_image;
+ disk_image->recno = last->recno;
+ disk_image->mem_size = dsk_dst->mem_size;
+ disk_image->u.entries = dsk_dst->u.entries;
+ }
+
+ /*
* There is likely a remnant in the working buffer that didn't
* get compressed; copy it down to the start of the buffer and
* update the starting record number, free space and so on.
@@ -2884,48 +2936,6 @@ split_grow: /*
}
/*
- * __rec_raw_decompress --
- * Decompress a raw-compressed image.
- */
-static int
-__rec_raw_decompress(
- WT_SESSION_IMPL *session, const void *image, size_t size, void *retp)
-{
- WT_BTREE *btree;
- WT_DECL_ITEM(tmp);
- WT_DECL_RET;
- WT_PAGE_HEADER const *dsk;
- size_t result_len;
-
- btree = S2BT(session);
- dsk = image;
-
- /*
- * We skipped an update and we can't write a block, but unfortunately,
- * the block has already been compressed. Decompress the block so we
- * can subsequently re-instantiate it in memory.
- */
- WT_RET(__wt_scr_alloc(session, dsk->mem_size, &tmp));
- memcpy(tmp->mem, image, WT_BLOCK_COMPRESS_SKIP);
- WT_ERR(btree->compressor->decompress(btree->compressor,
- &session->iface,
- (uint8_t *)image + WT_BLOCK_COMPRESS_SKIP,
- size - WT_BLOCK_COMPRESS_SKIP,
- (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP,
- dsk->mem_size - WT_BLOCK_COMPRESS_SKIP,
- &result_len));
- if (result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP)
- WT_ERR(__wt_illegal_value(session, btree->dhandle->name));
-
- WT_ERR(__wt_strndup(session, tmp->data, dsk->mem_size, retp));
- WT_ASSERT(session, __wt_verify_dsk_image(session,
- "[raw evict split]", tmp->data, dsk->mem_size, false) == 0);
-
-err: __wt_scr_free(session, &tmp);
- return (ret);
-}
-
-/*
* __rec_split_raw --
* Raw compression split routine.
*/
@@ -3032,7 +3042,7 @@ __rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r)
if (r->raw_compression && r->entries != 0) {
while (r->entries != 0) {
data_size =
- WT_PTRDIFF32(r->first_free, r->disk_image.mem);
+ WT_PTRDIFF(r->first_free, r->disk_image.mem);
if (data_size <= btree->allocsize)
break;
WT_RET(__rec_split_raw_worker(session, r, 0, true));
@@ -3155,14 +3165,13 @@ __rec_split_write(WT_SESSION_IMPL *session,
uint32_t bnd_slot, i, j;
int cmp;
uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE];
+ bool need_image;
btree = S2BT(session);
dsk = buf->mem;
page = r->page;
mod = page->modify;
- WT_RET(__wt_scr_alloc(session, 0, &key));
-
/* Set the zero-length value flag in the page header. */
if (dsk->type == WT_PAGE_ROW_LEAF) {
F_CLR(dsk, WT_PAGE_EMPTY_V_ALL | WT_PAGE_EMPTY_V_NONE);
@@ -3173,6 +3182,8 @@ __rec_split_write(WT_SESSION_IMPL *session,
F_SET(dsk, WT_PAGE_EMPTY_V_NONE);
}
+ bnd->entries = r->entries;
+
/* Initialize the address (set the page type for the parent). */
switch (dsk->type) {
case WT_PAGE_COL_FIX:
@@ -3186,11 +3197,10 @@ __rec_split_write(WT_SESSION_IMPL *session,
case WT_PAGE_ROW_INT:
bnd->addr.type = WT_ADDR_INT;
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE(session);
}
-
bnd->size = (uint32_t)buf->size;
- bnd->cksum = 0;
+ bnd->checksum = 0;
/*
* Check if we've saved updates that belong to this block, and move
@@ -3200,6 +3210,8 @@ __rec_split_write(WT_SESSION_IMPL *session,
* This code requires a key be filled in for the next block (or the
* last block flag be set, if there's no next block).
*/
+ if (page->type == WT_PAGE_ROW_LEAF)
+ WT_RET(__wt_scr_alloc(session, 0, &key));
for (i = 0, supd = r->supd; i < r->supd_next; ++i, ++supd) {
/* The last block gets all remaining saved updates. */
if (last_block) {
@@ -3264,33 +3276,11 @@ supd_check_complete:
* image, we can't actually write it. Instead, we will re-instantiate
* the page using the disk image and any list of updates we skipped.
*/
- if (F_ISSET(r, WT_EVICT_IN_MEMORY) ||
- (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL)) {
-
- /* Statistics tracking that we used update/restore. */
- if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL)
- r->cache_write_restore = true;
-
- /*
- * If the buffer is compressed (raw compression was configured),
- * we have to decompress it so we can instantiate it later. It's
- * a slow and convoluted path, but it's also a rare one and it's
- * not worth making it faster. Else, the disk image is ready,
- * copy it into place for later. It's possible the disk image
- * has no items; we have to flag that for verification, it's a
- * special case since read/writing empty pages isn't generally
- * allowed.
- */
- if (bnd->already_compressed)
- WT_ERR(__rec_raw_decompress(
- session, buf->data, buf->size, &bnd->disk_image));
- else {
- WT_ERR(__wt_strndup(
- session, buf->data, buf->size, &bnd->disk_image));
- WT_ASSERT(session, __wt_verify_dsk_image(session,
- "[evict split]", buf->data, buf->size, true) == 0);
- }
- goto done;
+ if (F_ISSET(r, WT_EVICT_IN_MEMORY))
+ goto copy_image;
+ if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) {
+ r->cache_write_restore = true;
+ goto copy_image;
}
/*
@@ -3316,7 +3306,7 @@ supd_check_complete:
*/
dsk->write_gen = 0;
memset(WT_BLOCK_HEADER_REF(dsk), 0, btree->block_header);
- bnd->cksum = __wt_cksum(buf->data, buf->size);
+ bnd->checksum = __wt_checksum(buf->data, buf->size);
/*
* One last check: don't reuse blocks if compacting, the reason
@@ -3329,32 +3319,30 @@ supd_check_complete:
mod->mod_multi_entries > bnd_slot) {
multi = &mod->mod_multi[bnd_slot];
if (multi->size == bnd->size &&
- multi->cksum == bnd->cksum) {
+ multi->checksum == bnd->checksum) {
multi->addr.reuse = 1;
bnd->addr = multi->addr;
WT_STAT_FAST_DATA_INCR(session, rec_page_match);
- goto done;
+ goto copy_image;
}
}
}
- bnd->entries = r->entries;
-
#ifdef HAVE_VERBOSE
/* Output a verbose message if we create a page without many entries */
if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT) && r->entries < 6)
- WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
+ __wt_verbose(session, WT_VERB_SPLIT,
"Reconciliation creating a page with %" PRIu32
- " entries, memory footprint %" PRIu64
+ " entries, memory footprint %" WT_SIZET_FMT
", page count %" PRIu32 ", %s, split state: %d\n",
r->entries, r->page->memory_footprint, r->bnd_next,
F_ISSET(r, WT_EVICTING) ? "evict" : "checkpoint",
- r->bnd_state));
+ r->bnd_state);
#endif
- WT_ERR(__wt_bt_write(session,
- buf, addr, &addr_size, false, bnd->already_compressed));
+ WT_ERR(__wt_bt_write(session, buf, addr, &addr_size,
+ false, F_ISSET(r, WT_CHECKPOINTING), bnd->already_compressed));
WT_ERR(__wt_strndup(session, addr, addr_size, &bnd->addr.addr));
bnd->addr.size = (uint8_t)addr_size;
@@ -3364,9 +3352,29 @@ supd_check_complete:
* the database's lookaside store.
*/
if (F_ISSET(r, WT_EVICT_LOOKASIDE) && bnd->supd != NULL)
- ret = __rec_update_las(session, r, btree->id, bnd);
+ WT_ERR(__rec_update_las(session, r, btree->id, bnd));
+
+copy_image:
+ /*
+ * If re-instantiating this page in memory (either because eviction
+ * wants to, or because we skipped updates to build the disk image),
+ * save a copy of the disk image.
+ *
+ * Raw compression might have already saved a copy of the disk image
+ * before we could know if we skipped updates to create it, and now
+ * we know if we're going to need it.
+ *
+ * Copy the disk image if we need a copy and don't already have one,
+ * discard any already saved copy we don't need.
+ */
+ need_image = F_ISSET(r, WT_EVICT_SCRUB) ||
+ (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL);
+ if (need_image && bnd->disk_image == NULL)
+ WT_ERR(__wt_strndup(
+ session, buf->data, buf->size, &bnd->disk_image));
+ if (!need_image)
+ __wt_free(session, bnd->disk_image);
-done:
err: __wt_scr_free(session, &key);
return (ret);
}
@@ -3403,7 +3411,7 @@ __rec_update_las(WT_SESSION_IMPL *session,
*/
__wt_las_set_written(session);
- WT_ERR(__wt_las_cursor(session, &cursor, &session_flags));
+ __wt_las_cursor(session, &cursor, &session_flags);
/* Ensure enough room for a column-store key without checking. */
WT_ERR(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key));
@@ -3566,8 +3574,9 @@ __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
WT_PAGE *parent;
WT_RECONCILE *r;
- r = cbulk->reconcile;
btree = S2BT(session);
+ if ((r = cbulk->reconcile) == NULL)
+ return (0);
switch (btree->type) {
case BTREE_COL_FIX:
@@ -5531,22 +5540,22 @@ __rec_split_dump_keys(WT_SESSION_IMPL *session, WT_PAGE *page, WT_RECONCILE *r)
if (page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_ROW_LEAF)
WT_RET(__wt_scr_alloc(session, 0, &tkey));
- WT_ERR(__wt_verbose(
- session, WT_VERB_SPLIT, "split: %" PRIu32 " pages", r->bnd_next));
+ __wt_verbose(
+ session, WT_VERB_SPLIT, "split: %" PRIu32 " pages", r->bnd_next);
for (bnd = r->bnd, i = 0; i < r->bnd_next; ++bnd, ++i)
switch (page->type) {
case WT_PAGE_ROW_INT:
case WT_PAGE_ROW_LEAF:
- WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
+ __wt_verbose(session, WT_VERB_SPLIT,
"starting key %s",
__wt_buf_set_printable(
- session, bnd->key.data, bnd->key.size, tkey)));
+ session, bnd->key.data, bnd->key.size, tkey));
break;
case WT_PAGE_COL_FIX:
case WT_PAGE_COL_INT:
case WT_PAGE_COL_VAR:
- WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
- "starting recno %" PRIu64, bnd->recno));
+ __wt_verbose(session, WT_VERB_SPLIT,
+ "starting recno %" PRIu64, bnd->recno);
break;
WT_ILLEGAL_VALUE_ERR(session);
}
@@ -5611,9 +5620,10 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
WT_RET(__wt_btree_block_free(session,
mod->mod_replace.addr, mod->mod_replace.size));
- /* Discard the replacement page's address. */
+ /* Discard the replacement page's address and disk image. */
__wt_free(session, mod->mod_replace.addr);
mod->mod_replace.size = 0;
+ __wt_free(session, mod->mod_disk_image);
break;
WT_ILLEGAL_VALUE(session);
}
@@ -5632,8 +5642,8 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
switch (r->bnd_next) {
case 0: /* Page delete */
- WT_RET(__wt_verbose(
- session, WT_VERB_RECONCILE, "page %p empty", page));
+ __wt_verbose(
+ session, WT_VERB_RECONCILE, "page %p empty", (void *)page);
WT_STAT_FAST_CONN_INCR(session, rec_page_delete);
WT_STAT_FAST_DATA_INCR(session, rec_page_delete);
@@ -5661,34 +5671,41 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
bnd = &r->bnd[0];
/*
- * If saving/restoring changes for this page and there's only
- * one block, there's nothing to write. This is an in-memory
- * configuration or a special case of forced eviction: set up
+ * If in-memory, or saving/restoring changes for this page and
+ * there's only one block, there's nothing to write. Set up
* a single block as if to split, then use that disk image to
- * rewrite the page in memory.
+ * rewrite the page in memory. This is separate from simple
+ * replacements where eviction has decided to retain the page
+ * in memory because the latter can't handle update lists and
+ * splits can.
*/
- if (bnd->disk_image != NULL)
+ if (F_ISSET(r, WT_EVICT_IN_MEMORY) ||
+ (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL))
goto split;
/*
- * If this is a root page, then we don't have an address and we
- * have to create a sync point. The address was cleared when
- * we were about to write the buffer so we know what to do here.
+ * A root page, we don't have an address and we have to create
+ * a sync point. The address was cleared when we were about to
+ * write the buffer so we know what to do here.
*/
if (bnd->addr.addr == NULL)
WT_RET(__wt_bt_write(session, &r->disk_image,
- NULL, NULL, true, bnd->already_compressed));
+ NULL, NULL, true, F_ISSET(r, WT_CHECKPOINTING),
+ bnd->already_compressed));
else {
mod->mod_replace = bnd->addr;
bnd->addr.addr = NULL;
+
+ mod->mod_disk_image = bnd->disk_image;
+ bnd->disk_image = NULL;
}
mod->rec_result = WT_PM_REC_REPLACE;
break;
default: /* Page split */
- WT_RET(__wt_verbose(session, WT_VERB_RECONCILE,
+ __wt_verbose(session, WT_VERB_RECONCILE,
"page %p reconciled into %" PRIu32 " pages",
- page, r->bnd_next));
+ (void *)page, r->bnd_next);
switch (page->type) {
case WT_PAGE_COL_INT:
@@ -5815,19 +5832,26 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
WT_RET(__wt_row_ikey_alloc(session, 0,
bnd->key.data, bnd->key.size, &multi->key.ikey));
- if (bnd->disk_image == NULL) {
- multi->addr = bnd->addr;
- multi->addr.reuse = 0;
- multi->size = bnd->size;
- multi->cksum = bnd->cksum;
- bnd->addr.addr = NULL;
- } else {
+ /*
+ * Copy any disk image. Don't take saved updates without a
+ * disk image (which happens if they have been saved to the
+ * lookaside table): they should be discarded along with the
+ * original page.
+ */
+ multi->disk_image = bnd->disk_image;
+ bnd->disk_image = NULL;
+ if (multi->disk_image != NULL) {
multi->supd = bnd->supd;
multi->supd_entries = bnd->supd_next;
bnd->supd = NULL;
- multi->disk_image = bnd->disk_image;
- bnd->disk_image = NULL;
}
+
+ /* Copy any address. */
+ multi->addr = bnd->addr;
+ multi->addr.reuse = 0;
+ multi->size = bnd->size;
+ multi->checksum = bnd->checksum;
+ bnd->addr.addr = NULL;
}
mod->mod_multi_entries = r->bnd_next;
@@ -5855,19 +5879,26 @@ __rec_split_col(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) {
multi->key.recno = bnd->recno;
- if (bnd->disk_image == NULL) {
- multi->addr = bnd->addr;
- multi->addr.reuse = 0;
- multi->size = bnd->size;
- multi->cksum = bnd->cksum;
- bnd->addr.addr = NULL;
- } else {
+ /*
+ * Copy any disk image. Don't take saved updates without a
+ * disk image (which happens if they have been saved to the
+ * lookaside table): they should be discarded along with the
+ * original page.
+ */
+ multi->disk_image = bnd->disk_image;
+ bnd->disk_image = NULL;
+ if (multi->disk_image != NULL) {
multi->supd = bnd->supd;
multi->supd_entries = bnd->supd_next;
bnd->supd = NULL;
- multi->disk_image = bnd->disk_image;
- bnd->disk_image = NULL;
}
+
+ /* Copy any address. */
+ multi->addr = bnd->addr;
+ multi->addr.reuse = 0;
+ multi->size = bnd->size;
+ multi->checksum = bnd->checksum;
+ bnd->addr.addr = NULL;
}
mod->mod_multi_entries = r->bnd_next;
@@ -6143,7 +6174,8 @@ __rec_cell_build_ovfl(WT_SESSION_IMPL *session,
/* Write the buffer. */
addr = buf;
- WT_ERR(__wt_bt_write(session, tmp, addr, &size, false, false));
+ WT_ERR(__wt_bt_write(session, tmp,
+ addr, &size, false, F_ISSET(r, WT_CHECKPOINTING), false));
/*
* Track the overflow record (unless it's a bulk load, which