summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2021-10-25 14:11:04 +1100
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-10-25 03:40:20 +0000
commit229b9c477c0569ab8f7b49f81dc06086c82aa1b4 (patch)
tree7c3790c239455e08c975a7fffb1f143a6ee41a74
parent4a4da3e8c2ecb4a1a51eff760714b1b74ce67bbf (diff)
downloadmongo-229b9c477c0569ab8f7b49f81dc06086c82aa1b4.tar.gz
Import wiredtiger: 70ab26de2ab263fabab39114aee583f632f4e088 from branch mongodb-5.1
ref: bfcac76ea0..70ab26de2a for: 5.1.0-rc2 WT-6001 Avoid reading the page into cache if it needs to be rewritten
-rw-r--r--src/third_party/wiredtiger/dist/s_string.ok1
-rwxr-xr-xsrc/third_party/wiredtiger/dist/s_void1
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/block/block_compact.c154
-rw-r--r--src/third_party/wiredtiger/src/block/block_mgr.c39
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_compact.c355
-rw-r--r--src/third_party/wiredtiger/src/include/block.h11
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h21
-rw-r--r--src/third_party/wiredtiger/src/include/compact.h7
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h6
-rw-r--r--src/third_party/wiredtiger/src/include/hardware.h4
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c28
-rwxr-xr-xsrc/third_party/wiredtiger/test/suite/test_compact02.py7
13 files changed, 458 insertions, 178 deletions
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok
index b355d29c901..83a238b625c 100644
--- a/src/third_party/wiredtiger/dist/s_string.ok
+++ b/src/third_party/wiredtiger/dist/s_string.ok
@@ -502,6 +502,7 @@ abcdef
abcdefghijklmnopqrstuvwxyz
addl
addr
+addrs
agc
alfred
alloc
diff --git a/src/third_party/wiredtiger/dist/s_void b/src/third_party/wiredtiger/dist/s_void
index 249c89fbbab..7ea8ce985dd 100755
--- a/src/third_party/wiredtiger/dist/s_void
+++ b/src/third_party/wiredtiger/dist/s_void
@@ -34,6 +34,7 @@ func_ok()
-e '/int __bm_stat$/d' \
-e '/int __checkpoint_presync$/d' \
-e '/int __compact_uri_analyze$/d' \
+ -e '/int __compact_walk_page_skip$/d' \
-e '/int __config_parser_close$/d' \
-e '/int __curlog_reset$/d' \
-e '/int __cursor_fix_implicit$/d' \
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 2b73f34dcde..8eb40686dfd 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-5.1",
- "commit": "bfcac76ea0dae325f0134818fb32bbe134eec71d"
+ "commit": "70ab26de2ab263fabab39114aee583f632f4e088"
}
diff --git a/src/third_party/wiredtiger/src/block/block_compact.c b/src/third_party/wiredtiger/src/block/block_compact.c
index f6be40cfa78..28076e856b4 100644
--- a/src/third_party/wiredtiger/src/block/block_compact.c
+++ b/src/third_party/wiredtiger/src/block/block_compact.c
@@ -24,9 +24,10 @@ __wt_block_compact_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
/* Reset the compaction state information. */
block->compact_pct_tenths = 0;
+ block->compact_blocks_moved = 0;
+ block->compact_cache_pages_dealt = 0;
block->compact_pages_reviewed = 0;
block->compact_pages_skipped = 0;
- block->compact_pages_written = 0;
return (0);
}
@@ -51,6 +52,32 @@ __wt_block_compact_end(WT_SESSION_IMPL *session, WT_BLOCK *block)
}
/*
+ * __wt_block_compact_progress --
+ * Output compact progress message.
+ */
+void
+__wt_block_compact_progress(WT_SESSION_IMPL *session, WT_BLOCK *block, u_int *msg_countp)
+{
+ struct timespec cur_time;
+ uint64_t time_diff;
+
+ if (!WT_VERBOSE_ISSET(session, WT_VERB_COMPACT_PROGRESS))
+ return;
+
+ __wt_epoch(session, &cur_time);
+
+ /* Log one progress message every twenty seconds. */
+ time_diff = WT_TIMEDIFF_SEC(cur_time, session->compact->begin);
+ if (time_diff / WT_PROGRESS_MSG_PERIOD > *msg_countp) {
+ ++*msg_countp;
+ __wt_verbose(session, WT_VERB_COMPACT_PROGRESS,
+ " compacting %s for %" PRIu64 " seconds; reviewed %" PRIu64 " pages, skipped %" PRIu64
+ " pages, cache pages evicted %" PRIu64 ", on-disk pages moved %" PRIu64,
+ block->name, time_diff, block->compact_pages_reviewed, block->compact_pages_skipped,
+ block->compact_cache_pages_dealt, block->compact_blocks_moved);
+ }
+}
+/*
* __wt_block_compact_skip --
* Return if compaction will shrink the file.
*/
@@ -117,7 +144,7 @@ __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, bool *skipp)
"%s: total reviewed %" PRIu64 " pages, total skipped %" PRIu64 " pages, total wrote %" PRIu64
" pages",
block->name, block->compact_pages_reviewed, block->compact_pages_skipped,
- block->compact_pages_written);
+ block->compact_cache_pages_dealt);
__wt_verbose(session, WT_VERB_COMPACT,
"%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first 80%% of the file",
block->name, (uintmax_t)avail_eighty / WT_MEGABYTE, (uintmax_t)avail_eighty);
@@ -136,27 +163,22 @@ __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, bool *skipp)
}
/*
- * __wt_block_compact_page_skip --
+ * __compact_page_skip --
* Return if writing a particular page will shrink the file.
*/
-int
-__wt_block_compact_page_skip(
- WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, bool *skipp)
+static void
+__compact_page_skip(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t offset, uint32_t size, bool *skipp)
{
WT_EXT *ext;
WT_EXTLIST *el;
- wt_off_t limit, offset;
- uint32_t checksum, objectid, size;
+ wt_off_t limit;
*skipp = true; /* Return a default skip. */
- /* Crack the cookie. */
- WT_RET(__wt_block_addr_unpack(
- session, block, addr, addr_size, &objectid, &offset, &size, &checksum));
-
/*
* If this block is in the chosen percentage of the file and there's a block on the available
- * list that's appears before that percentage of the file, rewrite the block. Checking the
+ * list that appears before that percentage of the file, rewrite the block. Checking the
* available list is necessary (otherwise writing the block would extend the file), but there's
* an obvious race if the file is sufficiently busy.
*/
@@ -174,17 +196,117 @@ __wt_block_compact_page_skip(
}
}
__wt_spin_unlock(session, &block->live_lock);
+}
+
+/*
+ * __wt_block_compact_page_skip --
+ * Return if writing a particular page will shrink the file.
+ */
+int
+__wt_block_compact_page_skip(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, bool *skipp)
+{
+ wt_off_t offset;
+ uint32_t size, checksum, objectid;
+
+ WT_UNUSED(addr_size);
+ *skipp = true; /* Return a default skip. */
+ offset = 0;
+
+ /* Crack the cookie. */
+ WT_RET(__wt_block_addr_unpack(
+ session, block, addr, addr_size, &objectid, &offset, &size, &checksum));
+
+ __compact_page_skip(session, block, offset, size, skipp);
++block->compact_pages_reviewed;
if (*skipp)
++block->compact_pages_skipped;
else
- ++block->compact_pages_written;
+ ++block->compact_cache_pages_dealt;
return (0);
}
/*
+ * __wt_block_compact_page_rewrite --
+ * Rewrite a page if it will shrink the file.
+ */
+int
+__wt_block_compact_page_rewrite(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t *addr_sizep, bool *skipp)
+{
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ wt_off_t offset, new_offset;
+ uint32_t size, checksum, objectid;
+ uint8_t *endp;
+ bool discard_block;
+
+ *skipp = true; /* Return a default skip. */
+ new_offset = 0; /* -Werror=maybe-uninitialized */
+
+ discard_block = false;
+
+ WT_ERR(__wt_block_addr_unpack(
+ session, block, addr, *addr_sizep, &objectid, &offset, &size, &checksum));
+
+ /* Check if the block is worth rewriting. */
+ __compact_page_skip(session, block, offset, size, skipp);
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT) ||
+ WT_VERBOSE_ISSET(session, WT_VERB_COMPACT_PROGRESS)) {
+ ++block->compact_pages_reviewed;
+ if (*skipp)
+ ++block->compact_pages_skipped;
+ else
+ ++block->compact_blocks_moved;
+ }
+ if (*skipp)
+ return (0);
+
+ /* Read the block. */
+ WT_ERR(__wt_scr_alloc(session, size, &tmp));
+ WT_ERR(__wt_read(session, block->fh, offset, size, tmp->mem));
+
+ /* Allocate a replacement block. */
+ WT_ERR(__wt_block_ext_prealloc(session, 5));
+ __wt_spin_lock(session, &block->live_lock);
+ ret = __wt_block_alloc(session, block, &new_offset, (wt_off_t)size);
+ __wt_spin_unlock(session, &block->live_lock);
+ WT_ERR(ret);
+ discard_block = true;
+
+ /* Write the block. */
+ WT_ERR(__wt_write(session, block->fh, new_offset, size, tmp->mem));
+
+ /* Free the original block. */
+ __wt_spin_lock(session, &block->live_lock);
+ ret = __wt_block_off_free(session, block, objectid, offset, (wt_off_t)size);
+ __wt_spin_unlock(session, &block->live_lock);
+ WT_ERR(ret);
+
+ /* Build the returned address cookie. */
+ endp = addr;
+ WT_ERR(__wt_block_addr_pack(block, &endp, objectid, new_offset, size, checksum));
+ *addr_sizep = WT_PTRDIFF(endp, addr);
+
+ WT_STAT_CONN_INCR(session, block_write);
+ WT_STAT_CONN_INCRV(session, block_byte_write, size);
+
+ discard_block = false;
+
+err:
+ if (discard_block) {
+ __wt_spin_lock(session, &block->live_lock);
+ WT_TRET(__wt_block_off_free(session, block, objectid, new_offset, (wt_off_t)size));
+ __wt_spin_unlock(session, &block->live_lock);
+ }
+ __wt_scr_free(session, &tmp);
+ return (ret);
+}
+
+/*
* __block_dump_bucket_stat --
* Dump out the information about available and used blocks in the given bucket (part of the
* file).
@@ -237,8 +359,10 @@ __block_dump_file_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, bool start)
session, WT_VERB_COMPACT, "pages reviewed: %" PRIu64, block->compact_pages_reviewed);
__wt_verbose(
session, WT_VERB_COMPACT, "pages skipped: %" PRIu64, block->compact_pages_skipped);
+ __wt_verbose(session, WT_VERB_COMPACT,
+ "cache pages read/flushed out of the cache: %" PRIu64, block->compact_cache_pages_dealt);
__wt_verbose(
- session, WT_VERB_COMPACT, "pages written: %" PRIu64, block->compact_pages_written);
+ session, WT_VERB_COMPACT, "blocks moved : %" PRIu64, block->compact_blocks_moved);
}
__wt_verbose(session, WT_VERB_COMPACT,
diff --git a/src/third_party/wiredtiger/src/block/block_mgr.c b/src/third_party/wiredtiger/src/block/block_mgr.c
index 433d7342295..983de3d7c52 100644
--- a/src/third_party/wiredtiger/src/block/block_mgr.c
+++ b/src/third_party/wiredtiger/src/block/block_mgr.c
@@ -221,6 +221,32 @@ __bm_compact_end_readonly(WT_BM *bm, WT_SESSION_IMPL *session)
}
/*
+ * __bm_compact_page_rewrite --
+ * Rewrite a page for compaction.
+ */
+static int
+__bm_compact_page_rewrite(
+ WT_BM *bm, WT_SESSION_IMPL *session, uint8_t *addr, size_t *addr_sizep, bool *writtenp)
+{
+ return (__wt_block_compact_page_rewrite(session, bm->block, addr, addr_sizep, writtenp));
+}
+
+/*
+ * __bm_compact_page_rewrite_readonly --
+ * Rewrite a page for compaction; readonly version.
+ */
+static int
+__bm_compact_page_rewrite_readonly(
+ WT_BM *bm, WT_SESSION_IMPL *session, uint8_t *addr, size_t *addr_sizep, bool *writtenp)
+{
+ WT_UNUSED(addr);
+ WT_UNUSED(addr_sizep);
+ WT_UNUSED(writtenp);
+
+ return (__bm_readonly(bm, session));
+}
+
+/*
* __bm_compact_page_skip --
* Return if a page is useful for compaction.
*/
@@ -247,6 +273,16 @@ __bm_compact_page_skip_readonly(
}
/*
+ * __bm_compact_progress --
+ * Output compact progress message.
+ */
+static void
+__bm_compact_progress(WT_BM *bm, WT_SESSION_IMPL *session, u_int *msg_countp)
+{
+ __wt_block_compact_progress(session, bm->block, msg_countp);
+}
+
+/*
* __bm_compact_skip --
* Return if a file can be compacted.
*/
@@ -584,7 +620,9 @@ __bm_method_set(WT_BM *bm, bool readonly)
bm->checkpoint_unload = __bm_checkpoint_unload;
bm->close = __bm_close;
bm->compact_end = __bm_compact_end;
+ bm->compact_page_rewrite = __bm_compact_page_rewrite;
bm->compact_page_skip = __bm_compact_page_skip;
+ bm->compact_progress = __bm_compact_progress;
bm->compact_skip = __bm_compact_skip;
bm->compact_start = __bm_compact_start;
bm->corrupt = __wt_bm_corrupt;
@@ -612,6 +650,7 @@ __bm_method_set(WT_BM *bm, bool readonly)
bm->checkpoint_resolve = __bm_checkpoint_resolve_readonly;
bm->checkpoint_start = __bm_checkpoint_start_readonly;
bm->compact_end = __bm_compact_end_readonly;
+ bm->compact_page_rewrite = __bm_compact_page_rewrite_readonly;
bm->compact_page_skip = __bm_compact_page_skip_readonly;
bm->compact_skip = __bm_compact_skip_readonly;
bm->compact_start = __bm_compact_start_readonly;
diff --git a/src/third_party/wiredtiger/src/btree/bt_compact.c b/src/third_party/wiredtiger/src/btree/bt_compact.c
index 29e83c10beb..7680f2cbc79 100644
--- a/src/third_party/wiredtiger/src/btree/bt_compact.c
+++ b/src/third_party/wiredtiger/src/btree/bt_compact.c
@@ -9,11 +9,11 @@
#include "wt_internal.h"
/*
- * __compact_rewrite --
- * Return if a modified page needs to be re-written.
+ * __compact_page_inmem_check_addrs --
+ * Return if a clean, in-memory page needs to be re-written.
*/
static int
-__compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
+__compact_page_inmem_check_addrs(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
WT_ADDR_COPY addr;
WT_BM *bm;
@@ -25,7 +25,7 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
bm = S2BT(session)->bm;
- /* If the page is clean, test the original addresses. */
+ /* If the page is currently clean, test the original addresses. */
if (__wt_page_evict_clean(ref->page))
return (__wt_ref_addr_copy(session, ref, &addr) ?
bm->compact_page_skip(bm, session, addr.addr, addr.size, skipp) :
@@ -53,63 +53,240 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
}
/*
- * __compact_rewrite_lock --
- * Return if a page needs to be re-written.
+ * __compact_page_inmem --
+ * Return if an in-memory page needs to be re-written.
*/
static int
-__compact_rewrite_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
+__compact_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
- WT_BTREE *btree;
- WT_DECL_RET;
+ *skipp = true; /* Default skip. */
+
+ /*
+ * Ignore dirty pages, checkpoint will likely write them. There are cases where checkpoint can
+ * skip dirty pages: to avoid that, we could alter the transactional information of the page,
+ * which is what checkpoint reviews to decide if a page can be skipped. Not doing that for now,
+ * the repeated checkpoints that compaction requires are more than likely to pick up all dirty
+ * pages at some point.
+ *
+ * Check clean page addresses, and mark page and tree dirty if the page needs to be rewritten.
+ */
+ if (__wt_page_is_modified(ref->page))
+ *skipp = false;
+ else {
+ WT_RET(__compact_page_inmem_check_addrs(session, ref, skipp));
+
+ if (!*skipp) {
+ WT_RET(__wt_page_modify_init(session, ref->page));
+ __wt_page_modify_set(session, ref->page);
+ }
+ }
+
+ /* If rewriting the page, have reconciliation write new blocks. */
+ if (!*skipp)
+ F_SET_ATOMIC(ref->page, WT_PAGE_COMPACTION_WRITE);
- btree = S2BT(session);
+ return (0);
+}
+
+/*
+ * __compact_page_replace_addr --
+ * Replace a page's WT_ADDR.
+ */
+static int
+__compact_page_replace_addr(WT_SESSION_IMPL *session, WT_REF *ref, WT_ADDR_COPY *copy)
+{
+ WT_ADDR *addr;
+ WT_CELL_UNPACK_ADDR unpack;
+ WT_DECL_RET;
/*
- * Reviewing in-memory pages requires looking at page reconciliation results, because we care
- * about where the page is stored now, not where the page was stored when we first read it into
- * the cache. We need to ensure we don't race with page reconciliation as it's writing the page
- * modify information. There are two ways we call reconciliation: checkpoints and eviction. We
- * are holding a hazard pointer that blocks eviction, but there's nothing blocking a checkpoint.
- * Get the tree's flush lock which blocks threads writing pages for checkpoints. If checkpoint
- * is holding the lock, quit working this file, we'll visit it again in our next pass.
+ * If there's no address at all (the page has never been written), allocate a new WT_ADDR
+ * structure, otherwise, the address has already been instantiated, replace the cookie.
*/
- WT_RET(__wt_spin_trylock(session, &btree->flush_lock));
+ addr = ref->addr;
+ WT_ASSERT(session, addr != NULL);
+
+ if (__wt_off_page(ref->home, addr))
+ __wt_free(session, addr->addr);
+ else {
+ __wt_cell_unpack_addr(session, ref->home->dsk, (WT_CELL *)addr, &unpack);
+
+ WT_RET(__wt_calloc_one(session, &addr));
+ addr->ta.newest_start_durable_ts = unpack.ta.newest_start_durable_ts;
+ addr->ta.newest_stop_durable_ts = unpack.ta.newest_stop_durable_ts;
+ addr->ta.oldest_start_ts = unpack.ta.oldest_start_ts;
+ addr->ta.newest_txn = unpack.ta.newest_txn;
+ addr->ta.newest_stop_ts = unpack.ta.newest_stop_ts;
+ addr->ta.newest_stop_txn = unpack.ta.newest_stop_txn;
+ switch (unpack.raw) {
+ case WT_CELL_ADDR_INT:
+ addr->type = WT_ADDR_INT;
+ break;
+ case WT_CELL_ADDR_LEAF:
+ addr->type = WT_ADDR_LEAF;
+ break;
+ case WT_CELL_ADDR_LEAF_NO:
+ addr->type = WT_ADDR_LEAF_NO;
+ break;
+ }
+ }
- ret = __compact_rewrite(session, ref, skipp);
+ WT_ERR(__wt_strndup(session, copy->addr, copy->size, &addr->addr));
+ addr->size = copy->size;
- /* Unblock threads writing leaf pages. */
- __wt_spin_unlock(session, &btree->flush_lock);
+ ref->addr = addr;
+ return (0);
+err:
+ if (addr != ref->addr)
+ __wt_free(session, addr);
return (ret);
}
/*
- * __compact_progress --
- * Output a compact progress message.
+ * __compact_page --
+ * Compaction for a single page.
*/
-static void
-__compact_progress(WT_SESSION_IMPL *session)
+static int
+__compact_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
- struct timespec cur_time;
+ WT_ADDR_COPY copy;
WT_BM *bm;
- uint64_t time_diff;
+ WT_DECL_RET;
+ size_t addr_size;
+ uint8_t previous_state;
- if (!WT_VERBOSE_ISSET(session, WT_VERB_COMPACT_PROGRESS))
- return;
+ *skipp = true; /* Default skip. */
- bm = S2BT(session)->bm;
- __wt_epoch(session, &cur_time);
-
- /* Log one progress message every twenty seconds. */
- time_diff = WT_TIMEDIFF_SEC(cur_time, session->compact->begin);
- if (time_diff / WT_PROGRESS_MSG_PERIOD > session->compact->prog_msg_count) {
- __wt_verbose(session, WT_VERB_COMPACT_PROGRESS,
- "Compact running for %" PRIu64 " seconds; reviewed %" PRIu64 " pages, skipped %" PRIu64
- " pages, wrote %" PRIu64 " pages",
- time_diff, bm->block->compact_pages_reviewed, bm->block->compact_pages_skipped,
- bm->block->compact_pages_written);
- session->compact->prog_msg_count++;
+ /* Lock the WT_REF. */
+ WT_REF_LOCK(session, ref, &previous_state);
+
+ /*
+ * Skip deleted pages but consider them progress (the on-disk block is discarded by the next
+ * checkpoint).
+ */
+ if (previous_state == WT_REF_DELETED)
+ *skipp = false;
+
+ /*
+ * If it's on-disk, get a copy of the address and ask the block manager to rewrite the block if
+ * it's useful. This is safe because we're holding the WT_REF locked, so nobody can read the
+ * page giving eviction a chance to modify the address.
+ *
+ * In this path, we are holding the WT_REF lock across two OS buffer cache I/Os (the read of the
+ * original block and the write of the new block), plus whatever overhead that entails. It's not
+ * ideal, we could release the lock, but then we'd have to deal with the block having been read
+ * into memory while we were moving it.
+ */
+ if (previous_state == WT_REF_DISK && __wt_ref_addr_copy(session, ref, &copy)) {
+ bm = S2BT(session)->bm;
+ addr_size = copy.size;
+ WT_ERR(bm->compact_page_rewrite(bm, session, copy.addr, &addr_size, skipp));
+ if (!*skipp) {
+ copy.size = (uint8_t)addr_size;
+ WT_ERR(__compact_page_replace_addr(session, ref, &copy));
+ WT_STAT_DATA_INCR(session, btree_compact_pages_rewritten);
+ }
}
+
+ /*
+ * Ignore pages that aren't in-memory for some reason other than they're on-disk, for example,
+ * they might have split or been deleted while we were locking the WT_REF. This includes the
+ * case where we found an on-disk page and either rewrite the block successfully or failed to
+ * get a copy of the address (which shouldn't ever happen, but if that goes wrong, it's not our
+ * problem to solve).
+ *
+ * In this path, we are holding the WT_REF lock across some in-memory checks and possibly one or
+ * more calls to the underlying block manager which is going to search the list of extents to
+ * figure out if the block is worth rewriting. It's not ideal because we're blocking the
+ * application's worker threads: we could release the lock, but then we'd have to acquire a
+ * hazard pointer to ensure eviction didn't select the page.
+ */
+ if (previous_state == WT_REF_MEM) {
+ WT_ERR(__compact_page_inmem(session, ref, skipp));
+ if (!*skipp)
+ WT_STAT_DATA_INCR(session, btree_compact_pages_rewritten);
+ }
+
+err:
+ WT_REF_UNLOCK(ref, previous_state);
+
+ return (ret);
+}
+
+/*
+ * __compact_walk_internal --
+ * Walk an internal page for compaction.
+ */
+static int
+__compact_walk_internal(WT_SESSION_IMPL *session, WT_REF *parent)
+{
+ WT_DECL_RET;
+ WT_REF *ref;
+ bool overall_progress, skipp;
+
+ ref = NULL; /* [-Wconditional-uninitialized] */
+
+ /*
+ * We could corrupt a checkpoint if we moved a block that's part of the checkpoint, that is, if
+ * we race with checkpoint's review of the tree. Get the tree's flush lock which blocks threads
+ * writing pages for checkpoints, and hold it long enough to review a single internal page. Quit
+ * working the file if checkpoint is holding the lock, checkpoint holds the lock for relatively
+ * long periods.
+ */
+ WT_RET(__wt_spin_trylock(session, &S2BT(session)->flush_lock));
+
+ /*
+ * Walk the internal page and check any leaf pages it references; skip internal pages, we'll
+ * visit them individually.
+ */
+ overall_progress = false;
+ WT_INTL_FOREACH_BEGIN (session, parent->page, ref) {
+ if (F_ISSET(ref, WT_REF_FLAG_LEAF)) {
+ WT_ERR(__compact_page(session, ref, &skipp));
+ if (!skipp)
+ overall_progress = true;
+ }
+ }
+ WT_INTL_FOREACH_END;
+
+ /*
+ * If we moved a leaf page, we'll write the parent. If we didn't move a leaf page, check pages
+ * other than the root to see if we want to move the internal page itself. (Skip the root as a
+ * forced checkpoint will always rewrite it, and you can't just "move" a root page.)
+ */
+ if (!overall_progress && !__wt_ref_is_root(parent)) {
+ WT_ERR(__compact_page(session, parent, &skipp));
+ if (!skipp)
+ overall_progress = true;
+ }
+
+ /* If we found a page to compact, mark the parent and tree dirty and report success. */
+ if (overall_progress) {
+ WT_ERR(__wt_page_parent_modify_set(session, ref, false));
+ session->compact_state = WT_COMPACT_SUCCESS;
+ }
+
+err:
+ /* Unblock checkpoint threads. */
+ __wt_spin_unlock(session, &S2BT(session)->flush_lock);
+
+ return (ret);
+}
+
+/*
+ * __compact_walk_page_skip --
+ * Skip leaf pages, all we want are internal pages.
+ */
+static int
+__compact_walk_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool *skipp)
+{
+ WT_UNUSED(context);
+ WT_UNUSED(session);
+
+ /* All we want are the internal pages. */
+ *skipp = F_ISSET(ref, WT_REF_FLAG_LEAF) ? true : false;
+ return (0);
}
/*
@@ -122,7 +299,7 @@ __wt_compact(WT_SESSION_IMPL *session)
WT_BM *bm;
WT_DECL_RET;
WT_REF *ref;
- u_int i;
+ u_int i, msg_count;
bool skip;
bm = S2BT(session)->bm;
@@ -131,8 +308,8 @@ __wt_compact(WT_SESSION_IMPL *session)
WT_STAT_DATA_INCR(session, session_compact);
/*
- * Check if compaction might be useful -- the API layer will quit trying to compact the data
- * source if we make no progress, set a flag if the block layer thinks compaction is possible.
+ * Check if compaction might be useful (the API layer will quit trying to compact the data
+ * source if we make no progress).
*/
WT_RET(bm->compact_skip(bm, session, &skip));
if (skip) {
@@ -148,14 +325,14 @@ __wt_compact(WT_SESSION_IMPL *session)
WT_STAT_DATA_SET(session, btree_compact_pages_reviewed, bm->block->compact_pages_reviewed);
WT_STAT_DATA_SET(session, btree_compact_pages_skipped, bm->block->compact_pages_skipped);
WT_STAT_DATA_SET(
- session, btree_compact_pages_write_selected, bm->block->compact_pages_written);
+ session, btree_compact_pages_write_selected, bm->block->compact_cache_pages_dealt);
/*
* Periodically check if we've timed out or eviction is stuck. Quit if eviction is stuck,
* we're making the problem worse.
*/
if (++i > 100) {
- __compact_progress(session);
+ bm->compact_progress(bm, session, &msg_count);
WT_ERR(__wt_session_compact_check_timeout(session));
if (__wt_cache_stuck(session))
@@ -176,94 +353,16 @@ __wt_compact(WT_SESSION_IMPL *session)
* evicted quickly.
*/
WT_ERR(__wt_tree_walk_custom_skip(
- session, &ref, __wt_compact_page_skip, NULL, WT_READ_NO_GEN | WT_READ_WONT_NEED));
+ session, &ref, __compact_walk_page_skip, NULL, WT_READ_NO_GEN | WT_READ_WONT_NEED));
if (ref == NULL)
break;
- /*
- * Cheap checks that don't require locking.
- *
- * Ignore the root: it may not have a replacement address, and besides, if anything else
- * gets written, so will it.
- *
- * Ignore dirty pages, checkpoint will likely write them. There are cases where checkpoint
- * can skip dirty pages: to avoid that, we could alter the transactional information of the
- * page, which is what checkpoint reviews to decide if a page can be skipped. Not doing that
- * for now, the repeated checkpoints that compaction requires are more than likely to pick
- * up all dirty pages at some point.
- */
- if (__wt_ref_is_root(ref))
- continue;
- if (__wt_page_is_modified(ref->page))
- continue;
-
- WT_ERR(__compact_rewrite_lock(session, ref, &skip));
- if (skip)
- continue;
-
- /* Rewrite the page: mark the page and tree dirty. */
- WT_ERR(__wt_page_modify_init(session, ref->page));
- __wt_page_modify_set(session, ref->page);
-
- session->compact_state = WT_COMPACT_SUCCESS;
- WT_STAT_DATA_INCR(session, btree_compact_pages_rewritten);
+ WT_WITH_PAGE_INDEX(session, ret = __compact_walk_internal(session, ref));
+ WT_ERR(ret);
}
err:
- if (ref != NULL)
- WT_TRET(__wt_page_release(session, ref, 0));
+ WT_TRET(__wt_page_release(session, ref, 0));
return (ret);
}
-
-/*
- * __wt_compact_page_skip --
- * Return if compaction requires we read this page.
- */
-int
-__wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool *skipp)
-{
- WT_ADDR_COPY addr;
- WT_BM *bm;
- uint8_t previous_state;
- bool diskaddr;
-
- WT_UNUSED(context);
-
- *skipp = false; /* Default to reading */
-
- /* Internal pages must be read to walk the tree. */
- if (F_ISSET(ref, WT_REF_FLAG_INTERNAL))
- return (0);
-
- /*
- * Skip deleted pages, rewriting them doesn't seem useful; in a better world we'd write the
- * parent to delete the page.
- */
- if (ref->state == WT_REF_DELETED) {
- *skipp = true;
- return (0);
- }
-
- /*
- * If the page is in-memory, we want to look at it (it may have been modified and written, and
- * the current location is the interesting one in terms of compaction, not the original).
- */
- if (ref->state != WT_REF_DISK)
- return (0);
-
- /*
- * Lock the WT_REF and if it's still on-disk, get a copy of the address. This is safe because
- * it's an on-disk page and we're holding the WT_REF locked, so nobody can read the page giving
- * either checkpoint or eviction a chance to modify the address.
- */
- WT_REF_LOCK(session, ref, &previous_state);
- diskaddr = previous_state == WT_REF_DISK && __wt_ref_addr_copy(session, ref, &addr);
- WT_REF_UNLOCK(ref, previous_state);
- if (!diskaddr)
- return (0);
-
- /* Ask the block-manager if it's useful to rewrite the page. */
- bm = S2BT(session)->bm;
- return (bm->compact_page_skip(bm, session, addr.addr, addr.size, skipp));
-}
diff --git a/src/third_party/wiredtiger/src/include/block.h b/src/third_party/wiredtiger/src/include/block.h
index 7b4fa732027..6f44c2a0ac3 100644
--- a/src/third_party/wiredtiger/src/include/block.h
+++ b/src/third_party/wiredtiger/src/include/block.h
@@ -181,8 +181,10 @@ struct __wt_bm {
int (*checkpoint_unload)(WT_BM *, WT_SESSION_IMPL *);
int (*close)(WT_BM *, WT_SESSION_IMPL *);
int (*compact_end)(WT_BM *, WT_SESSION_IMPL *);
+ int (*compact_page_rewrite)(WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t *, bool *);
int (*compact_page_skip)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t, bool *);
int (*compact_skip)(WT_BM *, WT_SESSION_IMPL *, bool *);
+ void (*compact_progress)(WT_BM *, WT_SESSION_IMPL *, u_int *);
int (*compact_start)(WT_BM *, WT_SESSION_IMPL *);
int (*corrupt)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
int (*free)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
@@ -276,10 +278,11 @@ struct __wt_block {
WT_CKPT *final_ckpt; /* Final live checkpoint write */
/* Compaction support */
- int compact_pct_tenths; /* Percent to compact */
- uint64_t compact_pages_reviewed; /* Pages reviewed */
- uint64_t compact_pages_skipped; /* Pages skipped */
- uint64_t compact_pages_written; /* Pages rewritten */
+ int compact_pct_tenths; /* Percent to compact */
+ uint64_t compact_blocks_moved; /* Pages moved by the block manager */
+ uint64_t compact_cache_pages_dealt; /* Pages dealt with in cache */
+ uint64_t compact_pages_reviewed; /* Pages reviewed */
+ uint64_t compact_pages_skipped; /* Pages skipped */
/* Salvage support */
wt_off_t slvg_off; /* Salvage file offset */
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index 6382a1d5f59..ac13ed388ae 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -660,16 +660,17 @@ struct __wt_page {
uint8_t type; /* Page type */
/* AUTOMATIC FLAG VALUE GENERATION START 0 */
-#define WT_PAGE_BUILD_KEYS 0x01u /* Keys have been built in memory */
-#define WT_PAGE_DISK_ALLOC 0x02u /* Disk image in allocated memory */
-#define WT_PAGE_DISK_MAPPED 0x04u /* Disk image in mapped memory */
-#define WT_PAGE_EVICT_LRU 0x08u /* Page is on the LRU queue */
-#define WT_PAGE_EVICT_NO_PROGRESS 0x10u /* Eviction doesn't count as progress */
-#define WT_PAGE_OVERFLOW_KEYS 0x20u /* Page has overflow keys */
-#define WT_PAGE_SPLIT_INSERT 0x40u /* A leaf page was split for append */
-#define WT_PAGE_UPDATE_IGNORE 0x80u /* Ignore updates on page discard */
- /* AUTOMATIC FLAG VALUE GENERATION STOP 8 */
- uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */
+#define WT_PAGE_BUILD_KEYS 0x001u /* Keys have been built in memory */
+#define WT_PAGE_COMPACTION_WRITE 0x002u /* Writing the page for compaction */
+#define WT_PAGE_DISK_ALLOC 0x004u /* Disk image in allocated memory */
+#define WT_PAGE_DISK_MAPPED 0x008u /* Disk image in mapped memory */
+#define WT_PAGE_EVICT_LRU 0x010u /* Page is on the LRU queue */
+#define WT_PAGE_EVICT_NO_PROGRESS 0x020u /* Eviction doesn't count as progress */
+#define WT_PAGE_OVERFLOW_KEYS 0x040u /* Page has overflow keys */
+#define WT_PAGE_SPLIT_INSERT 0x080u /* A leaf page was split for append */
+#define WT_PAGE_UPDATE_IGNORE 0x100u /* Ignore updates on page discard */
+ /* AUTOMATIC FLAG VALUE GENERATION STOP 9 */
+ uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */
uint8_t unused[2]; /* Unused padding */
diff --git a/src/third_party/wiredtiger/src/include/compact.h b/src/third_party/wiredtiger/src/include/compact.h
index 637f4a60e51..bf1a31b2167 100644
--- a/src/third_party/wiredtiger/src/include/compact.h
+++ b/src/third_party/wiredtiger/src/include/compact.h
@@ -7,10 +7,9 @@
*/
struct __wt_compact_state {
- uint32_t lsm_count; /* Number of LSM trees seen */
- uint32_t file_count; /* Number of files seen */
- uint64_t max_time; /* Configured timeout */
- uint64_t prog_msg_count; /* Progress message count */
+ uint32_t lsm_count; /* Number of LSM trees seen */
+ uint32_t file_count; /* Number of files seen */
+ uint64_t max_time; /* Configured timeout */
struct timespec begin; /* Starting time */
};
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 0f4f3d84a67..acbb7a89646 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -129,6 +129,8 @@ extern int __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_block_compact_end(WT_SESSION_IMPL *session, WT_BLOCK *block)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_block_compact_page_rewrite(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr,
+ size_t *addr_sizep, bool *skipp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_block_compact_page_skip(
WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, bool *skipp)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -397,8 +399,6 @@ extern int __wt_collator_config(WT_SESSION_IMPL *session, const char *uri, WT_CO
WT_CONFIG_ITEM *metadata, WT_COLLATOR **collatorp, int *ownp)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_compact(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool *skipp)
- WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_compressor_config(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *cval,
WT_COMPRESSOR **compressorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_cond_auto_alloc(WT_SESSION_IMPL *session, const char *name, uint64_t min,
@@ -1686,6 +1686,8 @@ extern void __wt_abort(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((nor
WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
extern void __wt_backup_destroy(WT_SESSION_IMPL *session);
extern void __wt_block_ckpt_destroy(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci);
+extern void __wt_block_compact_progress(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, u_int *msg_countp);
extern void __wt_block_configure_first_fit(WT_BLOCK *block, bool on);
extern void __wt_block_ext_free(WT_SESSION_IMPL *session, WT_EXT *ext);
extern void __wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el);
diff --git a/src/third_party/wiredtiger/src/include/hardware.h b/src/third_party/wiredtiger/src/include/hardware.h
index 762613f0373..a313fbb816b 100644
--- a/src/third_party/wiredtiger/src/include/hardware.h
+++ b/src/third_party/wiredtiger/src/include/hardware.h
@@ -33,6 +33,8 @@
#define F_SET_ATOMIC(p, mask) \
do { \
uint8_t __orig; \
+ if (F_ISSET_ATOMIC(p, mask)) \
+ break; \
do { \
__orig = (p)->flags_atomic; \
} while (!__wt_atomic_cas8(&(p)->flags_atomic, __orig, __orig | (uint8_t)(mask))); \
@@ -41,6 +43,8 @@
#define F_CLR_ATOMIC(p, mask) \
do { \
uint8_t __orig; \
+ if (!F_ISSET_ATOMIC(p, mask)) \
+ break; \
do { \
__orig = (p)->flags_atomic; \
} while (!__wt_atomic_cas8(&(p)->flags_atomic, __orig, __orig & ~(uint8_t)(mask))); \
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index a4f006b82c3..20a66b93c39 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -73,11 +73,9 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage
F_SET(session, WT_SESSION_NO_RECONCILE);
/*
- * Reconciliation locks the page for three reasons:
+ * Reconciliation locks the page for two reasons:
* Reconciliation reads the lists of page updates, obsolete updates
* cannot be discarded while reconciliation is in progress;
- * The compaction process reads page modification information, which
- * reconciliation modifies;
* In-memory splits: reconciliation of an internal page cannot handle
* a child page splitting during the reconciliation.
*/
@@ -98,6 +96,9 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage
*/
ret = __reconcile(session, ref, salvage, flags, &page_locked);
+ /* If writing a page in service of compaction, we're done, clear the flag. */
+ F_CLR_ATOMIC(ref->page, WT_PAGE_COMPACTION_WRITE);
+
err:
if (page_locked)
WT_PAGE_UNLOCK(session, page);
@@ -392,14 +393,18 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
/*
* If a single root page was written (either an empty page or there was a 1-for-1 page swap),
- * we've written root and checkpoint, we're done. If the root page split, write the resulting
- * WT_REF array. We already have an infrastructure for writing pages, create a fake root page
- * and write it instead of adding code to write blocks based on the list of blocks resulting
- * from a multiblock reconciliation.
+ * we've written root and checkpoint, we're done. Clear the result of the reconciliation, a root
+ * page never has the structures that would normally be associated with (at least), the
+ * replaced-object flag. If the root page split, write the resulting WT_REF array. We already
+ * have an infrastructure for writing pages, create a fake root page and write it instead of
+ * adding code to write blocks based on the list of blocks resulting from a multiblock
+ * reconciliation.
+ *
*/
switch (mod->rec_result) {
case WT_PM_REC_EMPTY: /* Page is empty */
case WT_PM_REC_REPLACE: /* 1-for-1 page swap */
+ mod->rec_result = 0;
return (0);
case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */
break;
@@ -1635,12 +1640,11 @@ __rec_split_write_reuse(
multi->checksum = __wt_checksum(image->data, image->size);
/*
- * Don't check for a block match when writing blocks during compaction, the whole idea is to
- * move those blocks. Check after calculating the checksum, we don't distinguish between pages
- * written solely as part of the compaction and pages written at around the same time, and so
- * there's a possibility the calculated checksum will be useful in the future.
+ * Don't check for a block match when writing a page for compaction, the whole idea is to move
+ * those blocks. Check after calculating the checksum, there's a possibility the calculated
+ * checksum will be useful in the future.
*/
- if (session->compact_state != WT_COMPACT_NONE)
+ if (F_ISSET_ATOMIC(r->page, WT_PAGE_COMPACTION_WRITE))
return (false);
/*
diff --git a/src/third_party/wiredtiger/test/suite/test_compact02.py b/src/third_party/wiredtiger/test/suite/test_compact02.py
index 81d636eee6b..a0b162dbaf2 100755
--- a/src/third_party/wiredtiger/test/suite/test_compact02.py
+++ b/src/third_party/wiredtiger/test/suite/test_compact02.py
@@ -110,6 +110,8 @@ class test_compact02(wttest.WiredTigerTestCase):
# Create a table, add keys with both big and small values.
def test_compact02(self):
+ mb = 1024 * 1024
+
# FIXME-WT-7187
# This test is temporarily disabled for OS/X, it fails, but not consistently.
import platform
@@ -118,8 +120,9 @@ class test_compact02(wttest.WiredTigerTestCase):
self.ConnectionOpen(self.cacheSize)
- mb = 1024 * 1024
- params = 'key_format=i,value_format=S,' + self.fileConfig
+ # Set the leaf_value_max to ensure we never create overflow items.
+ # FIXME: WT-2298
+ params = 'key_format=i,value_format=S,leaf_value_max=10MB,' + self.fileConfig
# 1. Create a table with the data, alternating record size.
self.session.create(self.uri, params)