diff options
author | Etienne Petrel <etienne.petrel@mongodb.com> | 2021-10-19 06:19:36 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-10-19 07:35:46 +0000 |
commit | 7fc844336b4650cea78c19990a14bb453b6f7b14 (patch) | |
tree | df51ff34c1d838bcc942f84b9f46c8749908734a /src/third_party/wiredtiger | |
parent | 1da364de3bee98e82fe180dad1e0983d4b12f9a2 (diff) | |
download | mongo-7fc844336b4650cea78c19990a14bb453b6f7b14.tar.gz |
Import wiredtiger: 70ab26de2ab263fabab39114aee583f632f4e088 from branch mongodb-master
ref: bfcac76ea0..70ab26de2a
for: 5.2.0
WT-6001 Avoid reading the page into cache if it needs to be rewritten
Diffstat (limited to 'src/third_party/wiredtiger')
-rw-r--r-- | src/third_party/wiredtiger/dist/s_string.ok | 1 | ||||
-rwxr-xr-x | src/third_party/wiredtiger/dist/s_void | 1 | ||||
-rw-r--r-- | src/third_party/wiredtiger/import.data | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/block/block_compact.c | 154 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/block/block_mgr.c | 39 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_compact.c | 355 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/block.h | 11 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/btmem.h | 21 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/compact.h | 7 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/extern.h | 6 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/hardware.h | 4 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/reconcile/rec_write.c | 28 | ||||
-rwxr-xr-x | src/third_party/wiredtiger/test/suite/test_compact02.py | 7 |
13 files changed, 458 insertions, 178 deletions
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index b355d29c901..83a238b625c 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -502,6 +502,7 @@ abcdef abcdefghijklmnopqrstuvwxyz addl addr +addrs agc alfred alloc diff --git a/src/third_party/wiredtiger/dist/s_void b/src/third_party/wiredtiger/dist/s_void index 249c89fbbab..7ea8ce985dd 100755 --- a/src/third_party/wiredtiger/dist/s_void +++ b/src/third_party/wiredtiger/dist/s_void @@ -34,6 +34,7 @@ func_ok() -e '/int __bm_stat$/d' \ -e '/int __checkpoint_presync$/d' \ -e '/int __compact_uri_analyze$/d' \ + -e '/int __compact_walk_page_skip$/d' \ -e '/int __config_parser_close$/d' \ -e '/int __curlog_reset$/d' \ -e '/int __cursor_fix_implicit$/d' \ diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index ad1d4cce211..3226c4916c3 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-master", - "commit": "bfcac76ea0dae325f0134818fb32bbe134eec71d" + "commit": "70ab26de2ab263fabab39114aee583f632f4e088" } diff --git a/src/third_party/wiredtiger/src/block/block_compact.c b/src/third_party/wiredtiger/src/block/block_compact.c index f6be40cfa78..28076e856b4 100644 --- a/src/third_party/wiredtiger/src/block/block_compact.c +++ b/src/third_party/wiredtiger/src/block/block_compact.c @@ -24,9 +24,10 @@ __wt_block_compact_start(WT_SESSION_IMPL *session, WT_BLOCK *block) /* Reset the compaction state information. */ block->compact_pct_tenths = 0; + block->compact_blocks_moved = 0; + block->compact_cache_pages_dealt = 0; block->compact_pages_reviewed = 0; block->compact_pages_skipped = 0; - block->compact_pages_written = 0; return (0); } @@ -51,6 +52,32 @@ __wt_block_compact_end(WT_SESSION_IMPL *session, WT_BLOCK *block) } /* + * __wt_block_compact_progress -- + * Output compact progress message. + */ +void +__wt_block_compact_progress(WT_SESSION_IMPL *session, WT_BLOCK *block, u_int *msg_countp) +{ + struct timespec cur_time; + uint64_t time_diff; + + if (!WT_VERBOSE_ISSET(session, WT_VERB_COMPACT_PROGRESS)) + return; + + __wt_epoch(session, &cur_time); + + /* Log one progress message every twenty seconds. */ + time_diff = WT_TIMEDIFF_SEC(cur_time, session->compact->begin); + if (time_diff / WT_PROGRESS_MSG_PERIOD > *msg_countp) { + ++*msg_countp; + __wt_verbose(session, WT_VERB_COMPACT_PROGRESS, + " compacting %s for %" PRIu64 " seconds; reviewed %" PRIu64 " pages, skipped %" PRIu64 + " pages, cache pages evicted %" PRIu64 ", on-disk pages moved %" PRIu64, + block->name, time_diff, block->compact_pages_reviewed, block->compact_pages_skipped, + block->compact_cache_pages_dealt, block->compact_blocks_moved); + } +} +/* * __wt_block_compact_skip -- * Return if compaction will shrink the file. */ @@ -117,7 +144,7 @@ __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, bool *skipp) "%s: total reviewed %" PRIu64 " pages, total skipped %" PRIu64 " pages, total wrote %" PRIu64 " pages", block->name, block->compact_pages_reviewed, block->compact_pages_skipped, - block->compact_pages_written); + block->compact_cache_pages_dealt); __wt_verbose(session, WT_VERB_COMPACT, "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first 80%% of the file", block->name, (uintmax_t)avail_eighty / WT_MEGABYTE, (uintmax_t)avail_eighty); @@ -136,27 +163,22 @@ __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, bool *skipp) } /* - * __wt_block_compact_page_skip -- + * __compact_page_skip -- * Return if writing a particular page will shrink the file. */ -int -__wt_block_compact_page_skip( - WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, bool *skipp) +static void +__compact_page_skip( + WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t offset, uint32_t size, bool *skipp) { WT_EXT *ext; WT_EXTLIST *el; - wt_off_t limit, offset; - uint32_t checksum, objectid, size; + wt_off_t limit; *skipp = true; /* Return a default skip. */ - /* Crack the cookie. */ - WT_RET(__wt_block_addr_unpack( - session, block, addr, addr_size, &objectid, &offset, &size, &checksum)); - /* * If this block is in the chosen percentage of the file and there's a block on the available - * list that's appears before that percentage of the file, rewrite the block. Checking the + * list that appears before that percentage of the file, rewrite the block. Checking the * available list is necessary (otherwise writing the block would extend the file), but there's * an obvious race if the file is sufficiently busy. */ @@ -174,17 +196,117 @@ __wt_block_compact_page_skip( } } __wt_spin_unlock(session, &block->live_lock); +} + +/* + * __wt_block_compact_page_skip -- + * Return if writing a particular page will shrink the file. + */ +int +__wt_block_compact_page_skip( + WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, bool *skipp) +{ + wt_off_t offset; + uint32_t size, checksum, objectid; + + WT_UNUSED(addr_size); + *skipp = true; /* Return a default skip. */ + offset = 0; + + /* Crack the cookie. */ + WT_RET(__wt_block_addr_unpack( + session, block, addr, addr_size, &objectid, &offset, &size, &checksum)); + + __compact_page_skip(session, block, offset, size, skipp); ++block->compact_pages_reviewed; if (*skipp) ++block->compact_pages_skipped; else - ++block->compact_pages_written; + ++block->compact_cache_pages_dealt; return (0); } /* + * __wt_block_compact_page_rewrite -- + * Rewrite a page if it will shrink the file. + */ +int +__wt_block_compact_page_rewrite( + WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t *addr_sizep, bool *skipp) +{ + WT_DECL_ITEM(tmp); + WT_DECL_RET; + wt_off_t offset, new_offset; + uint32_t size, checksum, objectid; + uint8_t *endp; + bool discard_block; + + *skipp = true; /* Return a default skip. */ + new_offset = 0; /* -Werror=maybe-uninitialized */ + + discard_block = false; + + WT_ERR(__wt_block_addr_unpack( + session, block, addr, *addr_sizep, &objectid, &offset, &size, &checksum)); + + /* Check if the block is worth rewriting. */ + __compact_page_skip(session, block, offset, size, skipp); + + if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT) || + WT_VERBOSE_ISSET(session, WT_VERB_COMPACT_PROGRESS)) { + ++block->compact_pages_reviewed; + if (*skipp) + ++block->compact_pages_skipped; + else + ++block->compact_blocks_moved; + } + if (*skipp) + return (0); + + /* Read the block. */ + WT_ERR(__wt_scr_alloc(session, size, &tmp)); + WT_ERR(__wt_read(session, block->fh, offset, size, tmp->mem)); + + /* Allocate a replacement block. */ + WT_ERR(__wt_block_ext_prealloc(session, 5)); + __wt_spin_lock(session, &block->live_lock); + ret = __wt_block_alloc(session, block, &new_offset, (wt_off_t)size); + __wt_spin_unlock(session, &block->live_lock); + WT_ERR(ret); + discard_block = true; + + /* Write the block. */ + WT_ERR(__wt_write(session, block->fh, new_offset, size, tmp->mem)); + + /* Free the original block. */ + __wt_spin_lock(session, &block->live_lock); + ret = __wt_block_off_free(session, block, objectid, offset, (wt_off_t)size); + __wt_spin_unlock(session, &block->live_lock); + WT_ERR(ret); + + /* Build the returned address cookie. */ + endp = addr; + WT_ERR(__wt_block_addr_pack(block, &endp, objectid, new_offset, size, checksum)); + *addr_sizep = WT_PTRDIFF(endp, addr); + + WT_STAT_CONN_INCR(session, block_write); + WT_STAT_CONN_INCRV(session, block_byte_write, size); + + discard_block = false; + +err: + if (discard_block) { + __wt_spin_lock(session, &block->live_lock); + WT_TRET(__wt_block_off_free(session, block, objectid, new_offset, (wt_off_t)size)); + __wt_spin_unlock(session, &block->live_lock); + } + __wt_scr_free(session, &tmp); + return (ret); +} + +/* * __block_dump_bucket_stat -- * Dump out the information about available and used blocks in the given bucket (part of the * file). @@ -237,8 +359,10 @@ __block_dump_file_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, bool start) session, WT_VERB_COMPACT, "pages reviewed: %" PRIu64, block->compact_pages_reviewed); __wt_verbose( session, WT_VERB_COMPACT, "pages skipped: %" PRIu64, block->compact_pages_skipped); + __wt_verbose(session, WT_VERB_COMPACT, + "cache pages read/flushed out of the cache: %" PRIu64, block->compact_cache_pages_dealt); __wt_verbose( - session, WT_VERB_COMPACT, "pages written: %" PRIu64, block->compact_pages_written); + session, WT_VERB_COMPACT, "blocks moved : %" PRIu64, block->compact_blocks_moved); } __wt_verbose(session, WT_VERB_COMPACT, diff --git a/src/third_party/wiredtiger/src/block/block_mgr.c b/src/third_party/wiredtiger/src/block/block_mgr.c index 433d7342295..983de3d7c52 100644 --- a/src/third_party/wiredtiger/src/block/block_mgr.c +++ b/src/third_party/wiredtiger/src/block/block_mgr.c @@ -221,6 +221,32 @@ __bm_compact_end_readonly(WT_BM *bm, WT_SESSION_IMPL *session) } /* + * __bm_compact_page_rewrite -- + * Rewrite a page for compaction. + */ +static int +__bm_compact_page_rewrite( + WT_BM *bm, WT_SESSION_IMPL *session, uint8_t *addr, size_t *addr_sizep, bool *writtenp) +{ + return (__wt_block_compact_page_rewrite(session, bm->block, addr, addr_sizep, writtenp)); +} + +/* + * __bm_compact_page_rewrite_readonly -- + * Rewrite a page for compaction; readonly version. + */ +static int +__bm_compact_page_rewrite_readonly( + WT_BM *bm, WT_SESSION_IMPL *session, uint8_t *addr, size_t *addr_sizep, bool *writtenp) +{ + WT_UNUSED(addr); + WT_UNUSED(addr_sizep); + WT_UNUSED(writtenp); + + return (__bm_readonly(bm, session)); +} + +/* * __bm_compact_page_skip -- * Return if a page is useful for compaction. */ @@ -247,6 +273,16 @@ __bm_compact_page_skip_readonly( } /* + * __bm_compact_progress -- + * Output compact progress message. + */ +static void +__bm_compact_progress(WT_BM *bm, WT_SESSION_IMPL *session, u_int *msg_countp) +{ + __wt_block_compact_progress(session, bm->block, msg_countp); +} + +/* * __bm_compact_skip -- * Return if a file can be compacted. */ @@ -584,7 +620,9 @@ __bm_method_set(WT_BM *bm, bool readonly) bm->checkpoint_unload = __bm_checkpoint_unload; bm->close = __bm_close; bm->compact_end = __bm_compact_end; + bm->compact_page_rewrite = __bm_compact_page_rewrite; bm->compact_page_skip = __bm_compact_page_skip; + bm->compact_progress = __bm_compact_progress; bm->compact_skip = __bm_compact_skip; bm->compact_start = __bm_compact_start; bm->corrupt = __wt_bm_corrupt; @@ -612,6 +650,7 @@ __bm_method_set(WT_BM *bm, bool readonly) bm->checkpoint_resolve = __bm_checkpoint_resolve_readonly; bm->checkpoint_start = __bm_checkpoint_start_readonly; bm->compact_end = __bm_compact_end_readonly; + bm->compact_page_rewrite = __bm_compact_page_rewrite_readonly; bm->compact_page_skip = __bm_compact_page_skip_readonly; bm->compact_skip = __bm_compact_skip_readonly; bm->compact_start = __bm_compact_start_readonly; diff --git a/src/third_party/wiredtiger/src/btree/bt_compact.c b/src/third_party/wiredtiger/src/btree/bt_compact.c index 29e83c10beb..7680f2cbc79 100644 --- a/src/third_party/wiredtiger/src/btree/bt_compact.c +++ b/src/third_party/wiredtiger/src/btree/bt_compact.c @@ -9,11 +9,11 @@ #include "wt_internal.h" /* - * __compact_rewrite -- - * Return if a modified page needs to be re-written. + * __compact_page_inmem_check_addrs -- + * Return if a clean, in-memory page needs to be re-written. */ static int -__compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) +__compact_page_inmem_check_addrs(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_ADDR_COPY addr; WT_BM *bm; @@ -25,7 +25,7 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) bm = S2BT(session)->bm; - /* If the page is clean, test the original addresses. */ + /* If the page is currently clean, test the original addresses. */ if (__wt_page_evict_clean(ref->page)) return (__wt_ref_addr_copy(session, ref, &addr) ? bm->compact_page_skip(bm, session, addr.addr, addr.size, skipp) : @@ -53,63 +53,240 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) } /* - * __compact_rewrite_lock -- - * Return if a page needs to be re-written. + * __compact_page_inmem -- + * Return if an in-memory page needs to be re-written. */ static int -__compact_rewrite_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) +__compact_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { - WT_BTREE *btree; - WT_DECL_RET; + *skipp = true; /* Default skip. */ + + /* + * Ignore dirty pages, checkpoint will likely write them. There are cases where checkpoint can + * skip dirty pages: to avoid that, we could alter the transactional information of the page, + * which is what checkpoint reviews to decide if a page can be skipped. Not doing that for now, + * the repeated checkpoints that compaction requires are more than likely to pick up all dirty + * pages at some point. + * + * Check clean page addresses, and mark page and tree dirty if the page needs to be rewritten. + */ + if (__wt_page_is_modified(ref->page)) + *skipp = false; + else { + WT_RET(__compact_page_inmem_check_addrs(session, ref, skipp)); + + if (!*skipp) { + WT_RET(__wt_page_modify_init(session, ref->page)); + __wt_page_modify_set(session, ref->page); + } + } + + /* If rewriting the page, have reconciliation write new blocks. */ + if (!*skipp) + F_SET_ATOMIC(ref->page, WT_PAGE_COMPACTION_WRITE); - btree = S2BT(session); + return (0); +} + +/* + * __compact_page_replace_addr -- + * Replace a page's WT_ADDR. + */ +static int +__compact_page_replace_addr(WT_SESSION_IMPL *session, WT_REF *ref, WT_ADDR_COPY *copy) +{ + WT_ADDR *addr; + WT_CELL_UNPACK_ADDR unpack; + WT_DECL_RET; /* - * Reviewing in-memory pages requires looking at page reconciliation results, because we care - * about where the page is stored now, not where the page was stored when we first read it into - * the cache. We need to ensure we don't race with page reconciliation as it's writing the page - * modify information. There are two ways we call reconciliation: checkpoints and eviction. We - * are holding a hazard pointer that blocks eviction, but there's nothing blocking a checkpoint. - * Get the tree's flush lock which blocks threads writing pages for checkpoints. If checkpoint - * is holding the lock, quit working this file, we'll visit it again in our next pass. + * If there's no address at all (the page has never been written), allocate a new WT_ADDR + * structure, otherwise, the address has already been instantiated, replace the cookie. */ - WT_RET(__wt_spin_trylock(session, &btree->flush_lock)); + addr = ref->addr; + WT_ASSERT(session, addr != NULL); + + if (__wt_off_page(ref->home, addr)) + __wt_free(session, addr->addr); + else { + __wt_cell_unpack_addr(session, ref->home->dsk, (WT_CELL *)addr, &unpack); + + WT_RET(__wt_calloc_one(session, &addr)); + addr->ta.newest_start_durable_ts = unpack.ta.newest_start_durable_ts; + addr->ta.newest_stop_durable_ts = unpack.ta.newest_stop_durable_ts; + addr->ta.oldest_start_ts = unpack.ta.oldest_start_ts; + addr->ta.newest_txn = unpack.ta.newest_txn; + addr->ta.newest_stop_ts = unpack.ta.newest_stop_ts; + addr->ta.newest_stop_txn = unpack.ta.newest_stop_txn; + switch (unpack.raw) { + case WT_CELL_ADDR_INT: + addr->type = WT_ADDR_INT; + break; + case WT_CELL_ADDR_LEAF: + addr->type = WT_ADDR_LEAF; + break; + case WT_CELL_ADDR_LEAF_NO: + addr->type = WT_ADDR_LEAF_NO; + break; + } + } - ret = __compact_rewrite(session, ref, skipp); + WT_ERR(__wt_strndup(session, copy->addr, copy->size, &addr->addr)); + addr->size = copy->size; - /* Unblock threads writing leaf pages. */ - __wt_spin_unlock(session, &btree->flush_lock); + ref->addr = addr; + return (0); +err: + if (addr != ref->addr) + __wt_free(session, addr); return (ret); } /* - * __compact_progress -- - * Output a compact progress message. + * __compact_page -- + * Compaction for a single page. */ -static void -__compact_progress(WT_SESSION_IMPL *session) +static int +__compact_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { - struct timespec cur_time; + WT_ADDR_COPY copy; WT_BM *bm; - uint64_t time_diff; + WT_DECL_RET; + size_t addr_size; + uint8_t previous_state; - if (!WT_VERBOSE_ISSET(session, WT_VERB_COMPACT_PROGRESS)) - return; + *skipp = true; /* Default skip. */ - bm = S2BT(session)->bm; - __wt_epoch(session, &cur_time); - - /* Log one progress message every twenty seconds. */ - time_diff = WT_TIMEDIFF_SEC(cur_time, session->compact->begin); - if (time_diff / WT_PROGRESS_MSG_PERIOD > session->compact->prog_msg_count) { - __wt_verbose(session, WT_VERB_COMPACT_PROGRESS, - "Compact running for %" PRIu64 " seconds; reviewed %" PRIu64 " pages, skipped %" PRIu64 - " pages, wrote %" PRIu64 " pages", - time_diff, bm->block->compact_pages_reviewed, bm->block->compact_pages_skipped, - bm->block->compact_pages_written); - session->compact->prog_msg_count++; + /* Lock the WT_REF. */ + WT_REF_LOCK(session, ref, &previous_state); + + /* + * Skip deleted pages but consider them progress (the on-disk block is discarded by the next + * checkpoint). + */ + if (previous_state == WT_REF_DELETED) + *skipp = false; + + /* + * If it's on-disk, get a copy of the address and ask the block manager to rewrite the block if + * it's useful. This is safe because we're holding the WT_REF locked, so nobody can read the + * page giving eviction a chance to modify the address. + * + * In this path, we are holding the WT_REF lock across two OS buffer cache I/Os (the read of the + * original block and the write of the new block), plus whatever overhead that entails. It's not + * ideal, we could release the lock, but then we'd have to deal with the block having been read + * into memory while we were moving it. + */ + if (previous_state == WT_REF_DISK && __wt_ref_addr_copy(session, ref, ©)) { + bm = S2BT(session)->bm; + addr_size = copy.size; + WT_ERR(bm->compact_page_rewrite(bm, session, copy.addr, &addr_size, skipp)); + if (!*skipp) { + copy.size = (uint8_t)addr_size; + WT_ERR(__compact_page_replace_addr(session, ref, ©)); + WT_STAT_DATA_INCR(session, btree_compact_pages_rewritten); + } } + + /* + * Ignore pages that aren't in-memory for some reason other than they're on-disk, for example, + * they might have split or been deleted while we were locking the WT_REF. This includes the + * case where we found an on-disk page and either rewrite the block successfully or failed to + * get a copy of the address (which shouldn't ever happen, but if that goes wrong, it's not our + * problem to solve). + * + * In this path, we are holding the WT_REF lock across some in-memory checks and possibly one or + * more calls to the underlying block manager which is going to search the list of extents to + * figure out if the block is worth rewriting. It's not ideal because we're blocking the + * application's worker threads: we could release the lock, but then we'd have to acquire a + * hazard pointer to ensure eviction didn't select the page. + */ + if (previous_state == WT_REF_MEM) { + WT_ERR(__compact_page_inmem(session, ref, skipp)); + if (!*skipp) + WT_STAT_DATA_INCR(session, btree_compact_pages_rewritten); + } + +err: + WT_REF_UNLOCK(ref, previous_state); + + return (ret); +} + +/* + * __compact_walk_internal -- + * Walk an internal page for compaction. + */ +static int +__compact_walk_internal(WT_SESSION_IMPL *session, WT_REF *parent) +{ + WT_DECL_RET; + WT_REF *ref; + bool overall_progress, skipp; + + ref = NULL; /* [-Wconditional-uninitialized] */ + + /* + * We could corrupt a checkpoint if we moved a block that's part of the checkpoint, that is, if + * we race with checkpoint's review of the tree. Get the tree's flush lock which blocks threads + * writing pages for checkpoints, and hold it long enough to review a single internal page. Quit + * working the file if checkpoint is holding the lock, checkpoint holds the lock for relatively + * long periods. + */ + WT_RET(__wt_spin_trylock(session, &S2BT(session)->flush_lock)); + + /* + * Walk the internal page and check any leaf pages it references; skip internal pages, we'll + * visit them individually. + */ + overall_progress = false; + WT_INTL_FOREACH_BEGIN (session, parent->page, ref) { + if (F_ISSET(ref, WT_REF_FLAG_LEAF)) { + WT_ERR(__compact_page(session, ref, &skipp)); + if (!skipp) + overall_progress = true; + } + } + WT_INTL_FOREACH_END; + + /* + * If we moved a leaf page, we'll write the parent. If we didn't move a leaf page, check pages + * other than the root to see if we want to move the internal page itself. (Skip the root as a + * forced checkpoint will always rewrite it, and you can't just "move" a root page.) + */ + if (!overall_progress && !__wt_ref_is_root(parent)) { + WT_ERR(__compact_page(session, parent, &skipp)); + if (!skipp) + overall_progress = true; + } + + /* If we found a page to compact, mark the parent and tree dirty and report success. */ + if (overall_progress) { + WT_ERR(__wt_page_parent_modify_set(session, ref, false)); + session->compact_state = WT_COMPACT_SUCCESS; + } + +err: + /* Unblock checkpoint threads. */ + __wt_spin_unlock(session, &S2BT(session)->flush_lock); + + return (ret); +} + +/* + * __compact_walk_page_skip -- + * Skip leaf pages, all we want are internal pages. + */ +static int +__compact_walk_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool *skipp) +{ + WT_UNUSED(context); + WT_UNUSED(session); + + /* All we want are the internal pages. */ + *skipp = F_ISSET(ref, WT_REF_FLAG_LEAF) ? true : false; + return (0); } /* @@ -122,7 +299,7 @@ __wt_compact(WT_SESSION_IMPL *session) WT_BM *bm; WT_DECL_RET; WT_REF *ref; - u_int i; + u_int i, msg_count; bool skip; bm = S2BT(session)->bm; @@ -131,8 +308,8 @@ __wt_compact(WT_SESSION_IMPL *session) WT_STAT_DATA_INCR(session, session_compact); /* - * Check if compaction might be useful -- the API layer will quit trying to compact the data - * source if we make no progress, set a flag if the block layer thinks compaction is possible. + * Check if compaction might be useful (the API layer will quit trying to compact the data + * source if we make no progress). */ WT_RET(bm->compact_skip(bm, session, &skip)); if (skip) { @@ -148,14 +325,14 @@ __wt_compact(WT_SESSION_IMPL *session) WT_STAT_DATA_SET(session, btree_compact_pages_reviewed, bm->block->compact_pages_reviewed); WT_STAT_DATA_SET(session, btree_compact_pages_skipped, bm->block->compact_pages_skipped); WT_STAT_DATA_SET( - session, btree_compact_pages_write_selected, bm->block->compact_pages_written); + session, btree_compact_pages_write_selected, bm->block->compact_cache_pages_dealt); /* * Periodically check if we've timed out or eviction is stuck. Quit if eviction is stuck, * we're making the problem worse. */ if (++i > 100) { - __compact_progress(session); + bm->compact_progress(bm, session, &msg_count); WT_ERR(__wt_session_compact_check_timeout(session)); if (__wt_cache_stuck(session)) @@ -176,94 +353,16 @@ __wt_compact(WT_SESSION_IMPL *session) * evicted quickly. */ WT_ERR(__wt_tree_walk_custom_skip( - session, &ref, __wt_compact_page_skip, NULL, WT_READ_NO_GEN | WT_READ_WONT_NEED)); + session, &ref, __compact_walk_page_skip, NULL, WT_READ_NO_GEN | WT_READ_WONT_NEED)); if (ref == NULL) break; - /* - * Cheap checks that don't require locking. - * - * Ignore the root: it may not have a replacement address, and besides, if anything else - * gets written, so will it. - * - * Ignore dirty pages, checkpoint will likely write them. There are cases where checkpoint - * can skip dirty pages: to avoid that, we could alter the transactional information of the - * page, which is what checkpoint reviews to decide if a page can be skipped. Not doing that - * for now, the repeated checkpoints that compaction requires are more than likely to pick - * up all dirty pages at some point. - */ - if (__wt_ref_is_root(ref)) - continue; - if (__wt_page_is_modified(ref->page)) - continue; - - WT_ERR(__compact_rewrite_lock(session, ref, &skip)); - if (skip) - continue; - - /* Rewrite the page: mark the page and tree dirty. */ - WT_ERR(__wt_page_modify_init(session, ref->page)); - __wt_page_modify_set(session, ref->page); - - session->compact_state = WT_COMPACT_SUCCESS; - WT_STAT_DATA_INCR(session, btree_compact_pages_rewritten); + WT_WITH_PAGE_INDEX(session, ret = __compact_walk_internal(session, ref)); + WT_ERR(ret); } err: - if (ref != NULL) - WT_TRET(__wt_page_release(session, ref, 0)); + WT_TRET(__wt_page_release(session, ref, 0)); return (ret); } - -/* - * __wt_compact_page_skip -- - * Return if compaction requires we read this page. - */ -int -__wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool *skipp) -{ - WT_ADDR_COPY addr; - WT_BM *bm; - uint8_t previous_state; - bool diskaddr; - - WT_UNUSED(context); - - *skipp = false; /* Default to reading */ - - /* Internal pages must be read to walk the tree. */ - if (F_ISSET(ref, WT_REF_FLAG_INTERNAL)) - return (0); - - /* - * Skip deleted pages, rewriting them doesn't seem useful; in a better world we'd write the - * parent to delete the page. - */ - if (ref->state == WT_REF_DELETED) { - *skipp = true; - return (0); - } - - /* - * If the page is in-memory, we want to look at it (it may have been modified and written, and - * the current location is the interesting one in terms of compaction, not the original). - */ - if (ref->state != WT_REF_DISK) - return (0); - - /* - * Lock the WT_REF and if it's still on-disk, get a copy of the address. This is safe because - * it's an on-disk page and we're holding the WT_REF locked, so nobody can read the page giving - * either checkpoint or eviction a chance to modify the address. - */ - WT_REF_LOCK(session, ref, &previous_state); - diskaddr = previous_state == WT_REF_DISK && __wt_ref_addr_copy(session, ref, &addr); - WT_REF_UNLOCK(ref, previous_state); - if (!diskaddr) - return (0); - - /* Ask the block-manager if it's useful to rewrite the page. */ - bm = S2BT(session)->bm; - return (bm->compact_page_skip(bm, session, addr.addr, addr.size, skipp)); -} diff --git a/src/third_party/wiredtiger/src/include/block.h b/src/third_party/wiredtiger/src/include/block.h index 7b4fa732027..6f44c2a0ac3 100644 --- a/src/third_party/wiredtiger/src/include/block.h +++ b/src/third_party/wiredtiger/src/include/block.h @@ -181,8 +181,10 @@ struct __wt_bm { int (*checkpoint_unload)(WT_BM *, WT_SESSION_IMPL *); int (*close)(WT_BM *, WT_SESSION_IMPL *); int (*compact_end)(WT_BM *, WT_SESSION_IMPL *); + int (*compact_page_rewrite)(WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t *, bool *); int (*compact_page_skip)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t, bool *); int (*compact_skip)(WT_BM *, WT_SESSION_IMPL *, bool *); + void (*compact_progress)(WT_BM *, WT_SESSION_IMPL *, u_int *); int (*compact_start)(WT_BM *, WT_SESSION_IMPL *); int (*corrupt)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t); int (*free)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t); @@ -276,10 +278,11 @@ struct __wt_block { WT_CKPT *final_ckpt; /* Final live checkpoint write */ /* Compaction support */ - int compact_pct_tenths; /* Percent to compact */ - uint64_t compact_pages_reviewed; /* Pages reviewed */ - uint64_t compact_pages_skipped; /* Pages skipped */ - uint64_t compact_pages_written; /* Pages rewritten */ + int compact_pct_tenths; /* Percent to compact */ + uint64_t compact_blocks_moved; /* Pages moved by the block manager */ + uint64_t compact_cache_pages_dealt; /* Pages dealt with in cache */ + uint64_t compact_pages_reviewed; /* Pages reviewed */ + uint64_t compact_pages_skipped; /* Pages skipped */ /* Salvage support */ wt_off_t slvg_off; /* Salvage file offset */ diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 6382a1d5f59..ac13ed388ae 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -660,16 +660,17 @@ struct __wt_page { uint8_t type; /* Page type */ /* AUTOMATIC FLAG VALUE GENERATION START 0 */ -#define WT_PAGE_BUILD_KEYS 0x01u /* Keys have been built in memory */ -#define WT_PAGE_DISK_ALLOC 0x02u /* Disk image in allocated memory */ -#define WT_PAGE_DISK_MAPPED 0x04u /* Disk image in mapped memory */ -#define WT_PAGE_EVICT_LRU 0x08u /* Page is on the LRU queue */ -#define WT_PAGE_EVICT_NO_PROGRESS 0x10u /* Eviction doesn't count as progress */ -#define WT_PAGE_OVERFLOW_KEYS 0x20u /* Page has overflow keys */ -#define WT_PAGE_SPLIT_INSERT 0x40u /* A leaf page was split for append */ -#define WT_PAGE_UPDATE_IGNORE 0x80u /* Ignore updates on page discard */ - /* AUTOMATIC FLAG VALUE GENERATION STOP 8 */ - uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ +#define WT_PAGE_BUILD_KEYS 0x001u /* Keys have been built in memory */ +#define WT_PAGE_COMPACTION_WRITE 0x002u /* Writing the page for compaction */ +#define WT_PAGE_DISK_ALLOC 0x004u /* Disk image in allocated memory */ +#define WT_PAGE_DISK_MAPPED 0x008u /* Disk image in mapped memory */ +#define WT_PAGE_EVICT_LRU 0x010u /* Page is on the LRU queue */ +#define WT_PAGE_EVICT_NO_PROGRESS 0x020u /* Eviction doesn't count as progress */ +#define WT_PAGE_OVERFLOW_KEYS 0x040u /* Page has overflow keys */ +#define WT_PAGE_SPLIT_INSERT 0x080u /* A leaf page was split for append */ +#define WT_PAGE_UPDATE_IGNORE 0x100u /* Ignore updates on page discard */ + /* AUTOMATIC FLAG VALUE GENERATION STOP 9 */ + uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ uint8_t unused[2]; /* Unused padding */ diff --git a/src/third_party/wiredtiger/src/include/compact.h b/src/third_party/wiredtiger/src/include/compact.h index 637f4a60e51..bf1a31b2167 100644 --- a/src/third_party/wiredtiger/src/include/compact.h +++ b/src/third_party/wiredtiger/src/include/compact.h @@ -7,10 +7,9 @@ */ struct __wt_compact_state { - uint32_t lsm_count; /* Number of LSM trees seen */ - uint32_t file_count; /* Number of files seen */ - uint64_t max_time; /* Configured timeout */ - uint64_t prog_msg_count; /* Progress message count */ + uint32_t lsm_count; /* Number of LSM trees seen */ + uint32_t file_count; /* Number of files seen */ + uint64_t max_time; /* Configured timeout */ struct timespec begin; /* Starting time */ }; diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 0f4f3d84a67..acbb7a89646 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -129,6 +129,8 @@ extern int __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_compact_end(WT_SESSION_IMPL *session, WT_BLOCK *block) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_block_compact_page_rewrite(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, + size_t *addr_sizep, bool *skipp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_compact_page_skip( WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, bool *skipp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -397,8 +399,6 @@ extern int __wt_collator_config(WT_SESSION_IMPL *session, const char *uri, WT_CO WT_CONFIG_ITEM *metadata, WT_COLLATOR **collatorp, int *ownp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_compact(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool *skipp) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_compressor_config(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *cval, WT_COMPRESSOR **compressorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_cond_auto_alloc(WT_SESSION_IMPL *session, const char *name, uint64_t min, @@ -1686,6 +1686,8 @@ extern void __wt_abort(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((nor WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern void __wt_backup_destroy(WT_SESSION_IMPL *session); extern void __wt_block_ckpt_destroy(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci); +extern void __wt_block_compact_progress( + WT_SESSION_IMPL *session, WT_BLOCK *block, u_int *msg_countp); extern void __wt_block_configure_first_fit(WT_BLOCK *block, bool on); extern void __wt_block_ext_free(WT_SESSION_IMPL *session, WT_EXT *ext); extern void __wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el); diff --git a/src/third_party/wiredtiger/src/include/hardware.h b/src/third_party/wiredtiger/src/include/hardware.h index 762613f0373..a313fbb816b 100644 --- a/src/third_party/wiredtiger/src/include/hardware.h +++ b/src/third_party/wiredtiger/src/include/hardware.h @@ -33,6 +33,8 @@ #define F_SET_ATOMIC(p, mask) \ do { \ uint8_t __orig; \ + if (F_ISSET_ATOMIC(p, mask)) \ + break; \ do { \ __orig = (p)->flags_atomic; \ } while (!__wt_atomic_cas8(&(p)->flags_atomic, __orig, __orig | (uint8_t)(mask))); \ @@ -41,6 +43,8 @@ #define F_CLR_ATOMIC(p, mask) \ do { \ uint8_t __orig; \ + if (!F_ISSET_ATOMIC(p, mask)) \ + break; \ do { \ __orig = (p)->flags_atomic; \ } while (!__wt_atomic_cas8(&(p)->flags_atomic, __orig, __orig & ~(uint8_t)(mask))); \ diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index a4f006b82c3..20a66b93c39 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -73,11 +73,9 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage F_SET(session, WT_SESSION_NO_RECONCILE); /* - * Reconciliation locks the page for three reasons: + * Reconciliation locks the page for two reasons: * Reconciliation reads the lists of page updates, obsolete updates * cannot be discarded while reconciliation is in progress; - * The compaction process reads page modification information, which - * reconciliation modifies; * In-memory splits: reconciliation of an internal page cannot handle * a child page splitting during the reconciliation. */ @@ -98,6 +96,9 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage */ ret = __reconcile(session, ref, salvage, flags, &page_locked); + /* If writing a page in service of compaction, we're done, clear the flag. */ + F_CLR_ATOMIC(ref->page, WT_PAGE_COMPACTION_WRITE); + err: if (page_locked) WT_PAGE_UNLOCK(session, page); @@ -392,14 +393,18 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) /* * If a single root page was written (either an empty page or there was a 1-for-1 page swap), - * we've written root and checkpoint, we're done. If the root page split, write the resulting - * WT_REF array. We already have an infrastructure for writing pages, create a fake root page - * and write it instead of adding code to write blocks based on the list of blocks resulting - * from a multiblock reconciliation. + * we've written root and checkpoint, we're done. Clear the result of the reconciliation, a root + * page never has the structures that would normally be associated with (at least), the + * replaced-object flag. If the root page split, write the resulting WT_REF array. We already + * have an infrastructure for writing pages, create a fake root page and write it instead of + * adding code to write blocks based on the list of blocks resulting from a multiblock + * reconciliation. + * */ switch (mod->rec_result) { case WT_PM_REC_EMPTY: /* Page is empty */ case WT_PM_REC_REPLACE: /* 1-for-1 page swap */ + mod->rec_result = 0; return (0); case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */ break; @@ -1635,12 +1640,11 @@ __rec_split_write_reuse( multi->checksum = __wt_checksum(image->data, image->size); /* - * Don't check for a block match when writing blocks during compaction, the whole idea is to - * move those blocks. Check after calculating the checksum, we don't distinguish between pages - * written solely as part of the compaction and pages written at around the same time, and so - * there's a possibility the calculated checksum will be useful in the future. + * Don't check for a block match when writing a page for compaction, the whole idea is to move + * those blocks. Check after calculating the checksum, there's a possibility the calculated + * checksum will be useful in the future. */ - if (session->compact_state != WT_COMPACT_NONE) + if (F_ISSET_ATOMIC(r->page, WT_PAGE_COMPACTION_WRITE)) return (false); /* diff --git a/src/third_party/wiredtiger/test/suite/test_compact02.py b/src/third_party/wiredtiger/test/suite/test_compact02.py index 81d636eee6b..a0b162dbaf2 100755 --- a/src/third_party/wiredtiger/test/suite/test_compact02.py +++ b/src/third_party/wiredtiger/test/suite/test_compact02.py @@ -110,6 +110,8 @@ class test_compact02(wttest.WiredTigerTestCase): # Create a table, add keys with both big and small values. def test_compact02(self): + mb = 1024 * 1024 + # FIXME-WT-7187 # This test is temporarily disabled for OS/X, it fails, but not consistently. import platform @@ -118,8 +120,9 @@ class test_compact02(wttest.WiredTigerTestCase): self.ConnectionOpen(self.cacheSize) - mb = 1024 * 1024 - params = 'key_format=i,value_format=S,' + self.fileConfig + # Set the leaf_value_max to ensure we never create overflow items. + # FIXME: WT-2298 + params = 'key_format=i,value_format=S,leaf_value_max=10MB,' + self.fileConfig # 1. Create a table with the data, alternating record size. self.session.create(self.uri, params) |