diff options
Diffstat (limited to 'src/third_party/wiredtiger/src/block/block_compact.c')
-rw-r--r-- | src/third_party/wiredtiger/src/block/block_compact.c | 404 |
1 files changed, 193 insertions, 211 deletions
diff --git a/src/third_party/wiredtiger/src/block/block_compact.c b/src/third_party/wiredtiger/src/block/block_compact.c index 85fc7303bc2..ca805d15d02 100644 --- a/src/third_party/wiredtiger/src/block/block_compact.c +++ b/src/third_party/wiredtiger/src/block/block_compact.c @@ -12,256 +12,238 @@ static void __block_dump_avail(WT_SESSION_IMPL *, WT_BLOCK *, bool); /* * __wt_block_compact_start -- - * Start compaction of a file. + * Start compaction of a file. */ int __wt_block_compact_start(WT_SESSION_IMPL *session, WT_BLOCK *block) { - WT_UNUSED(session); + WT_UNUSED(session); - /* Switch to first-fit allocation. */ - __wt_block_configure_first_fit(block, true); + /* Switch to first-fit allocation. */ + __wt_block_configure_first_fit(block, true); - /* Reset the compaction state information. */ - block->compact_pct_tenths = 0; - block->compact_pages_reviewed = 0; - block->compact_pages_skipped = 0; - block->compact_pages_written = 0; + /* Reset the compaction state information. */ + block->compact_pct_tenths = 0; + block->compact_pages_reviewed = 0; + block->compact_pages_skipped = 0; + block->compact_pages_written = 0; - return (0); + return (0); } /* * __wt_block_compact_end -- - * End compaction of a file. + * End compaction of a file. */ int __wt_block_compact_end(WT_SESSION_IMPL *session, WT_BLOCK *block) { - /* Restore the original allocation plan. */ - __wt_block_configure_first_fit(block, false); - - /* Dump the results of the compaction pass. */ - if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT)) { - __wt_spin_lock(session, &block->live_lock); - __block_dump_avail(session, block, false); - __wt_spin_unlock(session, &block->live_lock); - } - return (0); + /* Restore the original allocation plan. */ + __wt_block_configure_first_fit(block, false); + + /* Dump the results of the compaction pass. */ + if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT)) { + __wt_spin_lock(session, &block->live_lock); + __block_dump_avail(session, block, false); + __wt_spin_unlock(session, &block->live_lock); + } + return (0); } /* * __wt_block_compact_skip -- - * Return if compaction will shrink the file. + * Return if compaction will shrink the file. */ int __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, bool *skipp) { - WT_EXT *ext; - WT_EXTLIST *el; - wt_off_t avail_eighty, avail_ninety, eighty, ninety; - - *skipp = true; /* Return a default skip. */ - - /* - * We do compaction by copying blocks from the end of the file to the - * beginning of the file, and we need some metrics to decide if it's - * worth doing. Ignore small files, and files where we are unlikely - * to recover 10% of the file. - */ - if (block->size <= WT_MEGABYTE) - return (0); - - __wt_spin_lock(session, &block->live_lock); - - /* Dump the current state of the file. */ - if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT)) - __block_dump_avail(session, block, true); - - /* Sum the available bytes in the initial 80% and 90% of the file. */ - avail_eighty = avail_ninety = 0; - ninety = block->size - block->size / 10; - eighty = block->size - ((block->size / 10) * 2); - - el = &block->live.avail; - WT_EXT_FOREACH(ext, el->off) - if (ext->off < ninety) { - avail_ninety += ext->size; - if (ext->off < eighty) - avail_eighty += ext->size; - } - - /* - * Skip files where we can't recover at least 1MB. - * - * If at least 20% of the total file is available and in the first 80% - * of the file, we'll try compaction on the last 20% of the file; else, - * if at least 10% of the total file is available and in the first 90% - * of the file, we'll try compaction on the last 10% of the file. - * - * We could push this further, but there's diminishing returns, a mostly - * empty file can be processed quickly, so more aggressive compaction is - * less useful. - */ - if (avail_eighty > WT_MEGABYTE && - avail_eighty >= ((block->size / 10) * 2)) { - *skipp = false; - block->compact_pct_tenths = 2; - } else if (avail_ninety > WT_MEGABYTE && - avail_ninety >= block->size / 10) { - *skipp = false; - block->compact_pct_tenths = 1; - } - - __wt_verbose(session, WT_VERB_COMPACT, - "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first " - "80%% of the file", - block->name, - (uintmax_t)avail_eighty / WT_MEGABYTE, (uintmax_t)avail_eighty); - __wt_verbose(session, WT_VERB_COMPACT, - "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first " - "90%% of the file", - block->name, - (uintmax_t)avail_ninety / WT_MEGABYTE, (uintmax_t)avail_ninety); - __wt_verbose(session, WT_VERB_COMPACT, - "%s: require 10%% or %" PRIuMAX "MB (%" PRIuMAX ") in the first " - "90%% of the file to perform compaction, compaction %s", - block->name, - (uintmax_t)(block->size / 10) / WT_MEGABYTE, - (uintmax_t)block->size / 10, - *skipp ? "skipped" : "proceeding"); - - __wt_spin_unlock(session, &block->live_lock); - - return (0); + WT_EXT *ext; + WT_EXTLIST *el; + wt_off_t avail_eighty, avail_ninety, eighty, ninety; + + *skipp = true; /* Return a default skip. */ + + /* + * We do compaction by copying blocks from the end of the file to the beginning of the file, and + * we need some metrics to decide if it's worth doing. Ignore small files, and files where we + * are unlikely to recover 10% of the file. + */ + if (block->size <= WT_MEGABYTE) + return (0); + + __wt_spin_lock(session, &block->live_lock); + + /* Dump the current state of the file. */ + if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT)) + __block_dump_avail(session, block, true); + + /* Sum the available bytes in the initial 80% and 90% of the file. */ + avail_eighty = avail_ninety = 0; + ninety = block->size - block->size / 10; + eighty = block->size - ((block->size / 10) * 2); + + el = &block->live.avail; + WT_EXT_FOREACH (ext, el->off) + if (ext->off < ninety) { + avail_ninety += ext->size; + if (ext->off < eighty) + avail_eighty += ext->size; + } + + /* + * Skip files where we can't recover at least 1MB. + * + * If at least 20% of the total file is available and in the first 80% + * of the file, we'll try compaction on the last 20% of the file; else, + * if at least 10% of the total file is available and in the first 90% + * of the file, we'll try compaction on the last 10% of the file. + * + * We could push this further, but there's diminishing returns, a mostly + * empty file can be processed quickly, so more aggressive compaction is + * less useful. + */ + if (avail_eighty > WT_MEGABYTE && avail_eighty >= ((block->size / 10) * 2)) { + *skipp = false; + block->compact_pct_tenths = 2; + } else if (avail_ninety > WT_MEGABYTE && avail_ninety >= block->size / 10) { + *skipp = false; + block->compact_pct_tenths = 1; + } + + __wt_verbose(session, WT_VERB_COMPACT, "%s: %" PRIuMAX "MB (%" PRIuMAX + ") available space in the first " + "80%% of the file", + block->name, (uintmax_t)avail_eighty / WT_MEGABYTE, (uintmax_t)avail_eighty); + __wt_verbose(session, WT_VERB_COMPACT, "%s: %" PRIuMAX "MB (%" PRIuMAX + ") available space in the first " + "90%% of the file", + block->name, (uintmax_t)avail_ninety / WT_MEGABYTE, (uintmax_t)avail_ninety); + __wt_verbose(session, WT_VERB_COMPACT, "%s: require 10%% or %" PRIuMAX "MB (%" PRIuMAX + ") in the first " + "90%% of the file to perform compaction, compaction %s", + block->name, (uintmax_t)(block->size / 10) / WT_MEGABYTE, (uintmax_t)block->size / 10, + *skipp ? "skipped" : "proceeding"); + + __wt_spin_unlock(session, &block->live_lock); + + return (0); } /* * __wt_block_compact_page_skip -- - * Return if writing a particular page will shrink the file. + * Return if writing a particular page will shrink the file. */ int -__wt_block_compact_page_skip(WT_SESSION_IMPL *session, - WT_BLOCK *block, const uint8_t *addr, size_t addr_size, bool *skipp) +__wt_block_compact_page_skip( + WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, bool *skipp) { - WT_EXT *ext; - WT_EXTLIST *el; - wt_off_t limit, offset; - uint32_t size, checksum; - - WT_UNUSED(addr_size); - *skipp = true; /* Return a default skip. */ - - /* Crack the cookie. */ - WT_RET( - __wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum)); - - /* - * If this block is in the chosen percentage of the file and there's a - * block on the available list that's appears before that percentage of - * the file, rewrite the block. Checking the available list is - * necessary (otherwise writing the block would extend the file), but - * there's an obvious race if the file is sufficiently busy. - */ - __wt_spin_lock(session, &block->live_lock); - limit = block->size - ((block->size / 10) * block->compact_pct_tenths); - if (offset > limit) { - el = &block->live.avail; - WT_EXT_FOREACH(ext, el->off) { - if (ext->off >= limit) - break; - if (ext->size >= size) { - *skipp = false; - break; - } - } - } - __wt_spin_unlock(session, &block->live_lock); - - if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT)) { - ++block->compact_pages_reviewed; - if (*skipp) - ++block->compact_pages_skipped; - else - ++block->compact_pages_written; - } - - return (0); + WT_EXT *ext; + WT_EXTLIST *el; + wt_off_t limit, offset; + uint32_t size, checksum; + + WT_UNUSED(addr_size); + *skipp = true; /* Return a default skip. */ + + /* Crack the cookie. */ + WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum)); + + /* + * If this block is in the chosen percentage of the file and there's a block on the available + * list that's appears before that percentage of the file, rewrite the block. Checking the + * available list is necessary (otherwise writing the block would extend the file), but there's + * an obvious race if the file is sufficiently busy. + */ + __wt_spin_lock(session, &block->live_lock); + limit = block->size - ((block->size / 10) * block->compact_pct_tenths); + if (offset > limit) { + el = &block->live.avail; + WT_EXT_FOREACH (ext, el->off) { + if (ext->off >= limit) + break; + if (ext->size >= size) { + *skipp = false; + break; + } + } + } + __wt_spin_unlock(session, &block->live_lock); + + if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT)) { + ++block->compact_pages_reviewed; + if (*skipp) + ++block->compact_pages_skipped; + else + ++block->compact_pages_written; + } + + return (0); } /* * __block_dump_avail -- - * Dump out the avail list so we can see what compaction will look like. + * Dump out the avail list so we can see what compaction will look like. */ static void __block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block, bool start) { - WT_EXT *ext; - WT_EXTLIST *el; - wt_off_t decile[10], percentile[100], size, v; - u_int i; - - el = &block->live.avail; - size = block->size; - - __wt_verbose(session, WT_VERB_COMPACT, - "============ %s", - start ? "testing for compaction" : "ending compaction pass"); - - if (!start) { - __wt_verbose(session, WT_VERB_COMPACT, - "pages reviewed: %" PRIu64, block->compact_pages_reviewed); - __wt_verbose(session, WT_VERB_COMPACT, - "pages skipped: %" PRIu64, block->compact_pages_skipped); - __wt_verbose(session, WT_VERB_COMPACT, - "pages written: %" PRIu64, block->compact_pages_written); - } - - __wt_verbose(session, WT_VERB_COMPACT, - "file size %" PRIuMAX "MB (%" PRIuMAX ") with %" PRIuMAX - "%% space available %" PRIuMAX "MB (%" PRIuMAX ")", - (uintmax_t)size / WT_MEGABYTE, (uintmax_t)size, - ((uintmax_t)el->bytes * 100) / (uintmax_t)size, - (uintmax_t)el->bytes / WT_MEGABYTE, (uintmax_t)el->bytes); - - if (el->entries == 0) - return; - - /* - * Bucket the available memory into file deciles/percentiles. Large - * pieces of memory will cross over multiple buckets, assign to the - * decile/percentile in 512B chunks. - */ - memset(decile, 0, sizeof(decile)); - memset(percentile, 0, sizeof(percentile)); - WT_EXT_FOREACH(ext, el->off) - for (i = 0; i < ext->size / 512; ++i) { - ++decile[ - ((ext->off + (wt_off_t)i * 512) * 10) / size]; - ++percentile[ - ((ext->off + (wt_off_t)i * 512) * 100) / size]; - } + WT_EXT *ext; + WT_EXTLIST *el; + wt_off_t decile[10], percentile[100], size, v; + u_int i; + + el = &block->live.avail; + size = block->size; + + __wt_verbose(session, WT_VERB_COMPACT, "============ %s", + start ? "testing for compaction" : "ending compaction pass"); + + if (!start) { + __wt_verbose( + session, WT_VERB_COMPACT, "pages reviewed: %" PRIu64, block->compact_pages_reviewed); + __wt_verbose( + session, WT_VERB_COMPACT, "pages skipped: %" PRIu64, block->compact_pages_skipped); + __wt_verbose( + session, WT_VERB_COMPACT, "pages written: %" PRIu64, block->compact_pages_written); + } + + __wt_verbose(session, WT_VERB_COMPACT, "file size %" PRIuMAX "MB (%" PRIuMAX ") with %" PRIuMAX + "%% space available %" PRIuMAX "MB (%" PRIuMAX ")", + (uintmax_t)size / WT_MEGABYTE, (uintmax_t)size, + ((uintmax_t)el->bytes * 100) / (uintmax_t)size, (uintmax_t)el->bytes / WT_MEGABYTE, + (uintmax_t)el->bytes); + + if (el->entries == 0) + return; + + /* + * Bucket the available memory into file deciles/percentiles. Large pieces of memory will cross + * over multiple buckets, assign to the decile/percentile in 512B chunks. + */ + memset(decile, 0, sizeof(decile)); + memset(percentile, 0, sizeof(percentile)); + WT_EXT_FOREACH (ext, el->off) + for (i = 0; i < ext->size / 512; ++i) { + ++decile[((ext->off + (wt_off_t)i * 512) * 10) / size]; + ++percentile[((ext->off + (wt_off_t)i * 512) * 100) / size]; + } #ifdef __VERBOSE_OUTPUT_PERCENTILE - /* - * The verbose output always displays 10% buckets, running this code - * as well also displays 1% buckets. - */ - for (i = 0; i < WT_ELEMENTS(percentile); ++i) { - v = percentile[i] * 512; - __wt_verbose(session, WT_VERB_COMPACT, - "%2u%%: %12" PRIuMAX "MB, (%" PRIuMAX "B, %" - PRIuMAX "%%)", - i, (uintmax_t)v / WT_MEGABYTE, (uintmax_t)v, - (uintmax_t)((v * 100) / (wt_off_t)el->bytes)); - } + /* + * The verbose output always displays 10% buckets, running this code as well also displays 1% + * buckets. + */ + for (i = 0; i < WT_ELEMENTS(percentile); ++i) { + v = percentile[i] * 512; + __wt_verbose(session, WT_VERB_COMPACT, + "%2u%%: %12" PRIuMAX "MB, (%" PRIuMAX "B, %" PRIuMAX "%%)", i, (uintmax_t)v / WT_MEGABYTE, + (uintmax_t)v, (uintmax_t)((v * 100) / (wt_off_t)el->bytes)); + } #endif - for (i = 0; i < WT_ELEMENTS(decile); ++i) { - v = decile[i] * 512; - __wt_verbose(session, WT_VERB_COMPACT, - "%2u%%: %12" PRIuMAX "MB, (%" PRIuMAX "B, %" - PRIuMAX "%%)", - i * 10, (uintmax_t)v / WT_MEGABYTE, (uintmax_t)v, - (uintmax_t)((v * 100) / (wt_off_t)el->bytes)); - } + for (i = 0; i < WT_ELEMENTS(decile); ++i) { + v = decile[i] * 512; + __wt_verbose(session, WT_VERB_COMPACT, + "%2u%%: %12" PRIuMAX "MB, (%" PRIuMAX "B, %" PRIuMAX "%%)", i * 10, + (uintmax_t)v / WT_MEGABYTE, (uintmax_t)v, (uintmax_t)((v * 100) / (wt_off_t)el->bytes)); + } } |