diff options
author | Luke Chen <luke.chen@mongodb.com> | 2019-07-26 14:46:53 +1000 |
---|---|---|
committer | Luke Chen <luke.chen@mongodb.com> | 2019-07-26 14:46:53 +1000 |
commit | 1cfa8b2ad24612dbf958eb19103652bec8ecc679 (patch) | |
tree | 2500de03f28c4a5d17f15d00080b04ca498812e6 | |
parent | 61f5c52e235e1269d6111d05e864dd99a46b20d6 (diff) | |
download | mongo-1cfa8b2ad24612dbf958eb19103652bec8ecc679.tar.gz |
Import wiredtiger: b61a3e2a88220f2c84073ff1cbc4d6883aa26c5b from branch mongodb-4.0
ref: 4a3194b043..b61a3e2a88
for: 4.0.12
WT-4869 Stop adding cache pressure when eviction is falling behind
WT-4878 Disable random dhandle selection and fine tune eviction target calculations
WT-4881 Soften the restrictions on re-entering reconciliation
WT-4893 Fix a race between internal page child-page eviction checks and cursors in the tree
WT-4913 Fix the Windows CRC32 on blocks that aren't 8B aligned and/or multiples of 8B
22 files changed, 407 insertions, 127 deletions
diff --git a/src/third_party/wiredtiger/SConstruct b/src/third_party/wiredtiger/SConstruct index 2646d51378e..4c522a94ef6 100644 --- a/src/third_party/wiredtiger/SConstruct +++ b/src/third_party/wiredtiger/SConstruct @@ -533,6 +533,14 @@ t = env.Program("wtperf", [ LIBS=[wtlib, shim, testutil] + wtlibs) Default(t) +t = env.Program('wt2695_checksum', ['test/csuite/wt2695_checksum/main.c'], + LIBS=[wtlib, shim, testutil] + wtlibs) +Default(t) + +t = env.Program('wt4117_checksum', ['test/csuite/wt4117_checksum/main.c'], + LIBS=[wtlib, shim, testutil] + wtlibs) +Default(t) + #Build the Examples for ex in examples: exp = env.Program(ex, "examples/c/" + ex + ".c", LIBS=[wtlib, shim, testutil] + wtlibs) diff --git a/src/third_party/wiredtiger/dist/filelist b/src/third_party/wiredtiger/dist/filelist index 036b1a8b1a9..ecdb75497b1 100644 --- a/src/third_party/wiredtiger/dist/filelist +++ b/src/third_party/wiredtiger/dist/filelist @@ -53,6 +53,7 @@ src/checksum/power8/crc32.sx POWERPC_HOST src/checksum/power8/crc32_wrapper.c POWERPC_HOST src/checksum/software/checksum.c src/checksum/x86/crc32-x86.c X86_HOST +src/checksum/x86/crc32-x86-alt.c X86_HOST src/checksum/zseries/crc32-s390x.c ZSERIES_HOST src/checksum/zseries/crc32le-vx.sx ZSERIES_HOST src/config/config.c diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list index 4ed32778cbb..89a44ecf70b 100644 --- a/src/third_party/wiredtiger/dist/s_define.list +++ b/src/third_party/wiredtiger/dist/s_define.list @@ -46,6 +46,7 @@ WT_PACKED_STRUCT_END WT_PADDING_CHECK WT_PREPARE_INIT WT_READ_BARRIER +WT_REF_SAVE_STATE_MAX WT_REF_SIZE WT_SESSION_LOCKED_CHECKPOINT WT_SESSION_LOCKED_TABLE_READ diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index a6cf0bc879e..d7f8111c161 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "4a3194b043b8cffb5339c12e1554d0bd42ed1b1f", + "commit": "b61a3e2a88220f2c84073ff1cbc4d6883aa26c5b", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-4.0" diff --git a/src/third_party/wiredtiger/src/block/block_open.c b/src/third_party/wiredtiger/src/block/block_open.c index 0d60e751c29..39061fff7b1 100644 --- a/src/third_party/wiredtiger/src/block/block_open.c +++ b/src/third_party/wiredtiger/src/block/block_open.c @@ -319,7 +319,8 @@ __desc_read(WT_SESSION_IMPL *session, WT_BLOCK *block) WT_BLOCK_DESC *desc; WT_DECL_ITEM(buf); WT_DECL_RET; - uint32_t checksum_calculate, checksum_tmp; + uint32_t checksum_saved, checksum_tmp; + bool checksum_matched; /* If in-memory, we don't read or write the descriptor structure. */ if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) @@ -340,10 +341,14 @@ __desc_read(WT_SESSION_IMPL *session, WT_BLOCK *block) * a calculated checksum that should match the checksum in the header. */ desc = buf->mem; - checksum_tmp = desc->checksum; + checksum_saved = checksum_tmp = desc->checksum; +#ifdef WORDS_BIGENDIAN + checksum_tmp = __wt_bswap32(checksum_tmp); +#endif desc->checksum = 0; - checksum_calculate = __wt_checksum(desc, block->allocsize); - desc->checksum = checksum_tmp; + checksum_matched = + __wt_checksum_match(desc, block->allocsize, checksum_tmp); + desc->checksum = checksum_saved; __wt_block_desc_byteswap(desc); /* @@ -355,8 +360,7 @@ __desc_read(WT_SESSION_IMPL *session, WT_BLOCK *block) * may have entered the wrong file name, and is now frantically pounding * their interrupt key. */ - if (desc->magic != WT_BLOCK_MAGIC || - desc->checksum != checksum_calculate) + if (desc->magic != WT_BLOCK_MAGIC || !checksum_matched) WT_ERR_MSG(session, WT_ERROR, "%s does not appear to be a WiredTiger file", block->name); diff --git a/src/third_party/wiredtiger/src/block/block_read.c b/src/third_party/wiredtiger/src/block/block_read.c index e190161fbd3..bcf6817e2da 100644 --- a/src/third_party/wiredtiger/src/block/block_read.c +++ b/src/third_party/wiredtiger/src/block/block_read.c @@ -224,7 +224,6 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, { WT_BLOCK_HEADER *blk, swap; size_t bufsize; - uint32_t page_checksum; __wt_verbose(session, WT_VERB_READ, "off %" PRIuMAX ", size %" PRIu32 ", checksum %" PRIu32, @@ -261,10 +260,9 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, __wt_block_header_byteswap_copy(blk, &swap); if (swap.checksum == checksum) { blk->checksum = 0; - page_checksum = __wt_checksum(buf->mem, + if (__wt_checksum_match(buf->mem, F_ISSET(&swap, WT_BLOCK_DATA_CKSUM) ? - size : WT_BLOCK_COMPRESS_SKIP); - if (page_checksum == checksum) { + size : WT_BLOCK_COMPRESS_SKIP, checksum)) { /* * Swap the page-header as needed; this doesn't belong * here, but it's the best place to catch all callers. @@ -277,10 +275,8 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, __wt_errx(session, "%s: read checksum error for %" PRIu32 "B block at " "offset %" PRIuMAX ": calculated block checksum " - "of %" PRIu32 " doesn't match expected checksum " - "of %" PRIu32, - block->name, - size, (uintmax_t)offset, page_checksum, checksum); + " doesn't match expected checksum", + block->name, size, (uintmax_t)offset); } else if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) __wt_errx(session, diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c index e7ad4b9143a..4f584b087ff 100644 --- a/src/third_party/wiredtiger/src/btree/bt_discard.c +++ b/src/third_party/wiredtiger/src/btree/bt_discard.c @@ -289,7 +289,7 @@ __wt_free_ref( __wt_free(session, ref->page_del); } - __wt_overwrite_and_free(session, ref); + __wt_overwrite_and_free_len(session, ref, WT_REF_CLEAR_SIZE); } /* diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index 87f47f20aeb..9a23a26ea56 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -689,7 +689,8 @@ read: /* * we "acquire" it. */ wont_need = LF_ISSET(WT_READ_WONT_NEED) || - F_ISSET(session, WT_SESSION_READ_WONT_NEED); + F_ISSET(session, WT_SESSION_READ_WONT_NEED) || + F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_NOKEEP); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) diff --git a/src/third_party/wiredtiger/src/checksum/x86/crc32-x86-alt.c b/src/third_party/wiredtiger/src/checksum/x86/crc32-x86-alt.c new file mode 100644 index 00000000000..49b578ccc66 --- /dev/null +++ b/src/third_party/wiredtiger/src/checksum/x86/crc32-x86-alt.c @@ -0,0 +1,121 @@ +/*- + * Public Domain 2014-2019 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <wiredtiger_config.h> +/* + * Here be dragons! + * The intrin.h include file is in a different place for the 4.0 release of + * MongoDB than it is for the 4.2 release. There was a bug in the 4.0 release + * which caused checksums to be invalid if they weren't 8 byte aligned and a + * multiple of 8 bytes long. That bug is fixed by including the intrin.h header + * file. + * + * We want to keep writing the old format when running in MongoDB 4.0 so + * users can downgrade and do point release upgrades without concern of a data + * format changes. We also want users to be able to downgrade from 4.2 to 4.0 + * without pain. We achieve that by including this fixed version of the checksum + * implementation. It works because when a checksum fails we automatically try + * the alternative checksum. In the case of 4.0 the alternative is the correct + * checksum. In the case of 4.2 the alternative is the old style checksum - i.e + * it's important that intrin.h is included in this file in 4.0 and in the + * non-alternate checksum implementation in 4.2. + */ +#if defined(_M_AMD64) +#include <intrin.h> +#endif +#include <inttypes.h> +#include <stdbool.h> +#include <stddef.h> + +/* + * The hardware-accelerated checksum code that originally shipped on Windows + * did not correctly handle memory that wasn't 8B aligned and a multiple of 8B. + * It's likely that calculations were always 8B aligned, but there's some risk. + * + * What we do is always write the correct checksum, and if a checksum test + * fails, check it against the alternate version have before failing. + */ + +#if defined(_M_AMD64) && !defined(HAVE_NO_CRC32_HARDWARE) +/* + * __checksum_alt -- + * Return a checksum for a chunk of memory, computed in hardware + * using 8 byte steps. + */ +static uint32_t +__checksum_alt(const void *chunk, size_t len) +{ + uint32_t crc; + size_t nqwords; + const uint8_t *p; + const uint64_t *p64; + + crc = 0xffffffff; + + /* Checksum one byte at a time to the first 4B boundary. */ + for (p = chunk; + ((uintptr_t)p & (sizeof(uint32_t) - 1)) != 0 && + len > 0; ++p, --len) { + crc = _mm_crc32_u8(crc, *p); + } + + p64 = (const uint64_t *)p; + /* Checksum in 8B chunks. */ + for (nqwords = len / sizeof(uint64_t); nqwords; nqwords--) { + crc = (uint32_t)_mm_crc32_u64(crc, *p64); + p64++; + } + + /* Checksum trailing bytes one byte at a time. */ + p = (const uint8_t *)p64; + for (len &= 0x7; len > 0; ++p, len--) { + crc = _mm_crc32_u8(crc, *p); + } + + return (~crc); +} + +/* + * __wt_checksum_alt_match -- + * Return if a checksum matches the alternate calculation. + */ +bool +__wt_checksum_alt_match(const void *chunk, size_t len, uint32_t v) +{ + int cpuInfo[4]; + + __cpuid(cpuInfo, 1); + + #define CPUID_ECX_HAS_SSE42 (1 << 20) + if (cpuInfo[2] & CPUID_ECX_HAS_SSE42) + return (__checksum_alt(chunk, len) == v); + + return (false); +} + +#endif diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index f40ed758a19..8930ae19944 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -578,6 +578,7 @@ __evict_update_work(WT_SESSION_IMPL *session) WT_CONNECTION_IMPL *conn; double dirty_target, dirty_trigger, target, trigger; uint64_t bytes_inuse, bytes_max, dirty_inuse; + uint32_t flags; conn = S2C(session); cache = conn->cache; @@ -587,14 +588,16 @@ __evict_update_work(WT_SESSION_IMPL *session) target = cache->eviction_target; trigger = cache->eviction_trigger; - /* Clear previous state. */ - cache->flags = 0; + /* Build up the new state. */ + flags = 0; - if (!F_ISSET(conn, WT_CONN_EVICTION_RUN)) + if (!F_ISSET(conn, WT_CONN_EVICTION_RUN)) { + cache->flags = 0; return (false); + } if (!__evict_queue_empty(cache->evict_urgent_queue, false)) - F_SET(cache, WT_CACHE_EVICT_URGENT); + LF_SET(WT_CACHE_EVICT_URGENT); if (F_ISSET(conn, WT_CONN_LOOKASIDE_OPEN)) { WT_ASSERT(session, @@ -613,32 +616,38 @@ __evict_update_work(WT_SESSION_IMPL *session) bytes_max = conn->cache_size + 1; bytes_inuse = __wt_cache_bytes_inuse(cache); if (__wt_eviction_clean_needed(session, NULL)) - F_SET(cache, WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD); + LF_SET(WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD); else if (bytes_inuse > (target * bytes_max) / 100) - F_SET(cache, WT_CACHE_EVICT_CLEAN); + LF_SET(WT_CACHE_EVICT_CLEAN); dirty_inuse = __wt_cache_dirty_leaf_inuse(cache); if (__wt_eviction_dirty_needed(session, NULL)) - F_SET(cache, WT_CACHE_EVICT_DIRTY | WT_CACHE_EVICT_DIRTY_HARD); + LF_SET(WT_CACHE_EVICT_DIRTY | WT_CACHE_EVICT_DIRTY_HARD); else if (dirty_inuse > (uint64_t)(dirty_target * bytes_max) / 100) - F_SET(cache, WT_CACHE_EVICT_DIRTY); + LF_SET(WT_CACHE_EVICT_DIRTY); /* * If application threads are blocked by the total volume of data in * cache, try dirty pages as well. */ if (__wt_cache_aggressive(session) && - F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD)) - F_SET(cache, WT_CACHE_EVICT_DIRTY); + LF_ISSET(WT_CACHE_EVICT_CLEAN_HARD)) + LF_SET(WT_CACHE_EVICT_DIRTY); + + /* When we stop looking for dirty pages, reduce the lookaside score. */ + if (!LF_ISSET(WT_CACHE_EVICT_DIRTY)) + __wt_cache_update_lookaside_score(session, 1, 0); /* * Scrub dirty pages and keep them in cache if we are less than half * way to the clean or dirty trigger. */ - if (bytes_inuse < (uint64_t)((target + trigger) * bytes_max) / 200 && - dirty_inuse < - (uint64_t)((dirty_target + dirty_trigger) * bytes_max) / 200) - F_SET(cache, WT_CACHE_EVICT_SCRUB); + if (bytes_inuse < (uint64_t)((target + trigger) * bytes_max) / 200) { + if (dirty_inuse < (uint64_t) + ((dirty_target + dirty_trigger) * bytes_max) / 200) + LF_SET(WT_CACHE_EVICT_SCRUB); + } else + LF_SET(WT_CACHE_EVICT_NOKEEP); /* * Try lookaside evict when: @@ -651,20 +660,23 @@ __evict_update_work(WT_SESSION_IMPL *session) (__wt_cache_lookaside_score(cache) > 80 && dirty_inuse > (uint64_t)((dirty_target + dirty_trigger) * bytes_max) / 200)) - F_SET(cache, WT_CACHE_EVICT_LOOKASIDE); + LF_SET(WT_CACHE_EVICT_LOOKASIDE); /* * With an in-memory cache, we only do dirty eviction in order to scrub * pages. */ if (F_ISSET(conn, WT_CONN_IN_MEMORY)) { - if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) - F_SET(cache, WT_CACHE_EVICT_DIRTY); - if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD)) - F_SET(cache, WT_CACHE_EVICT_DIRTY_HARD); - F_CLR(cache, WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD); + if (LF_ISSET(WT_CACHE_EVICT_CLEAN)) + LF_SET(WT_CACHE_EVICT_DIRTY); + if (LF_ISSET(WT_CACHE_EVICT_CLEAN_HARD)) + LF_SET(WT_CACHE_EVICT_DIRTY_HARD); + LF_CLR(WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD); } + /* Update the global eviction state. */ + cache->flags = flags; + return (F_ISSET(cache, WT_CACHE_EVICT_ALL | WT_CACHE_EVICT_URGENT)); } @@ -1623,22 +1635,14 @@ __evict_push_candidate(WT_SESSION_IMPL *session, * Calculate how many pages to queue for a given tree. */ static uint32_t -__evict_walk_target(WT_SESSION_IMPL *session, u_int max_entries) +__evict_walk_target(WT_SESSION_IMPL *session) { WT_CACHE *cache; uint64_t btree_inuse, bytes_per_slot, cache_inuse; uint32_t target_pages_clean, target_pages_dirty, target_pages; - uint32_t total_slots; cache = S2C(session)->cache; target_pages_clean = target_pages_dirty = 0; - total_slots = max_entries; - - /* - * The number of times we should fill the queue by the end of - * considering all trees. - */ -#define QUEUE_FILLS_PER_PASS 10 /* * The minimum number of pages we should consider per tree. @@ -1654,7 +1658,7 @@ __evict_walk_target(WT_SESSION_IMPL *session, u_int max_entries) if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) { btree_inuse = __wt_btree_bytes_evictable(session); cache_inuse = __wt_cache_bytes_inuse(cache); - bytes_per_slot = 1 + cache_inuse / total_slots; + bytes_per_slot = 1 + cache_inuse / cache->evict_slots; target_pages_clean = (uint32_t)( (btree_inuse + bytes_per_slot / 2) / bytes_per_slot); } @@ -1662,20 +1666,12 @@ __evict_walk_target(WT_SESSION_IMPL *session, u_int max_entries) if (F_ISSET(cache, WT_CACHE_EVICT_DIRTY)) { btree_inuse = __wt_btree_dirty_leaf_inuse(session); cache_inuse = __wt_cache_dirty_leaf_inuse(cache); - bytes_per_slot = 1 + cache_inuse / total_slots; + bytes_per_slot = 1 + cache_inuse / cache->evict_slots; target_pages_dirty = (uint32_t)( (btree_inuse + bytes_per_slot / 2) / bytes_per_slot); } - /* - * Weight the number of target pages by the number of times we want to - * fill the cache per pass through all the trees. Note that we don't - * build this into the calculation above because we don't want to favor - * small trees, so round to a whole number of slots (zero for small - * trees) before multiplying. - */ - target_pages = WT_MAX(target_pages_clean, target_pages_dirty) * - QUEUE_FILLS_PER_PASS; + target_pages = WT_MAX(target_pages_clean, target_pages_dirty); /* * Walk trees with a small fraction of the cache in case there are so @@ -1739,12 +1735,10 @@ __evict_walk_tree(WT_SESSION_IMPL *session, start = queue->evict_queue + *slotp; remaining_slots = max_entries - *slotp; if (btree->evict_walk_progress >= btree->evict_walk_target) { - btree->evict_walk_target = - __evict_walk_target(session, max_entries); + btree->evict_walk_target = __evict_walk_target(session); btree->evict_walk_progress = 0; } - target_pages = WT_MIN(btree->evict_walk_target / QUEUE_FILLS_PER_PASS, - btree->evict_walk_target - btree->evict_walk_progress); + target_pages = btree->evict_walk_target - btree->evict_walk_progress; if (target_pages > remaining_slots) target_pages = remaining_slots; diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index 2510815401f..b8c0a88a966 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -470,11 +470,46 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent) WT_REF *child; bool active; + /* + * There may be cursors in the tree walking the list of child pages. + * The parent is locked, so all we care about is cursors already in the + * child pages, no thread can enter them. Any cursor moving through the + * child pages must be hazard pointer coupling between pages, where the + * page on which it currently has a hazard pointer must be in a state + * other than on-disk. Walk the child list forward, then backward, to + * ensure we don't race with a cursor walking in the opposite direction + * from our check. + */ + WT_INTL_FOREACH_BEGIN(session, parent->page, child) { + switch (child->state) { + case WT_REF_DISK: /* On-disk */ + case WT_REF_DELETED: /* On-disk, deleted */ + case WT_REF_LOOKASIDE: /* On-disk, lookaside */ + break; + default: + return (__wt_set_return(session, EBUSY)); + } + } WT_INTL_FOREACH_END; + WT_INTL_FOREACH_REVERSE_BEGIN(session, parent->page, child) { + switch (child->state) { + case WT_REF_DISK: /* On-disk */ + case WT_REF_DELETED: /* On-disk, deleted */ + case WT_REF_LOOKASIDE: /* On-disk, lookaside */ + break; + default: + return (__wt_set_return(session, EBUSY)); + } + } WT_INTL_FOREACH_END; + + /* + * The fast check is done and there are no cursors in the child pages. + * Make sure the child WT_REF structures pages can be discarded. + */ WT_INTL_FOREACH_BEGIN(session, parent->page, child) { switch (child->state) { case WT_REF_DISK: /* On-disk */ break; - case WT_REF_DELETED: /* Deleted */ + case WT_REF_DELETED: /* On-disk, deleted */ /* * If the child page was part of a truncate, * transaction rollback might switch this page into its @@ -498,7 +533,7 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent) if (active) return (__wt_set_return(session, EBUSY)); break; - case WT_REF_LOOKASIDE: + case WT_REF_LOOKASIDE: /* On-disk, lookaside */ /* * If the lookaside history is obsolete, the reference * can be ignored. @@ -520,9 +555,8 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent) * for conditions that would block its eviction. */ static int -__evict_review( - WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags, - bool *inmem_splitp) +__evict_review(WT_SESSION_IMPL *session, + WT_REF *ref, uint32_t evict_flags, bool *inmem_splitp) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 9859b3b607a..994633e0879 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -578,6 +578,14 @@ struct __wt_page { for (__refp = __pindex->index, \ __entries = __pindex->entries; __entries > 0; --__entries) {\ (ref) = *__refp++; +#define WT_INTL_FOREACH_REVERSE_BEGIN(session, page, ref) do { \ + WT_PAGE_INDEX *__pindex; \ + WT_REF **__refp; \ + uint32_t __entries; \ + WT_INTL_INDEX_GET(session, page, __pindex); \ + for (__refp = __pindex->index + __pindex->entries, \ + __entries = __pindex->entries; __entries > 0; --__entries) {\ + (ref) = *--__refp; #define WT_INTL_FOREACH_END \ } \ } while (0) @@ -880,43 +888,51 @@ struct __wt_ref { WT_PAGE_DELETED *page_del; /* Deleted page information */ WT_PAGE_LOOKASIDE *page_las; /* Lookaside information */ -/* A macro wrapper allowing us to remember the callers code location */ -#define WT_REF_CAS_STATE(session, ref, old_state, new_state) \ - __wt_ref_cas_state_int((session), (ref), (old_state), (new_state),\ - __FILE__, __LINE__) + /* + * In DIAGNOSTIC mode we overwrite the WT_REF on free to force failures. + * Don't clear the history in that case. + */ +#define WT_REF_CLEAR_SIZE (offsetof(WT_REF, hist)) + +#define WT_REF_SAVE_STATE_MAX 3 #ifdef HAVE_DIAGNOSTIC /* Capture history of ref state changes. */ struct __wt_ref_hist { WT_SESSION_IMPL *session; const char *name; - const char *file; - int line; - uint32_t state; - } hist[3]; + const char *func; + uint16_t line; + uint16_t state; + } hist[WT_REF_SAVE_STATE_MAX]; uint64_t histoff; #define WT_REF_SAVE_STATE(ref, s, f, l) do { \ (ref)->hist[(ref)->histoff].session = session; \ (ref)->hist[(ref)->histoff].name = session->name; \ - (ref)->hist[(ref)->histoff].file = (f); \ - (ref)->hist[(ref)->histoff].line = (l); \ - (ref)->hist[(ref)->histoff].state = s; \ + (ref)->hist[(ref)->histoff].func = (f); \ + (ref)->hist[(ref)->histoff].line = (uint16_t)(l); \ + (ref)->hist[(ref)->histoff].state = (uint16_t)(s); \ (ref)->histoff = \ ((ref)->histoff + 1) % WT_ELEMENTS((ref)->hist); \ } while (0) #define WT_REF_SET_STATE(ref, s) do { \ - WT_REF_SAVE_STATE(ref, s, __FILE__, __LINE__); \ + WT_REF_SAVE_STATE(ref, s, __func__, __LINE__); \ WT_PUBLISH((ref)->state, s); \ } while (0) #else #define WT_REF_SET_STATE(ref, s) WT_PUBLISH((ref)->state, s) #endif + +/* A macro wrapper allowing us to remember the callers code location */ +#define WT_REF_CAS_STATE(session, ref, old_state, new_state) \ + __wt_ref_cas_state_int( \ + session, ref, old_state, new_state, __func__, __LINE__) }; /* * WT_REF_SIZE is the expected structure size -- we verify the build to ensure * the compiler hasn't inserted padding which would break the world. */ #ifdef HAVE_DIAGNOSTIC -#define WT_REF_SIZE (56 + 3 * sizeof(WT_REF_HIST) + 8) +#define WT_REF_SIZE (56 + WT_REF_SAVE_STATE_MAX * sizeof(WT_REF_HIST) + 8) #else #define WT_REF_SIZE 56 #endif diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h index c4c0ee5d5d4..a093022efa6 100644 --- a/src/third_party/wiredtiger/src/include/cache.h +++ b/src/third_party/wiredtiger/src/include/cache.h @@ -250,14 +250,15 @@ struct __wt_cache { uint32_t pool_flags; /* Cache pool flags */ /* AUTOMATIC FLAG VALUE GENERATION START */ -#define WT_CACHE_EVICT_CLEAN 0x01u /* Evict clean pages */ -#define WT_CACHE_EVICT_CLEAN_HARD 0x02u /* Clean % blocking app threads */ -#define WT_CACHE_EVICT_DEBUG_MODE 0x04u /* Aggressive debugging mode */ -#define WT_CACHE_EVICT_DIRTY 0x08u /* Evict dirty pages */ -#define WT_CACHE_EVICT_DIRTY_HARD 0x10u /* Dirty % blocking app threads */ -#define WT_CACHE_EVICT_LOOKASIDE 0x20u /* Try lookaside eviction */ -#define WT_CACHE_EVICT_SCRUB 0x40u /* Scrub dirty pages */ -#define WT_CACHE_EVICT_URGENT 0x80u /* Pages are in the urgent queue */ +#define WT_CACHE_EVICT_CLEAN 0x001u /* Evict clean pages */ +#define WT_CACHE_EVICT_CLEAN_HARD 0x002u /* Clean % blocking app threads */ +#define WT_CACHE_EVICT_DEBUG_MODE 0x004u /* Aggressive debugging mode */ +#define WT_CACHE_EVICT_DIRTY 0x008u /* Evict dirty pages */ +#define WT_CACHE_EVICT_DIRTY_HARD 0x010u /* Dirty % blocking app threads */ +#define WT_CACHE_EVICT_LOOKASIDE 0x020u /* Try lookaside eviction */ +#define WT_CACHE_EVICT_NOKEEP 0x040u /* Don't add read pages to cache */ +#define WT_CACHE_EVICT_SCRUB 0x080u /* Scrub dirty pages */ +#define WT_CACHE_EVICT_URGENT 0x100u /* Pages are in the urgent queue */ /* AUTOMATIC FLAG VALUE GENERATION STOP */ #define WT_CACHE_EVICT_ALL (WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_DIRTY) uint32_t flags; diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index 73ac6c85522..eb2534f6cb3 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -23,7 +23,7 @@ struct __wt_process { double tsc_nsec_ratio; /* rdtsc ticks to nanoseconds */ bool use_epochtime; /* use expensive time */ - /* Checksum function */ + /* Checksum functions */ #define __wt_checksum(chunk, len) __wt_process.checksum(chunk, len) uint32_t (*checksum)(const void *, size_t); }; diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 1ecfaf6eef6..71660ab00ab 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -219,6 +219,7 @@ extern void __wt_las_remove_dropped(WT_SESSION_IMPL *session); extern int __wt_las_save_dropped(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_sweep(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern uint32_t __wt_checksum_sw(const void *chunk, size_t len); +extern bool __wt_checksum_alt_match(const void *chunk, size_t len, uint32_t v) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_config_initn(WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str, size_t len); extern void __wt_config_init(WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str); extern void __wt_config_subinit(WT_SESSION_IMPL *session, WT_CONFIG *conf, WT_CONFIG_ITEM *item); diff --git a/src/third_party/wiredtiger/src/include/misc.i b/src/third_party/wiredtiger/src/include/misc.i index ab24e87f34a..4cab5bad9e1 100644 --- a/src/third_party/wiredtiger/src/include/misc.i +++ b/src/third_party/wiredtiger/src/include/misc.i @@ -278,3 +278,37 @@ __wt_timing_stress(WT_SESSION_IMPL *session, u_int flag) /* The default maximum delay is 1/10th of a second. */ __wt_sleep(0, i * (WT_TIMING_STRESS_MAX_DELAY / 10)); } + +/* + * The hardware-accelerated checksum code that originally shipped on Windows + * did not correctly handle memory that wasn't 8B aligned and a multiple of 8B. + * It's likely that calculations were always 8B aligned, but there's some risk. + * + * What we do is always write the correct checksum, and if a checksum test + * fails, check it against the alternate version have before failing. + */ + +#if defined(_M_AMD64) && !defined(HAVE_NO_CRC32_HARDWARE) +/* + * __wt_checksum_match -- + * Return if a checksum matches either the primary or alternate values. + */ +static inline bool +__wt_checksum_match(const void *chunk, size_t len, uint32_t v) +{ + return (__wt_checksum(chunk, len) == v || + __wt_checksum_alt_match(chunk, len, v)); +} + +#else + +/* + * __wt_checksum_match -- + * Return if a checksum matches. + */ +static inline bool +__wt_checksum_match(const void *chunk, size_t len, uint32_t v) +{ + return (__wt_checksum(chunk, len) == v); +} +#endif diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index de10e8c44b9..3fd7b6cfc59 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -21,13 +21,13 @@ typedef enum { */ static inline bool __wt_ref_cas_state_int(WT_SESSION_IMPL *session, WT_REF *ref, - uint32_t old_state, uint32_t new_state, const char *file, int line) + uint32_t old_state, uint32_t new_state, const char *func, int line) { bool cas_result; /* Parameters that are used in a macro for diagnostic builds */ WT_UNUSED(session); - WT_UNUSED(file); + WT_UNUSED(func); WT_UNUSED(line); cas_result = __wt_atomic_casv32(&ref->state, old_state, new_state); @@ -39,7 +39,7 @@ __wt_ref_cas_state_int(WT_SESSION_IMPL *session, WT_REF *ref, * updated. */ if (cas_result) - WT_REF_SAVE_STATE(ref, new_state, file, line); + WT_REF_SAVE_STATE(ref, new_state, func, line); #endif return (cas_result); } diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c index 1963a3770fc..de6b806ca91 100644 --- a/src/third_party/wiredtiger/src/log/log.c +++ b/src/third_party/wiredtiger/src/log/log.c @@ -42,21 +42,21 @@ __wt_log_printf(WT_SESSION_IMPL *session, const char *format, ...) * Given a log record, return whether the checksum matches. */ static bool -__log_checksum_match(WT_SESSION_IMPL *session, WT_ITEM *buf, uint32_t reclen) +__log_checksum_match(WT_ITEM *buf, uint32_t reclen) { WT_LOG_RECORD *logrec; - uint32_t checksum_calculate, checksum_tmp; + uint32_t checksum_saved, checksum_tmp; + bool checksum_matched; - WT_UNUSED(session); - logrec = (WT_LOG_RECORD *)buf->mem; - checksum_tmp = logrec->checksum; - logrec->checksum = 0; - checksum_calculate = __wt_checksum(logrec, reclen); + logrec = buf->mem; + checksum_saved = checksum_tmp = logrec->checksum; #ifdef WORDS_BIGENDIAN - checksum_calculate = __wt_bswap32(checksum_calculate); + checksum_tmp = __wt_bswap32(checksum_tmp); #endif - logrec->checksum = checksum_tmp; - return (logrec->checksum == checksum_calculate); + logrec->checksum = 0; + checksum_matched = __wt_checksum_match(logrec, reclen, checksum_tmp); + logrec->checksum = checksum_saved; + return (checksum_matched); } /* @@ -1093,7 +1093,7 @@ __log_open_verify(WT_SESSION_IMPL *session, uint32_t id, WT_FH **fhp, goto err; } - if (!__log_checksum_match(session, buf, allocsize)) + if (!__log_checksum_match(buf, allocsize)) WT_ERR_MSG(session, WT_ERROR, "%s: System log record checksum mismatch", fh->name); __wt_log_record_byteswap(logrec); @@ -2543,7 +2543,7 @@ advance: */ buf->size = reclen; logrec = (WT_LOG_RECORD *)buf->mem; - if (!__log_checksum_match(session, buf, reclen)) { + if (!__log_checksum_match(buf, reclen)) { /* * A checksum mismatch means we have reached the end of * the useful part of the log. This should be found on diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 1c873fc3d8a..f0692f5de50 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -26,6 +26,8 @@ static void __rec_write_page_status(WT_SESSION_IMPL *, WT_RECONCILE *); static int __rec_write_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_write_wrapup_err( WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); +static int __reconcile(WT_SESSION_IMPL *, + WT_REF *, WT_SALVAGE_COOKIE *, uint32_t, bool *, bool *); /* * __wt_reconcile -- @@ -35,19 +37,15 @@ int __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags, bool *lookaside_retryp) { - WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; - WT_PAGE_MODIFY *mod; - WT_RECONCILE *r; - uint64_t oldest_id; + bool no_reconcile_set, page_locked; - btree = S2BT(session); - page = ref->page; - mod = page->modify; if (lookaside_retryp != NULL) *lookaside_retryp = false; + page = ref->page; + __wt_verbose(session, WT_VERB_RECONCILE, "%p reconcile %s (%s%s%s)", (void *)ref, __wt_page_type_string(page->type), @@ -76,10 +74,19 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, LF_ISSET(WT_REC_VISIBLE_ALL) || F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT)); - /* We shouldn't get called with a clean page, that's an error. */ + /* It's an error to be called with a clean page. */ WT_ASSERT(session, __wt_page_is_modified(page)); /* + * Reconciliation acquires and releases pages, and in rare cases that + * page release triggers eviction. If the page is dirty, eviction can + * trigger reconciliation, and we re-enter this code. Reconciliation + * isn't re-entrant, so we need to ensure that doesn't happen. + */ + no_reconcile_set = F_ISSET(session, WT_SESSION_NO_RECONCILE); + F_SET(session, WT_SESSION_NO_RECONCILE); + + /* * Reconciliation locks the page for three reasons: * Reconciliation reads the lists of page updates, obsolete updates * cannot be discarded while reconciliation is in progress; @@ -89,6 +96,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, * a child page splitting during the reconciliation. */ WT_PAGE_LOCK(session, page); + page_locked = true; /* * Now that the page is locked, if attempting to evict it, check again @@ -96,20 +104,37 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, * while we were waiting to acquire the lock (e.g., the page could have * split). */ - if (LF_ISSET(WT_REC_EVICT) && - !__wt_page_can_evict(session, ref, NULL)) { - WT_PAGE_UNLOCK(session, page); - return (__wt_set_return(session, EBUSY)); - } + if (LF_ISSET(WT_REC_EVICT) && !__wt_page_can_evict(session, ref, NULL)) + WT_ERR(__wt_set_return(session, EBUSY)); - /* Initialize the reconciliation structure for each new run. */ - if ((ret = __rec_init( - session, ref, flags, salvage, &session->reconcile)) != 0) { + /* + * Reconcile the page. The reconciliation code unlocks the page as soon + * as possible, and returns that information. + */ + ret = __reconcile(session, ref, + salvage, flags, lookaside_retryp, &page_locked); + +err: + if (page_locked) WT_PAGE_UNLOCK(session, page); - return (ret); - } - r = session->reconcile; + if (!no_reconcile_set) + F_CLR(session, WT_SESSION_NO_RECONCILE); + return (ret); +} +/* + * __reconcile_save_evict_state -- + * Save the transaction state that causes history to be pinned, whether + * reconciliation succeeds or fails. + */ +static void +__reconcile_save_evict_state( + WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) +{ + WT_PAGE_MODIFY *mod; + uint64_t oldest_id; + + mod = ref->page->modify; oldest_id = __wt_txn_oldest_id(session); /* @@ -135,6 +160,32 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_ASSERT(session, WT_TXNID_LE(mod->last_oldest_id, oldest_id)); mod->last_oldest_id = oldest_id; #endif +} + +/* + * __reconcile -- + * Reconcile an in-memory page into its on-disk format, and write it. + */ +static int +__reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, + uint32_t flags, bool *lookaside_retryp, bool *page_lockedp) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_PAGE *page; + WT_PAGE_MODIFY *mod; + WT_RECONCILE *r; + + btree = S2BT(session); + page = ref->page; + mod = page->modify; + + /* Save the eviction state. */ + __reconcile_save_evict_state(session, ref, flags); + + /* Initialize the reconciliation structure for each new run. */ + WT_RET(__rec_init(session, ref, flags, salvage, &session->reconcile)); + r = session->reconcile; /* Reconcile the page. */ switch (page->type) { @@ -189,6 +240,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, S2C(session)->txn_global.stable_timestamp; /* Release the reconciliation lock. */ + *page_lockedp = false; WT_PAGE_UNLOCK(session, page); /* Update statistics. */ @@ -520,7 +572,16 @@ __rec_init(WT_SESSION_IMPL *session, btree = S2BT(session); page = ref->page; - if ((r = *(WT_RECONCILE **)reconcilep) == NULL) { + /* + * Reconciliation is not re-entrant, make sure that doesn't happen. Our + * caller sets WT_SESSION_IMPL.WT_SESSION_NO_RECONCILE to prevent it, + * but it's been a problem in the past, check to be sure. + */ + r = *(WT_RECONCILE **)reconcilep; + if (r != NULL && r->ref != NULL) + WT_RET_MSG(session, WT_ERROR, "reconciliation re-entered"); + + if (r == NULL) { WT_RET(__wt_calloc_one(session, &r)); *(WT_RECONCILE **)reconcilep = r; @@ -535,9 +596,6 @@ __rec_init(WT_SESSION_IMPL *session, F_SET(&r->chunkB.image, WT_ITEM_ALIGNED); } - /* Reconciliation is not re-entrant, make sure that doesn't happen. */ - WT_ASSERT(session, r->ref == NULL); - /* Remember the configuration. */ r->ref = ref; r->page = page; diff --git a/src/third_party/wiredtiger/src/support/global.c b/src/third_party/wiredtiger/src/support/global.c index 76571b03278..d9cc137200d 100644 --- a/src/third_party/wiredtiger/src/support/global.c +++ b/src/third_party/wiredtiger/src/support/global.c @@ -117,6 +117,10 @@ __wt_global_once(void) return; } + /* + * Set up the checksum functions. If there's only one, set it as the + * alternate, that way code doesn't have to check if it's set or not. + */ __wt_process.checksum = wiredtiger_crc32c_func(); __global_calibrate_ticks(); diff --git a/src/third_party/wiredtiger/src/support/hazard.c b/src/third_party/wiredtiger/src/support/hazard.c index 52f77b5b232..bf77fc7c42e 100644 --- a/src/third_party/wiredtiger/src/support/hazard.c +++ b/src/third_party/wiredtiger/src/support/hazard.c @@ -330,6 +330,10 @@ __wt_hazard_check(WT_SESSION_IMPL *session, WT_SESSION_IMPL *s; uint32_t i, j, hazard_inuse, max, session_cnt, walk_cnt; + /* If a file can never be evicted, hazard pointers aren't required. */ + if (F_ISSET(S2BT(session), WT_BTREE_IN_MEMORY)) + return (NULL); + conn = S2C(session); WT_STAT_CONN_INCR(session, cache_hazard_checks); diff --git a/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c b/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c index 2c8140f8f26..edf3fed1c28 100644 --- a/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c @@ -139,8 +139,10 @@ op(WT_SESSION *session, WT_RAND_STATE *rnd, WT_CURSOR **cpp) /* Close the cursor half the time, otherwise cache it. */ if (__wt_random(rnd) % 2 == 0) testutil_check(cursor->close(cursor)); - else + else { + testutil_check(cursor->reset(cursor)); *cpp = cursor; + } (void)__wt_atomic_add64(&worker, 1); } |