diff options
author | Luke Chen <luke.chen@mongodb.com> | 2019-10-01 16:41:41 +1000 |
---|---|---|
committer | Luke Chen <luke.chen@mongodb.com> | 2019-10-01 16:41:41 +1000 |
commit | 86baddd406874c78cde915a8598999ff3f0ea70c (patch) | |
tree | 50b946d6421bce6f02f37047a255e06a8b704fb2 | |
parent | 34217d3b595b172180603e48e17421a330e04a81 (diff) | |
download | mongo-86baddd406874c78cde915a8598999ff3f0ea70c.tar.gz |
Import wiredtiger: 4c72feeb921607b30984301f4e007fc24b54e26b from branch mongodb-3.6
ref: 04447c57d5..4c72feeb92
for: 3.6.15
WT-4869 Stop adding cache pressure when eviction is falling behind
WT-4881 Soften the restrictions on re-entering reconciliation
WT-4893 Fix a race between internal page child-page eviction checks and cursors in the tree
WT-4898 Don't allow the eviction server to reconcile if it's busy
WT-4956 Handle the case where 4 billion updates are made to a page without eviction
WT-4957 Revert part of a change about when pages are queued for urgent eviction
WT-5050 Assertion failure during urgent eviction of metadata page
-rw-r--r-- | src/third_party/wiredtiger/dist/s_string.ok | 1 | ||||
-rw-r--r-- | src/third_party/wiredtiger/import.data | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_debug.c | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_read.c | 3 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_sync.c | 5 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/evict/evict_lru.c | 52 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/evict/evict_page.c | 39 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/btmem.h | 25 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/btree.i | 68 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/cache.h | 15 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/serial.i | 32 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/reconcile/rec_write.c | 145 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/support/hazard.c | 4 |
13 files changed, 266 insertions, 127 deletions
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index 8b3110e87c8..b3b95a4e50a 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -1302,6 +1302,7 @@ unmodify unordered unpackv unpadded +unreconciled unreferenced unregister unsized diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index b16f253e705..602318b9214 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "04447c57d565903849f0797fa391cd60f5fc7992", + "commit": "4c72feeb921607b30984301f4e007fc24b54e26b", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-3.6" diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index 47b84ad7a25..319b9357bd7 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -883,7 +883,7 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref) if (split_gen != 0) WT_RET(ds->f(ds, ", split-gen=%" PRIu64, split_gen)); if (mod != NULL) - WT_RET(ds->f(ds, ", write-gen=%" PRIu32, mod->write_gen)); + WT_RET(ds->f(ds, ", page-state=%" PRIu32, mod->page_state)); WT_RET(ds->f(ds, ", memory-size %" WT_SIZET_FMT, page->memory_footprint)); WT_RET(ds->f(ds, "\n")); diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index bbf60aabc20..69b0f95d205 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -689,7 +689,8 @@ read: /* * we "acquire" it. */ wont_need = LF_ISSET(WT_READ_WONT_NEED) || - F_ISSET(session, WT_SESSION_READ_WONT_NEED); + F_ISSET(session, WT_SESSION_READ_WONT_NEED) || + F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_NOKEEP); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index c5c08faa701..6a973335541 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -324,9 +324,14 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) * cache clean but with history that cannot be * discarded), that is not wasted effort because * checkpoint doesn't need to write the page again. + * + * Once the transaction has given up it's snapshot it + * is no longer safe to reconcile pages. That happens + * prior to the final metadata checkpoint. */ if (!WT_PAGE_IS_INTERNAL(page) && page->read_gen == WT_READGEN_WONT_NEED && + F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT) && !tried_eviction) { WT_ERR_BUSY_OK( __wt_page_release_evict(session, walk)); diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index ff3772533ae..bb310369d82 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -559,6 +559,7 @@ __evict_update_work(WT_SESSION_IMPL *session) WT_CONNECTION_IMPL *conn; double dirty_target, dirty_trigger, target, trigger; uint64_t bytes_inuse, bytes_max, dirty_inuse; + uint32_t flags; conn = S2C(session); cache = conn->cache; @@ -568,14 +569,16 @@ __evict_update_work(WT_SESSION_IMPL *session) target = cache->eviction_target; trigger = cache->eviction_trigger; - /* Clear previous state. */ - cache->flags = 0; + /* Build up the new state. */ + flags = 0; - if (!F_ISSET(conn, WT_CONN_EVICTION_RUN)) + if (!F_ISSET(conn, WT_CONN_EVICTION_RUN)) { + cache->flags = 0; return (false); + } if (!__evict_queue_empty(cache->evict_urgent_queue, false)) - F_SET(cache, WT_CACHE_EVICT_URGENT); + LF_SET(WT_CACHE_EVICT_URGENT); if (F_ISSET(conn, WT_CONN_LOOKASIDE_OPEN)) { WT_ASSERT(session, @@ -594,32 +597,38 @@ __evict_update_work(WT_SESSION_IMPL *session) bytes_max = conn->cache_size + 1; bytes_inuse = __wt_cache_bytes_inuse(cache); if (__wt_eviction_clean_needed(session, NULL)) - F_SET(cache, WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD); + LF_SET(WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD); else if (bytes_inuse > (target * bytes_max) / 100) - F_SET(cache, WT_CACHE_EVICT_CLEAN); + LF_SET(WT_CACHE_EVICT_CLEAN); dirty_inuse = __wt_cache_dirty_leaf_inuse(cache); if (__wt_eviction_dirty_needed(session, NULL)) - F_SET(cache, WT_CACHE_EVICT_DIRTY | WT_CACHE_EVICT_DIRTY_HARD); + LF_SET(WT_CACHE_EVICT_DIRTY | WT_CACHE_EVICT_DIRTY_HARD); else if (dirty_inuse > (uint64_t)(dirty_target * bytes_max) / 100) - F_SET(cache, WT_CACHE_EVICT_DIRTY); + LF_SET(WT_CACHE_EVICT_DIRTY); /* * If application threads are blocked by the total volume of data in * cache, try dirty pages as well. */ if (__wt_cache_aggressive(session) && - F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD)) - F_SET(cache, WT_CACHE_EVICT_DIRTY); + LF_ISSET(WT_CACHE_EVICT_CLEAN_HARD)) + LF_SET(WT_CACHE_EVICT_DIRTY); + + /* When we stop looking for dirty pages, reduce the lookaside score. */ + if (!LF_ISSET(WT_CACHE_EVICT_DIRTY)) + __wt_cache_update_lookaside_score(session, 1, 0); /* * Scrub dirty pages and keep them in cache if we are less than half * way to the clean or dirty trigger. */ - if (bytes_inuse < (uint64_t)((target + trigger) * bytes_max) / 200 && - dirty_inuse < - (uint64_t)((dirty_target + dirty_trigger) * bytes_max) / 200) - F_SET(cache, WT_CACHE_EVICT_SCRUB); + if (bytes_inuse < (uint64_t)((target + trigger) * bytes_max) / 200) { + if (dirty_inuse < (uint64_t) + ((dirty_target + dirty_trigger) * bytes_max) / 200) + LF_SET(WT_CACHE_EVICT_SCRUB); + } else + LF_SET(WT_CACHE_EVICT_NOKEEP); /* * Try lookaside evict when: @@ -632,20 +641,23 @@ __evict_update_work(WT_SESSION_IMPL *session) (__wt_cache_lookaside_score(cache) > 80 && dirty_inuse > (uint64_t)((dirty_target + dirty_trigger) * bytes_max) / 200)) - F_SET(cache, WT_CACHE_EVICT_LOOKASIDE); + LF_SET(WT_CACHE_EVICT_LOOKASIDE); /* * With an in-memory cache, we only do dirty eviction in order to scrub * pages. */ if (F_ISSET(conn, WT_CONN_IN_MEMORY)) { - if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) - F_SET(cache, WT_CACHE_EVICT_DIRTY); - if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD)) - F_SET(cache, WT_CACHE_EVICT_DIRTY_HARD); - F_CLR(cache, WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD); + if (LF_ISSET(WT_CACHE_EVICT_CLEAN)) + LF_SET(WT_CACHE_EVICT_DIRTY); + if (LF_ISSET(WT_CACHE_EVICT_CLEAN_HARD)) + LF_SET(WT_CACHE_EVICT_DIRTY_HARD); + LF_CLR(WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD); } + /* Update the global eviction state. */ + cache->flags = flags; + return (F_ISSET(cache, WT_CACHE_EVICT_ALL | WT_CACHE_EVICT_URGENT)); } diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index 44c3bbb8f78..a3fbe66b4a9 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -461,11 +461,46 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent) WT_REF *child; bool active; + /* + * There may be cursors in the tree walking the list of child pages. + * The parent is locked, so all we care about is cursors already in the + * child pages, no thread can enter them. Any cursor moving through the + * child pages must be hazard pointer coupling between pages, where the + * page on which it currently has a hazard pointer must be in a state + * other than on-disk. Walk the child list forward, then backward, to + * ensure we don't race with a cursor walking in the opposite direction + * from our check. + */ + WT_INTL_FOREACH_BEGIN(session, parent->page, child) { + switch (child->state) { + case WT_REF_DISK: /* On-disk */ + case WT_REF_DELETED: /* On-disk, deleted */ + case WT_REF_LOOKASIDE: /* On-disk, lookaside */ + break; + default: + return (__wt_set_return(session, EBUSY)); + } + } WT_INTL_FOREACH_END; + WT_INTL_FOREACH_REVERSE_BEGIN(session, parent->page, child) { + switch (child->state) { + case WT_REF_DISK: /* On-disk */ + case WT_REF_DELETED: /* On-disk, deleted */ + case WT_REF_LOOKASIDE: /* On-disk, lookaside */ + break; + default: + return (__wt_set_return(session, EBUSY)); + } + } WT_INTL_FOREACH_END; + + /* + * The fast check is done and there are no cursors in the child pages. + * Make sure the child WT_REF structures pages can be discarded. + */ WT_INTL_FOREACH_BEGIN(session, parent->page, child) { switch (child->state) { case WT_REF_DISK: /* On-disk */ break; - case WT_REF_DELETED: /* Deleted */ + case WT_REF_DELETED: /* On-disk, deleted */ /* * If the child page was part of a truncate, * transaction rollback might switch this page into its @@ -489,7 +524,7 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent) if (active) return (__wt_set_return(session, EBUSY)); break; - case WT_REF_LOOKASIDE: + case WT_REF_LOOKASIDE: /* On-disk, lookaside */ /* * If the lookaside history is obsolete, the reference * can be ignored. diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index dcfb59d2ce9..21ba145b257 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -473,10 +473,21 @@ struct __wt_page_modify { WT_SPINLOCK page_lock; /* Page's spinlock */ /* - * The write generation is incremented when a page is modified, a page - * is clean if the write generation is 0. + * The page state is incremented when a page is modified. + * + * WT_PAGE_CLEAN -- + * The page is clean. + * WT_PAGE_DIRTY_FIRST -- + * The page is in this state after the first operation that marks a + * page dirty, or when reconciliation is checking to see if it has + * done enough work to be able to mark the page clean. + * WT_PAGE_DIRTY -- + * Two or more updates have been added to the page. */ - uint32_t write_gen; +#define WT_PAGE_CLEAN 0 +#define WT_PAGE_DIRTY_FIRST 1 +#define WT_PAGE_DIRTY 2 + uint32_t page_state; #define WT_PM_REC_EMPTY 1 /* Reconciliation: no replacement */ #define WT_PM_REC_MULTIBLOCK 2 /* Reconciliation: multiple blocks */ @@ -577,6 +588,14 @@ struct __wt_page { for (__refp = __pindex->index, \ __entries = __pindex->entries; __entries > 0; --__entries) {\ (ref) = *__refp++; +#define WT_INTL_FOREACH_REVERSE_BEGIN(session, page, ref) do { \ + WT_PAGE_INDEX *__pindex; \ + WT_REF **__refp; \ + uint32_t __entries; \ + WT_INTL_INDEX_GET(session, page, __pindex); \ + for (__refp = __pindex->index + __pindex->entries, \ + __entries = __pindex->entries; __entries > 0; --__entries) {\ + (ref) = *--__refp; #define WT_INTL_FOREACH_END \ } \ } while (0) diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index 4ca82e1ee9e..0984dc93d57 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -34,7 +34,8 @@ __wt_page_is_empty(WT_PAGE *page) static inline bool __wt_page_evict_clean(WT_PAGE *page) { - return (page->modify == NULL || (page->modify->write_gen == 0 && + return (page->modify == NULL || + (page->modify->page_state == WT_PAGE_CLEAN && page->modify->rec_result == 0)); } @@ -45,7 +46,8 @@ __wt_page_evict_clean(WT_PAGE *page) static inline bool __wt_page_is_modified(WT_PAGE *page) { - return (page->modify != NULL && page->modify->write_gen != 0); + return (page->modify != NULL && + page->modify->page_state != WT_PAGE_CLEAN); } /* @@ -494,19 +496,25 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) WT_ASSERT(session, !F_ISSET(session->dhandle, WT_DHANDLE_DEAD)); last_running = 0; - if (page->modify->write_gen == 0) + if (page->modify->page_state == WT_PAGE_CLEAN) last_running = S2C(session)->txn_global.last_running; /* - * We depend on atomic-add being a write barrier, that is, a barrier to - * ensure all changes to the page are flushed before updating the page - * write generation and/or marking the tree dirty, otherwise checkpoints + * We depend on the atomic operation being a write barrier, that is, a + * barrier to ensure all changes to the page are flushed before updating + * the page state and/or marking the tree dirty, otherwise checkpoints * and/or page reconciliation might be looking at a clean page/tree. * * Every time the page transitions from clean to dirty, update the cache * and transactional information. + * + * The page state can only ever be incremented above dirty by the number + * of concurrently running threads, so the counter will never approach + * the point where it would wrap. */ - if (__wt_atomic_add32(&page->modify->write_gen, 1) == 1) { + if (page->modify->page_state < WT_PAGE_DIRTY && + __wt_atomic_add32(&page->modify->page_state, 1) == + WT_PAGE_DIRTY_FIRST) { __wt_cache_dirty_incr(session, page); /* @@ -577,7 +585,17 @@ __wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page) * Allow the call to be made on clean pages. */ if (__wt_page_is_modified(page)) { - page->modify->write_gen = 0; + /* + * The only part where ordering matters is during + * reconciliation where updates on other threads are performing + * writes to the page state that need to be visible to the + * reconciliation thread. + * + * Since clearing of the page state is not going to be happening + * during reconciliation on a separate thread, there's no write + * barrier needed here. + */ + page->modify->page_state = WT_PAGE_CLEAN; __wt_cache_dirty_decr(session, page); } } @@ -1498,26 +1516,30 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) * memory_page_max setting, when we see many deleted items, and when we * are attempting to scan without trashing the cache. * - * Fast checks if eviction is disabled for this handle, operation or - * tree, then perform a general check if eviction will be possible. + * Checkpoint should not queue pages for urgent eviction if they require + * dirty eviction: there is a special exemption that allows checkpoint + * to evict dirty pages in a tree that is being checkpointed, and no + * other thread can help with that. Checkpoints don't rely on this code + * for dirty eviction: that is handled explicitly in __wt_sync_file. * - * Checkpoint should not queue pages for urgent eviction if it cannot - * evict them immediately: there is a special exemption that allows - * checkpoint to evict dirty pages in a tree that is being - * checkpointed, and no other thread can help with that. + * If the operation has disabled eviction or splitting, or the session + * is preventing from reconciling, then just queue the page for urgent + * eviction. Otherwise, attempt to release and evict it. */ page = ref->page; if (WT_READGEN_EVICT_SOON(page->read_gen) && btree->evict_disabled == 0 && - __wt_page_can_evict(session, ref, &inmem_split)) { - if (!__wt_page_evict_clean(page) && - (LF_ISSET(WT_READ_NO_SPLIT) || (!inmem_split && - F_ISSET(session, WT_SESSION_NO_RECONCILE)))) { - if (!WT_SESSION_BTREE_SYNC(session)) - WT_IGNORE_RET( - __wt_page_evict_urgent(session, ref)); - } else { - WT_RET_BUSY_OK(__wt_page_release_evict(session, ref)); + __wt_page_can_evict(session, ref, &inmem_split) && + (!WT_SESSION_IS_CHECKPOINT(session) || + __wt_page_evict_clean(page))) { + if (LF_ISSET(WT_READ_NO_EVICT) || + (inmem_split ? LF_ISSET(WT_READ_NO_SPLIT) : + F_ISSET(session, WT_SESSION_NO_RECONCILE))) + WT_IGNORE_RET( + __wt_page_evict_urgent(session, ref)); + else { + WT_RET_BUSY_OK( + __wt_page_release_evict(session, ref)); return (0); } } diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h index 9e849bf4d7f..b0620091a23 100644 --- a/src/third_party/wiredtiger/src/include/cache.h +++ b/src/third_party/wiredtiger/src/include/cache.h @@ -249,13 +249,14 @@ struct __wt_cache { uint32_t pool_flags; /* Cache pool flags */ /* AUTOMATIC FLAG VALUE GENERATION START */ -#define WT_CACHE_EVICT_CLEAN 0x01u /* Evict clean pages */ -#define WT_CACHE_EVICT_CLEAN_HARD 0x02u /* Clean % blocking app threads */ -#define WT_CACHE_EVICT_DIRTY 0x04u /* Evict dirty pages */ -#define WT_CACHE_EVICT_DIRTY_HARD 0x08u /* Dirty % blocking app threads */ -#define WT_CACHE_EVICT_LOOKASIDE 0x10u /* Try lookaside eviction */ -#define WT_CACHE_EVICT_SCRUB 0x20u /* Scrub dirty pages */ -#define WT_CACHE_EVICT_URGENT 0x40u /* Pages are in the urgent queue */ +#define WT_CACHE_EVICT_CLEAN 0x01u /* Evict clean pages */ +#define WT_CACHE_EVICT_CLEAN_HARD 0x02u /* Clean % blocking app threads */ +#define WT_CACHE_EVICT_DIRTY 0x04u /* Evict dirty pages */ +#define WT_CACHE_EVICT_DIRTY_HARD 0x08u /* Dirty % blocking app threads */ +#define WT_CACHE_EVICT_LOOKASIDE 0x10u /* Try lookaside eviction */ +#define WT_CACHE_EVICT_NOKEEP 0x20u /* Don't add read pages to cache */ +#define WT_CACHE_EVICT_SCRUB 0x40u /* Scrub dirty pages */ +#define WT_CACHE_EVICT_URGENT 0x80u /* Pages are in the urgent queue */ /* AUTOMATIC FLAG VALUE GENERATION STOP */ #define WT_CACHE_EVICT_ALL (WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_DIRTY) uint32_t flags; diff --git a/src/third_party/wiredtiger/src/include/serial.i b/src/third_party/wiredtiger/src/include/serial.i index 02f15cdb8af..19c69b71f8f 100644 --- a/src/third_party/wiredtiger/src/include/serial.i +++ b/src/third_party/wiredtiger/src/include/serial.i @@ -7,29 +7,6 @@ */ /* - * __page_write_gen_wrapped_check -- - * Confirm the page's write generation number won't wrap. - */ -static inline int -__page_write_gen_wrapped_check(WT_PAGE *page) -{ - /* - * Check to see if the page's write generation is about to wrap (wildly - * unlikely as it implies 4B updates between clean page reconciliations, - * but technically possible), and fail the update. - * - * The check is outside of the serialization mutex because the page's - * write generation is going to be a hot cache line, so technically it's - * possible for the page's write generation to wrap between the test and - * our subsequent modification of it. However, the test is (4B-1M), and - * there cannot be a million threads that have done the test but not yet - * completed their modification. - */ - return (page->modify->write_gen > - UINT32_MAX - WT_MILLION ? WT_RESTART : 0); -} - -/* * __insert_simple_func -- * Worker function to add a WT_INSERT entry to the middle of a skiplist. */ @@ -163,9 +140,6 @@ __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page, new_ins = *new_insp; *new_insp = NULL; - /* Check for page write generation wrap. */ - WT_RET(__page_write_gen_wrapped_check(page)); - /* * Acquire the page's spinlock unless we already have exclusive access. * Then call the worker function. @@ -215,9 +189,6 @@ __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page, new_ins = *new_insp; *new_insp = NULL; - /* Check for page write generation wrap. */ - WT_RET(__page_write_gen_wrapped_check(page)); - simple = true; for (i = 0; i < skipdepth; i++) if (new_ins->next[i] == NULL) @@ -273,9 +244,6 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, upd = *updp; *updp = NULL; - /* Check for page write generation wrap. */ - WT_RET(__page_write_gen_wrapped_check(page)); - /* * All structure setup must be flushed before the structure is entered * into the list. We need a write barrier here, our callers depend on diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 37b0581af6b..d1aaf901534 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -26,12 +26,6 @@ typedef struct { uint32_t flags; /* Caller's configuration */ /* - * Track start/stop write generation to decide if all changes to the - * page are written. - */ - uint32_t orig_write_gen; - - /* * Track start/stop checkpoint generations to decide if lookaside table * records are correct. */ @@ -340,6 +334,8 @@ static void __rec_write_page_status(WT_SESSION_IMPL *, WT_RECONCILE *); static int __rec_write_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_write_wrapup_err( WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); +static int __reconcile(WT_SESSION_IMPL *, + WT_REF *, WT_SALVAGE_COOKIE *, uint32_t, bool *, bool *); static void __rec_dictionary_free(WT_SESSION_IMPL *, WT_RECONCILE *); static int __rec_dictionary_init(WT_SESSION_IMPL *, WT_RECONCILE *, u_int); @@ -355,19 +351,15 @@ int __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags, bool *lookaside_retryp) { - WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; - WT_PAGE_MODIFY *mod; - WT_RECONCILE *r; - uint64_t oldest_id; + bool no_reconcile_set, page_locked; - btree = S2BT(session); - page = ref->page; - mod = page->modify; if (lookaside_retryp != NULL) *lookaside_retryp = false; + page = ref->page; + __wt_verbose(session, WT_VERB_RECONCILE, "%p reconcile %s (%s%s%s)", (void *)ref, __wt_page_type_string(page->type), @@ -396,10 +388,19 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, LF_ISSET(WT_REC_VISIBLE_ALL) || F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT)); - /* We shouldn't get called with a clean page, that's an error. */ + /* It's an error to be called with a clean page. */ WT_ASSERT(session, __wt_page_is_modified(page)); /* + * Reconciliation acquires and releases pages, and in rare cases that + * page release triggers eviction. If the page is dirty, eviction can + * trigger reconciliation, and we re-enter this code. Reconciliation + * isn't re-entrant, so we need to ensure that doesn't happen. + */ + no_reconcile_set = F_ISSET(session, WT_SESSION_NO_RECONCILE); + F_SET(session, WT_SESSION_NO_RECONCILE); + + /* * Reconciliation locks the page for three reasons: * Reconciliation reads the lists of page updates, obsolete updates * cannot be discarded while reconciliation is in progress; @@ -409,6 +410,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, * a child page splitting during the reconciliation. */ WT_PAGE_LOCK(session, page); + page_locked = true; /* * Now that the page is locked, if attempting to evict it, check again @@ -416,20 +418,37 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, * while we were waiting to acquire the lock (e.g., the page could have * split). */ - if (LF_ISSET(WT_REC_EVICT) && - !__wt_page_can_evict(session, ref, NULL)) { - WT_PAGE_UNLOCK(session, page); - return (__wt_set_return(session, EBUSY)); - } + if (LF_ISSET(WT_REC_EVICT) && !__wt_page_can_evict(session, ref, NULL)) + WT_ERR(__wt_set_return(session, EBUSY)); - /* Initialize the reconciliation structure for each new run. */ - if ((ret = __rec_init( - session, ref, flags, salvage, &session->reconcile)) != 0) { + /* + * Reconcile the page. The reconciliation code unlocks the page as soon + * as possible, and returns that information. + */ + ret = __reconcile(session, ref, + salvage, flags, lookaside_retryp, &page_locked); + +err: + if (page_locked) WT_PAGE_UNLOCK(session, page); - return (ret); - } - r = session->reconcile; + if (!no_reconcile_set) + F_CLR(session, WT_SESSION_NO_RECONCILE); + return (ret); +} +/* + * __reconcile_save_evict_state -- + * Save the transaction state that causes history to be pinned, whether + * reconciliation succeeds or fails. + */ +static void +__reconcile_save_evict_state( + WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) +{ + WT_PAGE_MODIFY *mod; + uint64_t oldest_id; + + mod = ref->page->modify; oldest_id = __wt_txn_oldest_id(session); /* @@ -457,6 +476,36 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_ASSERT(session, WT_TXNID_LE(mod->last_oldest_id, oldest_id)); mod->last_oldest_id = oldest_id; #endif +} + +/* + * __reconcile -- + * Reconcile an in-memory page into its on-disk format, and write it. + */ +static int +__reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, + uint32_t flags, bool *lookaside_retryp, bool *page_lockedp) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_PAGE *page; +#ifdef HAVE_TIMESTAMPS + WT_PAGE_MODIFY *mod; +#endif + WT_RECONCILE *r; + + btree = S2BT(session); + page = ref->page; + +#ifdef HAVE_TIMESTAMPS + mod = page->modify; +#endif + /* Save the eviction state. */ + __reconcile_save_evict_state(session, ref, flags); + + /* Initialize the reconciliation structure for each new run. */ + WT_RET(__rec_init(session, ref, flags, salvage, &session->reconcile)); + r = session->reconcile; /* Reconcile the page. */ switch (page->type) { @@ -515,6 +564,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, #endif /* Release the reconciliation lock. */ + *page_lockedp = false; WT_PAGE_UNLOCK(session, page); /* Update statistics. */ @@ -733,14 +783,20 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r) } /* - * The page only might be clean; if the write generation is - * unchanged since reconciliation started, it's clean. + * We set the page state to mark it as having been dirtied for + * the first time prior to reconciliation. A failed atomic cas + * indicates that an update has taken place during + * reconciliation. * - * If the write generation changed, the page has been written - * since reconciliation started and remains dirty (that can't - * happen when evicting, the page is exclusively locked). + * The page only might be clean; if the page state is unchanged + * since reconciliation started, it's clean. + * + * If the page state changed, the page has been written since + * reconciliation started and remains dirty (that can't happen + * when evicting, the page is exclusively locked). */ - if (__wt_atomic_cas32(&mod->write_gen, r->orig_write_gen, 0)) + if (__wt_atomic_cas32( + &mod->page_state, WT_PAGE_DIRTY_FIRST, WT_PAGE_CLEAN)) __wt_cache_dirty_decr(session, page); else WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT)); @@ -904,7 +960,16 @@ __rec_init(WT_SESSION_IMPL *session, btree = S2BT(session); page = ref->page; - if ((r = *(WT_RECONCILE **)reconcilep) == NULL) { + /* + * Reconciliation is not re-entrant, make sure that doesn't happen. Our + * caller sets WT_SESSION_IMPL.WT_SESSION_NO_RECONCILE to prevent it, + * but it's been a problem in the past, check to be sure. + */ + r = *(WT_RECONCILE **)reconcilep; + if (r != NULL && r->ref != NULL) + WT_RET_MSG(session, WT_ERROR, "reconciliation re-entered"); + + if (r == NULL) { WT_RET(__wt_calloc_one(session, &r)); *(WT_RECONCILE **)reconcilep = r; @@ -919,21 +984,27 @@ __rec_init(WT_SESSION_IMPL *session, F_SET(&r->chunkB.image, WT_ITEM_ALIGNED); } - /* Reconciliation is not re-entrant, make sure that doesn't happen. */ - WT_ASSERT(session, r->ref == NULL); - /* Remember the configuration. */ r->ref = ref; r->page = page; /* - * Save the page's write generation before reading the page. * Save the transaction generations before reading the page. * These are all ordered reads, but we only need one. */ r->orig_btree_checkpoint_gen = btree->checkpoint_gen; r->orig_txn_checkpoint_gen = __wt_gen(session, WT_GEN_CHECKPOINT); - WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen); + + /* + * Update the page state to indicate that all currently installed + * updates will be included in this reconciliation if it would mark the + * page clean. + * + * Add a write barrier to make it more likely that a thread adding an + * update will see this state change. + */ + page->modify->page_state = WT_PAGE_DIRTY_FIRST; + WT_FULL_BARRIER(); /* * Cache the oldest running transaction ID. This is used to check diff --git a/src/third_party/wiredtiger/src/support/hazard.c b/src/third_party/wiredtiger/src/support/hazard.c index eb65c00741c..1c6487ef07f 100644 --- a/src/third_party/wiredtiger/src/support/hazard.c +++ b/src/third_party/wiredtiger/src/support/hazard.c @@ -329,6 +329,10 @@ __wt_hazard_check(WT_SESSION_IMPL *session, WT_REF *ref) WT_SESSION_IMPL *s; uint32_t i, j, hazard_inuse, max, session_cnt, walk_cnt; + /* If a file can never be evicted, hazard pointers aren't required. */ + if (F_ISSET(S2BT(session), WT_BTREE_IN_MEMORY)) + return (NULL); + conn = S2C(session); WT_STAT_CONN_INCR(session, cache_hazard_checks); |