summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2019-10-01 16:41:41 +1000
committerLuke Chen <luke.chen@mongodb.com>2019-10-01 16:41:41 +1000
commit86baddd406874c78cde915a8598999ff3f0ea70c (patch)
tree50b946d6421bce6f02f37047a255e06a8b704fb2
parent34217d3b595b172180603e48e17421a330e04a81 (diff)
downloadmongo-86baddd406874c78cde915a8598999ff3f0ea70c.tar.gz
Import wiredtiger: 4c72feeb921607b30984301f4e007fc24b54e26b from branch mongodb-3.6
ref: 04447c57d5..4c72feeb92 for: 3.6.15 WT-4869 Stop adding cache pressure when eviction is falling behind WT-4881 Soften the restrictions on re-entering reconciliation WT-4893 Fix a race between internal page child-page eviction checks and cursors in the tree WT-4898 Don't allow the eviction server to reconcile if it's busy WT-4956 Handle the case where 4 billion updates are made to a page without eviction WT-4957 Revert part of a change about when pages are queued for urgent eviction WT-5050 Assertion failure during urgent eviction of metadata page
-rw-r--r--src/third_party/wiredtiger/dist/s_string.ok1
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_debug.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_read.c3
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_sync.c5
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_lru.c52
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_page.c39
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h25
-rw-r--r--src/third_party/wiredtiger/src/include/btree.i68
-rw-r--r--src/third_party/wiredtiger/src/include/cache.h15
-rw-r--r--src/third_party/wiredtiger/src/include/serial.i32
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c145
-rw-r--r--src/third_party/wiredtiger/src/support/hazard.c4
13 files changed, 266 insertions, 127 deletions
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok
index 8b3110e87c8..b3b95a4e50a 100644
--- a/src/third_party/wiredtiger/dist/s_string.ok
+++ b/src/third_party/wiredtiger/dist/s_string.ok
@@ -1302,6 +1302,7 @@ unmodify
unordered
unpackv
unpadded
+unreconciled
unreferenced
unregister
unsized
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index b16f253e705..602318b9214 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -1,5 +1,5 @@
{
- "commit": "04447c57d565903849f0797fa391cd60f5fc7992",
+ "commit": "4c72feeb921607b30984301f4e007fc24b54e26b",
"github": "wiredtiger/wiredtiger.git",
"vendor": "wiredtiger",
"branch": "mongodb-3.6"
diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c
index 47b84ad7a25..319b9357bd7 100644
--- a/src/third_party/wiredtiger/src/btree/bt_debug.c
+++ b/src/third_party/wiredtiger/src/btree/bt_debug.c
@@ -883,7 +883,7 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref)
if (split_gen != 0)
WT_RET(ds->f(ds, ", split-gen=%" PRIu64, split_gen));
if (mod != NULL)
- WT_RET(ds->f(ds, ", write-gen=%" PRIu32, mod->write_gen));
+ WT_RET(ds->f(ds, ", page-state=%" PRIu32, mod->page_state));
WT_RET(ds->f(ds,
", memory-size %" WT_SIZET_FMT, page->memory_footprint));
WT_RET(ds->f(ds, "\n"));
diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c
index bbf60aabc20..69b0f95d205 100644
--- a/src/third_party/wiredtiger/src/btree/bt_read.c
+++ b/src/third_party/wiredtiger/src/btree/bt_read.c
@@ -689,7 +689,8 @@ read: /*
* we "acquire" it.
*/
wont_need = LF_ISSET(WT_READ_WONT_NEED) ||
- F_ISSET(session, WT_SESSION_READ_WONT_NEED);
+ F_ISSET(session, WT_SESSION_READ_WONT_NEED) ||
+ F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_NOKEEP);
continue;
case WT_REF_READING:
if (LF_ISSET(WT_READ_CACHE))
diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c
index c5c08faa701..6a973335541 100644
--- a/src/third_party/wiredtiger/src/btree/bt_sync.c
+++ b/src/third_party/wiredtiger/src/btree/bt_sync.c
@@ -324,9 +324,14 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
* cache clean but with history that cannot be
* discarded), that is not wasted effort because
* checkpoint doesn't need to write the page again.
+ *
+ * Once the transaction has given up it's snapshot it
+ * is no longer safe to reconcile pages. That happens
+ * prior to the final metadata checkpoint.
*/
if (!WT_PAGE_IS_INTERNAL(page) &&
page->read_gen == WT_READGEN_WONT_NEED &&
+ F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT) &&
!tried_eviction) {
WT_ERR_BUSY_OK(
__wt_page_release_evict(session, walk));
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index ff3772533ae..bb310369d82 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -559,6 +559,7 @@ __evict_update_work(WT_SESSION_IMPL *session)
WT_CONNECTION_IMPL *conn;
double dirty_target, dirty_trigger, target, trigger;
uint64_t bytes_inuse, bytes_max, dirty_inuse;
+ uint32_t flags;
conn = S2C(session);
cache = conn->cache;
@@ -568,14 +569,16 @@ __evict_update_work(WT_SESSION_IMPL *session)
target = cache->eviction_target;
trigger = cache->eviction_trigger;
- /* Clear previous state. */
- cache->flags = 0;
+ /* Build up the new state. */
+ flags = 0;
- if (!F_ISSET(conn, WT_CONN_EVICTION_RUN))
+ if (!F_ISSET(conn, WT_CONN_EVICTION_RUN)) {
+ cache->flags = 0;
return (false);
+ }
if (!__evict_queue_empty(cache->evict_urgent_queue, false))
- F_SET(cache, WT_CACHE_EVICT_URGENT);
+ LF_SET(WT_CACHE_EVICT_URGENT);
if (F_ISSET(conn, WT_CONN_LOOKASIDE_OPEN)) {
WT_ASSERT(session,
@@ -594,32 +597,38 @@ __evict_update_work(WT_SESSION_IMPL *session)
bytes_max = conn->cache_size + 1;
bytes_inuse = __wt_cache_bytes_inuse(cache);
if (__wt_eviction_clean_needed(session, NULL))
- F_SET(cache, WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD);
+ LF_SET(WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD);
else if (bytes_inuse > (target * bytes_max) / 100)
- F_SET(cache, WT_CACHE_EVICT_CLEAN);
+ LF_SET(WT_CACHE_EVICT_CLEAN);
dirty_inuse = __wt_cache_dirty_leaf_inuse(cache);
if (__wt_eviction_dirty_needed(session, NULL))
- F_SET(cache, WT_CACHE_EVICT_DIRTY | WT_CACHE_EVICT_DIRTY_HARD);
+ LF_SET(WT_CACHE_EVICT_DIRTY | WT_CACHE_EVICT_DIRTY_HARD);
else if (dirty_inuse > (uint64_t)(dirty_target * bytes_max) / 100)
- F_SET(cache, WT_CACHE_EVICT_DIRTY);
+ LF_SET(WT_CACHE_EVICT_DIRTY);
/*
* If application threads are blocked by the total volume of data in
* cache, try dirty pages as well.
*/
if (__wt_cache_aggressive(session) &&
- F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD))
- F_SET(cache, WT_CACHE_EVICT_DIRTY);
+ LF_ISSET(WT_CACHE_EVICT_CLEAN_HARD))
+ LF_SET(WT_CACHE_EVICT_DIRTY);
+
+ /* When we stop looking for dirty pages, reduce the lookaside score. */
+ if (!LF_ISSET(WT_CACHE_EVICT_DIRTY))
+ __wt_cache_update_lookaside_score(session, 1, 0);
/*
* Scrub dirty pages and keep them in cache if we are less than half
* way to the clean or dirty trigger.
*/
- if (bytes_inuse < (uint64_t)((target + trigger) * bytes_max) / 200 &&
- dirty_inuse <
- (uint64_t)((dirty_target + dirty_trigger) * bytes_max) / 200)
- F_SET(cache, WT_CACHE_EVICT_SCRUB);
+ if (bytes_inuse < (uint64_t)((target + trigger) * bytes_max) / 200) {
+ if (dirty_inuse < (uint64_t)
+ ((dirty_target + dirty_trigger) * bytes_max) / 200)
+ LF_SET(WT_CACHE_EVICT_SCRUB);
+ } else
+ LF_SET(WT_CACHE_EVICT_NOKEEP);
/*
* Try lookaside evict when:
@@ -632,20 +641,23 @@ __evict_update_work(WT_SESSION_IMPL *session)
(__wt_cache_lookaside_score(cache) > 80 &&
dirty_inuse >
(uint64_t)((dirty_target + dirty_trigger) * bytes_max) / 200))
- F_SET(cache, WT_CACHE_EVICT_LOOKASIDE);
+ LF_SET(WT_CACHE_EVICT_LOOKASIDE);
/*
* With an in-memory cache, we only do dirty eviction in order to scrub
* pages.
*/
if (F_ISSET(conn, WT_CONN_IN_MEMORY)) {
- if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN))
- F_SET(cache, WT_CACHE_EVICT_DIRTY);
- if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD))
- F_SET(cache, WT_CACHE_EVICT_DIRTY_HARD);
- F_CLR(cache, WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD);
+ if (LF_ISSET(WT_CACHE_EVICT_CLEAN))
+ LF_SET(WT_CACHE_EVICT_DIRTY);
+ if (LF_ISSET(WT_CACHE_EVICT_CLEAN_HARD))
+ LF_SET(WT_CACHE_EVICT_DIRTY_HARD);
+ LF_CLR(WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD);
}
+ /* Update the global eviction state. */
+ cache->flags = flags;
+
return (F_ISSET(cache, WT_CACHE_EVICT_ALL | WT_CACHE_EVICT_URGENT));
}
diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c
index 44c3bbb8f78..a3fbe66b4a9 100644
--- a/src/third_party/wiredtiger/src/evict/evict_page.c
+++ b/src/third_party/wiredtiger/src/evict/evict_page.c
@@ -461,11 +461,46 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent)
WT_REF *child;
bool active;
+ /*
+ * There may be cursors in the tree walking the list of child pages.
+ * The parent is locked, so all we care about is cursors already in the
+ * child pages, no thread can enter them. Any cursor moving through the
+ * child pages must be hazard pointer coupling between pages, where the
+ * page on which it currently has a hazard pointer must be in a state
+ * other than on-disk. Walk the child list forward, then backward, to
+ * ensure we don't race with a cursor walking in the opposite direction
+ * from our check.
+ */
+ WT_INTL_FOREACH_BEGIN(session, parent->page, child) {
+ switch (child->state) {
+ case WT_REF_DISK: /* On-disk */
+ case WT_REF_DELETED: /* On-disk, deleted */
+ case WT_REF_LOOKASIDE: /* On-disk, lookaside */
+ break;
+ default:
+ return (__wt_set_return(session, EBUSY));
+ }
+ } WT_INTL_FOREACH_END;
+ WT_INTL_FOREACH_REVERSE_BEGIN(session, parent->page, child) {
+ switch (child->state) {
+ case WT_REF_DISK: /* On-disk */
+ case WT_REF_DELETED: /* On-disk, deleted */
+ case WT_REF_LOOKASIDE: /* On-disk, lookaside */
+ break;
+ default:
+ return (__wt_set_return(session, EBUSY));
+ }
+ } WT_INTL_FOREACH_END;
+
+ /*
+ * The fast check is done and there are no cursors in the child pages.
+ * Make sure the child WT_REF structures pages can be discarded.
+ */
WT_INTL_FOREACH_BEGIN(session, parent->page, child) {
switch (child->state) {
case WT_REF_DISK: /* On-disk */
break;
- case WT_REF_DELETED: /* Deleted */
+ case WT_REF_DELETED: /* On-disk, deleted */
/*
* If the child page was part of a truncate,
* transaction rollback might switch this page into its
@@ -489,7 +524,7 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent)
if (active)
return (__wt_set_return(session, EBUSY));
break;
- case WT_REF_LOOKASIDE:
+ case WT_REF_LOOKASIDE: /* On-disk, lookaside */
/*
* If the lookaside history is obsolete, the reference
* can be ignored.
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index dcfb59d2ce9..21ba145b257 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -473,10 +473,21 @@ struct __wt_page_modify {
WT_SPINLOCK page_lock; /* Page's spinlock */
/*
- * The write generation is incremented when a page is modified, a page
- * is clean if the write generation is 0.
+ * The page state is incremented when a page is modified.
+ *
+ * WT_PAGE_CLEAN --
+ * The page is clean.
+ * WT_PAGE_DIRTY_FIRST --
+ * The page is in this state after the first operation that marks a
+ * page dirty, or when reconciliation is checking to see if it has
+ * done enough work to be able to mark the page clean.
+ * WT_PAGE_DIRTY --
+ * Two or more updates have been added to the page.
*/
- uint32_t write_gen;
+#define WT_PAGE_CLEAN 0
+#define WT_PAGE_DIRTY_FIRST 1
+#define WT_PAGE_DIRTY 2
+ uint32_t page_state;
#define WT_PM_REC_EMPTY 1 /* Reconciliation: no replacement */
#define WT_PM_REC_MULTIBLOCK 2 /* Reconciliation: multiple blocks */
@@ -577,6 +588,14 @@ struct __wt_page {
for (__refp = __pindex->index, \
__entries = __pindex->entries; __entries > 0; --__entries) {\
(ref) = *__refp++;
+#define WT_INTL_FOREACH_REVERSE_BEGIN(session, page, ref) do { \
+ WT_PAGE_INDEX *__pindex; \
+ WT_REF **__refp; \
+ uint32_t __entries; \
+ WT_INTL_INDEX_GET(session, page, __pindex); \
+ for (__refp = __pindex->index + __pindex->entries, \
+ __entries = __pindex->entries; __entries > 0; --__entries) {\
+ (ref) = *--__refp;
#define WT_INTL_FOREACH_END \
} \
} while (0)
diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i
index 4ca82e1ee9e..0984dc93d57 100644
--- a/src/third_party/wiredtiger/src/include/btree.i
+++ b/src/third_party/wiredtiger/src/include/btree.i
@@ -34,7 +34,8 @@ __wt_page_is_empty(WT_PAGE *page)
static inline bool
__wt_page_evict_clean(WT_PAGE *page)
{
- return (page->modify == NULL || (page->modify->write_gen == 0 &&
+ return (page->modify == NULL ||
+ (page->modify->page_state == WT_PAGE_CLEAN &&
page->modify->rec_result == 0));
}
@@ -45,7 +46,8 @@ __wt_page_evict_clean(WT_PAGE *page)
static inline bool
__wt_page_is_modified(WT_PAGE *page)
{
- return (page->modify != NULL && page->modify->write_gen != 0);
+ return (page->modify != NULL &&
+ page->modify->page_state != WT_PAGE_CLEAN);
}
/*
@@ -494,19 +496,25 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
WT_ASSERT(session, !F_ISSET(session->dhandle, WT_DHANDLE_DEAD));
last_running = 0;
- if (page->modify->write_gen == 0)
+ if (page->modify->page_state == WT_PAGE_CLEAN)
last_running = S2C(session)->txn_global.last_running;
/*
- * We depend on atomic-add being a write barrier, that is, a barrier to
- * ensure all changes to the page are flushed before updating the page
- * write generation and/or marking the tree dirty, otherwise checkpoints
+ * We depend on the atomic operation being a write barrier, that is, a
+ * barrier to ensure all changes to the page are flushed before updating
+ * the page state and/or marking the tree dirty, otherwise checkpoints
* and/or page reconciliation might be looking at a clean page/tree.
*
* Every time the page transitions from clean to dirty, update the cache
* and transactional information.
+ *
+ * The page state can only ever be incremented above dirty by the number
+ * of concurrently running threads, so the counter will never approach
+ * the point where it would wrap.
*/
- if (__wt_atomic_add32(&page->modify->write_gen, 1) == 1) {
+ if (page->modify->page_state < WT_PAGE_DIRTY &&
+ __wt_atomic_add32(&page->modify->page_state, 1) ==
+ WT_PAGE_DIRTY_FIRST) {
__wt_cache_dirty_incr(session, page);
/*
@@ -577,7 +585,17 @@ __wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page)
* Allow the call to be made on clean pages.
*/
if (__wt_page_is_modified(page)) {
- page->modify->write_gen = 0;
+ /*
+ * The only part where ordering matters is during
+ * reconciliation where updates on other threads are performing
+ * writes to the page state that need to be visible to the
+ * reconciliation thread.
+ *
+ * Since clearing of the page state is not going to be happening
+ * during reconciliation on a separate thread, there's no write
+ * barrier needed here.
+ */
+ page->modify->page_state = WT_PAGE_CLEAN;
__wt_cache_dirty_decr(session, page);
}
}
@@ -1498,26 +1516,30 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
* memory_page_max setting, when we see many deleted items, and when we
* are attempting to scan without trashing the cache.
*
- * Fast checks if eviction is disabled for this handle, operation or
- * tree, then perform a general check if eviction will be possible.
+ * Checkpoint should not queue pages for urgent eviction if they require
+ * dirty eviction: there is a special exemption that allows checkpoint
+ * to evict dirty pages in a tree that is being checkpointed, and no
+ * other thread can help with that. Checkpoints don't rely on this code
+ * for dirty eviction: that is handled explicitly in __wt_sync_file.
*
- * Checkpoint should not queue pages for urgent eviction if it cannot
- * evict them immediately: there is a special exemption that allows
- * checkpoint to evict dirty pages in a tree that is being
- * checkpointed, and no other thread can help with that.
+ * If the operation has disabled eviction or splitting, or the session
+ * is preventing from reconciling, then just queue the page for urgent
+ * eviction. Otherwise, attempt to release and evict it.
*/
page = ref->page;
if (WT_READGEN_EVICT_SOON(page->read_gen) &&
btree->evict_disabled == 0 &&
- __wt_page_can_evict(session, ref, &inmem_split)) {
- if (!__wt_page_evict_clean(page) &&
- (LF_ISSET(WT_READ_NO_SPLIT) || (!inmem_split &&
- F_ISSET(session, WT_SESSION_NO_RECONCILE)))) {
- if (!WT_SESSION_BTREE_SYNC(session))
- WT_IGNORE_RET(
- __wt_page_evict_urgent(session, ref));
- } else {
- WT_RET_BUSY_OK(__wt_page_release_evict(session, ref));
+ __wt_page_can_evict(session, ref, &inmem_split) &&
+ (!WT_SESSION_IS_CHECKPOINT(session) ||
+ __wt_page_evict_clean(page))) {
+ if (LF_ISSET(WT_READ_NO_EVICT) ||
+ (inmem_split ? LF_ISSET(WT_READ_NO_SPLIT) :
+ F_ISSET(session, WT_SESSION_NO_RECONCILE)))
+ WT_IGNORE_RET(
+ __wt_page_evict_urgent(session, ref));
+ else {
+ WT_RET_BUSY_OK(
+ __wt_page_release_evict(session, ref));
return (0);
}
}
diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h
index 9e849bf4d7f..b0620091a23 100644
--- a/src/third_party/wiredtiger/src/include/cache.h
+++ b/src/third_party/wiredtiger/src/include/cache.h
@@ -249,13 +249,14 @@ struct __wt_cache {
uint32_t pool_flags; /* Cache pool flags */
/* AUTOMATIC FLAG VALUE GENERATION START */
-#define WT_CACHE_EVICT_CLEAN 0x01u /* Evict clean pages */
-#define WT_CACHE_EVICT_CLEAN_HARD 0x02u /* Clean % blocking app threads */
-#define WT_CACHE_EVICT_DIRTY 0x04u /* Evict dirty pages */
-#define WT_CACHE_EVICT_DIRTY_HARD 0x08u /* Dirty % blocking app threads */
-#define WT_CACHE_EVICT_LOOKASIDE 0x10u /* Try lookaside eviction */
-#define WT_CACHE_EVICT_SCRUB 0x20u /* Scrub dirty pages */
-#define WT_CACHE_EVICT_URGENT 0x40u /* Pages are in the urgent queue */
+#define WT_CACHE_EVICT_CLEAN 0x01u /* Evict clean pages */
+#define WT_CACHE_EVICT_CLEAN_HARD 0x02u /* Clean % blocking app threads */
+#define WT_CACHE_EVICT_DIRTY 0x04u /* Evict dirty pages */
+#define WT_CACHE_EVICT_DIRTY_HARD 0x08u /* Dirty % blocking app threads */
+#define WT_CACHE_EVICT_LOOKASIDE 0x10u /* Try lookaside eviction */
+#define WT_CACHE_EVICT_NOKEEP 0x20u /* Don't add read pages to cache */
+#define WT_CACHE_EVICT_SCRUB 0x40u /* Scrub dirty pages */
+#define WT_CACHE_EVICT_URGENT 0x80u /* Pages are in the urgent queue */
/* AUTOMATIC FLAG VALUE GENERATION STOP */
#define WT_CACHE_EVICT_ALL (WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_DIRTY)
uint32_t flags;
diff --git a/src/third_party/wiredtiger/src/include/serial.i b/src/third_party/wiredtiger/src/include/serial.i
index 02f15cdb8af..19c69b71f8f 100644
--- a/src/third_party/wiredtiger/src/include/serial.i
+++ b/src/third_party/wiredtiger/src/include/serial.i
@@ -7,29 +7,6 @@
*/
/*
- * __page_write_gen_wrapped_check --
- * Confirm the page's write generation number won't wrap.
- */
-static inline int
-__page_write_gen_wrapped_check(WT_PAGE *page)
-{
- /*
- * Check to see if the page's write generation is about to wrap (wildly
- * unlikely as it implies 4B updates between clean page reconciliations,
- * but technically possible), and fail the update.
- *
- * The check is outside of the serialization mutex because the page's
- * write generation is going to be a hot cache line, so technically it's
- * possible for the page's write generation to wrap between the test and
- * our subsequent modification of it. However, the test is (4B-1M), and
- * there cannot be a million threads that have done the test but not yet
- * completed their modification.
- */
- return (page->modify->write_gen >
- UINT32_MAX - WT_MILLION ? WT_RESTART : 0);
-}
-
-/*
* __insert_simple_func --
* Worker function to add a WT_INSERT entry to the middle of a skiplist.
*/
@@ -163,9 +140,6 @@ __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
new_ins = *new_insp;
*new_insp = NULL;
- /* Check for page write generation wrap. */
- WT_RET(__page_write_gen_wrapped_check(page));
-
/*
* Acquire the page's spinlock unless we already have exclusive access.
* Then call the worker function.
@@ -215,9 +189,6 @@ __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
new_ins = *new_insp;
*new_insp = NULL;
- /* Check for page write generation wrap. */
- WT_RET(__page_write_gen_wrapped_check(page));
-
simple = true;
for (i = 0; i < skipdepth; i++)
if (new_ins->next[i] == NULL)
@@ -273,9 +244,6 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
upd = *updp;
*updp = NULL;
- /* Check for page write generation wrap. */
- WT_RET(__page_write_gen_wrapped_check(page));
-
/*
* All structure setup must be flushed before the structure is entered
* into the list. We need a write barrier here, our callers depend on
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 37b0581af6b..d1aaf901534 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -26,12 +26,6 @@ typedef struct {
uint32_t flags; /* Caller's configuration */
/*
- * Track start/stop write generation to decide if all changes to the
- * page are written.
- */
- uint32_t orig_write_gen;
-
- /*
* Track start/stop checkpoint generations to decide if lookaside table
* records are correct.
*/
@@ -340,6 +334,8 @@ static void __rec_write_page_status(WT_SESSION_IMPL *, WT_RECONCILE *);
static int __rec_write_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
static int __rec_write_wrapup_err(
WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int __reconcile(WT_SESSION_IMPL *,
+ WT_REF *, WT_SALVAGE_COOKIE *, uint32_t, bool *, bool *);
static void __rec_dictionary_free(WT_SESSION_IMPL *, WT_RECONCILE *);
static int __rec_dictionary_init(WT_SESSION_IMPL *, WT_RECONCILE *, u_int);
@@ -355,19 +351,15 @@ int
__wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
WT_SALVAGE_COOKIE *salvage, uint32_t flags, bool *lookaside_retryp)
{
- WT_BTREE *btree;
WT_DECL_RET;
WT_PAGE *page;
- WT_PAGE_MODIFY *mod;
- WT_RECONCILE *r;
- uint64_t oldest_id;
+ bool no_reconcile_set, page_locked;
- btree = S2BT(session);
- page = ref->page;
- mod = page->modify;
if (lookaside_retryp != NULL)
*lookaside_retryp = false;
+ page = ref->page;
+
__wt_verbose(session, WT_VERB_RECONCILE,
"%p reconcile %s (%s%s%s)",
(void *)ref, __wt_page_type_string(page->type),
@@ -396,10 +388,19 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
LF_ISSET(WT_REC_VISIBLE_ALL) ||
F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT));
- /* We shouldn't get called with a clean page, that's an error. */
+ /* It's an error to be called with a clean page. */
WT_ASSERT(session, __wt_page_is_modified(page));
/*
+ * Reconciliation acquires and releases pages, and in rare cases that
+ * page release triggers eviction. If the page is dirty, eviction can
+ * trigger reconciliation, and we re-enter this code. Reconciliation
+ * isn't re-entrant, so we need to ensure that doesn't happen.
+ */
+ no_reconcile_set = F_ISSET(session, WT_SESSION_NO_RECONCILE);
+ F_SET(session, WT_SESSION_NO_RECONCILE);
+
+ /*
* Reconciliation locks the page for three reasons:
* Reconciliation reads the lists of page updates, obsolete updates
* cannot be discarded while reconciliation is in progress;
@@ -409,6 +410,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
* a child page splitting during the reconciliation.
*/
WT_PAGE_LOCK(session, page);
+ page_locked = true;
/*
* Now that the page is locked, if attempting to evict it, check again
@@ -416,20 +418,37 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
* while we were waiting to acquire the lock (e.g., the page could have
* split).
*/
- if (LF_ISSET(WT_REC_EVICT) &&
- !__wt_page_can_evict(session, ref, NULL)) {
- WT_PAGE_UNLOCK(session, page);
- return (__wt_set_return(session, EBUSY));
- }
+ if (LF_ISSET(WT_REC_EVICT) && !__wt_page_can_evict(session, ref, NULL))
+ WT_ERR(__wt_set_return(session, EBUSY));
- /* Initialize the reconciliation structure for each new run. */
- if ((ret = __rec_init(
- session, ref, flags, salvage, &session->reconcile)) != 0) {
+ /*
+ * Reconcile the page. The reconciliation code unlocks the page as soon
+ * as possible, and returns that information.
+ */
+ ret = __reconcile(session, ref,
+ salvage, flags, lookaside_retryp, &page_locked);
+
+err:
+ if (page_locked)
WT_PAGE_UNLOCK(session, page);
- return (ret);
- }
- r = session->reconcile;
+ if (!no_reconcile_set)
+ F_CLR(session, WT_SESSION_NO_RECONCILE);
+ return (ret);
+}
+/*
+ * __reconcile_save_evict_state --
+ * Save the transaction state that causes history to be pinned, whether
+ * reconciliation succeeds or fails.
+ */
+static void
+__reconcile_save_evict_state(
+ WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
+{
+ WT_PAGE_MODIFY *mod;
+ uint64_t oldest_id;
+
+ mod = ref->page->modify;
oldest_id = __wt_txn_oldest_id(session);
/*
@@ -457,6 +476,36 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
WT_ASSERT(session, WT_TXNID_LE(mod->last_oldest_id, oldest_id));
mod->last_oldest_id = oldest_id;
#endif
+}
+
+/*
+ * __reconcile --
+ * Reconcile an in-memory page into its on-disk format, and write it.
+ */
+static int
+__reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage,
+ uint32_t flags, bool *lookaside_retryp, bool *page_lockedp)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_PAGE *page;
+#ifdef HAVE_TIMESTAMPS
+ WT_PAGE_MODIFY *mod;
+#endif
+ WT_RECONCILE *r;
+
+ btree = S2BT(session);
+ page = ref->page;
+
+#ifdef HAVE_TIMESTAMPS
+ mod = page->modify;
+#endif
+ /* Save the eviction state. */
+ __reconcile_save_evict_state(session, ref, flags);
+
+ /* Initialize the reconciliation structure for each new run. */
+ WT_RET(__rec_init(session, ref, flags, salvage, &session->reconcile));
+ r = session->reconcile;
/* Reconcile the page. */
switch (page->type) {
@@ -515,6 +564,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
#endif
/* Release the reconciliation lock. */
+ *page_lockedp = false;
WT_PAGE_UNLOCK(session, page);
/* Update statistics. */
@@ -733,14 +783,20 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r)
}
/*
- * The page only might be clean; if the write generation is
- * unchanged since reconciliation started, it's clean.
+ * We set the page state to mark it as having been dirtied for
+ * the first time prior to reconciliation. A failed atomic cas
+ * indicates that an update has taken place during
+ * reconciliation.
*
- * If the write generation changed, the page has been written
- * since reconciliation started and remains dirty (that can't
- * happen when evicting, the page is exclusively locked).
+ * The page only might be clean; if the page state is unchanged
+ * since reconciliation started, it's clean.
+ *
+ * If the page state changed, the page has been written since
+ * reconciliation started and remains dirty (that can't happen
+ * when evicting, the page is exclusively locked).
*/
- if (__wt_atomic_cas32(&mod->write_gen, r->orig_write_gen, 0))
+ if (__wt_atomic_cas32(
+ &mod->page_state, WT_PAGE_DIRTY_FIRST, WT_PAGE_CLEAN))
__wt_cache_dirty_decr(session, page);
else
WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT));
@@ -904,7 +960,16 @@ __rec_init(WT_SESSION_IMPL *session,
btree = S2BT(session);
page = ref->page;
- if ((r = *(WT_RECONCILE **)reconcilep) == NULL) {
+ /*
+ * Reconciliation is not re-entrant, make sure that doesn't happen. Our
+ * caller sets WT_SESSION_IMPL.WT_SESSION_NO_RECONCILE to prevent it,
+ * but it's been a problem in the past, check to be sure.
+ */
+ r = *(WT_RECONCILE **)reconcilep;
+ if (r != NULL && r->ref != NULL)
+ WT_RET_MSG(session, WT_ERROR, "reconciliation re-entered");
+
+ if (r == NULL) {
WT_RET(__wt_calloc_one(session, &r));
*(WT_RECONCILE **)reconcilep = r;
@@ -919,21 +984,27 @@ __rec_init(WT_SESSION_IMPL *session,
F_SET(&r->chunkB.image, WT_ITEM_ALIGNED);
}
- /* Reconciliation is not re-entrant, make sure that doesn't happen. */
- WT_ASSERT(session, r->ref == NULL);
-
/* Remember the configuration. */
r->ref = ref;
r->page = page;
/*
- * Save the page's write generation before reading the page.
* Save the transaction generations before reading the page.
* These are all ordered reads, but we only need one.
*/
r->orig_btree_checkpoint_gen = btree->checkpoint_gen;
r->orig_txn_checkpoint_gen = __wt_gen(session, WT_GEN_CHECKPOINT);
- WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen);
+
+ /*
+ * Update the page state to indicate that all currently installed
+ * updates will be included in this reconciliation if it would mark the
+ * page clean.
+ *
+ * Add a write barrier to make it more likely that a thread adding an
+ * update will see this state change.
+ */
+ page->modify->page_state = WT_PAGE_DIRTY_FIRST;
+ WT_FULL_BARRIER();
/*
* Cache the oldest running transaction ID. This is used to check
diff --git a/src/third_party/wiredtiger/src/support/hazard.c b/src/third_party/wiredtiger/src/support/hazard.c
index eb65c00741c..1c6487ef07f 100644
--- a/src/third_party/wiredtiger/src/support/hazard.c
+++ b/src/third_party/wiredtiger/src/support/hazard.c
@@ -329,6 +329,10 @@ __wt_hazard_check(WT_SESSION_IMPL *session, WT_REF *ref)
WT_SESSION_IMPL *s;
uint32_t i, j, hazard_inuse, max, session_cnt, walk_cnt;
+ /* If a file can never be evicted, hazard pointers aren't required. */
+ if (F_ISSET(S2BT(session), WT_BTREE_IN_MEMORY))
+ return (NULL);
+
conn = S2C(session);
WT_STAT_CONN_INCR(session, cache_hazard_checks);