summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2022-09-09 16:04:57 +1000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-09-09 06:34:13 +0000
commitccc7ca852ec042b2ee948812d4980ee6197375b5 (patch)
tree060e42b79c4782051909b21c55b6fb20dbaf3f44
parentdd101c28231cda92c52047f7b61475625513c522 (diff)
downloadmongo-ccc7ca852ec042b2ee948812d4980ee6197375b5.tar.gz
Import wiredtiger: bb64c7cdcac3638b66ecea5c240df1448d409ab8 from branch mongodb-6.1
ref: cfb48c05eb..bb64c7cdca for: 6.1.0-rc2 WT-9751 Fix memory leak if we fail eviction in single page rewrite (#8214)
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_discard.c15
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c9
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_page.c166
-rw-r--r--src/third_party/wiredtiger/test/format/failure_configs/CONFIG.WT-9751138
5 files changed, 251 insertions, 79 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 4c48c144ef5..9a2ab6174df 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-6.1",
- "commit": "cfb48c05ebcc12aa122d245430dbda07c5f0513a"
+ "commit": "bb64c7cdcac3638b66ecea5c240df1448d409ab8"
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c
index 6af568b3a76..1ba7ca1c43b 100644
--- a/src/third_party/wiredtiger/src/btree/bt_discard.c
+++ b/src/third_party/wiredtiger/src/btree/bt_discard.c
@@ -165,6 +165,13 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
break;
}
__wt_free(session, multi->supd);
+ /*
+ * Discard the new disk images if they are not NULL. If the new disk images are NULL,
+ * they must have been instantiated into memory. Otherwise, we have a failure in
+ * eviction after reconciliation. If the split code only successfully instantiates a
+ * subset of new pages into memory, free the instantiated pages and the new disk images
+ * of the pages not in memory. We will redo reconciliation next time we visit this page.
+ */
__wt_free(session, multi->disk_image);
__wt_free(session, multi->addr.addr);
}
@@ -174,8 +181,16 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
/*
* Discard any replacement address: this memory is usually moved into the parent's WT_REF,
* but at the root that can't happen.
+ *
+ * Discard the new disk image if it is not NULL. If the new disk image is NULL, it must have
+ * been instantiated into memory. Otherwise, we have a failure in eviction after
+ * reconciliation and later we decide to discard the old disk image without loading the new
+ * disk image into memory. Free the new disk image in this case. If a checkpoint visits this
+ * page, it would write the new disk image even it hasn't been instantiated into memory.
+ * Therefore, no need to reconcile the page again if it remains clean.
*/
__wt_free(session, mod->mod_replace.addr);
+ __wt_free(session, mod->mod_disk_image);
break;
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index a48210bd5bc..d9ce6c6a3ac 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -2127,8 +2127,8 @@ __split_multi(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
__split_multi_inmem_final(session, page, &mod->mod_multi[i]);
/*
- * Pages with unresolved changes are not marked clean in reconciliation, do it now, then discard
- * the page.
+ * Page with changes not written in this reconciliation is not marked as clean, do it now, then
+ * discard the page.
*/
__wt_page_modify_clear(session, page);
__wt_page_out(session, &page);
@@ -2137,6 +2137,11 @@ __split_multi(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
err:
for (i = 0; i < new_entries; ++i)
__split_multi_inmem_fail(session, page, &mod->mod_multi[i], ref_new[i]);
+ /*
+ * Mark the page dirty to ensure it is reconciled again as we free the split disk images if
+ * we fail to instantiate any of them into memory.
+ */
+ __wt_page_modify_set(session, page);
}
__wt_free(session, ref_new);
diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c
index 3917c6c5b25..13543a41af5 100644
--- a/src/third_party/wiredtiger/src/evict/evict_page.c
+++ b/src/third_party/wiredtiger/src/evict/evict_page.c
@@ -10,6 +10,7 @@
static int __evict_page_clean_update(WT_SESSION_IMPL *, WT_REF *, uint32_t);
static int __evict_page_dirty_update(WT_SESSION_IMPL *, WT_REF *, uint32_t);
+static int __evict_reconcile(WT_SESSION_IMPL *, WT_REF *, uint32_t);
static int __evict_review(WT_SESSION_IMPL *, WT_REF *, uint32_t, bool *);
/*
@@ -168,11 +169,27 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint8_t previous_state, uint32
WT_ERR(__evict_review(session, ref, flags, &inmem_split));
/*
- * If there was an in-memory split, the tree has been left in the state we want: there is
- * nothing more to do.
+ * If we decide to do an in-memory split. Do it now. If an in-memory split completes, the page
+ * stays in memory and the tree is left in the desired state: avoid the usual cleanup.
*/
- if (inmem_split)
+ if (inmem_split) {
+ WT_ERR(__wt_split_insert(session, ref));
goto done;
+ }
+
+ /* No need to reconcile the page if it is from a dead tree or it is clean. */
+ if (!tree_dead && __wt_page_is_modified(page))
+ WT_ERR(__evict_reconcile(session, ref, flags));
+
+ /*
+ * Fail 0.1% of the time after we have done reconciliation. We should always evict the page of a
+ * dead tree.
+ */
+ if (!closing && !tree_dead &&
+ __wt_failpoint(session, WT_TIMING_STRESS_FAILPOINT_EVICTION_FAIL_AFTER_RECONCILIATION, 10)) {
+ ret = EBUSY;
+ goto err;
+ }
/* Check we are not evicting an accessible internal page with an active split generation. */
WT_ASSERT(session,
@@ -206,6 +223,11 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint8_t previous_state, uint32
else
WT_ERR(__evict_page_dirty_update(session, ref, flags));
+ /*
+ * We have loaded the new disk image and updated the tree structure. We can no longer fail after
+ * this point.
+ */
+
if (time_start != 0) {
time_stop = __wt_clock(session);
if (force_evict_hs)
@@ -369,6 +391,7 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_
WT_MULTI multi;
WT_PAGE_MODIFY *mod;
bool closing;
+ void *tmp;
mod = ref->page->modify;
closing = FLD_ISSET(evict_flags, WT_EVICT_CALL_CLOSING);
@@ -431,13 +454,20 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_
__wt_ref_out(session, ref);
WT_REF_SET_STATE(ref, WT_REF_DISK);
} else {
- /*
- * The split code works with WT_MULTI structures, build one for the disk image.
- */
+ /* The split code works with WT_MULTI structures, build one for the disk image. */
memset(&multi, 0, sizeof(multi));
multi.disk_image = mod->mod_disk_image;
-
- WT_RET(__wt_split_rewrite(session, ref, &multi));
+ /*
+ * Store the disk image to a temporary pointer in case we fail to rewrite the page and
+ * we need to link the new disk image back to the old disk image.
+ */
+ tmp = mod->mod_disk_image;
+ mod->mod_disk_image = NULL;
+ ret = __wt_split_rewrite(session, ref, &multi);
+ if (ret != 0) {
+ mod->mod_disk_image = tmp;
+ return (ret);
+ }
}
break;
@@ -575,27 +605,22 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent)
/*
* __evict_review --
- * Get exclusive access to the page and review the page and its subtree for conditions that
- * would block its eviction.
+ * Review the page and its subtree for conditions that would block its eviction.
*/
static int
__evict_review(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags, bool *inmem_splitp)
{
WT_BTREE *btree;
- WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_PAGE *page;
- uint32_t flags;
bool closing, modified;
- bool is_eviction_thread, use_snapshot_for_app_thread;
*inmem_splitp = false;
btree = S2BT(session);
conn = S2C(session);
page = ref->page;
- flags = WT_REC_EVICT;
closing = FLD_ISSET(evict_flags, WT_EVICT_CALL_CLOSING);
/*
@@ -640,13 +665,9 @@ __evict_review(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags, bool
if (!__wt_page_can_evict(session, ref, inmem_splitp))
return (__wt_set_return(session, EBUSY));
- /*
- * Check for an append-only workload needing an in-memory split; we can't do this earlier
- * because in-memory splits require exclusive access. If an in-memory split completes, the
- * page stays in memory and the tree is left in the desired state: avoid the usual cleanup.
- */
+ /* Check for an append-only workload needing an in-memory split. */
if (*inmem_splitp)
- return (__wt_split_insert(session, ref));
+ return (0);
}
/* If the page is clean, we're done and we can evict. */
@@ -655,12 +676,12 @@ __evict_review(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags, bool
/*
* If we are trying to evict a dirty page that does not belong to history store(HS) and
- * checkpoint is processing the HS file, then avoid evicting the dirty non-HS page for now if
- * the cache is already dominated by dirty HS content.
+ * checkpoint is processing the HS file, avoid evicting the dirty non-HS page for now if the
+ * cache is already dominated by dirty HS content.
*
- * Evicting a non-HS dirty page can generate even more HS content. As we can not evict HS pages
+ * Evicting an non-HS dirty page can generate even more HS content. As we cannot evict HS pages
* while checkpoint is operating on the HS file, we can end up in a situation where we exceed
- * the cache size limits.
+ * the cache size limit.
*/
if (conn->txn_global.checkpoint_running_hs && !WT_IS_HS(btree->dhandle) &&
__wt_cache_hs_dirty(session) && __wt_cache_full(session)) {
@@ -674,33 +695,28 @@ __evict_review(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags, bool
if (F_ISSET(session, WT_SESSION_NO_RECONCILE))
return (__wt_set_return(session, EBUSY));
- /*
- * If the page is dirty, reconcile it to decide if we can evict it.
- *
- * If we have an exclusive lock (we're discarding the tree), assert there are no updates we
- * cannot read.
- *
- * Don't set any other flags for internal pages: there are no update lists to be saved and
- * restored, changes can't be written into the history store table, nor can we re-create
- * internal pages in memory.
- *
- * For leaf pages:
- *
- * In-memory pages are a known configuration.
- *
- * Set the update/restore flag, so reconciliation will write blocks it can write and create a
- * list of skipped updates for blocks it cannot write, along with disk images. This is how
- * eviction of active, huge pages works: we take a big page and reconcile it into blocks, some
- * of which we write and discard, the rest of which we re-create as smaller in-memory pages,
- * (restoring the updates that stopped us from writing the block), and inserting the whole mess
- * into the page's parent. Set the flag in all cases because the incremental cost of
- * update/restore in reconciliation is minimal, eviction shouldn't have picked a page where
- * update/restore is necessary, absent some cache pressure. It's possible updates occurred after
- * we selected this page for eviction, but it's unlikely and we don't try and manage that risk.
- *
- * Additionally, if we aren't trying to free space in the cache, scrub the page and keep it in
- * memory.
- */
+ return (0);
+}
+
+/*
+ * __evict_reconcile --
+ * Reconcile the page for eviction.
+ */
+static int
+__evict_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags)
+{
+ WT_BTREE *btree;
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ uint32_t flags;
+ bool closing, is_eviction_thread, use_snapshot_for_app_thread;
+
+ btree = S2BT(session);
+ conn = S2C(session);
+ flags = WT_REC_EVICT;
+ closing = FLD_ISSET(evict_flags, WT_EVICT_CALL_CLOSING);
+
cache = conn->cache;
/*
@@ -710,22 +726,36 @@ __evict_review(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags, bool
if (FLD_ISSET(evict_flags, WT_EVICT_CALL_URGENT))
LF_SET(WT_REC_CALL_URGENT);
+ /*
+ * If we have an exclusive lock (we're discarding the tree), assert there are no updates we
+ * cannot read.
+ */
if (closing)
LF_SET(WT_REC_VISIBILITY_ERR);
+ /*
+ * Don't set any other flags for internal pages: there are no update lists to be saved and
+ * restored, changes can't be written into the history store table, nor can we re-create
+ * internal pages in memory.
+ *
+ * Don't set any other flags for history store table as all the content is evictable.
+ */
else if (F_ISSET(ref, WT_REF_FLAG_INTERNAL) || WT_IS_HS(btree->dhandle))
;
- else if (WT_SESSION_BTREE_SYNC(session) && !WT_IS_METADATA(btree->dhandle))
- LF_SET(WT_REC_HS);
+ /* Always do update restore for in-memory database. */
else if (F_ISSET(conn, WT_CONN_IN_MEMORY))
LF_SET(WT_REC_IN_MEMORY | WT_REC_SCRUB);
+ /* For data store leaf pages, write the history to history store except for metadata. */
else if (!WT_IS_METADATA(btree->dhandle)) {
LF_SET(WT_REC_HS);
/*
- * Scrub if we're supposed to or toss it in sometimes if we are in debugging mode.
+ * Scrub and we're supposed to or toss it in sometimes if we are in debugging mode.
+ *
+ * Note that don't scrub if checkpoint is running on the tree.
*/
- if (F_ISSET(cache, WT_CACHE_EVICT_SCRUB) ||
- (F_ISSET(cache, WT_CACHE_EVICT_DEBUG_MODE) && __wt_random(&session->rnd) % 3 == 0))
+ if (!WT_SESSION_BTREE_SYNC(session) &&
+ (F_ISSET(cache, WT_CACHE_EVICT_SCRUB) ||
+ (F_ISSET(cache, WT_CACHE_EVICT_DEBUG_MODE) && __wt_random(&session->rnd) % 3 == 0)))
LF_SET(WT_REC_SCRUB);
}
@@ -764,10 +794,8 @@ __evict_review(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags, bool
__wt_txn_bump_snapshot(session);
else if (use_snapshot_for_app_thread)
LF_SET(WT_REC_APP_EVICTION_SNAPSHOT);
- else {
- if (!WT_SESSION_BTREE_SYNC(session))
- LF_SET(WT_REC_VISIBLE_ALL);
- }
+ else if (!WT_SESSION_BTREE_SYNC(session))
+ LF_SET(WT_REC_VISIBLE_ALL);
WT_ASSERT(session, LF_ISSET(WT_REC_VISIBLE_ALL) || F_ISSET(session->txn, WT_TXN_HAS_SNAPSHOT));
@@ -778,7 +806,7 @@ __evict_review(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags, bool
* Reconcile the page. Force read-committed isolation level if we are using snapshots for
* eviction workers or application threads.
*/
- if (LF_ISSET(WT_REC_APP_EVICTION_SNAPSHOT) || is_eviction_thread)
+ if (is_eviction_thread || use_snapshot_for_app_thread)
WT_WITH_TXN_ISOLATION(
session, WT_ISO_READ_COMMITTED, ret = __wt_reconcile(session, ref, NULL, flags));
else
@@ -796,22 +824,8 @@ __evict_review(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags, bool
* Success: assert that the page is clean or reconciliation was configured to save updates.
*/
WT_ASSERT(session,
- !__wt_page_is_modified(page) || LF_ISSET(WT_REC_HS | WT_REC_IN_MEMORY) ||
+ !__wt_page_is_modified(ref->page) || LF_ISSET(WT_REC_HS | WT_REC_IN_MEMORY) ||
WT_IS_METADATA(btree->dhandle));
- /*
- * FIXME-WT-9751
- *
- * Disable this failpoint for now - triggering it leads to a memory leak in testing. We want to
- * fix the leak and take the time to cleanup the code, so disable the failpoint in the meantime
- * to reduce testing noise.
- */
-#if 0
- /* Fail 0.1% of the time. */
- if (!closing &&
- __wt_failpoint(session, WT_TIMING_STRESS_FAILPOINT_EVICTION_FAIL_AFTER_RECONCILIATION, 10))
- return (EBUSY);
-#endif
-
return (0);
}
diff --git a/src/third_party/wiredtiger/test/format/failure_configs/CONFIG.WT-9751 b/src/third_party/wiredtiger/test/format/failure_configs/CONFIG.WT-9751
new file mode 100644
index 00000000000..8ff2a91c49d
--- /dev/null
+++ b/src/third_party/wiredtiger/test/format/failure_configs/CONFIG.WT-9751
@@ -0,0 +1,138 @@
+############################################
+# RUN PARAMETERS: V3
+############################################
+assert.read_timestamp=0
+assert.write_timestamp=0
+backup=0
+backup.incremental=off
+backup.incr_granularity=8115
+block_cache=0
+block_cache.cache_on_checkpoint=0
+block_cache.cache_on_writes=0
+block_cache.size=53
+btree.compression=none
+cache=336
+cache.evict_max=2
+cache.minimum=40
+checkpoint=on
+checkpoint.log_size=79
+checkpoint.wait=40
+disk.data_extend=0
+disk.direct_io=0
+disk.encryption=none
+disk.mmap=1
+disk.mmap_all=0
+format.abort=0
+format.independent_thread_rng=0
+format.major_timeout=0
+import=0
+logging=0
+logging.compression=none
+logging.file_max=481991
+logging.prealloc=0
+logging.remove=1
+ops.alter=0
+ops.compaction=0
+ops.hs_cursor=0
+ops.prepare=0
+ops.random_cursor=1
+ops.salvage=0
+ops.verify=1
+quiet=1
+runs.in_memory=0
+runs.ops=0
+runs.rows=100000
+runs.source=table
+runs.tables=3
+runs.threads=6
+runs.timer=1
+runs.type=variable-length column-store
+runs.verify_failure_dump=0
+statistics=0
+statistics.server=1
+stress.aggressive_sweep=0
+stress.checkpoint=0
+stress.checkpoint_evict_page=0
+stress.checkpoint_reserved_txnid_delay=0
+stress.checkpoint_prepare=0
+stress.evict_reposition=0
+stress.failpoint_eviction_fail_after_reconciliation=1
+stress.failpoint_hs_delete_key_from_ts=0
+stress.hs_checkpoint_delay=0
+stress.hs_search=0
+stress.hs_sweep=0
+stress.split_1=0
+stress.split_2=0
+stress.split_3=0
+stress.split_4=0
+stress.split_5=0
+stress.split_6=0
+stress.split_7=0
+transaction.implicit=0
+transaction.timestamps=1
+wiredtiger.config=off
+wiredtiger.rwlock=0
+wiredtiger.leak_memory=0
+############################################
+# TABLE PARAMETERS: table 1
+############################################
+table1.btree.dictionary=1
+table1.btree.huffman_value=0
+table1.btree.internal_key_truncation=1
+table1.btree.internal_page_max=17
+table1.btree.leaf_page_max=9
+table1.btree.memory_page_max=8
+table1.btree.repeat_data_pct=87
+table1.btree.split_pct=60
+table1.btree.value_max=1299
+table1.btree.value_min=12
+table1.disk.checksum=on
+table1.disk.firstfit=0
+table1.ops.pct.delete=77
+table1.ops.pct.insert=4
+table1.ops.pct.modify=12
+table1.ops.pct.read=1
+table1.ops.pct.write=6
+table1.ops.truncate=1
+############################################
+# TABLE PARAMETERS: table 2
+############################################
+table2.btree.dictionary=0
+table2.btree.huffman_value=0
+table2.btree.internal_key_truncation=1
+table2.btree.internal_page_max=14
+table2.btree.leaf_page_max=11
+table2.btree.memory_page_max=7
+table2.btree.repeat_data_pct=89
+table2.btree.split_pct=67
+table2.btree.value_max=3716
+table2.btree.value_min=11
+table2.disk.checksum=unencrypted
+table2.disk.firstfit=0
+table2.ops.pct.delete=20
+table2.ops.pct.insert=1
+table2.ops.pct.modify=7
+table2.ops.pct.read=72
+table2.ops.pct.write=0
+table2.ops.truncate=1
+############################################
+# TABLE PARAMETERS: table 3
+############################################
+table3.btree.dictionary=0
+table3.btree.huffman_value=0
+table3.btree.internal_key_truncation=1
+table3.btree.internal_page_max=17
+table3.btree.leaf_page_max=12
+table3.btree.memory_page_max=9
+table3.btree.repeat_data_pct=12
+table3.btree.split_pct=54
+table3.btree.value_max=2929
+table3.btree.value_min=8
+table3.disk.checksum=on
+table3.disk.firstfit=0
+table3.ops.pct.delete=34
+table3.ops.pct.insert=45
+table3.ops.pct.modify=1
+table3.ops.pct.read=0
+table3.ops.pct.write=20
+table3.ops.truncate=1