diff options
author | Luke Chen <luke.chen@mongodb.com> | 2022-08-08 11:53:51 +1000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-08-08 02:21:40 +0000 |
commit | b02dbf2c7011dfd8fca95d870d69b4f0f9af699a (patch) | |
tree | 2990174c9f86ac2f7f2976286061003142c5b184 | |
parent | f0febdf5b71991bc4d78547c4364bd30a53747ab (diff) | |
download | mongo-b02dbf2c7011dfd8fca95d870d69b4f0f9af699a.tar.gz |
Import wiredtiger: 399edaeb4cd9265746de06b4bc59dd3b047d8106 from branch mongodb-5.0
ref: ccff98033f..399edaeb4c
for: 5.0.11
WT-8847 Add tombstone to WT_SAVE_UPD to truncate the update list upon page restore (#7703)
5 files changed, 145 insertions, 8 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 579bc8c83b4..82cc50b973c 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-5.0", - "commit": "ccff98033f53ebe5dd4bd3c097a1ba386045ebe2" + "commit": "399edaeb4cd9265746de06b4bc59dd3b047d8106" } diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 54fc452bd49..107f99b7285 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -1357,7 +1357,7 @@ __split_multi_inmem(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_MULTI *multi, WT WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_SAVE_UPD *supd; - WT_UPDATE *prev_onpage, *upd; + WT_UPDATE *prev_onpage, *upd, *tmp; uint64_t recno; uint32_t i, slot; bool prepare; @@ -1440,19 +1440,39 @@ __split_multi_inmem(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_MULTI *multi, WT if (supd->onpage_upd != NULL && !F_ISSET(S2C(session), WT_CONN_IN_MEMORY) && orig->type != WT_PAGE_COL_FIX) { /* + * If there is an on-page tombstone we need to remove it as well while performing update + * restore eviction. + */ + tmp = supd->onpage_tombstone != NULL ? supd->onpage_tombstone : supd->onpage_upd; + + /* * We have decided to restore this update chain so it must have newer updates than the * onpage value on it. */ - WT_ASSERT(session, upd != supd->onpage_upd); + WT_ASSERT(session, upd != tmp); + WT_ASSERT(session, F_ISSET(tmp, WT_UPDATE_DS)); + /* * Move the pointer to the position before the onpage value and truncate all the updates * starting from the onpage value. */ - for (prev_onpage = upd; - prev_onpage->next != NULL && prev_onpage->next != supd->onpage_upd; + for (prev_onpage = upd; prev_onpage->next != NULL && prev_onpage->next != tmp; prev_onpage = prev_onpage->next) ; - WT_ASSERT(session, prev_onpage->next == supd->onpage_upd); + WT_ASSERT(session, prev_onpage->next == tmp); +#ifdef HAVE_DIAGNOSTIC + /* + * During update restore eviction we remove anything older than the on-page update, + * including the on-page update. However it is possible a tombstone is also written as + * the stop time of the on-page value. To handle this we also need to remove the + * tombstone from the update chain. + * + * This assertion checks that there aren't any unexpected updates between that tombstone + * and the subsequent value which both make up the on-page value. + */ + for (; tmp != NULL && tmp != supd->onpage_upd; tmp = tmp->next) + WT_ASSERT(session, tmp == supd->onpage_tombstone || tmp->txnid == WT_TXN_ABORTED); +#endif prev_onpage->next = NULL; } diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 8b4ee968dc4..b0cea3be6ff 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -255,6 +255,7 @@ struct __wt_save_upd { WT_INSERT *ins; /* Insert list reference */ WT_ROW *rip; /* Original on-page reference */ WT_UPDATE *onpage_upd; + WT_UPDATE *onpage_tombstone; bool restore; /* Whether to restore this saved update chain */ }; diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c index ba947c13ac4..2deb26f587f 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c @@ -26,7 +26,7 @@ __rec_update_stable(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE *upd) */ static inline int __rec_update_save(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip, - WT_UPDATE *onpage_upd, bool supd_restore, size_t upd_memsize) + WT_UPDATE *onpage_upd, WT_UPDATE *tombstone, bool supd_restore, size_t upd_memsize) { WT_SAVE_UPD *supd; @@ -44,6 +44,7 @@ __rec_update_save(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ supd->ins = ins; supd->rip = rip; supd->onpage_upd = onpage_upd; + supd->onpage_tombstone = tombstone; supd->restore = supd_restore; ++r->supd_next; r->supd_memsize += upd_memsize; @@ -757,7 +758,8 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, W (has_newer_updates || F_ISSET(S2C(session), WT_CONN_IN_MEMORY) || page->type == WT_PAGE_COL_FIX); - WT_ERR(__rec_update_save(session, r, ins, rip, onpage_upd, supd_restore, upd_memsize)); + WT_ERR(__rec_update_save( + session, r, ins, rip, onpage_upd, tombstone, supd_restore, upd_memsize)); /* * Mark the selected update (and potentially the tombstone preceding it) as being destined diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable32.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable32.py new file mode 100644 index 00000000000..850dc145fe8 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable32.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python +# +# Public Domain 2014-present MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +from helper import simulate_crash_restart +from test_rollback_to_stable01 import test_rollback_to_stable_base +from wiredtiger import stat +from wtdataset import SimpleDataSet +from wtscenario import make_scenarios + +# test_rollback_to_stable32.py +# Test that update restore eviction correctly removes an on-disk +# tombstone. Previously it would trigger an assertion in reconciliation. +class test_rollback_to_stable32(test_rollback_to_stable_base): + + format_values = [ + ('column', dict(key_format='r', value_format='S')), + ('row_integer', dict(key_format='i', value_format='S')), + ] + + prepare_values = [ + ('no_prepare', dict(prepare=False)), + ('prepare', dict(prepare=True)) + ] + + scenarios = make_scenarios(format_values, prepare_values) + + def conn_config(self): + config = 'cache_size=100MB,statistics=(all)' + return config + + def test_rollback_to_stable_with_update_restore_evict(self): + nrows = 1000 + # Create a table. + uri = "table:rollback_to_stable32" + ds = SimpleDataSet(self, uri, 0, key_format=self.key_format, value_format=self.value_format, + config='split_pct=50') + ds.populate() + + if self.value_format == '8t': + value_a = 97 + value_b = 98 + value_c = 99 + else: + value_a = "aaaaa" * 100 + value_b = "bbbbb" * 100 + value_c = "ccccc" * 100 + + # Pin oldest and stable to timestamp 10. + self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(10) + + ',stable_timestamp=' + self.timestamp_str(10)) + + # Perform several updates. + self.large_updates(uri, value_a, ds, nrows, self.prepare, 20) + # Perform several updates. + self.large_updates(uri, value_b, ds, nrows, self.prepare, 30) + # Perform several removes. + self.large_removes(uri, ds, nrows, self.prepare, 40) + # Pin stable to timestamp 50 if prepare otherwise 40. + if self.prepare: + self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(50)) + else: + self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(40)) + # Perform several updates and checkpoint. + self.large_updates(uri, value_c, ds, nrows, self.prepare, 60) + self.session.checkpoint() + + # Verify data is visible and correct. + # (In FLCS, the removed rows should read back as zero.) + self.check(value_a, uri, nrows, 21 if self.prepare else 20) + self.check(None, uri, 0, 41 if self.prepare else 40) + self.check(value_c, uri, nrows, 61 if self.prepare else 60) + self.evict_cursor(uri, nrows, value_c) + + self.conn.rollback_to_stable() + + self.conn.reconfigure("debug_mode=(eviction=false)") + + # Perform several updates and checkpoint. + cursor = self.session.open_cursor(uri) + self.session.begin_transaction() + for i in range(1, nrows + 1): + cursor[ds.key(i)] = value_c + cursor.close() + self.session.rollback_transaction() + self.session.breakpoint() + # Perform several updates and checkpoint. + self.large_updates(uri, value_c, ds, nrows, self.prepare, 60) + self.evict_cursor(uri, nrows, value_c) + self.check(value_b, uri, nrows, 31 if self.prepare else 30) + self.conn.close() |