From b1eb4a1ca94b3bb3cff23f2a9562aa4433cce26a Mon Sep 17 00:00:00 2001 From: Chenhao Qu Date: Wed, 27 Apr 2022 01:05:50 +0000 Subject: Import wiredtiger: 406855114b113a55f3bc3b6f9a8a09b2a08cc328 from branch mongodb-master ref: ea81029e03..406855114b for: 6.1.0-rc0 WT-9179 Timestamp verification for fast-delete information --- src/third_party/wiredtiger/import.data | 2 +- src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c | 87 +++++++++ src/third_party/wiredtiger/src/support/timestamp.c | 2 +- .../wiredtiger/test/suite/test_truncate10.py | 198 +++++++++++++++++++++ 4 files changed, 287 insertions(+), 2 deletions(-) create mode 100644 src/third_party/wiredtiger/test/suite/test_truncate10.py (limited to 'src') diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 75d9df9d5df..6e99eb43941 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-master", - "commit": "ea81029e036649dcdf8f860d005d87a422e5e8d6" + "commit": "406855114b113a55f3bc3b6f9a8a09b2a08cc328" } diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c index b9bc0a32086..7f6665884cb 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c @@ -237,6 +237,89 @@ __verify_dsk_value_validity(WT_SESSION_IMPL *session, WT_CELL_UNPACK_KV *unpack, cell_num - 1, tag); } +/* + * __verify_dsk_addr_page_del -- + * Verify a deleted-page address cell's page delete information. + */ +static int +__verify_dsk_addr_page_del(WT_SESSION_IMPL *session, WT_CELL_UNPACK_ADDR *unpack, uint32_t cell_num, + WT_ADDR *addr, const char *tag) +{ + WT_DECL_RET; + WT_TIME_AGGREGATE ta_with_delete; + char time_string[WT_TIME_STRING_SIZE]; + + /* The durable timestamp in the page_delete info should not be before its commit timestamp. */ + if (unpack->page_del.durable_timestamp < unpack->page_del.timestamp) + WT_RET_VRFY(session, + "fast-delete cell %" PRIu32 " on page at %s has durable timestamp %" PRIu64 + " before its commit timestamp %" PRIu64, + cell_num - 1, tag, unpack->page_del.durable_timestamp, unpack->page_del.timestamp); + + /* + * The timestamps in the page_delete information are a global stop time for the entire page. + * This is not reflected in the aggregate, but is supposed to be reflected in the parent's + * aggregate. First check that the aggregate is consistent with being deleted at the given time. + */ + if (unpack->ta.newest_stop_durable_ts > unpack->page_del.durable_timestamp) + WT_RET_VRFY(session, + "fast-delete cell %" PRIu32 + " on page at %s has invalid newest durable stop time; should be <= %" PRIu64 + "; time aggregate %s", + cell_num - 1, tag, unpack->page_del.durable_timestamp, + __wt_time_aggregate_to_string(&unpack->ta, time_string)); + if (unpack->ta.newest_txn > unpack->page_del.txnid) + WT_RET_VRFY(session, + "fast-delete cell %" PRIu32 + " on page at %s has invalid newest transaction; should be <= %" PRIu64 + "; time aggregate %s", + cell_num - 1, tag, unpack->page_del.txnid, + __wt_time_aggregate_to_string(&unpack->ta, time_string)); + if (unpack->ta.newest_stop_ts != WT_TS_MAX && + unpack->ta.newest_stop_ts > unpack->page_del.timestamp) + WT_RET_VRFY(session, + "fast-delete cell %" PRIu32 + " on page at %s has invalid newest stop time; should be <= %" PRIu64 + "; time aggregate %s", + cell_num - 1, tag, unpack->page_del.timestamp, + __wt_time_aggregate_to_string(&unpack->ta, time_string)); + if (unpack->ta.newest_stop_txn != WT_TXN_MAX && + unpack->ta.newest_stop_txn > unpack->page_del.txnid) + WT_RET_VRFY(session, + "fast-delete cell %" PRIu32 + " on page at %s has invalid newest stop transaction; should be <= %" PRIu64 + "; time aggregate %s", + cell_num - 1, tag, unpack->page_del.txnid, + __wt_time_aggregate_to_string(&unpack->ta, time_string)); + + /* + * Merge this information into the aggregate and verify the results, against the parent if + * possible. + */ + WT_TIME_AGGREGATE_COPY(&ta_with_delete, &unpack->ta); + ta_with_delete.newest_stop_durable_ts = unpack->page_del.durable_timestamp; + ta_with_delete.newest_txn = unpack->page_del.txnid; + ta_with_delete.newest_stop_ts = unpack->page_del.timestamp; + ta_with_delete.newest_stop_txn = unpack->page_del.txnid; + ret = __wt_time_aggregate_validate(session, &ta_with_delete, addr != NULL ? &addr->ta : NULL, + F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)); + if (ret != 0) + WT_RET_VRFY_RETVAL(session, ret, + "fast-delete cell %" PRIu32 " on page at %s failed adjusted timestamp validation", + cell_num - 1, tag); + + /* + * The other elements of the structure are not stored on disk and are set unconditionally by the + * unpack code, so just assert about them. Prepared fast-truncates are not allowed to be + * evicted. + */ + WT_ASSERT(session, unpack->page_del.prepare_state == 0); + WT_ASSERT(session, unpack->page_del.previous_ref_state == WT_REF_DISK); + WT_ASSERT(session, unpack->page_del.committed == true); + + return (0); +} + /* * __verify_row_key_order_check -- * Check key ordering for row-store pages. @@ -388,6 +471,10 @@ __verify_dsk_row_int( break; } + /* Check that any fast-delete info is consistent with the validity window. */ + if (cell_type == WT_CELL_ADDR_DEL && F_ISSET(dsk, WT_PAGE_FT_UPDATE)) + WT_ERR(__verify_dsk_addr_page_del(session, unpack, cell_num, addr, tag)); + /* * Remaining checks are for key order. If this cell isn't a key, we're done, move to the * next cell. If this cell is an overflow item, instantiate the key and compare it with the diff --git a/src/third_party/wiredtiger/src/support/timestamp.c b/src/third_party/wiredtiger/src/support/timestamp.c index 0b6a008de9c..8acdbcb56b2 100644 --- a/src/third_party/wiredtiger/src/support/timestamp.c +++ b/src/third_party/wiredtiger/src/support/timestamp.c @@ -391,7 +391,7 @@ __time_value_validate_parent( if (tw->prepare && !parent->prepare) WT_TIME_VALIDATE_RET(session, - "aggregate time window is prepared but its parent is not; time aggregate %s, parent %s", + "value time window is prepared but its parent is not; time window %s, parent %s", __wt_time_window_to_string(tw, time_string[0]), __wt_time_aggregate_to_string(parent, time_string[1])); diff --git a/src/third_party/wiredtiger/test/suite/test_truncate10.py b/src/third_party/wiredtiger/test/suite/test_truncate10.py new file mode 100644 index 00000000000..1270bd83d0a --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_truncate10.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python +# +# Public Domain 2014-present MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wttest +from wiredtiger import stat, WiredTigerError, wiredtiger_strerror, WT_ROLLBACK +from wtdataset import SimpleDataSet +from wtscenario import make_scenarios + +# test_truncate10.py +# +# Check that nothing comes unstuck if we commit a truncate with durable > commit. + +class test_truncate10(wttest.WiredTigerTestCase): + conn_config = 'statistics=(all)' + session_config = 'isolation=snapshot' + + # Hook to run using remove instead of truncate for reference. This should not alter the + # behavior... but may if things are broken. Disable the reference version by default as it's + # only useful when investigating behavior changes. This list is first in the make_scenarios + # call so the additional cases don't change the scenario numbering. + trunc_values = [ + ('truncate', dict(trunc_with_remove=False)), + #('remove', dict(trunc_with_remove=True)), + ] + + format_values = [ + ('column', dict(key_format='r', value_format='S', extraconfig='')), + ('column_fix', dict(key_format='r', value_format='8t', + extraconfig=',allocation_size=512,leaf_page_max=512')), + ('integer_row', dict(key_format='i', value_format='S', extraconfig='')), + ] + # Try various stable times. + stable_values = [ + ('10', dict(stable_timestamp=10)), + ('20', dict(stable_timestamp=20)), + ('25', dict(stable_timestamp=25)), + ('30', dict(stable_timestamp=30)), + ] + # Try both with and without an intermediate checkpoint. + checkpoint_values = [ + ('checkpoint', dict(do_checkpoint=True)), + ('no-checkpoint', dict(do_checkpoint=False)), + ] + + scenarios = make_scenarios(trunc_values, format_values, stable_values, checkpoint_values) + + def truncate(self, uri, make_key, keynum1, keynum2): + if self.trunc_with_remove: + cursor = self.session.open_cursor(uri) + err = 0 + for k in range(keynum1, keynum2 + 1): + cursor.set_key(k) + try: + err = cursor.remove() + except WiredTigerError as e: + if wiredtiger_strerror(WT_ROLLBACK) in str(e): + err = WT_ROLLBACK + else: + raise e + if err != 0: + break + cursor.close() + else: + lo_cursor = self.session.open_cursor(uri) + hi_cursor = self.session.open_cursor(uri) + lo_cursor.set_key(make_key(keynum1)) + hi_cursor.set_key(make_key(keynum2)) + try: + err = self.session.truncate(None, lo_cursor, hi_cursor, None) + except WiredTigerError as e: + if wiredtiger_strerror(WT_ROLLBACK) in str(e): + err = WT_ROLLBACK + else: + raise e + lo_cursor.close() + hi_cursor.close() + return err + + def check(self, uri, make_key, nrows, nzeros, value, ts): + cursor = self.session.open_cursor(uri) + self.session.begin_transaction('read_timestamp=' + self.timestamp_str(ts)) + seen = 0 + zseen = 0 + for k, v in cursor: + if self.value_format == '8t' and v == 0: + zseen += 1 + else: + self.assertEqual(v, value) + seen += 1 + self.assertEqual(seen, nrows) + self.assertEqual(zseen, nzeros if self.value_format == '8t' else 0) + self.session.rollback_transaction() + cursor.close() + + def test_truncate10(self): + nrows = 10000 + + uri = "table:truncate10" + ds = SimpleDataSet( + self, uri, 0, key_format=self.key_format, value_format=self.value_format, + config='log=(enabled=false)' + self.extraconfig) + ds.populate() + + if self.value_format == '8t': + value_a = 97 + value_b = 98 + else: + value_a = "aaaaa" * 100 + value_b = "bbbbb" * 100 + + # Pin oldest and stable timestamps to 1. + self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(1) + + ',stable_timestamp=' + self.timestamp_str(1)) + + # Write a bunch of data at time 10. + cursor = self.session.open_cursor(ds.uri) + self.session.begin_transaction() + for i in range(1, nrows + 1): + cursor[ds.key(i)] = value_a + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(10)) + + # Mark it stable. + self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(10)) + + # Reopen the connection so nothing is in memory and we can fast-truncate. + self.reopen_conn() + + # Truncate the data at time 25, but prepare at 20 and make durable 30. + self.session.begin_transaction() + err = self.truncate(ds.uri, ds.key, nrows // 4 + 1, nrows // 4 + nrows // 2) + self.assertEqual(err, 0) + self.session.prepare_transaction('prepare_timestamp=' + self.timestamp_str(20)) + self.session.timestamp_transaction('commit_timestamp=' + self.timestamp_str(25)) + self.session.commit_transaction('durable_timestamp=' + self.timestamp_str(30)) + + # Make sure we did at least one fast-delete. For columns, there's no fast-delete + # support (yet) so assert we didn't. + stat_cursor = self.session.open_cursor('statistics:', None, None) + fastdelete_pages = stat_cursor[stat.conn.rec_page_delete_fast][2] + if self.key_format == 'r': + self.assertEqual(fastdelete_pages, 0) + else: + self.assertGreater(fastdelete_pages, 0) + + # Optionally advance stable. + self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(self.stable_timestamp)) + + # Optionally checkpoint. + if self.do_checkpoint: + self.session.checkpoint() + + # Validate the data. + + # At time 10 we should see all value_a. + self.check(ds.uri, ds.key, nrows, 0, value_a, 10) + + # At time 20 we should still see all value_a. + self.check(ds.uri, ds.key, nrows, 0, value_a, 20) + + # At time 25 we should still see half value_a, and for FLCS, half zeros. + # (Note that reading between commit and durable can be problematic, but for + # now at least it remains permitted.) + self.check(ds.uri, ds.key, nrows // 2, nrows // 2, value_a, 25) + + # At time 30 we should also see half value_a, and for FLCS, half zeros. + self.check(ds.uri, ds.key, nrows // 2, nrows // 2, value_a, 30) + + # Move the stable timestamp forward before exiting so we don't waste time rolling + # back the changes during shutdown. + self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(50)) + +if __name__ == '__main__': + wttest.run() -- cgit v1.2.1