summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorChenhao Qu <chenhao.qu@mongodb.com>2022-04-27 01:05:50 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-04-27 01:33:53 +0000
commitb1eb4a1ca94b3bb3cff23f2a9562aa4433cce26a (patch)
treef2f7d6e61ca6db31bda39b4f1d3e055a78055b25 /src
parenta1f721cd6b388b590b7f7c0ed6f04578093cd8c9 (diff)
downloadmongo-b1eb4a1ca94b3bb3cff23f2a9562aa4433cce26a.tar.gz
Import wiredtiger: 406855114b113a55f3bc3b6f9a8a09b2a08cc328 from branch mongodb-master
ref: ea81029e03..406855114b for: 6.1.0-rc0 WT-9179 Timestamp verification for fast-delete information
Diffstat (limited to 'src')
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c87
-rw-r--r--src/third_party/wiredtiger/src/support/timestamp.c2
-rw-r--r--src/third_party/wiredtiger/test/suite/test_truncate10.py198
4 files changed, 287 insertions, 2 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 75d9df9d5df..6e99eb43941 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-master",
- "commit": "ea81029e036649dcdf8f860d005d87a422e5e8d6"
+ "commit": "406855114b113a55f3bc3b6f9a8a09b2a08cc328"
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
index b9bc0a32086..7f6665884cb 100644
--- a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
+++ b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
@@ -238,6 +238,89 @@ __verify_dsk_value_validity(WT_SESSION_IMPL *session, WT_CELL_UNPACK_KV *unpack,
}
/*
+ * __verify_dsk_addr_page_del --
+ * Verify a deleted-page address cell's page delete information.
+ */
+static int
+__verify_dsk_addr_page_del(WT_SESSION_IMPL *session, WT_CELL_UNPACK_ADDR *unpack, uint32_t cell_num,
+ WT_ADDR *addr, const char *tag)
+{
+ WT_DECL_RET;
+ WT_TIME_AGGREGATE ta_with_delete;
+ char time_string[WT_TIME_STRING_SIZE];
+
+ /* The durable timestamp in the page_delete info should not be before its commit timestamp. */
+ if (unpack->page_del.durable_timestamp < unpack->page_del.timestamp)
+ WT_RET_VRFY(session,
+ "fast-delete cell %" PRIu32 " on page at %s has durable timestamp %" PRIu64
+ " before its commit timestamp %" PRIu64,
+ cell_num - 1, tag, unpack->page_del.durable_timestamp, unpack->page_del.timestamp);
+
+ /*
+ * The timestamps in the page_delete information are a global stop time for the entire page.
+ * This is not reflected in the aggregate, but is supposed to be reflected in the parent's
+ * aggregate. First check that the aggregate is consistent with being deleted at the given time.
+ */
+ if (unpack->ta.newest_stop_durable_ts > unpack->page_del.durable_timestamp)
+ WT_RET_VRFY(session,
+ "fast-delete cell %" PRIu32
+ " on page at %s has invalid newest durable stop time; should be <= %" PRIu64
+ "; time aggregate %s",
+ cell_num - 1, tag, unpack->page_del.durable_timestamp,
+ __wt_time_aggregate_to_string(&unpack->ta, time_string));
+ if (unpack->ta.newest_txn > unpack->page_del.txnid)
+ WT_RET_VRFY(session,
+ "fast-delete cell %" PRIu32
+ " on page at %s has invalid newest transaction; should be <= %" PRIu64
+ "; time aggregate %s",
+ cell_num - 1, tag, unpack->page_del.txnid,
+ __wt_time_aggregate_to_string(&unpack->ta, time_string));
+ if (unpack->ta.newest_stop_ts != WT_TS_MAX &&
+ unpack->ta.newest_stop_ts > unpack->page_del.timestamp)
+ WT_RET_VRFY(session,
+ "fast-delete cell %" PRIu32
+ " on page at %s has invalid newest stop time; should be <= %" PRIu64
+ "; time aggregate %s",
+ cell_num - 1, tag, unpack->page_del.timestamp,
+ __wt_time_aggregate_to_string(&unpack->ta, time_string));
+ if (unpack->ta.newest_stop_txn != WT_TXN_MAX &&
+ unpack->ta.newest_stop_txn > unpack->page_del.txnid)
+ WT_RET_VRFY(session,
+ "fast-delete cell %" PRIu32
+ " on page at %s has invalid newest stop transaction; should be <= %" PRIu64
+ "; time aggregate %s",
+ cell_num - 1, tag, unpack->page_del.txnid,
+ __wt_time_aggregate_to_string(&unpack->ta, time_string));
+
+ /*
+ * Merge this information into the aggregate and verify the results, against the parent if
+ * possible.
+ */
+ WT_TIME_AGGREGATE_COPY(&ta_with_delete, &unpack->ta);
+ ta_with_delete.newest_stop_durable_ts = unpack->page_del.durable_timestamp;
+ ta_with_delete.newest_txn = unpack->page_del.txnid;
+ ta_with_delete.newest_stop_ts = unpack->page_del.timestamp;
+ ta_with_delete.newest_stop_txn = unpack->page_del.txnid;
+ ret = __wt_time_aggregate_validate(session, &ta_with_delete, addr != NULL ? &addr->ta : NULL,
+ F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE));
+ if (ret != 0)
+ WT_RET_VRFY_RETVAL(session, ret,
+ "fast-delete cell %" PRIu32 " on page at %s failed adjusted timestamp validation",
+ cell_num - 1, tag);
+
+ /*
+ * The other elements of the structure are not stored on disk and are set unconditionally by the
+ * unpack code, so just assert about them. Prepared fast-truncates are not allowed to be
+ * evicted.
+ */
+ WT_ASSERT(session, unpack->page_del.prepare_state == 0);
+ WT_ASSERT(session, unpack->page_del.previous_ref_state == WT_REF_DISK);
+ WT_ASSERT(session, unpack->page_del.committed == true);
+
+ return (0);
+}
+
+/*
* __verify_row_key_order_check --
* Check key ordering for row-store pages.
*/
@@ -388,6 +471,10 @@ __verify_dsk_row_int(
break;
}
+ /* Check that any fast-delete info is consistent with the validity window. */
+ if (cell_type == WT_CELL_ADDR_DEL && F_ISSET(dsk, WT_PAGE_FT_UPDATE))
+ WT_ERR(__verify_dsk_addr_page_del(session, unpack, cell_num, addr, tag));
+
/*
* Remaining checks are for key order. If this cell isn't a key, we're done, move to the
* next cell. If this cell is an overflow item, instantiate the key and compare it with the
diff --git a/src/third_party/wiredtiger/src/support/timestamp.c b/src/third_party/wiredtiger/src/support/timestamp.c
index 0b6a008de9c..8acdbcb56b2 100644
--- a/src/third_party/wiredtiger/src/support/timestamp.c
+++ b/src/third_party/wiredtiger/src/support/timestamp.c
@@ -391,7 +391,7 @@ __time_value_validate_parent(
if (tw->prepare && !parent->prepare)
WT_TIME_VALIDATE_RET(session,
- "aggregate time window is prepared but its parent is not; time aggregate %s, parent %s",
+ "value time window is prepared but its parent is not; time window %s, parent %s",
__wt_time_window_to_string(tw, time_string[0]),
__wt_time_aggregate_to_string(parent, time_string[1]));
diff --git a/src/third_party/wiredtiger/test/suite/test_truncate10.py b/src/third_party/wiredtiger/test/suite/test_truncate10.py
new file mode 100644
index 00000000000..1270bd83d0a
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_truncate10.py
@@ -0,0 +1,198 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wttest
+from wiredtiger import stat, WiredTigerError, wiredtiger_strerror, WT_ROLLBACK
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+
+# test_truncate10.py
+#
+# Check that nothing comes unstuck if we commit a truncate with durable > commit.
+
+class test_truncate10(wttest.WiredTigerTestCase):
+ conn_config = 'statistics=(all)'
+ session_config = 'isolation=snapshot'
+
+ # Hook to run using remove instead of truncate for reference. This should not alter the
+ # behavior... but may if things are broken. Disable the reference version by default as it's
+ # only useful when investigating behavior changes. This list is first in the make_scenarios
+ # call so the additional cases don't change the scenario numbering.
+ trunc_values = [
+ ('truncate', dict(trunc_with_remove=False)),
+ #('remove', dict(trunc_with_remove=True)),
+ ]
+
+ format_values = [
+ ('column', dict(key_format='r', value_format='S', extraconfig='')),
+ ('column_fix', dict(key_format='r', value_format='8t',
+ extraconfig=',allocation_size=512,leaf_page_max=512')),
+ ('integer_row', dict(key_format='i', value_format='S', extraconfig='')),
+ ]
+ # Try various stable times.
+ stable_values = [
+ ('10', dict(stable_timestamp=10)),
+ ('20', dict(stable_timestamp=20)),
+ ('25', dict(stable_timestamp=25)),
+ ('30', dict(stable_timestamp=30)),
+ ]
+ # Try both with and without an intermediate checkpoint.
+ checkpoint_values = [
+ ('checkpoint', dict(do_checkpoint=True)),
+ ('no-checkpoint', dict(do_checkpoint=False)),
+ ]
+
+ scenarios = make_scenarios(trunc_values, format_values, stable_values, checkpoint_values)
+
+ def truncate(self, uri, make_key, keynum1, keynum2):
+ if self.trunc_with_remove:
+ cursor = self.session.open_cursor(uri)
+ err = 0
+ for k in range(keynum1, keynum2 + 1):
+ cursor.set_key(k)
+ try:
+ err = cursor.remove()
+ except WiredTigerError as e:
+ if wiredtiger_strerror(WT_ROLLBACK) in str(e):
+ err = WT_ROLLBACK
+ else:
+ raise e
+ if err != 0:
+ break
+ cursor.close()
+ else:
+ lo_cursor = self.session.open_cursor(uri)
+ hi_cursor = self.session.open_cursor(uri)
+ lo_cursor.set_key(make_key(keynum1))
+ hi_cursor.set_key(make_key(keynum2))
+ try:
+ err = self.session.truncate(None, lo_cursor, hi_cursor, None)
+ except WiredTigerError as e:
+ if wiredtiger_strerror(WT_ROLLBACK) in str(e):
+ err = WT_ROLLBACK
+ else:
+ raise e
+ lo_cursor.close()
+ hi_cursor.close()
+ return err
+
+ def check(self, uri, make_key, nrows, nzeros, value, ts):
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction('read_timestamp=' + self.timestamp_str(ts))
+ seen = 0
+ zseen = 0
+ for k, v in cursor:
+ if self.value_format == '8t' and v == 0:
+ zseen += 1
+ else:
+ self.assertEqual(v, value)
+ seen += 1
+ self.assertEqual(seen, nrows)
+ self.assertEqual(zseen, nzeros if self.value_format == '8t' else 0)
+ self.session.rollback_transaction()
+ cursor.close()
+
+ def test_truncate10(self):
+ nrows = 10000
+
+ uri = "table:truncate10"
+ ds = SimpleDataSet(
+ self, uri, 0, key_format=self.key_format, value_format=self.value_format,
+ config='log=(enabled=false)' + self.extraconfig)
+ ds.populate()
+
+ if self.value_format == '8t':
+ value_a = 97
+ value_b = 98
+ else:
+ value_a = "aaaaa" * 100
+ value_b = "bbbbb" * 100
+
+ # Pin oldest and stable timestamps to 1.
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(1) +
+ ',stable_timestamp=' + self.timestamp_str(1))
+
+ # Write a bunch of data at time 10.
+ cursor = self.session.open_cursor(ds.uri)
+ self.session.begin_transaction()
+ for i in range(1, nrows + 1):
+ cursor[ds.key(i)] = value_a
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(10))
+
+ # Mark it stable.
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(10))
+
+ # Reopen the connection so nothing is in memory and we can fast-truncate.
+ self.reopen_conn()
+
+ # Truncate the data at time 25, but prepare at 20 and make durable 30.
+ self.session.begin_transaction()
+ err = self.truncate(ds.uri, ds.key, nrows // 4 + 1, nrows // 4 + nrows // 2)
+ self.assertEqual(err, 0)
+ self.session.prepare_transaction('prepare_timestamp=' + self.timestamp_str(20))
+ self.session.timestamp_transaction('commit_timestamp=' + self.timestamp_str(25))
+ self.session.commit_transaction('durable_timestamp=' + self.timestamp_str(30))
+
+ # Make sure we did at least one fast-delete. For columns, there's no fast-delete
+ # support (yet) so assert we didn't.
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ fastdelete_pages = stat_cursor[stat.conn.rec_page_delete_fast][2]
+ if self.key_format == 'r':
+ self.assertEqual(fastdelete_pages, 0)
+ else:
+ self.assertGreater(fastdelete_pages, 0)
+
+ # Optionally advance stable.
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(self.stable_timestamp))
+
+ # Optionally checkpoint.
+ if self.do_checkpoint:
+ self.session.checkpoint()
+
+ # Validate the data.
+
+ # At time 10 we should see all value_a.
+ self.check(ds.uri, ds.key, nrows, 0, value_a, 10)
+
+ # At time 20 we should still see all value_a.
+ self.check(ds.uri, ds.key, nrows, 0, value_a, 20)
+
+ # At time 25 we should still see half value_a, and for FLCS, half zeros.
+ # (Note that reading between commit and durable can be problematic, but for
+ # now at least it remains permitted.)
+ self.check(ds.uri, ds.key, nrows // 2, nrows // 2, value_a, 25)
+
+ # At time 30 we should also see half value_a, and for FLCS, half zeros.
+ self.check(ds.uri, ds.key, nrows // 2, nrows // 2, value_a, 30)
+
+ # Move the stable timestamp forward before exiting so we don't waste time rolling
+ # back the changes during shutdown.
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(50))
+
+if __name__ == '__main__':
+ wttest.run()