summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2022-09-20 14:57:24 +1000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-09-20 05:19:14 +0000
commitb598ac1495318c0594a6607666bd33d38c5e82f5 (patch)
tree891e89d96d67577793bdf0d51226f1f69cf9ec24
parented0fa50de855927d652593345cfcb9dda8544625 (diff)
downloadmongo-b598ac1495318c0594a6607666bd33d38c5e82f5.tar.gz
Import wiredtiger: ec742d6807b943cd6f2baf1a55853d296eb5b5c6 from branch mongodb-4.4
ref: b28742aed0..ec742d6807 for: 4.4.17 WT-9500 Fix RTS to use cell time window instead of key/value timestamps (#8073) (v4.4 Backport) (#8225)
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c72
-rw-r--r--src/third_party/wiredtiger/test/suite/test_rollback_to_stable37.py114
3 files changed, 155 insertions, 33 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index fd5b609dfb5..d98b5fcebb2 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-4.4",
- "commit": "b28742aed07482b945164b755ebc3967d5e03851"
+ "commit": "ec742d6807b943cd6f2baf1a55853d296eb5b5c6"
}
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index 478b07f05b3..e6ce11062dd 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -27,9 +27,7 @@ __rollback_delete_hs(WT_SESSION_IMPL *session, WT_ITEM *key, wt_timestamp_t ts)
WT_CURSOR *hs_cursor;
WT_DECL_ITEM(hs_key);
WT_DECL_RET;
- wt_timestamp_t hs_start_ts;
- uint64_t hs_counter;
- uint32_t hs_btree_id;
+ WT_TIME_WINDOW *hs_tw;
/* Open a history store table cursor. */
WT_RET(__wt_curhs_open(session, NULL, &hs_cursor));
@@ -51,12 +49,14 @@ __rollback_delete_hs(WT_SESSION_IMPL *session, WT_ITEM *key, wt_timestamp_t ts)
hs_cursor->set_key(hs_cursor, 4, S2BT(session)->id, key, WT_TS_MAX, UINT64_MAX);
ret = __wt_curhs_search_near_before(session, hs_cursor);
for (; ret == 0; ret = hs_cursor->prev(hs_cursor)) {
- WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter));
- if (hs_start_ts < ts)
+ /* Retrieve the time window from the history cursor. */
+ __wt_hs_upd_time_window(hs_cursor, &hs_tw);
+ if (hs_tw->start_ts < ts)
break;
+
WT_ERR(hs_cursor->remove(hs_cursor));
WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed);
- if (hs_start_ts == ts)
+ if (hs_tw->start_ts == ts)
WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts);
else
WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts_unstable);
@@ -399,7 +399,7 @@ __rollback_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_REF *ref, WT_ROW *rip,
* become obsolete according to the checkpoint.
*/
if (__rollback_txn_visible_id(session, hs_tw->stop_txn) &&
- hs_stop_durable_ts <= pinned_ts) {
+ hs_tw->durable_stop_ts <= pinned_ts) {
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
"history store stop is obsolete with time window: %s and pinned timestamp: %s",
__wt_time_window_to_string(hs_tw, tw_string),
@@ -421,7 +421,7 @@ __rollback_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_REF *ref, WT_ROW *rip,
* the written proper timestamp, so comparing against it with history store shouldn't have
* any problem.
*/
- if (hs_start_ts <= unpack->tw.start_ts || unpack->tw.prepare) {
+ if (hs_tw->start_ts <= unpack->tw.start_ts || unpack->tw.prepare) {
if (type == WT_UPDATE_MODIFY)
WT_ERR(__wt_modify_apply_item(
session, S2BT(session)->value_format, full_value, hs_value->data));
@@ -431,11 +431,9 @@ __rollback_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_REF *ref, WT_ROW *rip,
}
} else
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
- "history store update more recent than on-disk update with start timestamp: %s,"
- " durable timestamp: %s, stop timestamp: %s and type: %" PRIu8,
- __wt_timestamp_to_string(hs_start_ts, ts_string[0]),
- __wt_timestamp_to_string(hs_durable_ts, ts_string[1]),
- __wt_timestamp_to_string(hs_stop_durable_ts, ts_string[2]), type);
+ "history store update more recent than on-disk update with time window: %s and type: "
+ "%" PRIu8,
+ __wt_time_window_to_string(hs_tw, tw_string), type);
/*
* Verify the history store timestamps are in order. The start timestamp may be equal to the
@@ -467,32 +465,38 @@ __rollback_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_REF *ref, WT_ROW *rip,
WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_stop_older_than_newer_start);
/*
+ * Validate the timestamps in the key and the cell are same. This must be validated only
+ * after verifying it's stop time window is not globally visible. The start timestamps of
+ * the time window are cleared when they are globally visible and there will be no stop
+ * timestamp in the history store whenever a prepared update is written to the data store.
+ */
+ WT_ASSERT(session,
+ (hs_tw->start_ts == WT_TS_NONE || hs_tw->start_ts == hs_start_ts) &&
+ (hs_tw->durable_start_ts == WT_TS_NONE || hs_tw->durable_start_ts == hs_durable_ts) &&
+ ((hs_tw->durable_stop_ts == 0 && hs_stop_durable_ts == WT_TS_MAX) ||
+ hs_tw->durable_stop_ts == hs_stop_durable_ts));
+
+ /*
* Stop processing when we find a stable update according to the given timestamp and
* transaction id.
*/
if (__rollback_txn_visible_id(session, hs_tw->start_txn) &&
- hs_durable_ts <= rollback_timestamp) {
+ hs_tw->durable_start_ts <= rollback_timestamp) {
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
- "history store update valid with start timestamp: %s, durable timestamp: %s, stop "
- "timestamp: %s, stable timestamp: %s, txnid: %" PRIu64 " and type: %" PRIu8,
- __wt_timestamp_to_string(hs_start_ts, ts_string[0]),
- __wt_timestamp_to_string(hs_durable_ts, ts_string[1]),
- __wt_timestamp_to_string(hs_stop_durable_ts, ts_string[2]),
- __wt_timestamp_to_string(rollback_timestamp, ts_string[3]), hs_tw->start_txn, type);
+ "history store update valid with time window: %s, type: %" PRIu8
+ " and stable timestamp: %s",
+ __wt_time_window_to_string(hs_tw, tw_string), type,
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[0]));
WT_ASSERT(session, unpack->tw.prepare || hs_tw->start_ts <= unpack->tw.start_ts);
valid_update_found = true;
break;
}
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
- "history store update aborted with start timestamp: %s, durable timestamp: %s, stop "
- "timestamp: %s, stable timestamp: %s, start txnid: %" PRIu64 ", stop txnid: %" PRIu64
- " and type: %" PRIu8,
- __wt_timestamp_to_string(hs_start_ts, ts_string[0]),
- __wt_timestamp_to_string(hs_durable_ts, ts_string[1]),
- __wt_timestamp_to_string(hs_stop_durable_ts, ts_string[2]),
- __wt_timestamp_to_string(rollback_timestamp, ts_string[3]), hs_tw->start_txn,
- hs_tw->stop_txn, type);
+ "history store update aborted with time window: %s, type: %" PRIu8
+ " and stable timestamp: %s",
+ __wt_time_window_to_string(hs_tw, tw_string), type,
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[3]));
/*
* Start time point of the current record may be used as stop time point of the previous
@@ -549,7 +553,7 @@ __rollback_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_REF *ref, WT_ROW *rip,
* timestamp and txnid, we need to restore that as well.
*/
if (__rollback_txn_visible_id(session, hs_tw->stop_txn) &&
- hs_stop_durable_ts <= rollback_timestamp) {
+ hs_tw->durable_stop_ts <= rollback_timestamp) {
/*
* The restoring tombstone timestamp must be zero or less than previous update start
* timestamp or the on-disk update is an out of order prepared.
@@ -1283,10 +1287,11 @@ __rollback_to_stable_btree_hs_truncate(WT_SESSION_IMPL *session, uint32_t btree_
WT_CURSOR *hs_cursor;
WT_DECL_ITEM(hs_key);
WT_DECL_RET;
+ WT_TIME_WINDOW *hs_tw;
wt_timestamp_t hs_start_ts;
uint64_t hs_counter;
uint32_t hs_btree_id;
- char ts_string[WT_TS_INT_STRING_SIZE];
+ char tw_string[WT_TIME_STRING_SIZE];
hs_cursor = NULL;
@@ -1305,9 +1310,12 @@ __rollback_to_stable_btree_hs_truncate(WT_SESSION_IMPL *session, uint32_t btree_
/* We shouldn't cross the btree search space. */
WT_ASSERT(session, btree_id == hs_btree_id);
+ /* Retrieve the time window from the history cursor. */
+ __wt_hs_upd_time_window(hs_cursor, &hs_tw);
+
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
- "rollback to stable history store cleanup of update with start timestamp: %s",
- __wt_timestamp_to_string(hs_start_ts, ts_string));
+ "rollback to stable history store cleanup of update with time window: %s",
+ __wt_time_window_to_string(hs_tw, tw_string));
WT_ERR(hs_cursor->remove(hs_cursor));
WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed);
diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable37.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable37.py
new file mode 100644
index 00000000000..dc8fd321df9
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable37.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+from wiredtiger import stat
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+from helper import simulate_crash_restart
+from test_rollback_to_stable01 import test_rollback_to_stable_base
+
+# test_rollback_to_stable37.py
+# Test that the rollback to stable to restore proper stable update from history store when a no timestamp
+# update has rewritten the history store data.
+class test_rollback_to_stable37(test_rollback_to_stable_base):
+ conn_config = 'cache_size=1GB,statistics=(all),statistics_log=(json,on_close,wait=1),log=(enabled=false)'
+
+ format_values = [
+ ('column', dict(key_format='r', value_format='S')),
+ ('row_integer', dict(key_format='i', value_format='S')),
+ ]
+
+ scenarios = make_scenarios(format_values)
+
+ def test_rollback_to_stable(self):
+ uri = 'table:test_rollback_to_stable37'
+ nrows = 1000
+
+ if self.value_format == '8t':
+ value_a = 97
+ value_b = 98
+ value_c = 99
+ value_d = 100
+ else:
+ value_a = 'a' * 10
+ value_b = 'b' * 10
+ value_c = 'c' * 10
+ value_d = 'd' * 10
+
+ # Create our table.
+ ds = SimpleDataSet(self, uri, 0, key_format=self.key_format, value_format=self.value_format)
+ ds.populate()
+
+ # Pin oldest and stable to timestamp 1.
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(1) +
+ ',stable_timestamp=' + self.timestamp_str(1))
+
+ # Insert 300 updates to the same key.
+ for i in range (20, 320):
+ if self.value_format == '8t':
+ self.large_updates(uri, value_a, ds, nrows, False, i)
+ else:
+ self.large_updates(uri, value_a + str(i), ds, nrows, False, i)
+
+ old_reader_session = self.conn.open_session()
+ old_reader_session.begin_transaction('read_timestamp=' + self.timestamp_str(10))
+
+ self.large_updates(uri, value_b, ds, nrows, False, 2000)
+ self.check(value_b, uri, nrows,2000)
+
+ self.evict_cursor(uri, nrows, value_b)
+
+ # Insert update without a timestamp.
+ self.large_updates(uri, value_c, ds, nrows, False, 0)
+ self.check(value_c, uri, nrows, 0)
+
+ self.evict_cursor(uri, nrows, value_c)
+
+ self.large_updates(uri, value_d, ds, nrows, False, 3000)
+ self.check(value_d, uri, nrows, 3000)
+
+ old_reader_session.rollback_transaction()
+ self.session.checkpoint()
+
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(2000))
+ self.session.checkpoint()
+
+ self.conn.rollback_to_stable()
+
+ self.check(value_c, uri, nrows, 1000)
+ self.check(value_c, uri, nrows, 2000)
+ self.check(value_c, uri, nrows, 3000)
+
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ keys_removed = stat_cursor[stat.conn.txn_rts_keys_removed][2]
+ stat_cursor.close()
+
+ self.assertEqual(keys_removed, 0)
+
+if __name__ == '__main__':
+ wttest.run()