diff options
author | Luke Chen <luke.chen@mongodb.com> | 2022-09-20 14:57:26 +1000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-09-20 05:19:14 +0000 |
commit | 88049cab6697c2e830a62b45747018f5bd7b7d2b (patch) | |
tree | 2338afb16b62765e397cad00eb5fc6e2be39e227 /src | |
parent | b598ac1495318c0594a6607666bd33d38c5e82f5 (diff) | |
download | mongo-88049cab6697c2e830a62b45747018f5bd7b7d2b.tar.gz |
Import wiredtiger: 54336f44ebf1d6012bce4c9105b4f0dc93fb2bd0 from branch mongodb-4.4r4.4.17-rc1
ref: ec742d6807..54336f44eb
for: 4.4.17
WT-9870 Fix the global time window state before performing rollback to stable (#8289)
Diffstat (limited to 'src')
4 files changed, 217 insertions, 20 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index d98b5fcebb2..701ab11e416 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-4.4", - "commit": "ec742d6807b943cd6f2baf1a55853d296eb5b5c6" + "commit": "54336f44ebf1d6012bce4c9105b4f0dc93fb2bd0" } diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c index 84b7e43a579..02d7d5e7a05 100644 --- a/src/third_party/wiredtiger/src/txn/txn_recover.c +++ b/src/third_party/wiredtiger/src/txn/txn_recover.c @@ -430,7 +430,8 @@ __recovery_set_checkpoint_timestamp(WT_RECOVERY *r) /* * __recovery_set_oldest_timestamp -- - * Set the oldest timestamp as retrieved from the metadata file. + * Set the oldest timestamp as retrieved from the metadata file. Setting the oldest timestamp + * doesn't automatically set the pinned timestamp. */ static int __recovery_set_oldest_timestamp(WT_RECOVERY *r) @@ -574,6 +575,44 @@ err: } /* + * __recovery_txn_setup_initial_state -- + * Setup the transaction initial state required by rollback to stable. + */ +static int +__recovery_txn_setup_initial_state(WT_SESSION_IMPL *session, WT_RECOVERY *r) +{ + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + + WT_RET(__recovery_set_checkpoint_snapshot(session)); + + /* + * Set the checkpoint timestamp and oldest timestamp retrieved from the checkpoint metadata. + * These are the stable timestamp and oldest timestamps of the last successful checkpoint. + */ + WT_RET(__recovery_set_checkpoint_timestamp(r)); + WT_RET(__recovery_set_oldest_timestamp(r)); + + /* + * Now that timestamps extracted from the checkpoint metadata have been configured, configure + * the pinned timestamp. + */ + WT_RET(__wt_txn_update_pinned_timestamp(session, true)); + + WT_ASSERT(session, + conn->txn_global.has_stable_timestamp == false && + conn->txn_global.stable_timestamp == WT_TS_NONE); + + /* Set the stable timestamp from recovery timestamp. */ + conn->txn_global.stable_timestamp = conn->txn_global.recovery_timestamp; + if (conn->txn_global.stable_timestamp != WT_TS_NONE) + conn->txn_global.has_stable_timestamp = true; + + return (0); +} + +/* * __recovery_setup_file -- * Set up the recovery slot for a file, track the largest file ID, and update the base write gen * based on the file's configuration. @@ -996,10 +1035,7 @@ done: "Upgrading from a WiredTiger version 10.0.0 database that was not shutdown cleanly is " "not allowed. Perform a clean shutdown on version 10.0.0 and then upgrade."); #endif - - WT_ERR(__recovery_set_checkpoint_timestamp(&r)); - WT_ERR(__recovery_set_oldest_timestamp(&r)); - WT_ERR(__recovery_set_checkpoint_snapshot(session)); + WT_ERR(__recovery_txn_setup_initial_state(session, &r)); /* * Set the history store file size as it may already exist after a restart. @@ -1022,20 +1058,6 @@ done: eviction_started = true; } - WT_ASSERT(session, - conn->txn_global.has_stable_timestamp == false && - conn->txn_global.stable_timestamp == WT_TS_NONE); - - /* - * Set the stable timestamp from recovery timestamp and process the trees for rollback to - * stable. - */ - conn->txn_global.stable_timestamp = conn->txn_global.recovery_timestamp; - conn->txn_global.has_stable_timestamp = false; - - if (conn->txn_global.recovery_timestamp != WT_TS_NONE) - conn->txn_global.has_stable_timestamp = true; - __wt_verbose(session, WT_VERB_RECOVERY | WT_VERB_RTS, "performing recovery rollback_to_stable with stable timestamp: %s and oldest timestamp: " "%s", diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index e6ce11062dd..042bdadbbd2 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -1690,6 +1690,17 @@ __rollback_to_stable(WT_SESSION_IMPL *session, bool no_ckpt) WT_ERR(__rollback_to_stable_check(session)); /* + * Update the global time window state to have consistent view from global visibility rules for + * the rollback to stable to bring back the database into a consistent state. + * + * As part of the below function call, the oldest transaction id and pinned timestamps are + * updated. + */ + WT_ERR(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); + + WT_ASSERT(session, txn_global->has_pinned_timestamp || !txn_global->has_oldest_timestamp); + + /* * Copy the stable timestamp, otherwise we'd need to lock it each time it's accessed. Even * though the stable timestamp isn't supposed to be updated while rolling back, accessing it * without a lock would violate protocol. diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable40.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable40.py new file mode 100755 index 00000000000..07df57c2966 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable40.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python +# +# Public Domain 2014-present MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import fnmatch, os, shutil, time +from helper import copy_wiredtiger_home, simulate_crash_restart +from test_rollback_to_stable01 import test_rollback_to_stable_base +from wiredtiger import stat +from wtdataset import SimpleDataSet +from wtscenario import make_scenarios + +# test_rollback_to_stable40.py +# Test the rollback to stable operation performs as expected following a server crash +# and recovery. Verify that the on-disk value is replaced by the correct value from +# the history store. +class test_rollback_to_stable40(test_rollback_to_stable_base): + session_config = 'isolation=snapshot' + + key_format_values = [ + ('column', dict(key_format='r')), + ('integer_row', dict(key_format='i')), + ] + + scenarios = make_scenarios(key_format_values) + + def conn_config(self): + config = 'cache_size=1MB,statistics=(all),log=(enabled=true)' + return config + + def test_rollback_to_stable(self): + nrows = 3 + + # Create a table without logging. + uri = "table:rollback_to_stable40" + ds = SimpleDataSet( + self, uri, 0, key_format=self.key_format, value_format="S", config='log=(enabled=false)') + ds.populate() + + # Pin oldest and stable to timestamp 10. + self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(10) + + ',stable_timestamp=' + self.timestamp_str(10)) + + value_a = "aaaaa" * 100 + value_b = "bbbbb" * 100 + value_c = "ccccc" * 100 + value_d = "ddddd" * 100 + + # Insert 3 keys with same updates. + cursor = self.session.open_cursor(uri) + self.session.begin_transaction() + cursor[1] = value_a + cursor[2] = value_a + cursor[3] = value_a + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(20)) + + # Update the first and last key with another value with a large timestamp. + self.session.begin_transaction() + cursor[1] = value_d + cursor[3] = value_d + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(1000)) + + # Update the middle key with lot of updates to generate more history. + for i in range(21, 499): + self.session.begin_transaction() + cursor[2] = value_b + str(i) + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(i)) + + # With this checkpoint, all the updates in the history store are persisted to disk. + self.session.checkpoint() + + self.session.begin_transaction() + cursor[2] = value_c + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(500)) + + # Pin oldest and stable to timestamp 500. + self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(500) + + ',stable_timestamp=' + self.timestamp_str(500)) + + # Evict the globally visible update to write to the disk, this will reset the time window. + evict_cursor = self.session.open_cursor(uri, None, "debug=(release_evict)") + self.session.begin_transaction("ignore_prepare=true") + evict_cursor.set_key(2) + self.assertEqual(evict_cursor[2], value_c) + evict_cursor.reset() + evict_cursor.close() + self.session.rollback_transaction() + + self.session.begin_transaction() + cursor[2] = value_d + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(501)) + + # 1. This checkpoint will move the globally visible update to the first of the key range. + # 2. The existing updates in the history store are having with a larger timestamp are + # obsolete, so they are not explicitly removed. + # 3. Any of the history store updates that are already evicted will not rewrite by the + # checkpoint. + self.session.checkpoint() + + # Verify data is visible and correct. + self.session.begin_transaction('read_timestamp=' + self.timestamp_str(1000)) + for i in range (1, nrows + 1): + cursor.set_key(ds.key(i)) + self.assertEqual(cursor.search(), 0) + self.assertEquals(cursor.get_value(), value_d) + self.session.rollback_transaction() + cursor.close() + + # Simulate a server crash and restart. + simulate_crash_restart(self, ".", "RESTART") + + # Verify data is visible and correct. + cursor = self.session.open_cursor(uri) + self.session.begin_transaction('read_timestamp=' + self.timestamp_str(1000)) + for i in range (1, nrows + 1): + cursor.set_key(ds.key(i)) + self.assertEqual(cursor.search(), 0) + if i % 2 == 0: + self.assertEquals(cursor.get_value(), value_c) + else: + self.assertEquals(cursor.get_value(), value_a) + self.session.rollback_transaction() + + stat_cursor = self.session.open_cursor('statistics:', None, None) + calls = stat_cursor[stat.conn.txn_rts][2] + hs_removed = stat_cursor[stat.conn.txn_rts_hs_removed][2] + keys_removed = stat_cursor[stat.conn.txn_rts_keys_removed][2] + keys_restored = stat_cursor[stat.conn.txn_rts_keys_restored][2] + pages_visited = stat_cursor[stat.conn.txn_rts_pages_visited][2] + upd_aborted = stat_cursor[stat.conn.txn_rts_upd_aborted][2] + stat_cursor.close() + + self.assertEqual(calls, 0) + self.assertEqual(keys_removed, 0) + self.assertEqual(keys_restored, 0) + self.assertGreaterEqual(upd_aborted, 0) + self.assertGreater(pages_visited, 0) + self.assertGreaterEqual(hs_removed, 3) + +if __name__ == '__main__': + wttest.run() |