summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2022-09-26 15:52:51 +1000
committerLuke Chen <luke.chen@mongodb.com>2022-09-26 15:52:51 +1000
commit3c29923bb50f654335a68b719d92f4e341bed5f8 (patch)
tree6795e76405e82f57fae229c5757e8d13f1d14b56
parent59018abb95b6a344ea223589554a87185b4fae6e (diff)
downloadmongo-r6.1.0-rc3.tar.gz
Import wiredtiger: 0e90362a6e0ec654480da2ffce384c6b53a06be6 from branch mongodb-6.1r6.1.0-rc3
ref: 0af906ba58..0e90362a6e for: 6.1.0-rc3 WT-9870 Fix the global time window state before performing rollback to stable (#8280) (#8288)
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_recover.c60
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c13
-rwxr-xr-xsrc/third_party/wiredtiger/test/suite/test_rollback_to_stable40.py164
4 files changed, 218 insertions, 21 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 919097d0505..a903a189b47 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-6.1",
- "commit": "0af906ba5866a9dd3de643e1daf8d983028f88f4"
+ "commit": "0e90362a6e0ec654480da2ffce384c6b53a06be6"
}
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
index 827d5cab9ca..ea724213713 100644
--- a/src/third_party/wiredtiger/src/txn/txn_recover.c
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -408,7 +408,8 @@ __recovery_set_checkpoint_timestamp(WT_RECOVERY *r)
/*
* __recovery_set_oldest_timestamp --
- * Set the oldest timestamp as retrieved from the metadata file.
+ * Set the oldest timestamp as retrieved from the metadata file. Setting the oldest timestamp
+ * doesn't automatically set the pinned timestamp.
*/
static int
__recovery_set_oldest_timestamp(WT_RECOVERY *r)
@@ -504,6 +505,44 @@ err:
}
/*
+ * __recovery_txn_setup_initial_state --
+ * Setup the transaction initial state required by rollback to stable.
+ */
+static int
+__recovery_txn_setup_initial_state(WT_SESSION_IMPL *session, WT_RECOVERY *r)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+
+ WT_RET(__recovery_set_checkpoint_snapshot(session));
+
+ /*
+ * Set the checkpoint timestamp and oldest timestamp retrieved from the checkpoint metadata.
+ * These are the stable timestamp and oldest timestamps of the last successful checkpoint.
+ */
+ WT_RET(__recovery_set_checkpoint_timestamp(r));
+ WT_RET(__recovery_set_oldest_timestamp(r));
+
+ /*
+ * Now that timestamps extracted from the checkpoint metadata have been configured, configure
+ * the pinned timestamp.
+ */
+ __wt_txn_update_pinned_timestamp(session, true);
+
+ WT_ASSERT(session,
+ conn->txn_global.has_stable_timestamp == false &&
+ conn->txn_global.stable_timestamp == WT_TS_NONE);
+
+ /* Set the stable timestamp from recovery timestamp. */
+ conn->txn_global.stable_timestamp = conn->txn_global.recovery_timestamp;
+ if (conn->txn_global.stable_timestamp != WT_TS_NONE)
+ conn->txn_global.has_stable_timestamp = true;
+
+ return (0);
+}
+
+/*
* __recovery_setup_file --
* Set up the recovery slot for a file, track the largest file ID, and update the base write gen
* based on the file's configuration.
@@ -925,10 +964,7 @@ done:
"Upgrading from a WiredTiger version 10.0.0 database that was not shutdown cleanly is "
"not allowed. Perform a clean shutdown on version 10.0.0 and then upgrade.");
#endif
-
- WT_ERR(__recovery_set_checkpoint_timestamp(&r));
- WT_ERR(__recovery_set_oldest_timestamp(&r));
- WT_ERR(__recovery_set_checkpoint_snapshot(session));
+ WT_ERR(__recovery_txn_setup_initial_state(session, &r));
/*
* Set the history store file size as it may already exist after a restart.
@@ -951,20 +987,6 @@ done:
eviction_started = true;
}
- WT_ASSERT(session,
- conn->txn_global.has_stable_timestamp == false &&
- conn->txn_global.stable_timestamp == WT_TS_NONE);
-
- /*
- * Set the stable timestamp from recovery timestamp and process the trees for rollback to
- * stable.
- */
- conn->txn_global.stable_timestamp = conn->txn_global.recovery_timestamp;
- conn->txn_global.has_stable_timestamp = false;
-
- if (conn->txn_global.recovery_timestamp != WT_TS_NONE)
- conn->txn_global.has_stable_timestamp = true;
-
__wt_verbose_multi(session,
WT_DECL_VERBOSE_MULTI_CATEGORY(((WT_VERBOSE_CATEGORY[]){WT_VERB_RECOVERY, WT_VERB_RTS})),
"performing recovery rollback_to_stable with stable timestamp: %s and oldest timestamp: "
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index d3c2adc3596..37269ec53f1 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -1921,9 +1921,20 @@ __rollback_to_stable(WT_SESSION_IMPL *session, bool no_ckpt)
WT_ERR(__rollback_to_stable_check(session));
- /* Update the oldest id to get a consistent view of global visibility. */
+ /*
+ * Update the global time window state to have consistent view from global visibility rules for
+ * the rollback to stable to bring back the database into a consistent state.
+ *
+ * As part of the below function call, the oldest transaction id and pinned timestamps are
+ * updated.
+ */
WT_ERR(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT));
+ WT_ASSERT_ALWAYS(session,
+ (txn_global->has_pinned_timestamp || !txn_global->has_oldest_timestamp),
+ "Database has no pinned timestamp but an oldest timestamp. Pinned timestamp is required to "
+ "find out the global visibility/obsolete of an update.");
+
/*
* Copy the stable timestamp, otherwise we'd need to lock it each time it's accessed. Even
* though the stable timestamp isn't supposed to be updated while rolling back, accessing it
diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable40.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable40.py
new file mode 100755
index 00000000000..07df57c2966
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable40.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import fnmatch, os, shutil, time
+from helper import copy_wiredtiger_home, simulate_crash_restart
+from test_rollback_to_stable01 import test_rollback_to_stable_base
+from wiredtiger import stat
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+
+# test_rollback_to_stable40.py
+# Test the rollback to stable operation performs as expected following a server crash
+# and recovery. Verify that the on-disk value is replaced by the correct value from
+# the history store.
+class test_rollback_to_stable40(test_rollback_to_stable_base):
+ session_config = 'isolation=snapshot'
+
+ key_format_values = [
+ ('column', dict(key_format='r')),
+ ('integer_row', dict(key_format='i')),
+ ]
+
+ scenarios = make_scenarios(key_format_values)
+
+ def conn_config(self):
+ config = 'cache_size=1MB,statistics=(all),log=(enabled=true)'
+ return config
+
+ def test_rollback_to_stable(self):
+ nrows = 3
+
+ # Create a table without logging.
+ uri = "table:rollback_to_stable40"
+ ds = SimpleDataSet(
+ self, uri, 0, key_format=self.key_format, value_format="S", config='log=(enabled=false)')
+ ds.populate()
+
+ # Pin oldest and stable to timestamp 10.
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(10) +
+ ',stable_timestamp=' + self.timestamp_str(10))
+
+ value_a = "aaaaa" * 100
+ value_b = "bbbbb" * 100
+ value_c = "ccccc" * 100
+ value_d = "ddddd" * 100
+
+ # Insert 3 keys with same updates.
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ cursor[1] = value_a
+ cursor[2] = value_a
+ cursor[3] = value_a
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(20))
+
+ # Update the first and last key with another value with a large timestamp.
+ self.session.begin_transaction()
+ cursor[1] = value_d
+ cursor[3] = value_d
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(1000))
+
+ # Update the middle key with lot of updates to generate more history.
+ for i in range(21, 499):
+ self.session.begin_transaction()
+ cursor[2] = value_b + str(i)
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(i))
+
+ # With this checkpoint, all the updates in the history store are persisted to disk.
+ self.session.checkpoint()
+
+ self.session.begin_transaction()
+ cursor[2] = value_c
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(500))
+
+ # Pin oldest and stable to timestamp 500.
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(500) +
+ ',stable_timestamp=' + self.timestamp_str(500))
+
+ # Evict the globally visible update to write to the disk, this will reset the time window.
+ evict_cursor = self.session.open_cursor(uri, None, "debug=(release_evict)")
+ self.session.begin_transaction("ignore_prepare=true")
+ evict_cursor.set_key(2)
+ self.assertEqual(evict_cursor[2], value_c)
+ evict_cursor.reset()
+ evict_cursor.close()
+ self.session.rollback_transaction()
+
+ self.session.begin_transaction()
+ cursor[2] = value_d
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(501))
+
+ # 1. This checkpoint will move the globally visible update to the first of the key range.
+ # 2. The existing updates in the history store are having with a larger timestamp are
+ # obsolete, so they are not explicitly removed.
+ # 3. Any of the history store updates that are already evicted will not rewrite by the
+ # checkpoint.
+ self.session.checkpoint()
+
+ # Verify data is visible and correct.
+ self.session.begin_transaction('read_timestamp=' + self.timestamp_str(1000))
+ for i in range (1, nrows + 1):
+ cursor.set_key(ds.key(i))
+ self.assertEqual(cursor.search(), 0)
+ self.assertEquals(cursor.get_value(), value_d)
+ self.session.rollback_transaction()
+ cursor.close()
+
+ # Simulate a server crash and restart.
+ simulate_crash_restart(self, ".", "RESTART")
+
+ # Verify data is visible and correct.
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction('read_timestamp=' + self.timestamp_str(1000))
+ for i in range (1, nrows + 1):
+ cursor.set_key(ds.key(i))
+ self.assertEqual(cursor.search(), 0)
+ if i % 2 == 0:
+ self.assertEquals(cursor.get_value(), value_c)
+ else:
+ self.assertEquals(cursor.get_value(), value_a)
+ self.session.rollback_transaction()
+
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ calls = stat_cursor[stat.conn.txn_rts][2]
+ hs_removed = stat_cursor[stat.conn.txn_rts_hs_removed][2]
+ keys_removed = stat_cursor[stat.conn.txn_rts_keys_removed][2]
+ keys_restored = stat_cursor[stat.conn.txn_rts_keys_restored][2]
+ pages_visited = stat_cursor[stat.conn.txn_rts_pages_visited][2]
+ upd_aborted = stat_cursor[stat.conn.txn_rts_upd_aborted][2]
+ stat_cursor.close()
+
+ self.assertEqual(calls, 0)
+ self.assertEqual(keys_removed, 0)
+ self.assertEqual(keys_restored, 0)
+ self.assertGreaterEqual(upd_aborted, 0)
+ self.assertGreater(pages_visited, 0)
+ self.assertGreaterEqual(hs_removed, 3)
+
+if __name__ == '__main__':
+ wttest.run()