summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2022-09-20 14:57:26 +1000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-09-20 05:19:14 +0000
commit88049cab6697c2e830a62b45747018f5bd7b7d2b (patch)
tree2338afb16b62765e397cad00eb5fc6e2be39e227
parentb598ac1495318c0594a6607666bd33d38c5e82f5 (diff)
downloadmongo-r4.4.17-rc1.tar.gz
Import wiredtiger: 54336f44ebf1d6012bce4c9105b4f0dc93fb2bd0 from branch mongodb-4.4r4.4.17-rc1
ref: ec742d6807..54336f44eb for: 4.4.17 WT-9870 Fix the global time window state before performing rollback to stable (#8289)
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_recover.c60
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c11
-rwxr-xr-xsrc/third_party/wiredtiger/test/suite/test_rollback_to_stable40.py164
4 files changed, 217 insertions, 20 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index d98b5fcebb2..701ab11e416 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-4.4",
- "commit": "ec742d6807b943cd6f2baf1a55853d296eb5b5c6"
+ "commit": "54336f44ebf1d6012bce4c9105b4f0dc93fb2bd0"
}
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
index 84b7e43a579..02d7d5e7a05 100644
--- a/src/third_party/wiredtiger/src/txn/txn_recover.c
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -430,7 +430,8 @@ __recovery_set_checkpoint_timestamp(WT_RECOVERY *r)
/*
* __recovery_set_oldest_timestamp --
- * Set the oldest timestamp as retrieved from the metadata file.
+ * Set the oldest timestamp as retrieved from the metadata file. Setting the oldest timestamp
+ * doesn't automatically set the pinned timestamp.
*/
static int
__recovery_set_oldest_timestamp(WT_RECOVERY *r)
@@ -574,6 +575,44 @@ err:
}
/*
+ * __recovery_txn_setup_initial_state --
+ * Setup the transaction initial state required by rollback to stable.
+ */
+static int
+__recovery_txn_setup_initial_state(WT_SESSION_IMPL *session, WT_RECOVERY *r)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+
+ WT_RET(__recovery_set_checkpoint_snapshot(session));
+
+ /*
+ * Set the checkpoint timestamp and oldest timestamp retrieved from the checkpoint metadata.
+ * These are the stable timestamp and oldest timestamps of the last successful checkpoint.
+ */
+ WT_RET(__recovery_set_checkpoint_timestamp(r));
+ WT_RET(__recovery_set_oldest_timestamp(r));
+
+ /*
+ * Now that timestamps extracted from the checkpoint metadata have been configured, configure
+ * the pinned timestamp.
+ */
+ WT_RET(__wt_txn_update_pinned_timestamp(session, true));
+
+ WT_ASSERT(session,
+ conn->txn_global.has_stable_timestamp == false &&
+ conn->txn_global.stable_timestamp == WT_TS_NONE);
+
+ /* Set the stable timestamp from recovery timestamp. */
+ conn->txn_global.stable_timestamp = conn->txn_global.recovery_timestamp;
+ if (conn->txn_global.stable_timestamp != WT_TS_NONE)
+ conn->txn_global.has_stable_timestamp = true;
+
+ return (0);
+}
+
+/*
* __recovery_setup_file --
* Set up the recovery slot for a file, track the largest file ID, and update the base write gen
* based on the file's configuration.
@@ -996,10 +1035,7 @@ done:
"Upgrading from a WiredTiger version 10.0.0 database that was not shutdown cleanly is "
"not allowed. Perform a clean shutdown on version 10.0.0 and then upgrade.");
#endif
-
- WT_ERR(__recovery_set_checkpoint_timestamp(&r));
- WT_ERR(__recovery_set_oldest_timestamp(&r));
- WT_ERR(__recovery_set_checkpoint_snapshot(session));
+ WT_ERR(__recovery_txn_setup_initial_state(session, &r));
/*
* Set the history store file size as it may already exist after a restart.
@@ -1022,20 +1058,6 @@ done:
eviction_started = true;
}
- WT_ASSERT(session,
- conn->txn_global.has_stable_timestamp == false &&
- conn->txn_global.stable_timestamp == WT_TS_NONE);
-
- /*
- * Set the stable timestamp from recovery timestamp and process the trees for rollback to
- * stable.
- */
- conn->txn_global.stable_timestamp = conn->txn_global.recovery_timestamp;
- conn->txn_global.has_stable_timestamp = false;
-
- if (conn->txn_global.recovery_timestamp != WT_TS_NONE)
- conn->txn_global.has_stable_timestamp = true;
-
__wt_verbose(session, WT_VERB_RECOVERY | WT_VERB_RTS,
"performing recovery rollback_to_stable with stable timestamp: %s and oldest timestamp: "
"%s",
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index e6ce11062dd..042bdadbbd2 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -1690,6 +1690,17 @@ __rollback_to_stable(WT_SESSION_IMPL *session, bool no_ckpt)
WT_ERR(__rollback_to_stable_check(session));
/*
+ * Update the global time window state to have consistent view from global visibility rules for
+ * the rollback to stable to bring back the database into a consistent state.
+ *
+ * As part of the below function call, the oldest transaction id and pinned timestamps are
+ * updated.
+ */
+ WT_ERR(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT));
+
+ WT_ASSERT(session, txn_global->has_pinned_timestamp || !txn_global->has_oldest_timestamp);
+
+ /*
* Copy the stable timestamp, otherwise we'd need to lock it each time it's accessed. Even
* though the stable timestamp isn't supposed to be updated while rolling back, accessing it
* without a lock would violate protocol.
diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable40.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable40.py
new file mode 100755
index 00000000000..07df57c2966
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable40.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import fnmatch, os, shutil, time
+from helper import copy_wiredtiger_home, simulate_crash_restart
+from test_rollback_to_stable01 import test_rollback_to_stable_base
+from wiredtiger import stat
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+
+# test_rollback_to_stable40.py
+# Test the rollback to stable operation performs as expected following a server crash
+# and recovery. Verify that the on-disk value is replaced by the correct value from
+# the history store.
+class test_rollback_to_stable40(test_rollback_to_stable_base):
+ session_config = 'isolation=snapshot'
+
+ key_format_values = [
+ ('column', dict(key_format='r')),
+ ('integer_row', dict(key_format='i')),
+ ]
+
+ scenarios = make_scenarios(key_format_values)
+
+ def conn_config(self):
+ config = 'cache_size=1MB,statistics=(all),log=(enabled=true)'
+ return config
+
+ def test_rollback_to_stable(self):
+ nrows = 3
+
+ # Create a table without logging.
+ uri = "table:rollback_to_stable40"
+ ds = SimpleDataSet(
+ self, uri, 0, key_format=self.key_format, value_format="S", config='log=(enabled=false)')
+ ds.populate()
+
+ # Pin oldest and stable to timestamp 10.
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(10) +
+ ',stable_timestamp=' + self.timestamp_str(10))
+
+ value_a = "aaaaa" * 100
+ value_b = "bbbbb" * 100
+ value_c = "ccccc" * 100
+ value_d = "ddddd" * 100
+
+ # Insert 3 keys with same updates.
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ cursor[1] = value_a
+ cursor[2] = value_a
+ cursor[3] = value_a
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(20))
+
+ # Update the first and last key with another value with a large timestamp.
+ self.session.begin_transaction()
+ cursor[1] = value_d
+ cursor[3] = value_d
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(1000))
+
+ # Update the middle key with lot of updates to generate more history.
+ for i in range(21, 499):
+ self.session.begin_transaction()
+ cursor[2] = value_b + str(i)
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(i))
+
+ # With this checkpoint, all the updates in the history store are persisted to disk.
+ self.session.checkpoint()
+
+ self.session.begin_transaction()
+ cursor[2] = value_c
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(500))
+
+ # Pin oldest and stable to timestamp 500.
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(500) +
+ ',stable_timestamp=' + self.timestamp_str(500))
+
+ # Evict the globally visible update to write to the disk, this will reset the time window.
+ evict_cursor = self.session.open_cursor(uri, None, "debug=(release_evict)")
+ self.session.begin_transaction("ignore_prepare=true")
+ evict_cursor.set_key(2)
+ self.assertEqual(evict_cursor[2], value_c)
+ evict_cursor.reset()
+ evict_cursor.close()
+ self.session.rollback_transaction()
+
+ self.session.begin_transaction()
+ cursor[2] = value_d
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(501))
+
+ # 1. This checkpoint will move the globally visible update to the first of the key range.
+ # 2. The existing updates in the history store are having with a larger timestamp are
+ # obsolete, so they are not explicitly removed.
+ # 3. Any of the history store updates that are already evicted will not rewrite by the
+ # checkpoint.
+ self.session.checkpoint()
+
+ # Verify data is visible and correct.
+ self.session.begin_transaction('read_timestamp=' + self.timestamp_str(1000))
+ for i in range (1, nrows + 1):
+ cursor.set_key(ds.key(i))
+ self.assertEqual(cursor.search(), 0)
+ self.assertEquals(cursor.get_value(), value_d)
+ self.session.rollback_transaction()
+ cursor.close()
+
+ # Simulate a server crash and restart.
+ simulate_crash_restart(self, ".", "RESTART")
+
+ # Verify data is visible and correct.
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction('read_timestamp=' + self.timestamp_str(1000))
+ for i in range (1, nrows + 1):
+ cursor.set_key(ds.key(i))
+ self.assertEqual(cursor.search(), 0)
+ if i % 2 == 0:
+ self.assertEquals(cursor.get_value(), value_c)
+ else:
+ self.assertEquals(cursor.get_value(), value_a)
+ self.session.rollback_transaction()
+
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ calls = stat_cursor[stat.conn.txn_rts][2]
+ hs_removed = stat_cursor[stat.conn.txn_rts_hs_removed][2]
+ keys_removed = stat_cursor[stat.conn.txn_rts_keys_removed][2]
+ keys_restored = stat_cursor[stat.conn.txn_rts_keys_restored][2]
+ pages_visited = stat_cursor[stat.conn.txn_rts_pages_visited][2]
+ upd_aborted = stat_cursor[stat.conn.txn_rts_upd_aborted][2]
+ stat_cursor.close()
+
+ self.assertEqual(calls, 0)
+ self.assertEqual(keys_removed, 0)
+ self.assertEqual(keys_restored, 0)
+ self.assertGreaterEqual(upd_aborted, 0)
+ self.assertGreater(pages_visited, 0)
+ self.assertGreaterEqual(hs_removed, 3)
+
+if __name__ == '__main__':
+ wttest.run()