SERVER-55766 Introduce an optimized "for restore" startup replication recovery mechanism

author: Matthew Russotto <matthew.russotto@mongodb.com> 2021-04-29 16:07:24 -0400
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2021-04-29 21:04:41 +0000
commit: 8a7e9a21fd0e10ddc1b41345e5bea1a82141061b (patch)
tree: 66439d7dfb6aebd0649c66483a77fdd475596f45 /jstests/replsets/startup_recovery_for_restore.js
parent: d9fbb80aa08741261580ebf3473e1b31ed795b7c (diff)
download: mongo-8a7e9a21fd0e10ddc1b41345e5bea1a82141061b.tar.gz
1 files changed, 160 insertions, 0 deletions
diff --git a/jstests/replsets/startup_recovery_for_restore.js b/jstests/replsets/startup_recovery_for_restore.js
new file mode 100644
index 00000000000..bd5b5a98db7
--- /dev/null
+++ b/jstests/replsets/startup_recovery_for_restore.js
@@ -0,0 +1,160 @@
+/*
+ * Tests that we can recover from a node with a lagged stable timestamp using the special
+ * "for restore" mode, but not read from older points-in-time on the recovered node.
+ *
+ * This test only makes sense for storage engines that support recover to stable timestamp.
+ * @tags: [requires_wiredtiger, requires_persistence, requires_journaling, requires_replication,
+ * requires_majority_read_concern, uses_transactions, uses_prepare_transaction,
+ * # We don't expect to do this while upgrading.
+ * multiversion_incompatible]
+ */
+
+(function() {
+"use strict";
+load("jstests/libs/fail_point_util.js");
+
+const dbName = TestData.testName;
+
+const logLevel = tojson({storage: {recovery: 2}});
+
+const rst = new ReplSetTest({
+    nodes: [{}, {}, {rsConfig: {priority: 0}}, {rsConfig: {priority: 0}}],
+    settings: {chainingAllowed: false}
+});
+
+const startParams = {
+    logComponentVerbosity: logLevel,
+    replBatchLimitOperations: 100
+};
+const nodes = rst.startSet({setParameter: startParams});
+let restoreNode = nodes[1];
+rst.initiateWithHighElectionTimeout();
+const primary = rst.getPrimary();
+const db = primary.getDB(dbName);
+const collName = "testcoll";
+const sentinelCollName = "sentinelcoll";
+const coll = db[collName];
+const paddingStr = "XXXXXXXXX";
+
+// Pre-load some documents.
+const nPreDocs = 2;
+coll.insert([{_id: "pre1"}, {_id: "pre2"}]);
+rst.awaitReplication();
+
+const holdOpTime = assert.commandWorked(db.runCommand({find: collName, limit: 1})).operationTime;
+
+// Keep the stable timestamp from moving on the node we're going to restart in restore mode.
+assert.commandWorked(restoreNode.adminCommand({
+    configureFailPoint: 'holdStableTimestampAtSpecificTimestamp',
+    mode: 'alwaysOn',
+    data: {"timestamp": holdOpTime}
+}));
+
+// Insert a bunch of documents.
+let bulk = coll.initializeUnorderedBulkOp();
+const nDocs = 1000;
+jsTestLog("Inserting " + nDocs + " documents with snapshotting disabled on one node.");
+for (let id = 1; id <= nDocs; id++) {
+    bulk.insert({_id: id, paddingStr: paddingStr});
+}
+bulk.execute();
+rst.awaitReplication();
+
+jsTestLog("Stopping replication on secondaries to hold back majority commit point.");
+let stopReplProducer2 = configureFailPoint(nodes[2], 'stopReplProducer');
+let stopReplProducer3 = configureFailPoint(nodes[3], 'stopReplProducer');
+
+jsTestLog("Writing first sentinel document.");
+const sentinel1Timestamp =
+    assert.commandWorked(db.runCommand({insert: sentinelCollName, documents: [{_id: "s1"}]}))
+        .operationTime;
+
+const nExtraDocs = 50;
+jsTestLog("Inserting " + nExtraDocs + " documents with majority point held back.");
+bulk = coll.initializeUnorderedBulkOp();
+for (let id = 1; id <= nExtraDocs; id++) {
+    bulk.insert({_id: (id + nDocs), paddingStr: paddingStr});
+}
+bulk.execute();
+const lastId = nDocs + nExtraDocs;
+
+const penultimateOpTime =
+    assert.commandWorked(db.runCommand({find: collName, limit: 1})).operationTime;
+
+const sentinel2Timestamp =
+    assert.commandWorked(db.runCommand({insert: sentinelCollName, documents: [{_id: "s2"}]}))
+        .operationTime;
+
+rst.awaitReplication(undefined, undefined, [restoreNode]);
+
+jsTestLog("Restarting restore node with the --startupRecoveryForRestore flag");
+restoreNode = rst.restart(restoreNode, {
+    noReplSet: true,
+    setParameter: Object.merge(startParams, {
+        startupRecoveryForRestore: true,
+        recoverFromOplogAsStandalone: true,
+        takeUnstableCheckpointOnShutdown: true
+    })
+});
+// Make sure we can read something after standalone recovery.
+assert.eq(2, restoreNode.getDB(dbName)[sentinelCollName].find({}).itcount());
+
+jsTestLog("Restarting restore node again, in repl set mode");
+restoreNode = rst.restart(restoreNode, {noReplSet: false, setParameter: startParams});
+
+rst.awaitSecondaryNodes(undefined, [restoreNode]);
+jsTestLog("Finished restarting restore node");
+
+const restoreDb = restoreNode.getDB(dbName);
+
+jsTestLog("Checking restore node untimestamped read.");
+// Basic test: should see all docs with untimestamped read.
+assert.eq(nPreDocs + nDocs + nExtraDocs, coll.find().itcount());
+assert.eq(nPreDocs + nDocs + nExtraDocs, restoreDb[collName].find().itcount());
+
+// For the remaining checks we step up the restored node so we can do atClusterTime reads on it.
+// They are necessarily speculative because we are preventing majority optimes from advancing.
+
+jsTestLog("Stepping up restore node");
+rst.stepUp(restoreNode, {awaitReplicationBeforeStepUp: false});
+
+// Should also be able to read at the final sentinel optime on restore node.
+const restoreNodeSession = restoreNode.startSession({causalConsistency: false});
+restoreNodeSession.startTransaction(
+    {readConcern: {level: "snapshot", atClusterTime: sentinel2Timestamp}});
+const restoreNodeSessionDb = restoreNodeSession.getDatabase(dbName);
+jsTestLog("Checking top-of-oplog read works on restored node.");
+
+let res = assert.commandWorked(
+    restoreNodeSessionDb.runCommand({find: collName, filter: {"_id": lastId}}));
+assert.eq(1, res.cursor.firstBatch.length);
+assert.docEq({_id: lastId, paddingStr: paddingStr}, res.cursor.firstBatch[0]);
+
+// Must abort because majority is not advancing.
+restoreNodeSession.abortTransaction();
+
+// Should NOT able to read at the first sentinel optime on the restore node.
+restoreNodeSession.startTransaction(
+    {readConcern: {level: "snapshot", atClusterTime: sentinel1Timestamp}});
+jsTestLog(
+    "Checking restore node majority optime read, which should fail, because the restore node does not have that history.");
+res = assert.commandFailedWithCode(
+    restoreNodeSessionDb.runCommand({find: collName, filter: {"_id": {"$gte": nDocs}}}),
+    ErrorCodes.SnapshotTooOld);
+restoreNodeSession.abortTransaction();
+
+// Should NOT able to read at the penultimate optime on the restore node either.
+jsTestLog(
+    "Checking restore node top-of-oplog minus 1 read, which should fail, because the restore node does not have that history.");
+restoreNodeSession.startTransaction(
+    {readConcern: {level: "snapshot", atClusterTime: penultimateOpTime}});
+res = assert.commandFailedWithCode(
+    restoreNodeSessionDb.runCommand({find: collName, filter: {"_id": lastId}}),
+    ErrorCodes.SnapshotTooOld);
+restoreNodeSession.abortTransaction();
+
+// Allow set to become current and shut down with ordinary dbHash verification.
+stopReplProducer2.off();
+stopReplProducer3.off();
+rst.stopSet();
+})();
author	Matthew Russotto <matthew.russotto@mongodb.com>	2021-04-29 16:07:24 -0400
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2021-04-29 21:04:41 +0000
commit	8a7e9a21fd0e10ddc1b41345e5bea1a82141061b (patch)
tree	66439d7dfb6aebd0649c66483a77fdd475596f45 /jstests/replsets/startup_recovery_for_restore.js
parent	d9fbb80aa08741261580ebf3473e1b31ed795b7c (diff)
download	mongo-8a7e9a21fd0e10ddc1b41345e5bea1a82141061b.tar.gz