SERVER-41386 Test that the replica set's majority commit point can move forward because of secondaries without primary durable writes

SERVER-41387 Test that oplogTruncateAfterPoint will clear oplog holes during startup recovery after primary crash create mode 100644 jstests/noPassthrough/non_durable_writes_on_primary_can_reach_majority.js create mode 100644 jstests/noPassthrough/startup_recovery_truncates_oplog_holes_after_primary_crash.js
author: Dianna Hohensee <dianna.hohensee@mongodb.com> 2020-02-26 13:14:49 -0500
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2020-02-27 13:54:12 +0000
commit: 3a59e217a4b34234acbd6a404f98276a87435ee0 (patch)
tree: 0e0890c6e5bd0db9d962b694acf0f3ee5edb19b5 /jstests/noPassthrough/startup_recovery_truncates_oplog_holes_after_primary_crash.js
parent: 77c6d1044cff7b113881a1c97f5dca63567fbe81 (diff)
download: mongo-3a59e217a4b34234acbd6a404f98276a87435ee0.tar.gz
1 files changed, 93 insertions, 0 deletions
diff --git a/jstests/noPassthrough/startup_recovery_truncates_oplog_holes_after_primary_crash.js b/jstests/noPassthrough/startup_recovery_truncates_oplog_holes_after_primary_crash.js
new file mode 100644
index 00000000000..14d50245f98
--- /dev/null
+++ b/jstests/noPassthrough/startup_recovery_truncates_oplog_holes_after_primary_crash.js
@@ -0,0 +1,93 @@
+/**
+ * Test that a confirmed write against a primary with oplog holes behind it when a crash occurs will
+ * be truncated on startup recovery.
+ *
+ * There must be more than 1 voting node, otherwise the write concern behavior changes to waiting
+ * for no holes for writes with {j: true} write concern, and no confirmed writes will be truncated.
+ *
+ * @tags: [
+ *     # The primary is restarted and must retain its data.
+ *     requires_persistence,
+ * ]
+ */
+
+(function() {
+"use strict";
+
+load("jstests/libs/fail_point_util.js");
+
+const rst = new ReplSetTest({name: jsTest.name(), nodes: 2});
+rst.startSet();
+// Make sure there are no election timeouts. This should prevent primary stepdown. Normally we would
+// set the secondary node votes to 0, but that would affect the feature that is being tested.
+rst.initiateWithHighElectionTimeout();
+
+const primary = rst.getPrimary();
+const dbName = "testDB";
+const collName = jsTest.name();
+const primaryDB = primary.getDB(dbName);
+const primaryColl = primaryDB[collName];
+
+assert.commandWorked(primaryDB.createCollection(collName, {writeConcern: {w: "majority"}}));
+
+const failPoint = configureFailPoint(primaryDB,
+                                     "hangAfterCollectionInserts",
+                                     {collectionNS: primaryColl.getFullName(), first_id: "b"});
+
+try {
+    // Hold back the durable timestamp by leaving an uncommitted transaction hanging.
+
+    TestData.dbName = dbName;
+    TestData.collName = collName;
+
+    startParallelShell(() => {
+        jsTestLog("Insert a document that will hang before the insert completes.");
+        // Crashing the server while this command is running may cause the parallel shell code to
+        // error and stop executing. We will therefore ignore the result of this command and
+        // parallel shell. Test correctness is guaranteed by waiting for the failpoint this command
+        // hits.
+        db.getSiblingDB(TestData.dbName)[TestData.collName].insert({_id: "b"});
+    }, primary.port);
+
+    jsTest.log("Wait for async insert to hit the failpoint.");
+    failPoint.wait();
+
+    // Execute an insert with confirmation that it made it to disk ({j: true});
+    //
+    // The primary's durable timestamp should be pinned by the prior hanging uncommitted write. So
+    // this second write will have an oplog hole behind it and will be truncated after a crash.
+    assert.commandWorked(
+        primaryColl.insert({_id: "writeAfterHole"}, {writeConcern: {w: 1, j: true}}));
+
+    const findResult = primaryColl.findOne({_id: "writeAfterHole"});
+    assert.eq(findResult, {"_id": "writeAfterHole"});
+
+    jsTest.log("Force a checkpoint so the primary has data on startup recovery after a crash");
+    assert.commandWorked(primary.adminCommand({fsync: 1}));
+
+    // Crash and restart the primary, which should truncate the second successful write, because
+    // the first write never committed and left a hole in the oplog.
+    rst.stop(primary, 9, {allowedExitCode: MongoRunner.EXIT_SIGKILL});
+} catch (error) {
+    // Turn off the failpoint before allowing the test to end, so nothing hangs while the server
+    // shuts down or in post-test hooks.
+    failPoint.off();
+    throw error;
+}
+
+rst.start(primary);
+
+// Wait for the restarted node to complete startup recovery and start accepting user requests.
+// Note: no new primary will be elected because of the high election timeout set on the replica set.
+assert.soonNoExcept(function() {
+    const nodeState = assert.commandWorked(primary.adminCommand("replSetGetStatus")).myState;
+    return nodeState == ReplSetTest.State.SECONDARY;
+});
+
+// Confirm that the write with the oplog hold behind it is now gone (truncated) as expected.
+primary.setSlaveOk();
+const find = primary.getDB(dbName).getCollection(collName).findOne({_id: "writeAfterHole"});
+assert.eq(find, null);
+
+rst.stopSet();
+})();
author	Dianna Hohensee <dianna.hohensee@mongodb.com>	2020-02-26 13:14:49 -0500
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2020-02-27 13:54:12 +0000
commit	3a59e217a4b34234acbd6a404f98276a87435ee0 (patch)
tree	0e0890c6e5bd0db9d962b694acf0f3ee5edb19b5 /jstests/noPassthrough/startup_recovery_truncates_oplog_holes_after_primary_crash.js
parent	77c6d1044cff7b113881a1c97f5dca63567fbe81 (diff)
download	mongo-3a59e217a4b34234acbd6a404f98276a87435ee0.tar.gz