SERVER-41386 Test that the replica set's majority commit point can move forward because of secondaries without primary durable writes

SERVER-41387 Test that oplogTruncateAfterPoint will clear oplog holes during startup recovery after primary crash create mode 100644 jstests/noPassthrough/non_durable_writes_on_primary_can_reach_majority.js create mode 100644 jstests/noPassthrough/startup_recovery_truncates_oplog_holes_after_primary_crash.js
author: Dianna Hohensee <dianna.hohensee@mongodb.com> 2020-02-26 13:14:49 -0500
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2020-02-27 13:54:12 +0000
commit: 3a59e217a4b34234acbd6a404f98276a87435ee0 (patch)
tree: 0e0890c6e5bd0db9d962b694acf0f3ee5edb19b5
parent: 77c6d1044cff7b113881a1c97f5dca63567fbe81 (diff)
download: mongo-3a59e217a4b34234acbd6a404f98276a87435ee0.tar.gz
3 files changed, 220 insertions, 0 deletions
diff --git a/jstests/noPassthrough/non_durable_writes_on_primary_can_reach_majority.js b/jstests/noPassthrough/non_durable_writes_on_primary_can_reach_majority.js
new file mode 100644
index 00000000000..e5a1b93396c
--- /dev/null
+++ b/jstests/noPassthrough/non_durable_writes_on_primary_can_reach_majority.js
@@ -0,0 +1,121 @@
+/**
+ * Tests that non-durable writes on the primary can be successfully majority confirmed by the
+ * secondaries.
+ *
+ * Skipping persistence on the primary will hold back its durable timestamp used for cross-replica
+ * set write concern confirmation.
+ *
+ * First tests that writes can be majority confirmed without the primary by two secondaries.
+ * Then tests that writes cannot be majority confirmed without the primary and only one secondary.
+ *
+ * @tags: [
+ *     # inMemory has journaling off, so {j:true} writes are not allowed.
+ *     requires_journaling,
+ * ]
+ */
+
+(function() {
+"use strict";
+
+load("jstests/libs/fail_point_util.js");
+
+const rst = new ReplSetTest({name: jsTest.name(), nodes: 3});
+rst.startSet();
+// Make sure there are no election timeouts. This should prevent primary stepdown. Normally we would
+// set the secondary node votes to 0, but that would affect the feature that is being tested.
+rst.initiateWithHighElectionTimeout();
+
+const primary = rst.getPrimary();
+const dbName = "testDB";
+const collName = jsTest.name();
+const primaryDB = primary.getDB(dbName);
+const primaryColl = primaryDB[collName];
+
+assert.commandWorked(primaryDB.createCollection(collName, {writeConcern: {w: "majority"}}));
+
+// Do a write and then fetch and save the durable and majority timestamps on the primary.
+// Use {w: 3, j: true} write concern to make sure the timestamps are stable.
+const res = assert.commandWorked(
+    primaryColl.insert({_id: "writeAllDurable"}, {writeConcern: {w: 3, j: true}}));
+const primaryReplSetStatus = assert.commandWorked(primary.adminCommand("replSetGetStatus"));
+const primaryPreFailPointDurableTs = primaryReplSetStatus.optimes.durableOpTime.ts;
+const primaryPreFailPointMajorityTs = primaryReplSetStatus.optimes.readConcernMajorityOpTime.ts;
+assert.neq(primaryPreFailPointDurableTs, null);
+assert.neq(primaryPreFailPointMajorityTs, null);
+assert.eq(primaryPreFailPointDurableTs, primaryPreFailPointMajorityTs);
+
+// Configure the primary to stop moving the durable timestamp forward. The primary will no longer be
+// able to contribute to moving the replica set's majority timestamp forward because the replica
+// set's majority timestamp depends upon each member's durable timestamp,
+const failPoint = configureFailPoint(primaryDB, "skipDurableTimestampUpdates");
+
+try {
+    // Perform some writes with majority write concern. The primary cannot confirm them, so success
+    // means that the secondaries have the writes durably.
+    jsTestLog("Writes majority confirmed by secondaries.");
+    assert.commandWorked(
+        primaryColl.insert({_id: "majority1"}, {writeConcern: {w: "majority", j: true}}));
+    assert.commandWorked(
+        primaryColl.insert({_id: "majority2"}, {writeConcern: {w: "majority", j: true}}));
+
+    // Check that the primary's durable timestamp has not moved forward, but the majority point has.
+    const primaryStatus = assert.commandWorked(primary.adminCommand("replSetGetStatus"));
+    const primaryPostWritesDurableTs = primaryStatus.optimes.durableOpTime.ts;
+    const primaryPostWritesMajorityTs = primaryStatus.optimes.readConcernMajorityOpTime.ts;
+    assert.eq(primaryPostWritesDurableTs, primaryPreFailPointDurableTs);
+    assert.gt(primaryPostWritesMajorityTs, primaryPreFailPointDurableTs);
+
+    // Check that the secondaries' durable and majority timestamps have moved forward.
+    rst.getSecondaries().forEach(function(secondary) {
+        const secondaryStatus = assert.commandWorked(secondary.adminCommand("replSetGetStatus"));
+        const secondaryDurableTs = secondaryStatus.optimes.durableOpTime.ts;
+        const secondaryMajorityTs = secondaryStatus.optimes.readConcernMajorityOpTime.ts;
+        assert.eq(secondaryDurableTs, secondaryMajorityTs);
+        assert.gt(secondaryDurableTs, primaryPreFailPointDurableTs);
+        assert.eq(secondaryMajorityTs, primaryPostWritesMajorityTs);
+    });
+
+    // Shutdown a secondary so that there is no longer a majority able to confirm the durability of
+    // a write.
+    jsTestLog("Stopping one of the two secondaries");
+    let secondaries = rst.getSecondaries();
+    assert.eq(secondaries.length, 2);
+    let stoppedSecondary = secondaries[0];
+    let runningSecondary = secondaries[1];
+    rst.stop(stoppedSecondary);
+
+    // Now writes cannot reach majority without the primary. We will do {w: 2, j: false} writes to
+    // get the writes on both remaining nodes. Then follow up with fsync commands againt the two
+    // nodes to make sure the durable timestamps move forward if possible -- this will work only on
+    // the secondary, the primary's durable timestamp will not move.
+    jsTestLog("Writes cannot become majority confirmed.");
+    assert.commandWorked(
+        primaryColl.insert({_id: "noMajority1"}, {writeConcern: {w: 2, j: false}}));
+    assert.commandWorked(
+        primaryColl.insert({_id: "noMajority2"}, {writeConcern: {w: 2, j: false}}));
+
+    jsTest.log("Force checkpoints to move the durable timestamps forward");
+    assert.commandWorked(primary.adminCommand({fsync: 1}));
+    assert.commandWorked(runningSecondary.adminCommand({fsync: 1}));
+
+    // Check that the primary's durable and majority timestamps have not moved forward.
+    const primaryReplStatus = assert.commandWorked(primary.adminCommand("replSetGetStatus"));
+    const primaryPostFsyncDurableTs = primaryReplStatus.optimes.durableOpTime.ts;
+    const primaryPostFsyncMajorityTs = primaryReplStatus.optimes.readConcernMajorityOpTime.ts;
+    assert.eq(primaryPostFsyncDurableTs, primaryPreFailPointDurableTs);
+    assert.eq(primaryPostFsyncMajorityTs, primaryPostWritesMajorityTs);
+
+    // Check that the secondary's durable timestamp has moved forward, but the majority has not.
+    const secondaryStatus = assert.commandWorked(runningSecondary.adminCommand("replSetGetStatus"));
+    const secondaryDurableTs = secondaryStatus.optimes.durableOpTime.ts;
+    const secondaryMajorityTs = secondaryStatus.optimes.readConcernMajorityOpTime.ts;
+    assert.gt(secondaryDurableTs, primaryPostFsyncMajorityTs);
+    assert.eq(secondaryMajorityTs, primaryPostFsyncMajorityTs);
+} finally {
+    // Turn off the failpoint before allowing the test to end, so nothing hangs while the server
+    // shuts down or in post-test hooks.
+    failPoint.off();
+}
+
+rst.stopSet();
+})();
diff --git a/jstests/noPassthrough/startup_recovery_truncates_oplog_holes_after_primary_crash.js b/jstests/noPassthrough/startup_recovery_truncates_oplog_holes_after_primary_crash.js
new file mode 100644
index 00000000000..14d50245f98
--- /dev/null
+++ b/jstests/noPassthrough/startup_recovery_truncates_oplog_holes_after_primary_crash.js
@@ -0,0 +1,93 @@
+/**
+ * Test that a confirmed write against a primary with oplog holes behind it when a crash occurs will
+ * be truncated on startup recovery.
+ *
+ * There must be more than 1 voting node, otherwise the write concern behavior changes to waiting
+ * for no holes for writes with {j: true} write concern, and no confirmed writes will be truncated.
+ *
+ * @tags: [
+ *     # The primary is restarted and must retain its data.
+ *     requires_persistence,
+ * ]
+ */
+
+(function() {
+"use strict";
+
+load("jstests/libs/fail_point_util.js");
+
+const rst = new ReplSetTest({name: jsTest.name(), nodes: 2});
+rst.startSet();
+// Make sure there are no election timeouts. This should prevent primary stepdown. Normally we would
+// set the secondary node votes to 0, but that would affect the feature that is being tested.
+rst.initiateWithHighElectionTimeout();
+
+const primary = rst.getPrimary();
+const dbName = "testDB";
+const collName = jsTest.name();
+const primaryDB = primary.getDB(dbName);
+const primaryColl = primaryDB[collName];
+
+assert.commandWorked(primaryDB.createCollection(collName, {writeConcern: {w: "majority"}}));
+
+const failPoint = configureFailPoint(primaryDB,
+                                     "hangAfterCollectionInserts",
+                                     {collectionNS: primaryColl.getFullName(), first_id: "b"});
+
+try {
+    // Hold back the durable timestamp by leaving an uncommitted transaction hanging.
+
+    TestData.dbName = dbName;
+    TestData.collName = collName;
+
+    startParallelShell(() => {
+        jsTestLog("Insert a document that will hang before the insert completes.");
+        // Crashing the server while this command is running may cause the parallel shell code to
+        // error and stop executing. We will therefore ignore the result of this command and
+        // parallel shell. Test correctness is guaranteed by waiting for the failpoint this command
+        // hits.
+        db.getSiblingDB(TestData.dbName)[TestData.collName].insert({_id: "b"});
+    }, primary.port);
+
+    jsTest.log("Wait for async insert to hit the failpoint.");
+    failPoint.wait();
+
+    // Execute an insert with confirmation that it made it to disk ({j: true});
+    //
+    // The primary's durable timestamp should be pinned by the prior hanging uncommitted write. So
+    // this second write will have an oplog hole behind it and will be truncated after a crash.
+    assert.commandWorked(
+        primaryColl.insert({_id: "writeAfterHole"}, {writeConcern: {w: 1, j: true}}));
+
+    const findResult = primaryColl.findOne({_id: "writeAfterHole"});
+    assert.eq(findResult, {"_id": "writeAfterHole"});
+
+    jsTest.log("Force a checkpoint so the primary has data on startup recovery after a crash");
+    assert.commandWorked(primary.adminCommand({fsync: 1}));
+
+    // Crash and restart the primary, which should truncate the second successful write, because
+    // the first write never committed and left a hole in the oplog.
+    rst.stop(primary, 9, {allowedExitCode: MongoRunner.EXIT_SIGKILL});
+} catch (error) {
+    // Turn off the failpoint before allowing the test to end, so nothing hangs while the server
+    // shuts down or in post-test hooks.
+    failPoint.off();
+    throw error;
+}
+
+rst.start(primary);
+
+// Wait for the restarted node to complete startup recovery and start accepting user requests.
+// Note: no new primary will be elected because of the high election timeout set on the replica set.
+assert.soonNoExcept(function() {
+    const nodeState = assert.commandWorked(primary.adminCommand("replSetGetStatus")).myState;
+    return nodeState == ReplSetTest.State.SECONDARY;
+});
+
+// Confirm that the write with the oplog hold behind it is now gone (truncated) as expected.
+primary.setSlaveOk();
+const find = primary.getDB(dbName).getCollection(collName).findOne({_id: "writeAfterHole"});
+assert.eq(find, null);
+
+rst.stopSet();
+})();
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index ec92dcfc2e2..f3a605dbd75 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -114,6 +114,7 @@ MONGO_FAIL_POINT_DEFINE(forceSyncSourceRetryWaitForInitialSync);
 MONGO_FAIL_POINT_DEFINE(waitForIsMasterResponse);
 // Will cause an isMaster request to hang as it starts waiting.
 MONGO_FAIL_POINT_DEFINE(hangWhileWaitingForIsMasterResponse);
+MONGO_FAIL_POINT_DEFINE(skipDurableTimestampUpdates);
 
 // Number of times we tried to go live as a secondary.
 Counter64 attemptsToBecomeSecondary;
@@ -1196,6 +1197,11 @@ void ReplicationCoordinatorImpl::setMyLastAppliedOpTimeAndWallTimeForward(
 void ReplicationCoordinatorImpl::setMyLastDurableOpTimeAndWallTimeForward(
     const OpTimeAndWallTime& opTimeAndWallTime) {
     stdx::unique_lock<Latch> lock(_mutex);
+
+    if (MONGO_unlikely(skipDurableTimestampUpdates.shouldFail())) {
+        return;
+    }
+
     if (opTimeAndWallTime.opTime > _getMyLastDurableOpTime_inlock()) {
         _setMyLastDurableOpTimeAndWallTime(lock, opTimeAndWallTime, false);
         _reportUpstream_inlock(std::move(lock));
author	Dianna Hohensee <dianna.hohensee@mongodb.com>	2020-02-26 13:14:49 -0500
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2020-02-27 13:54:12 +0000
commit	3a59e217a4b34234acbd6a404f98276a87435ee0 (patch)
tree	0e0890c6e5bd0db9d962b694acf0f3ee5edb19b5
parent	77c6d1044cff7b113881a1c97f5dca63567fbe81 (diff)
download	mongo-3a59e217a4b34234acbd6a404f98276a87435ee0.tar.gz