summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDianna Hohensee <dianna.hohensee@mongodb.com>2020-02-26 13:14:49 -0500
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-02-27 13:54:12 +0000
commit3a59e217a4b34234acbd6a404f98276a87435ee0 (patch)
tree0e0890c6e5bd0db9d962b694acf0f3ee5edb19b5
parent77c6d1044cff7b113881a1c97f5dca63567fbe81 (diff)
downloadmongo-3a59e217a4b34234acbd6a404f98276a87435ee0.tar.gz
SERVER-41386 Test that the replica set's majority commit point can move forward because of secondaries without primary durable writes
SERVER-41387 Test that oplogTruncateAfterPoint will clear oplog holes during startup recovery after primary crash create mode 100644 jstests/noPassthrough/non_durable_writes_on_primary_can_reach_majority.js create mode 100644 jstests/noPassthrough/startup_recovery_truncates_oplog_holes_after_primary_crash.js
-rw-r--r--jstests/noPassthrough/non_durable_writes_on_primary_can_reach_majority.js121
-rw-r--r--jstests/noPassthrough/startup_recovery_truncates_oplog_holes_after_primary_crash.js93
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp6
3 files changed, 220 insertions, 0 deletions
diff --git a/jstests/noPassthrough/non_durable_writes_on_primary_can_reach_majority.js b/jstests/noPassthrough/non_durable_writes_on_primary_can_reach_majority.js
new file mode 100644
index 00000000000..e5a1b93396c
--- /dev/null
+++ b/jstests/noPassthrough/non_durable_writes_on_primary_can_reach_majority.js
@@ -0,0 +1,121 @@
+/**
+ * Tests that non-durable writes on the primary can be successfully majority confirmed by the
+ * secondaries.
+ *
+ * Skipping persistence on the primary will hold back its durable timestamp used for cross-replica
+ * set write concern confirmation.
+ *
+ * First tests that writes can be majority confirmed without the primary by two secondaries.
+ * Then tests that writes cannot be majority confirmed without the primary and only one secondary.
+ *
+ * @tags: [
+ * # inMemory has journaling off, so {j:true} writes are not allowed.
+ * requires_journaling,
+ * ]
+ */
+
+(function() {
+"use strict";
+
+load("jstests/libs/fail_point_util.js");
+
+const rst = new ReplSetTest({name: jsTest.name(), nodes: 3});
+rst.startSet();
+// Make sure there are no election timeouts. This should prevent primary stepdown. Normally we would
+// set the secondary node votes to 0, but that would affect the feature that is being tested.
+rst.initiateWithHighElectionTimeout();
+
+const primary = rst.getPrimary();
+const dbName = "testDB";
+const collName = jsTest.name();
+const primaryDB = primary.getDB(dbName);
+const primaryColl = primaryDB[collName];
+
+assert.commandWorked(primaryDB.createCollection(collName, {writeConcern: {w: "majority"}}));
+
+// Do a write and then fetch and save the durable and majority timestamps on the primary.
+// Use {w: 3, j: true} write concern to make sure the timestamps are stable.
+const res = assert.commandWorked(
+ primaryColl.insert({_id: "writeAllDurable"}, {writeConcern: {w: 3, j: true}}));
+const primaryReplSetStatus = assert.commandWorked(primary.adminCommand("replSetGetStatus"));
+const primaryPreFailPointDurableTs = primaryReplSetStatus.optimes.durableOpTime.ts;
+const primaryPreFailPointMajorityTs = primaryReplSetStatus.optimes.readConcernMajorityOpTime.ts;
+assert.neq(primaryPreFailPointDurableTs, null);
+assert.neq(primaryPreFailPointMajorityTs, null);
+assert.eq(primaryPreFailPointDurableTs, primaryPreFailPointMajorityTs);
+
+// Configure the primary to stop moving the durable timestamp forward. The primary will no longer be
+// able to contribute to moving the replica set's majority timestamp forward because the replica
+// set's majority timestamp depends upon each member's durable timestamp,
+const failPoint = configureFailPoint(primaryDB, "skipDurableTimestampUpdates");
+
+try {
+ // Perform some writes with majority write concern. The primary cannot confirm them, so success
+ // means that the secondaries have the writes durably.
+ jsTestLog("Writes majority confirmed by secondaries.");
+ assert.commandWorked(
+ primaryColl.insert({_id: "majority1"}, {writeConcern: {w: "majority", j: true}}));
+ assert.commandWorked(
+ primaryColl.insert({_id: "majority2"}, {writeConcern: {w: "majority", j: true}}));
+
+ // Check that the primary's durable timestamp has not moved forward, but the majority point has.
+ const primaryStatus = assert.commandWorked(primary.adminCommand("replSetGetStatus"));
+ const primaryPostWritesDurableTs = primaryStatus.optimes.durableOpTime.ts;
+ const primaryPostWritesMajorityTs = primaryStatus.optimes.readConcernMajorityOpTime.ts;
+ assert.eq(primaryPostWritesDurableTs, primaryPreFailPointDurableTs);
+ assert.gt(primaryPostWritesMajorityTs, primaryPreFailPointDurableTs);
+
+ // Check that the secondaries' durable and majority timestamps have moved forward.
+ rst.getSecondaries().forEach(function(secondary) {
+ const secondaryStatus = assert.commandWorked(secondary.adminCommand("replSetGetStatus"));
+ const secondaryDurableTs = secondaryStatus.optimes.durableOpTime.ts;
+ const secondaryMajorityTs = secondaryStatus.optimes.readConcernMajorityOpTime.ts;
+ assert.eq(secondaryDurableTs, secondaryMajorityTs);
+ assert.gt(secondaryDurableTs, primaryPreFailPointDurableTs);
+ assert.eq(secondaryMajorityTs, primaryPostWritesMajorityTs);
+ });
+
+ // Shutdown a secondary so that there is no longer a majority able to confirm the durability of
+ // a write.
+ jsTestLog("Stopping one of the two secondaries");
+ let secondaries = rst.getSecondaries();
+ assert.eq(secondaries.length, 2);
+ let stoppedSecondary = secondaries[0];
+ let runningSecondary = secondaries[1];
+ rst.stop(stoppedSecondary);
+
+ // Now writes cannot reach majority without the primary. We will do {w: 2, j: false} writes to
+ // get the writes on both remaining nodes. Then follow up with fsync commands againt the two
+ // nodes to make sure the durable timestamps move forward if possible -- this will work only on
+ // the secondary, the primary's durable timestamp will not move.
+ jsTestLog("Writes cannot become majority confirmed.");
+ assert.commandWorked(
+ primaryColl.insert({_id: "noMajority1"}, {writeConcern: {w: 2, j: false}}));
+ assert.commandWorked(
+ primaryColl.insert({_id: "noMajority2"}, {writeConcern: {w: 2, j: false}}));
+
+ jsTest.log("Force checkpoints to move the durable timestamps forward");
+ assert.commandWorked(primary.adminCommand({fsync: 1}));
+ assert.commandWorked(runningSecondary.adminCommand({fsync: 1}));
+
+ // Check that the primary's durable and majority timestamps have not moved forward.
+ const primaryReplStatus = assert.commandWorked(primary.adminCommand("replSetGetStatus"));
+ const primaryPostFsyncDurableTs = primaryReplStatus.optimes.durableOpTime.ts;
+ const primaryPostFsyncMajorityTs = primaryReplStatus.optimes.readConcernMajorityOpTime.ts;
+ assert.eq(primaryPostFsyncDurableTs, primaryPreFailPointDurableTs);
+ assert.eq(primaryPostFsyncMajorityTs, primaryPostWritesMajorityTs);
+
+ // Check that the secondary's durable timestamp has moved forward, but the majority has not.
+ const secondaryStatus = assert.commandWorked(runningSecondary.adminCommand("replSetGetStatus"));
+ const secondaryDurableTs = secondaryStatus.optimes.durableOpTime.ts;
+ const secondaryMajorityTs = secondaryStatus.optimes.readConcernMajorityOpTime.ts;
+ assert.gt(secondaryDurableTs, primaryPostFsyncMajorityTs);
+ assert.eq(secondaryMajorityTs, primaryPostFsyncMajorityTs);
+} finally {
+ // Turn off the failpoint before allowing the test to end, so nothing hangs while the server
+ // shuts down or in post-test hooks.
+ failPoint.off();
+}
+
+rst.stopSet();
+})();
diff --git a/jstests/noPassthrough/startup_recovery_truncates_oplog_holes_after_primary_crash.js b/jstests/noPassthrough/startup_recovery_truncates_oplog_holes_after_primary_crash.js
new file mode 100644
index 00000000000..14d50245f98
--- /dev/null
+++ b/jstests/noPassthrough/startup_recovery_truncates_oplog_holes_after_primary_crash.js
@@ -0,0 +1,93 @@
+/**
+ * Test that a confirmed write against a primary with oplog holes behind it when a crash occurs will
+ * be truncated on startup recovery.
+ *
+ * There must be more than 1 voting node, otherwise the write concern behavior changes to waiting
+ * for no holes for writes with {j: true} write concern, and no confirmed writes will be truncated.
+ *
+ * @tags: [
+ * # The primary is restarted and must retain its data.
+ * requires_persistence,
+ * ]
+ */
+
+(function() {
+"use strict";
+
+load("jstests/libs/fail_point_util.js");
+
+const rst = new ReplSetTest({name: jsTest.name(), nodes: 2});
+rst.startSet();
+// Make sure there are no election timeouts. This should prevent primary stepdown. Normally we would
+// set the secondary node votes to 0, but that would affect the feature that is being tested.
+rst.initiateWithHighElectionTimeout();
+
+const primary = rst.getPrimary();
+const dbName = "testDB";
+const collName = jsTest.name();
+const primaryDB = primary.getDB(dbName);
+const primaryColl = primaryDB[collName];
+
+assert.commandWorked(primaryDB.createCollection(collName, {writeConcern: {w: "majority"}}));
+
+const failPoint = configureFailPoint(primaryDB,
+ "hangAfterCollectionInserts",
+ {collectionNS: primaryColl.getFullName(), first_id: "b"});
+
+try {
+ // Hold back the durable timestamp by leaving an uncommitted transaction hanging.
+
+ TestData.dbName = dbName;
+ TestData.collName = collName;
+
+ startParallelShell(() => {
+ jsTestLog("Insert a document that will hang before the insert completes.");
+ // Crashing the server while this command is running may cause the parallel shell code to
+ // error and stop executing. We will therefore ignore the result of this command and
+ // parallel shell. Test correctness is guaranteed by waiting for the failpoint this command
+ // hits.
+ db.getSiblingDB(TestData.dbName)[TestData.collName].insert({_id: "b"});
+ }, primary.port);
+
+ jsTest.log("Wait for async insert to hit the failpoint.");
+ failPoint.wait();
+
+ // Execute an insert with confirmation that it made it to disk ({j: true});
+ //
+ // The primary's durable timestamp should be pinned by the prior hanging uncommitted write. So
+ // this second write will have an oplog hole behind it and will be truncated after a crash.
+ assert.commandWorked(
+ primaryColl.insert({_id: "writeAfterHole"}, {writeConcern: {w: 1, j: true}}));
+
+ const findResult = primaryColl.findOne({_id: "writeAfterHole"});
+ assert.eq(findResult, {"_id": "writeAfterHole"});
+
+ jsTest.log("Force a checkpoint so the primary has data on startup recovery after a crash");
+ assert.commandWorked(primary.adminCommand({fsync: 1}));
+
+ // Crash and restart the primary, which should truncate the second successful write, because
+ // the first write never committed and left a hole in the oplog.
+ rst.stop(primary, 9, {allowedExitCode: MongoRunner.EXIT_SIGKILL});
+} catch (error) {
+ // Turn off the failpoint before allowing the test to end, so nothing hangs while the server
+ // shuts down or in post-test hooks.
+ failPoint.off();
+ throw error;
+}
+
+rst.start(primary);
+
+// Wait for the restarted node to complete startup recovery and start accepting user requests.
+// Note: no new primary will be elected because of the high election timeout set on the replica set.
+assert.soonNoExcept(function() {
+ const nodeState = assert.commandWorked(primary.adminCommand("replSetGetStatus")).myState;
+ return nodeState == ReplSetTest.State.SECONDARY;
+});
+
+// Confirm that the write with the oplog hold behind it is now gone (truncated) as expected.
+primary.setSlaveOk();
+const find = primary.getDB(dbName).getCollection(collName).findOne({_id: "writeAfterHole"});
+assert.eq(find, null);
+
+rst.stopSet();
+})();
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index ec92dcfc2e2..f3a605dbd75 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -114,6 +114,7 @@ MONGO_FAIL_POINT_DEFINE(forceSyncSourceRetryWaitForInitialSync);
MONGO_FAIL_POINT_DEFINE(waitForIsMasterResponse);
// Will cause an isMaster request to hang as it starts waiting.
MONGO_FAIL_POINT_DEFINE(hangWhileWaitingForIsMasterResponse);
+MONGO_FAIL_POINT_DEFINE(skipDurableTimestampUpdates);
// Number of times we tried to go live as a secondary.
Counter64 attemptsToBecomeSecondary;
@@ -1196,6 +1197,11 @@ void ReplicationCoordinatorImpl::setMyLastAppliedOpTimeAndWallTimeForward(
void ReplicationCoordinatorImpl::setMyLastDurableOpTimeAndWallTimeForward(
const OpTimeAndWallTime& opTimeAndWallTime) {
stdx::unique_lock<Latch> lock(_mutex);
+
+ if (MONGO_unlikely(skipDurableTimestampUpdates.shouldFail())) {
+ return;
+ }
+
if (opTimeAndWallTime.opTime > _getMyLastDurableOpTime_inlock()) {
_setMyLastDurableOpTimeAndWallTime(lock, opTimeAndWallTime, false);
_reportUpstream_inlock(std::move(lock));