diff options
author | Dianna Hohensee <dianna.hohensee@mongodb.com> | 2020-02-26 13:14:49 -0500 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-02-27 13:54:12 +0000 |
commit | 3a59e217a4b34234acbd6a404f98276a87435ee0 (patch) | |
tree | 0e0890c6e5bd0db9d962b694acf0f3ee5edb19b5 | |
parent | 77c6d1044cff7b113881a1c97f5dca63567fbe81 (diff) | |
download | mongo-3a59e217a4b34234acbd6a404f98276a87435ee0.tar.gz |
SERVER-41386 Test that the replica set's majority commit point can move forward because of secondaries without primary durable writes
SERVER-41387 Test that oplogTruncateAfterPoint will clear oplog holes during startup recovery
after primary crash
create mode 100644 jstests/noPassthrough/non_durable_writes_on_primary_can_reach_majority.js
create mode 100644 jstests/noPassthrough/startup_recovery_truncates_oplog_holes_after_primary_crash.js
3 files changed, 220 insertions, 0 deletions
diff --git a/jstests/noPassthrough/non_durable_writes_on_primary_can_reach_majority.js b/jstests/noPassthrough/non_durable_writes_on_primary_can_reach_majority.js new file mode 100644 index 00000000000..e5a1b93396c --- /dev/null +++ b/jstests/noPassthrough/non_durable_writes_on_primary_can_reach_majority.js @@ -0,0 +1,121 @@ +/** + * Tests that non-durable writes on the primary can be successfully majority confirmed by the + * secondaries. + * + * Skipping persistence on the primary will hold back its durable timestamp used for cross-replica + * set write concern confirmation. + * + * First tests that writes can be majority confirmed without the primary by two secondaries. + * Then tests that writes cannot be majority confirmed without the primary and only one secondary. + * + * @tags: [ + * # inMemory has journaling off, so {j:true} writes are not allowed. + * requires_journaling, + * ] + */ + +(function() { +"use strict"; + +load("jstests/libs/fail_point_util.js"); + +const rst = new ReplSetTest({name: jsTest.name(), nodes: 3}); +rst.startSet(); +// Make sure there are no election timeouts. This should prevent primary stepdown. Normally we would +// set the secondary node votes to 0, but that would affect the feature that is being tested. +rst.initiateWithHighElectionTimeout(); + +const primary = rst.getPrimary(); +const dbName = "testDB"; +const collName = jsTest.name(); +const primaryDB = primary.getDB(dbName); +const primaryColl = primaryDB[collName]; + +assert.commandWorked(primaryDB.createCollection(collName, {writeConcern: {w: "majority"}})); + +// Do a write and then fetch and save the durable and majority timestamps on the primary. +// Use {w: 3, j: true} write concern to make sure the timestamps are stable. +const res = assert.commandWorked( + primaryColl.insert({_id: "writeAllDurable"}, {writeConcern: {w: 3, j: true}})); +const primaryReplSetStatus = assert.commandWorked(primary.adminCommand("replSetGetStatus")); +const primaryPreFailPointDurableTs = primaryReplSetStatus.optimes.durableOpTime.ts; +const primaryPreFailPointMajorityTs = primaryReplSetStatus.optimes.readConcernMajorityOpTime.ts; +assert.neq(primaryPreFailPointDurableTs, null); +assert.neq(primaryPreFailPointMajorityTs, null); +assert.eq(primaryPreFailPointDurableTs, primaryPreFailPointMajorityTs); + +// Configure the primary to stop moving the durable timestamp forward. The primary will no longer be +// able to contribute to moving the replica set's majority timestamp forward because the replica +// set's majority timestamp depends upon each member's durable timestamp, +const failPoint = configureFailPoint(primaryDB, "skipDurableTimestampUpdates"); + +try { + // Perform some writes with majority write concern. The primary cannot confirm them, so success + // means that the secondaries have the writes durably. + jsTestLog("Writes majority confirmed by secondaries."); + assert.commandWorked( + primaryColl.insert({_id: "majority1"}, {writeConcern: {w: "majority", j: true}})); + assert.commandWorked( + primaryColl.insert({_id: "majority2"}, {writeConcern: {w: "majority", j: true}})); + + // Check that the primary's durable timestamp has not moved forward, but the majority point has. + const primaryStatus = assert.commandWorked(primary.adminCommand("replSetGetStatus")); + const primaryPostWritesDurableTs = primaryStatus.optimes.durableOpTime.ts; + const primaryPostWritesMajorityTs = primaryStatus.optimes.readConcernMajorityOpTime.ts; + assert.eq(primaryPostWritesDurableTs, primaryPreFailPointDurableTs); + assert.gt(primaryPostWritesMajorityTs, primaryPreFailPointDurableTs); + + // Check that the secondaries' durable and majority timestamps have moved forward. + rst.getSecondaries().forEach(function(secondary) { + const secondaryStatus = assert.commandWorked(secondary.adminCommand("replSetGetStatus")); + const secondaryDurableTs = secondaryStatus.optimes.durableOpTime.ts; + const secondaryMajorityTs = secondaryStatus.optimes.readConcernMajorityOpTime.ts; + assert.eq(secondaryDurableTs, secondaryMajorityTs); + assert.gt(secondaryDurableTs, primaryPreFailPointDurableTs); + assert.eq(secondaryMajorityTs, primaryPostWritesMajorityTs); + }); + + // Shutdown a secondary so that there is no longer a majority able to confirm the durability of + // a write. + jsTestLog("Stopping one of the two secondaries"); + let secondaries = rst.getSecondaries(); + assert.eq(secondaries.length, 2); + let stoppedSecondary = secondaries[0]; + let runningSecondary = secondaries[1]; + rst.stop(stoppedSecondary); + + // Now writes cannot reach majority without the primary. We will do {w: 2, j: false} writes to + // get the writes on both remaining nodes. Then follow up with fsync commands againt the two + // nodes to make sure the durable timestamps move forward if possible -- this will work only on + // the secondary, the primary's durable timestamp will not move. + jsTestLog("Writes cannot become majority confirmed."); + assert.commandWorked( + primaryColl.insert({_id: "noMajority1"}, {writeConcern: {w: 2, j: false}})); + assert.commandWorked( + primaryColl.insert({_id: "noMajority2"}, {writeConcern: {w: 2, j: false}})); + + jsTest.log("Force checkpoints to move the durable timestamps forward"); + assert.commandWorked(primary.adminCommand({fsync: 1})); + assert.commandWorked(runningSecondary.adminCommand({fsync: 1})); + + // Check that the primary's durable and majority timestamps have not moved forward. + const primaryReplStatus = assert.commandWorked(primary.adminCommand("replSetGetStatus")); + const primaryPostFsyncDurableTs = primaryReplStatus.optimes.durableOpTime.ts; + const primaryPostFsyncMajorityTs = primaryReplStatus.optimes.readConcernMajorityOpTime.ts; + assert.eq(primaryPostFsyncDurableTs, primaryPreFailPointDurableTs); + assert.eq(primaryPostFsyncMajorityTs, primaryPostWritesMajorityTs); + + // Check that the secondary's durable timestamp has moved forward, but the majority has not. + const secondaryStatus = assert.commandWorked(runningSecondary.adminCommand("replSetGetStatus")); + const secondaryDurableTs = secondaryStatus.optimes.durableOpTime.ts; + const secondaryMajorityTs = secondaryStatus.optimes.readConcernMajorityOpTime.ts; + assert.gt(secondaryDurableTs, primaryPostFsyncMajorityTs); + assert.eq(secondaryMajorityTs, primaryPostFsyncMajorityTs); +} finally { + // Turn off the failpoint before allowing the test to end, so nothing hangs while the server + // shuts down or in post-test hooks. + failPoint.off(); +} + +rst.stopSet(); +})(); diff --git a/jstests/noPassthrough/startup_recovery_truncates_oplog_holes_after_primary_crash.js b/jstests/noPassthrough/startup_recovery_truncates_oplog_holes_after_primary_crash.js new file mode 100644 index 00000000000..14d50245f98 --- /dev/null +++ b/jstests/noPassthrough/startup_recovery_truncates_oplog_holes_after_primary_crash.js @@ -0,0 +1,93 @@ +/** + * Test that a confirmed write against a primary with oplog holes behind it when a crash occurs will + * be truncated on startup recovery. + * + * There must be more than 1 voting node, otherwise the write concern behavior changes to waiting + * for no holes for writes with {j: true} write concern, and no confirmed writes will be truncated. + * + * @tags: [ + * # The primary is restarted and must retain its data. + * requires_persistence, + * ] + */ + +(function() { +"use strict"; + +load("jstests/libs/fail_point_util.js"); + +const rst = new ReplSetTest({name: jsTest.name(), nodes: 2}); +rst.startSet(); +// Make sure there are no election timeouts. This should prevent primary stepdown. Normally we would +// set the secondary node votes to 0, but that would affect the feature that is being tested. +rst.initiateWithHighElectionTimeout(); + +const primary = rst.getPrimary(); +const dbName = "testDB"; +const collName = jsTest.name(); +const primaryDB = primary.getDB(dbName); +const primaryColl = primaryDB[collName]; + +assert.commandWorked(primaryDB.createCollection(collName, {writeConcern: {w: "majority"}})); + +const failPoint = configureFailPoint(primaryDB, + "hangAfterCollectionInserts", + {collectionNS: primaryColl.getFullName(), first_id: "b"}); + +try { + // Hold back the durable timestamp by leaving an uncommitted transaction hanging. + + TestData.dbName = dbName; + TestData.collName = collName; + + startParallelShell(() => { + jsTestLog("Insert a document that will hang before the insert completes."); + // Crashing the server while this command is running may cause the parallel shell code to + // error and stop executing. We will therefore ignore the result of this command and + // parallel shell. Test correctness is guaranteed by waiting for the failpoint this command + // hits. + db.getSiblingDB(TestData.dbName)[TestData.collName].insert({_id: "b"}); + }, primary.port); + + jsTest.log("Wait for async insert to hit the failpoint."); + failPoint.wait(); + + // Execute an insert with confirmation that it made it to disk ({j: true}); + // + // The primary's durable timestamp should be pinned by the prior hanging uncommitted write. So + // this second write will have an oplog hole behind it and will be truncated after a crash. + assert.commandWorked( + primaryColl.insert({_id: "writeAfterHole"}, {writeConcern: {w: 1, j: true}})); + + const findResult = primaryColl.findOne({_id: "writeAfterHole"}); + assert.eq(findResult, {"_id": "writeAfterHole"}); + + jsTest.log("Force a checkpoint so the primary has data on startup recovery after a crash"); + assert.commandWorked(primary.adminCommand({fsync: 1})); + + // Crash and restart the primary, which should truncate the second successful write, because + // the first write never committed and left a hole in the oplog. + rst.stop(primary, 9, {allowedExitCode: MongoRunner.EXIT_SIGKILL}); +} catch (error) { + // Turn off the failpoint before allowing the test to end, so nothing hangs while the server + // shuts down or in post-test hooks. + failPoint.off(); + throw error; +} + +rst.start(primary); + +// Wait for the restarted node to complete startup recovery and start accepting user requests. +// Note: no new primary will be elected because of the high election timeout set on the replica set. +assert.soonNoExcept(function() { + const nodeState = assert.commandWorked(primary.adminCommand("replSetGetStatus")).myState; + return nodeState == ReplSetTest.State.SECONDARY; +}); + +// Confirm that the write with the oplog hold behind it is now gone (truncated) as expected. +primary.setSlaveOk(); +const find = primary.getDB(dbName).getCollection(collName).findOne({_id: "writeAfterHole"}); +assert.eq(find, null); + +rst.stopSet(); +})(); diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index ec92dcfc2e2..f3a605dbd75 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -114,6 +114,7 @@ MONGO_FAIL_POINT_DEFINE(forceSyncSourceRetryWaitForInitialSync); MONGO_FAIL_POINT_DEFINE(waitForIsMasterResponse); // Will cause an isMaster request to hang as it starts waiting. MONGO_FAIL_POINT_DEFINE(hangWhileWaitingForIsMasterResponse); +MONGO_FAIL_POINT_DEFINE(skipDurableTimestampUpdates); // Number of times we tried to go live as a secondary. Counter64 attemptsToBecomeSecondary; @@ -1196,6 +1197,11 @@ void ReplicationCoordinatorImpl::setMyLastAppliedOpTimeAndWallTimeForward( void ReplicationCoordinatorImpl::setMyLastDurableOpTimeAndWallTimeForward( const OpTimeAndWallTime& opTimeAndWallTime) { stdx::unique_lock<Latch> lock(_mutex); + + if (MONGO_unlikely(skipDurableTimestampUpdates.shouldFail())) { + return; + } + if (opTimeAndWallTime.opTime > _getMyLastDurableOpTime_inlock()) { _setMyLastDurableOpTimeAndWallTime(lock, opTimeAndWallTime, false); _reportUpstream_inlock(std::move(lock)); |