diff options
author | XueruiFa <xuerui.fa@mongodb.com> | 2020-10-08 19:58:21 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-10-14 15:45:17 +0000 |
commit | 4d212ea86b46ec06196cc979b3f760c278958a8f (patch) | |
tree | 7491f874815f2bffa1c107c839ad07edddbf32ee | |
parent | 1f0009a389042c24360509625d50a9e3812658c7 (diff) | |
download | mongo-4d212ea86b46ec06196cc979b3f760c278958a8f.tar.gz |
SERVER-51418: Resolve race condition in StepdownShouldInterruptConfigWrite
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_reconfig_test.cpp | 133 |
1 files changed, 73 insertions, 60 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl_reconfig_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_reconfig_test.cpp index 2979160b523..b92245d47db 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_reconfig_test.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_reconfig_test.cpp @@ -1408,66 +1408,79 @@ TEST_F(ReplCoordReconfigTest, ASSERT_OK(status); } -// TEST_F(ReplCoordReconfigTest, StepdownShouldInterruptConfigWrite) { -// // Start out in a non-initial config version. -// init(); -// auto configVersion = 2; -// assertStartSuccess(configWithMembers(configVersion, -// 0, -// BSON_ARRAY(member(1, "n1:1") -// << member(2, "n2:1") << member(3, "n3:1", -// 0))), -// HostAndPort("n1", 1)); -// ASSERT_OK(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - -// // Simulate application of one oplog entry. -// replCoordSetMyLastAppliedAndDurableOpTime(OpTime(Timestamp(1, 1), 0)); - -// // Get elected primary. -// simulateSuccessfulV1Election(); -// ASSERT_EQ(getReplCoord()->getMemberState(), MemberState::RS_PRIMARY); -// ASSERT_EQ(getReplCoord()->getTerm(), 1); - -// // Advance your optime. -// auto commitPoint = OpTime(Timestamp(2, 1), 1); -// replCoordSetMyLastAppliedAndDurableOpTime(commitPoint); -// replicateOpTo(2, commitPoint); - -// // Respond to heartbeats before reconfig. -// respondToAllHeartbeats(); - -// // Do a reconfig that should fail due to stepdown. -// configVersion = 3; -// ReplSetReconfigArgs args; -// args.newConfigObj = configWithMembers( -// configVersion, 1, BSON_ARRAY(member(1, "n1:1") << member(2, "n2:1") << member(3, -// "n3:1"))); - -// BSONObjBuilder result; -// Status status(ErrorCodes::InternalError, "Not Set"); -// const auto opCtx = makeOperationContext(); -// auto reconfigResult = stdx::async(stdx::launch::async, [&] { -// status = getReplCoord()->processReplSetReconfig(opCtx.get(), args, &result); -// }); - -// // Step down due to a higher term. -// TopologyCoordinator::UpdateTermResult termUpdated; -// auto updateTermEvh = getReplCoord()->updateTerm_forTest(2, &termUpdated); -// ASSERT(termUpdated == TopologyCoordinator::UpdateTermResult::kTriggerStepDown); -// ASSERT(updateTermEvh.isValid()); -// getReplExec()->waitForEvent(updateTermEvh); - -// // Respond to quorum check to resume the reconfig. We keep responding until the reconfig -// thread -// // finishes. -// while (stdx::future_status::ready != -// reconfigResult.wait_for(Milliseconds::zero().toSystemDuration())) { -// respondToAllHeartbeats(); -// } - -// ASSERT_EQ(status.code(), ErrorCodes::NotWritablePrimary); -// ASSERT_EQ(status.reason(), "Stepped down when persisting new config"); -// } +TEST_F(ReplCoordReconfigTest, StepdownShouldInterruptConfigWrite) { + // Start out in a non-initial config version. + init(); + auto configVersion = 2; + assertStartSuccess(configWithMembers(configVersion, + 0, + BSON_ARRAY(member(1, "n1:1") + << member(2, "n2:1") << member(3, "n3:1", 0))), + HostAndPort("n1", 1)); + ASSERT_OK(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + + // Simulate application of one oplog entry. + replCoordSetMyLastAppliedAndDurableOpTime(OpTime(Timestamp(1, 1), 0)); + + // Get elected primary. + simulateSuccessfulV1Election(); + ASSERT_EQ(getReplCoord()->getMemberState(), MemberState::RS_PRIMARY); + ASSERT_EQ(getReplCoord()->getTerm(), 1); + + // Advance your optime. + auto commitPoint = OpTime(Timestamp(2, 1), 1); + replCoordSetMyLastAppliedAndDurableOpTime(commitPoint); + replicateOpTo(2, commitPoint); + + // Respond to heartbeats before reconfig. + respondToAllHeartbeats(); + + // Do a reconfig that should fail due to stepdown. + configVersion = 3; + ReplSetReconfigArgs args; + args.newConfigObj = configWithMembers( + configVersion, 1, BSON_ARRAY(member(1, "n1:1") << member(2, "n2:1") << member(3, "n3:1"))); + + // Turn on a reconfig hang to ensure that the reconfig thread will be scheduled before the node + // is stepped down. If the node is stepped down before reconfig is initiated, the reconfig will + // fail with a different reason than expected. + auto hangReconfig = globalFailPointRegistry().find("ReconfigHangBeforeConfigValidationCheck"); + auto timesEnteredFailPoint = hangReconfig->setMode(FailPoint::alwaysOn); + + BSONObjBuilder result; + Status status(ErrorCodes::InternalError, "Not Set"); + const auto opCtx = makeOperationContext(); + auto reconfigResult = stdx::async(stdx::launch::async, [&] { + status = getReplCoord()->processReplSetReconfig(opCtx.get(), args, &result); + }); + + // Ensure the reconfig thread has started before stepping down as primary. + hangReconfig->waitForTimesEntered(timesEnteredFailPoint + 1); + hangReconfig->setMode(FailPoint::off); + + // The failpoint should be released now. We run the clock forward so that the reconfig can + // continue. + getNet()->enterNetwork(); + getNet()->runUntil(getNet()->now() + Milliseconds(200)); + getNet()->exitNetwork(); + + // Step down due to a higher term. + TopologyCoordinator::UpdateTermResult termUpdated; + auto updateTermEvh = getReplCoord()->updateTerm_forTest(2, &termUpdated); + ASSERT(termUpdated == TopologyCoordinator::UpdateTermResult::kTriggerStepDown); + ASSERT(updateTermEvh.isValid()); + getReplExec()->waitForEvent(updateTermEvh); + + // Respond to quorum check to resume the reconfig. We keep responding until the reconfig thread + // finishes. + while (stdx::future_status::ready != + reconfigResult.wait_for(Milliseconds::zero().toSystemDuration())) { + respondToAllHeartbeats(); + } + + ASSERT_EQ(status.code(), ErrorCodes::NotWritablePrimary); + ASSERT_EQ(status.reason(), "Stepped down when persisting new config"); +} TEST_F(ReplCoordReconfigTest, StartElectionOnReconfigToSingleNode) { // Start up as a secondary. |