diff options
author | Pavithra Vetriselvan <pavithra.vetriselvan@mongodb.com> | 2020-02-27 22:02:01 +0000 |
---|---|---|
committer | evergreen <evergreen@mongodb.com> | 2020-02-27 22:02:01 +0000 |
commit | 9286617bbc5f9087f3774fc7d4dd4d366c584ae2 (patch) | |
tree | 0c15b9278037e970eb34ad8852f2e0fc89572129 /src | |
parent | c01de187585576180188d2598e5229a93a0743ed (diff) | |
download | mongo-9286617bbc5f9087f3774fc7d4dd4d366c584ae2.tar.gz |
SERVER-45085 re-enable config quorum checker
Diffstat (limited to 'src')
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_reconfig_test.cpp | 259 |
2 files changed, 150 insertions, 115 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index 191d514630c..075485476c9 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -115,6 +115,8 @@ MONGO_FAIL_POINT_DEFINE(waitForIsMasterResponse); // Will cause an isMaster request to hang as it starts waiting. MONGO_FAIL_POINT_DEFINE(hangWhileWaitingForIsMasterResponse); MONGO_FAIL_POINT_DEFINE(skipDurableTimestampUpdates); +// Will cause a reconfig to hang after completing the config quorum check. +MONGO_FAIL_POINT_DEFINE(omitConfigQuorumCheck); // Number of times we tried to go live as a secondary. Counter64 attemptsToBecomeSecondary; @@ -3033,9 +3035,7 @@ Status ReplicationCoordinatorImpl::processReplSetReconfig(OperationContext* opCt "replSetReconfig config object with {numMembers} members parses ok", "numMembers"_attr = newConfig.getNumMembers()); - if (!args.force && - !serverGlobalParams.featureCompatibility.isVersion( - ServerGlobalParams::FeatureCompatibility::Version::kFullyUpgradedTo44)) { + if (!args.force && !MONGO_unlikely(omitConfigQuorumCheck.shouldFail())) { status = checkQuorumForReconfig( _replExecutor.get(), newConfig, myIndex.getValue(), _topCoord->getTerm()); if (!status.isOK()) { diff --git a/src/mongo/db/repl/replication_coordinator_impl_reconfig_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_reconfig_test.cpp index 73832a41225..5892b316d4a 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_reconfig_test.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_reconfig_test.cpp @@ -265,49 +265,48 @@ void doReplSetReconfig(ReplicationCoordinatorImpl* replCoord, *status = replCoord->processReplSetReconfig(opCtx, args, &garbage); } -// TEST_F(ReplCoordTest, -// NodeReturnsNewReplicaSetConfigurationIncompatibleWhenQuorumCheckFailsDuringReconfig) { -// // start up, become primary, fail during quorum check due to a heartbeat -// // containing a higher config version -// assertStartSuccess(BSON("_id" -// << "mySet" -// << "version" << 2 << "members" -// << BSON_ARRAY(BSON("_id" << 1 << "host" -// << "node1:12345") -// << BSON("_id" << 2 << "host" -// << "node2:12345"))), -// HostAndPort("node1", 12345)); -// ASSERT_OK(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); -// replCoordSetMyLastAppliedOpTime(OpTime(Timestamp(100, 1), 0), Date_t() + Seconds(100)); -// replCoordSetMyLastDurableOpTime(OpTime(Timestamp(100, 1), 0), Date_t() + Seconds(100)); -// simulateSuccessfulV1Election(); - -// Status status(ErrorCodes::InternalError, "Not Set"); -// const auto opCtx = makeOperationContext(); -// stdx::thread reconfigThread([&] { doReplSetReconfig(getReplCoord(), &status, opCtx.get()); -// }); - -// NetworkInterfaceMock* net = getNet(); -// getNet()->enterNetwork(); -// const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); -// const RemoteCommandRequest& request = noi->getRequest(); -// repl::ReplSetHeartbeatArgsV1 hbArgs; -// ASSERT_OK(hbArgs.initialize(request.cmdObj)); -// repl::ReplSetHeartbeatResponse hbResp; -// hbResp.setSetName("mySet"); -// hbResp.setState(MemberState::RS_SECONDARY); -// hbResp.setConfigVersion(5); -// BSONObjBuilder respObj; -// hbResp.setAppliedOpTimeAndWallTime({OpTime(Timestamp(100, 1), 0), Date_t() + Seconds(100)}); -// hbResp.setDurableOpTimeAndWallTime({OpTime(Timestamp(100, 1), 0), Date_t() + Seconds(100)}); -// respObj << "ok" << 1; -// hbResp.addToBSON(&respObj); -// net->scheduleResponse(noi, net->now(), makeResponseStatus(respObj.obj())); -// net->runReadyNetworkOperations(); -// getNet()->exitNetwork(); -// reconfigThread.join(); -// ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, status); -// } +TEST_F(ReplCoordTest, + NodeReturnsNewReplicaSetConfigurationIncompatibleWhenQuorumCheckFailsDuringReconfig) { + // start up, become primary, fail during quorum check due to a heartbeat + // containing a higher config version + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345"))), + HostAndPort("node1", 12345)); + ASSERT_OK(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + replCoordSetMyLastAppliedOpTime(OpTime(Timestamp(100, 1), 0), Date_t() + Seconds(100)); + replCoordSetMyLastDurableOpTime(OpTime(Timestamp(100, 1), 0), Date_t() + Seconds(100)); + simulateSuccessfulV1Election(); + + Status status(ErrorCodes::InternalError, "Not Set"); + const auto opCtx = makeOperationContext(); + stdx::thread reconfigThread([&] { doReplSetReconfig(getReplCoord(), &status, opCtx.get()); }); + + NetworkInterfaceMock* net = getNet(); + getNet()->enterNetwork(); + const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); + const RemoteCommandRequest& request = noi->getRequest(); + repl::ReplSetHeartbeatArgsV1 hbArgs; + ASSERT_OK(hbArgs.initialize(request.cmdObj)); + repl::ReplSetHeartbeatResponse hbResp; + hbResp.setSetName("mySet"); + hbResp.setState(MemberState::RS_SECONDARY); + hbResp.setConfigVersion(5); + BSONObjBuilder respObj; + hbResp.setAppliedOpTimeAndWallTime({OpTime(Timestamp(100, 1), 0), Date_t() + Seconds(100)}); + hbResp.setDurableOpTimeAndWallTime({OpTime(Timestamp(100, 1), 0), Date_t() + Seconds(100)}); + respObj << "ok" << 1; + hbResp.addToBSON(&respObj); + net->scheduleResponse(noi, net->now(), makeResponseStatus(respObj.obj())); + net->runReadyNetworkOperations(); + getNet()->exitNetwork(); + reconfigThread.join(); + ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, status); +} TEST_F(ReplCoordTest, NodeReturnsOutOfDiskSpaceWhenSavingANewConfigFailsDuringReconfig) { // start up, become primary, saving the config fails @@ -338,6 +337,7 @@ TEST_F(ReplCoordTest, NodeReturnsOutOfDiskSpaceWhenSavingANewConfigFailsDuringRe const auto opCtx = makeOperationContext(); stdx::thread reconfigThread([&] { doReplSetReconfig(getReplCoord(), &status, opCtx.get()); }); + replyToReceivedHeartbeatV1(); reconfigThread.join(); ASSERT_EQUALS(ErrorCodes::OutOfDiskSpace, status); } @@ -451,6 +451,9 @@ TEST_F(ReplCoordTest, PrimaryNodeAcceptsNewConfigWhenReceivingAReconfigWithAComp stdx::thread reconfigThread( [&] { doReplSetReconfig(getReplCoord(), &status, opCtx.get(), OpTime::kInitialTerm); }); + // Satisfy quorum check. + replyToReceivedHeartbeatV1(); + // Receive heartbeat from secondary saying that it has replicated the new config (v: 3, t: 1). // This should allow us to finish waiting for the config majority. NetworkInterfaceMock* net = getNet(); @@ -507,7 +510,29 @@ TEST_F(ReplCoordTest, OverrideReconfigBsonTermSoReconfigSucceeds) { stdx::thread reconfigThread( [&] { doReplSetReconfig(getReplCoord(), &status, opCtx.get(), 50 /* incorrect term */); }); + // Satisfy quorum check. replyToReceivedHeartbeatV1(); + // Satisfy config replication check. + auto net = getNet(); + enterNetwork(); + const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); + const RemoteCommandRequest& request = noi->getRequest(); + repl::ReplSetHeartbeatArgsV1 hbArgs; + ASSERT_OK(hbArgs.initialize(request.cmdObj)); + repl::ReplSetHeartbeatResponse hbResp; + hbResp.setSetName("mySet"); + hbResp.setState(MemberState::RS_SECONDARY); + hbResp.setConfigVersion(3); + hbResp.setConfigTerm(1); + BSONObjBuilder respObj; + hbResp.setAppliedOpTimeAndWallTime({OpTime(Timestamp(100, 1), 0), Date_t() + Seconds(100)}); + hbResp.setDurableOpTimeAndWallTime({OpTime(Timestamp(100, 1), 0), Date_t() + Seconds(100)}); + respObj << "ok" << 1; + hbResp.addToBSON(&respObj); + net->scheduleResponse(noi, net->now(), makeResponseStatus(respObj.obj())); + net->runReadyNetworkOperations(); + exitNetwork(); + reconfigThread.join(); ASSERT_OK(status); @@ -578,73 +603,72 @@ TEST_F( globalFailPointRegistry().find("blockHeartbeatReconfigFinish")->setMode(FailPoint::off); } -// TEST_F(ReplCoordTest, NodeDoesNotAcceptHeartbeatReconfigWhileInTheMidstOfReconfig) { -// // start up, become primary, reconfig, while reconfigging receive reconfig via heartbeat -// assertStartSuccess(BSON("_id" -// << "mySet" -// << "version" << 2 << "members" -// << BSON_ARRAY(BSON("_id" << 1 << "host" -// << "node1:12345") -// << BSON("_id" << 2 << "host" -// << "node2:12345"))), -// HostAndPort("node1", 12345)); -// ASSERT_OK(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); -// replCoordSetMyLastAppliedOpTime(OpTime(Timestamp(100, 1), 0), Date_t() + Seconds(100)); -// replCoordSetMyLastDurableOpTime(OpTime(Timestamp(100, 1), 0), Date_t() + Seconds(100)); -// simulateSuccessfulV1Election(); -// ASSERT_TRUE(getReplCoord()->getMemberState().primary()); - -// // start reconfigThread -// Status status(ErrorCodes::InternalError, "Not Set"); -// const auto opCtx = makeOperationContext(); -// stdx::thread reconfigThread([&] { doReplSetReconfig(getReplCoord(), &status, opCtx.get()); -// }); - -// // wait for reconfigThread to create network requests to ensure the replication coordinator -// // is in state kConfigReconfiguring -// NetworkInterfaceMock* net = getNet(); -// net->enterNetwork(); -// net->blackHole(net->getNextReadyRequest()); - -// // schedule hb reconfig -// net->runUntil(net->now() + Seconds(10)); // run until we've sent a heartbeat request -// const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); -// ReplSetHeartbeatResponse hbResp; -// ReplSetConfig config; -// config -// .initialize(BSON("_id" -// << "mySet" -// << "version" << 4 << "members" -// << BSON_ARRAY(BSON("_id" << 1 << "host" -// << "node1:12345") -// << BSON("_id" << 2 << "host" -// << "node2:12345")))) -// .transitional_ignore(); -// hbResp.setConfig(config); -// hbResp.setConfigVersion(4); -// hbResp.setSetName("mySet"); -// hbResp.setState(MemberState::RS_SECONDARY); -// hbResp.setAppliedOpTimeAndWallTime({OpTime(Timestamp(100, 1), 0), Date_t() + Seconds(100)}); -// hbResp.setDurableOpTimeAndWallTime({OpTime(Timestamp(100, 1), 0), Date_t() + Seconds(100)}); -// BSONObjBuilder respObj2; -// respObj2 << "ok" << 1; -// hbResp.addToBSON(&respObj2); -// net->scheduleResponse(noi, net->now(), makeResponseStatus(respObj2.obj())); - -// setMinimumLoggedSeverity(logger::LogSeverity::Debug(1)); -// startCapturingLogMessages(); -// // execute hb reconfig, which should fail with a log message; confirmed at end of test -// net->runReadyNetworkOperations(); -// // respond to reconfig's quorum check so that we can join that thread and exit cleanly -// net->exitNetwork(); -// stopCapturingLogMessages(); -// ASSERT_EQUALS(1, -// countTextFormatLogLinesContaining( -// "because already in the midst of a configuration process")); -// shutdown(opCtx.get()); -// reconfigThread.join(); -// setMinimumLoggedSeverity(logger::LogSeverity::Log()); -// } +TEST_F(ReplCoordTest, NodeDoesNotAcceptHeartbeatReconfigWhileInTheMidstOfReconfig) { + // start up, become primary, reconfig, while reconfigging receive reconfig via heartbeat + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345"))), + HostAndPort("node1", 12345)); + ASSERT_OK(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + replCoordSetMyLastAppliedOpTime(OpTime(Timestamp(100, 1), 0), Date_t() + Seconds(100)); + replCoordSetMyLastDurableOpTime(OpTime(Timestamp(100, 1), 0), Date_t() + Seconds(100)); + simulateSuccessfulV1Election(); + ASSERT_TRUE(getReplCoord()->getMemberState().primary()); + + // start reconfigThread + Status status(ErrorCodes::InternalError, "Not Set"); + const auto opCtx = makeOperationContext(); + stdx::thread reconfigThread([&] { doReplSetReconfig(getReplCoord(), &status, opCtx.get()); }); + + // wait for reconfigThread to create network requests to ensure the replication coordinator + // is in state kConfigReconfiguring + NetworkInterfaceMock* net = getNet(); + net->enterNetwork(); + net->blackHole(net->getNextReadyRequest()); + + // schedule hb reconfig + net->runUntil(net->now() + Seconds(10)); // run until we've sent a heartbeat request + const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); + ReplSetHeartbeatResponse hbResp; + ReplSetConfig config; + config + .initialize(BSON("_id" + << "mySet" + << "version" << 4 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345")))) + .transitional_ignore(); + hbResp.setConfig(config); + hbResp.setConfigVersion(4); + hbResp.setSetName("mySet"); + hbResp.setState(MemberState::RS_SECONDARY); + hbResp.setAppliedOpTimeAndWallTime({OpTime(Timestamp(100, 1), 0), Date_t() + Seconds(100)}); + hbResp.setDurableOpTimeAndWallTime({OpTime(Timestamp(100, 1), 0), Date_t() + Seconds(100)}); + BSONObjBuilder respObj2; + respObj2 << "ok" << 1; + hbResp.addToBSON(&respObj2); + net->scheduleResponse(noi, net->now(), makeResponseStatus(respObj2.obj())); + + setMinimumLoggedSeverity(logger::LogSeverity::Debug(1)); + startCapturingLogMessages(); + // execute hb reconfig, which should fail with a log message; confirmed at end of test + net->runReadyNetworkOperations(); + // respond to reconfig's quorum check so that we can join that thread and exit cleanly + net->exitNetwork(); + stopCapturingLogMessages(); + ASSERT_EQUALS(1, + countTextFormatLogLinesContaining( + "because already in the midst of a configuration process")); + shutdown(opCtx.get()); + reconfigThread.join(); + setMinimumLoggedSeverity(logger::LogSeverity::Log()); +} TEST_F(ReplCoordTest, NodeAcceptsConfigFromAReconfigWithForceTrueWhileNotPrimary) { // start up, become a secondary, receive a forced reconfig @@ -771,9 +795,15 @@ TEST_F(ReplCoordReconfigTest, reconfigThread = stdx::thread( [&] { status = getReplCoord()->processReplSetReconfig(opCtx.get(), args, &result); }); - // Satisfy config replication check. + // Satisfy the quorum check. auto net = getNet(); enterNetwork(); + respondToHeartbeat(net); + respondToHeartbeat(net); + exitNetwork(); + + // Satisfy config replication check. + enterNetwork(); respondToHeartbeat(net, configVersion, 1 /* configTerm */); respondToHeartbeat(net, configVersion, 1 /* configTerm */); exitNetwork(); @@ -834,7 +864,12 @@ TEST_F(ReplCoordReconfigTest, reconfigThread = stdx::thread( [&] { status = getReplCoord()->processReplSetReconfig(opCtx.get(), args, &result); }); - // Satisfy config replication. + // Satisfy the quorum check. + enterNetwork(); + respondToHeartbeat(getNet()); + exitNetwork(); + + // Satisfy config replication check. enterNetwork(); respondToHeartbeat(getNet(), configVersion, 1 /* configTerm */); exitNetwork(); |