diff options
author | Spencer T Brody <spencer@mongodb.com> | 2016-10-26 19:11:11 -0400 |
---|---|---|
committer | Spencer T Brody <spencer@mongodb.com> | 2016-11-04 12:52:46 -0400 |
commit | c3e843b10cd3522fd5f5db71486cfc738c88428d (patch) | |
tree | 37f9ba96cb1c6ac788b59ba49ae1ce0162f3239e | |
parent | 605083d00f91c24595d3a112b7ab207d3cbb8aec (diff) | |
download | mongo-c3e843b10cd3522fd5f5db71486cfc738c88428d.tar.gz |
SERVER-26748 Don't call for priority takeover when repl lagged.
Backport of 286a682a5ecff2e1646cbe68315529f7b3f6bd7c
5 files changed, 302 insertions, 41 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp index e29e772b011..9eb0c6aca9c 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp @@ -769,8 +769,70 @@ TEST_F(ReplCoordElectV1Test, ElectionFailsWhenTermChangesDuringActualElection) { countLogLinesContaining("not becoming primary, we have been superceded already")); } -TEST_F(ReplCoordElectV1Test, SchedulesPriorityTakeoverIfNodeHasHigherPriorityThanCurrentPrimary) { - startCapturingLogMessages(); +class PriorityTakeoverTest : public ReplCoordTest { +public: + void respondToAllHeartbeats(const ReplicaSetConfig& config, + Date_t runUntil, + const HostAndPort& primaryHostAndPort, + const OpTime& otherNodesOpTime) { + auto replCoord = getReplCoord(); + + auto net = getNet(); + net->enterNetwork(); + while (net->now() < runUntil || net->hasReadyRequests()) { + if (net->now() < runUntil) { + net->runUntil(runUntil); + } + auto noi = net->getNextReadyRequest(); + auto&& request = noi->getRequest(); + log() << request.target << " processing " << request.cmdObj; + ASSERT_EQUALS("replSetHeartbeat", request.cmdObj.firstElement().fieldNameStringData()); + ReplSetHeartbeatArgsV1 hbArgs; + if (hbArgs.initialize(request.cmdObj).isOK()) { + ReplSetHeartbeatResponse hbResp; + hbResp.setSetName(config.getReplSetName()); + if (request.target == primaryHostAndPort) { + hbResp.setState(MemberState::RS_PRIMARY); + } else { + hbResp.setState(MemberState::RS_SECONDARY); + } + hbResp.setConfigVersion(config.getConfigVersion()); + hbResp.setTerm(replCoord->getTerm()); + hbResp.setAppliedOpTime(otherNodesOpTime); + hbResp.setDurableOpTime(otherNodesOpTime); + auto response = + makeResponseStatus(hbResp.toBSON(replCoord->isV1ElectionProtocol())); + net->scheduleResponse(noi, net->now(), response); + } else { + error() << "Black holing unexpected request to " << request.target << ": " + << request.cmdObj; + net->blackHole(noi); + } + } + net->runReadyNetworkOperations(); + net->exitNetwork(); + } + + void performSuccessfulPriorityTakeover(Date_t priorityTakeoverTime) { + startCapturingLogMessages(); + simulateSuccessfulV1ElectionAt(priorityTakeoverTime); + getReplCoord()->waitForElectionFinish_forTest(); + stopCapturingLogMessages(); + + ASSERT(getReplCoord()->getMemberState().primary()); + + // Check last vote + auto lastVote = getExternalState()->loadLocalLastVoteDocument(nullptr); + ASSERT(lastVote.isOK()); + ASSERT_EQ(0, lastVote.getValue().getCandidateIndex()); + ASSERT_EQ(1, lastVote.getValue().getTerm()); + + ASSERT_EQUALS(1, countLogLinesContaining("Starting an election for a priority takeover")); + ASSERT_EQUALS(1, countLogLinesContaining("election succeeded")); + } +}; + +TEST_F(PriorityTakeoverTest, SchedulesPriorityTakeoverIfNodeHasHigherPriorityThanCurrentPrimary) { BSONObj configObj = BSON("_id" << "mySet" << "version" << 1 << "members" @@ -795,35 +857,8 @@ TEST_F(ReplCoordElectV1Test, SchedulesPriorityTakeoverIfNodeHasHigherPriorityTha ASSERT_EQUALS(Date_t(), replCoord->getPriorityTakeover_forTest()); - auto net = getNet(); - net->enterNetwork(); - auto now = net->now(); - while (net->hasReadyRequests()) { - auto noi = net->getNextReadyRequest(); - auto&& request = noi->getRequest(); - log() << request.target << " processing " << request.cmdObj; - ASSERT_EQUALS("replSetHeartbeat", request.cmdObj.firstElement().fieldNameStringData()); - ReplSetHeartbeatArgsV1 hbArgs; - if (hbArgs.initialize(request.cmdObj).isOK()) { - ReplSetHeartbeatResponse hbResp; - hbResp.setSetName(config.getReplSetName()); - if (request.target == HostAndPort("node2", 12345)) { - hbResp.setState(MemberState::RS_PRIMARY); - } else { - hbResp.setState(MemberState::RS_SECONDARY); - } - hbResp.setConfigVersion(config.getConfigVersion()); - hbResp.setTerm(replCoord->getTerm()); - auto response = makeResponseStatus(hbResp.toBSON(replCoord->isV1ElectionProtocol())); - net->scheduleResponse(noi, net->now(), response); - } else { - error() << "Black holing unexpected request to " << request.target << ": " - << request.cmdObj; - net->blackHole(noi); - } - } - net->runReadyNetworkOperations(); - net->exitNetwork(); + auto now = getNet()->now(); + respondToAllHeartbeats(config, now, HostAndPort("node2", 12345), time1); ASSERT_NOT_EQUALS(Date_t(), replCoord->getPriorityTakeover_forTest()); ASSERT_EQUALS(now + config.getPriorityTakeoverDelay(0), @@ -834,6 +869,169 @@ TEST_F(ReplCoordElectV1Test, SchedulesPriorityTakeoverIfNodeHasHigherPriorityTha ASSERT_EQUALS(Date_t(), replCoord->getPriorityTakeover_forTest()); } +TEST_F(PriorityTakeoverTest, SuccessfulPriorityTakeover) { + BSONObj configObj = BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345" + << "priority" << 2) + << BSON("_id" << 2 << "host" + << "node2:12345") + << BSON("_id" << 3 << "host" + << "node3:12345")) << "protocolVersion" + << 1); + assertStartSuccess(configObj, HostAndPort("node1", 12345)); + ReplicaSetConfig config = assertMakeRSConfig(configObj); + + auto replCoord = getReplCoord(); + + OperationContextNoop txn; + OpTime time1(Timestamp(100, 1), 0); + replCoord->setMyLastAppliedOpTime(time1); + replCoord->setMyLastDurableOpTime(time1); + ASSERT(replCoord->setFollowerMode(MemberState::RS_SECONDARY)); + + ASSERT_EQUALS(Date_t(), replCoord->getPriorityTakeover_forTest()); + + auto now = getNet()->now(); + respondToAllHeartbeats(config, now, HostAndPort("node2", 12345), time1); + + auto priorityTakeoverTime = replCoord->getPriorityTakeover_forTest(); + ASSERT_NOT_EQUALS(Date_t(), priorityTakeoverTime); + ASSERT_EQUALS(now + config.getPriorityTakeoverDelay(0), priorityTakeoverTime); + + performSuccessfulPriorityTakeover(priorityTakeoverTime); +} + +TEST_F(PriorityTakeoverTest, DontCallForPriorityTakeoverWhenLaggedSameSecond) { + BSONObj configObj = BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345" + << "priority" << 2) + << BSON("_id" << 2 << "host" + << "node2:12345") + << BSON("_id" << 3 << "host" + << "node3:12345")) << "protocolVersion" + << 1); + assertStartSuccess(configObj, HostAndPort("node1", 12345)); + ReplicaSetConfig config = assertMakeRSConfig(configObj); + HostAndPort primaryHostAndPort("node2", 12345); + + auto replCoord = getReplCoord(); + + OperationContextNoop txn; + OpTime currentOpTime(Timestamp(100, 5000), 0); + OpTime behindOpTime(Timestamp(100, 3999), 0); + OpTime closeEnoughOpTime(Timestamp(100, 4000), 0); + replCoord->setMyLastAppliedOpTime(behindOpTime); + replCoord->setMyLastDurableOpTime(behindOpTime); + ASSERT(replCoord->setFollowerMode(MemberState::RS_SECONDARY)); + + ASSERT_EQUALS(Date_t(), replCoord->getPriorityTakeover_forTest()); + + auto now = getNet()->now(); + + respondToAllHeartbeats(config, now, primaryHostAndPort, currentOpTime); + + auto priorityTakeoverTime = replCoord->getPriorityTakeover_forTest(); + ASSERT_NOT_EQUALS(Date_t(), priorityTakeoverTime); + ASSERT_EQUALS(now + config.getPriorityTakeoverDelay(0), priorityTakeoverTime); + + + // At this point the other nodes are all ahead of the current node, so it can't call for + // priority takeover. + startCapturingLogMessages(); + respondToAllHeartbeats(config, priorityTakeoverTime, primaryHostAndPort, currentOpTime); + stopCapturingLogMessages(); + + + ASSERT(replCoord->getMemberState().secondary()); + ASSERT_EQUALS(1, + countLogLinesContaining( + "Not standing for election because member is not " + "caught up enough to the most up-to-date member to " + "call for priority takeover")); + + now = getNet()->now(); + ASSERT_EQUALS(now, priorityTakeoverTime); + priorityTakeoverTime = replCoord->getPriorityTakeover_forTest(); + ASSERT_NOT_EQUALS(Date_t(), priorityTakeoverTime); + ASSERT_EQUALS(now + config.getPriorityTakeoverDelay(0), priorityTakeoverTime); + + // Now make us caught up enough to call for priority takeover to succeed. + replCoord->setMyLastAppliedOpTime(closeEnoughOpTime); + replCoord->setMyLastDurableOpTime(closeEnoughOpTime); + + performSuccessfulPriorityTakeover(priorityTakeoverTime); +} + +TEST_F(PriorityTakeoverTest, DontCallForPriorityTakeoverWhenLaggedDifferentSecond) { + BSONObj configObj = BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345" + << "priority" << 2) + << BSON("_id" << 2 << "host" + << "node2:12345") + << BSON("_id" << 3 << "host" + << "node3:12345")) << "protocolVersion" + << 1); + assertStartSuccess(configObj, HostAndPort("node1", 12345)); + ReplicaSetConfig config = assertMakeRSConfig(configObj); + HostAndPort primaryHostAndPort("node2", 12345); + + auto replCoord = getReplCoord(); + + OperationContextNoop txn; + OpTime currentOpTime(Timestamp(100, 0), 0); + OpTime behindOpTime(Timestamp(97, 0), 0); + OpTime closeEnoughOpTime(Timestamp(98, 0), 0); + replCoord->setMyLastAppliedOpTime(behindOpTime); + replCoord->setMyLastDurableOpTime(behindOpTime); + ASSERT(replCoord->setFollowerMode(MemberState::RS_SECONDARY)); + + ASSERT_EQUALS(Date_t(), replCoord->getPriorityTakeover_forTest()); + + auto now = getNet()->now(); + + respondToAllHeartbeats(config, now, primaryHostAndPort, currentOpTime); + + auto priorityTakeoverTime = replCoord->getPriorityTakeover_forTest(); + ASSERT_NOT_EQUALS(Date_t(), priorityTakeoverTime); + ASSERT_EQUALS(now + config.getPriorityTakeoverDelay(0), priorityTakeoverTime); + + + // At this point the other nodes are all ahead of the current node, so it can't call for + // priority takeover. + startCapturingLogMessages(); + respondToAllHeartbeats(config, priorityTakeoverTime, primaryHostAndPort, currentOpTime); + stopCapturingLogMessages(); + + + ASSERT(replCoord->getMemberState().secondary()); + ASSERT_EQUALS(1, + countLogLinesContaining( + "Not standing for election because member is not " + "caught up enough to the most up-to-date member to " + "call for priority takeover")); + + now = getNet()->now(); + ASSERT_EQUALS(now, priorityTakeoverTime); + priorityTakeoverTime = replCoord->getPriorityTakeover_forTest(); + ASSERT_NOT_EQUALS(Date_t(), priorityTakeoverTime); + ASSERT_EQUALS(now + config.getPriorityTakeoverDelay(0), priorityTakeoverTime); + + // Now make us caught up enough to call for priority takeover to succeed. + replCoord->setMyLastAppliedOpTime(closeEnoughOpTime); + replCoord->setMyLastDurableOpTime(closeEnoughOpTime); + + performSuccessfulPriorityTakeover(priorityTakeoverTime); +} + } // namespace } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator_test_fixture.cpp b/src/mongo/db/repl/replication_coordinator_test_fixture.cpp index 4a143cba71e..e7b8d561d55 100644 --- a/src/mongo/db/repl/replication_coordinator_test_fixture.cpp +++ b/src/mongo/db/repl/replication_coordinator_test_fixture.cpp @@ -237,21 +237,25 @@ void ReplCoordTest::simulateSuccessfulDryRun() { } void ReplCoordTest::simulateSuccessfulV1Election() { + auto electionTimeoutWhen = getReplCoord()->getElectionTimeout_forTest(); + ASSERT_NOT_EQUALS(Date_t(), electionTimeoutWhen); + log() << "Election timeout scheduled at " << electionTimeoutWhen << " (simulator time)"; + + simulateSuccessfulV1ElectionAt(electionTimeoutWhen); +} + +void ReplCoordTest::simulateSuccessfulV1ElectionAt(Date_t electionTime) { OperationContextReplMock txn; ReplicationCoordinatorImpl* replCoord = getReplCoord(); NetworkInterfaceMock* net = getNet(); - auto electionTimeoutWhen = replCoord->getElectionTimeout_forTest(); - ASSERT_NOT_EQUALS(Date_t(), electionTimeoutWhen); - log() << "Election timeout scheduled at " << electionTimeoutWhen << " (simulator time)"; - ReplicaSetConfig rsConfig = replCoord->getReplicaSetConfig_forTest(); ASSERT(replCoord->getMemberState().secondary()) << replCoord->getMemberState().toString(); while (!replCoord->getMemberState().primary()) { log() << "Waiting on network in state " << replCoord->getMemberState(); getNet()->enterNetwork(); - if (net->now() < electionTimeoutWhen) { - net->runUntil(electionTimeoutWhen); + if (net->now() < electionTime) { + net->runUntil(electionTime); } const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); const RemoteCommandRequest& request = noi->getRequest(); diff --git a/src/mongo/db/repl/replication_coordinator_test_fixture.h b/src/mongo/db/repl/replication_coordinator_test_fixture.h index 301d458fb4b..84e525a671a 100644 --- a/src/mongo/db/repl/replication_coordinator_test_fixture.h +++ b/src/mongo/db/repl/replication_coordinator_test_fixture.h @@ -196,7 +196,7 @@ protected: /** * Brings the repl coord from SECONDARY to PRIMARY by simulating the messages required to - * elect it. + * elect it, after progressing the mocked-out notion of time past the election timeout. * * Behavior is unspecified if node does not have a clean config, is not in SECONDARY, etc. */ @@ -204,6 +204,13 @@ protected: void simulateSuccessfulV1Election(); /** + * Same as simulateSuccessfulV1Election, but rather than getting the election timeout and + * progressing time past that point, takes in what time to expect an election to occur at. + * Useful for simulating elections triggered via priority takeover. + */ + void simulateSuccessfulV1ElectionAt(Date_t electionTime); + + /** * Shuts down the objects under test. */ void shutdown(); diff --git a/src/mongo/db/repl/topology_coordinator_impl.cpp b/src/mongo/db/repl/topology_coordinator_impl.cpp index 3b1d4e413cd..e245ddb3276 100644 --- a/src/mongo/db/repl/topology_coordinator_impl.cpp +++ b/src/mongo/db/repl/topology_coordinator_impl.cpp @@ -48,8 +48,9 @@ #include "mongo/db/repl/repl_set_request_votes_args.h" #include "mongo/db/repl/replication_executor.h" #include "mongo/db/repl/rslog.h" -#include "mongo/s/catalog/catalog_manager.h" +#include "mongo/db/server_parameters.h" #include "mongo/rpc/metadata/repl_set_metadata.h" +#include "mongo/s/catalog/catalog_manager.h" #include "mongo/util/hex.h" #include "mongo/util/log.h" #include "mongo/util/mongoutils/str.h" @@ -62,6 +63,10 @@ using std::vector; const Seconds TopologyCoordinatorImpl::VoteLease::leaseTime = Seconds(30); +// Controls how caught up in replication a secondary with higher priority than the current primary +// must be before it will call for a priority takeover election. +MONGO_EXPORT_STARTUP_SERVER_PARAMETER(priorityTakeoverFreshnessWindowSeconds, int, 2); + namespace { template <typename T> @@ -1363,6 +1368,34 @@ bool TopologyCoordinatorImpl::_isOpTimeCloseEnoughToLatestToElect( return otherOpTime.getSecs() + 10 >= (latestKnownOpTime.getSecs()); } +bool TopologyCoordinatorImpl::_amIFreshEnoughForPriorityTakeover( + const OpTime& ourLastOpApplied) const { + const OpTime latestKnownOpTime = _latestKnownOpTime(ourLastOpApplied); + + // Rules are: + // - If the terms don't match, we don't call for priority takeover. + // - If our optime and the latest optime happen in different seconds, our optime must be within + // at least priorityTakeoverFreshnessWindowSeconds seconds of the latest optime. + // - If our optime and the latest optime happen in the same second, our optime must be within + // at least 1000 oplog entries of the latest optime (i.e. the increment portion of the timestamp + // must be within 1000). This is to handle the case where a primary had its clock set far into + // the future, took some writes, then had its clock set back. In that case the timestamp + // component of all future oplog entries generated will be the same, until real world time + // passes the timestamp component of the last oplog entry. + + if (ourLastOpApplied.getTerm() != latestKnownOpTime.getTerm()) { + return false; + } + + if (ourLastOpApplied.getTimestamp().getSecs() != latestKnownOpTime.getTimestamp().getSecs()) { + return ourLastOpApplied.getTimestamp().getSecs() + priorityTakeoverFreshnessWindowSeconds >= + latestKnownOpTime.getTimestamp().getSecs(); + } else { + return ourLastOpApplied.getTimestamp().getInc() + 1000 >= + latestKnownOpTime.getTimestamp().getInc(); + } +} + bool TopologyCoordinatorImpl::_iAmPrimary() const { if (_role == Role::leader) { invariant(_currentPrimaryIndex == _selfIndex); @@ -1964,8 +1997,8 @@ TopologyCoordinatorImpl::UnelectableReasonMask TopologyCoordinatorImpl::_getMyUn result |= NotSecondary; } - // Election rules only for protocol version 0. if (_rsConfig.getProtocolVersion() == 0) { + // Election rules only for protocol version 0. if (_voteLease.whoId != -1 && _voteLease.whoId != _rsConfig.getMemberAt(_selfIndex).getId() && _voteLease.when + VoteLease::leaseTime >= now) { @@ -1974,6 +2007,13 @@ TopologyCoordinatorImpl::UnelectableReasonMask TopologyCoordinatorImpl::_getMyUn if (!_isOpTimeCloseEnoughToLatestToElect(lastApplied, lastApplied)) { result |= NotCloseEnoughToLatestOptime; } + } else { + // Election rules only for protocol version 1. + invariant(_rsConfig.getProtocolVersion() == 1); + bool isPriorityTakeover = _currentPrimaryIndex != -1; + if (isPriorityTakeover && !_amIFreshEnoughForPriorityTakeover(lastApplied)) { + result |= NotCloseEnoughToLatestForPriorityTakeover; + } } return result; } @@ -2037,6 +2077,14 @@ std::string TopologyCoordinatorImpl::_getUnelectableReasonString( hasWrittenToStream = true; ss << "member is more than 10 seconds behind the most up-to-date member"; } + if (ur & NotCloseEnoughToLatestForPriorityTakeover) { + if (hasWrittenToStream) { + ss << "; "; + } + hasWrittenToStream = true; + ss << "member is not caught up enough to the most up-to-date member to call for priority " + "takeover - must be within " << priorityTakeoverFreshnessWindowSeconds << " seconds"; + } if (ur & NotInitialized) { if (hasWrittenToStream) { ss << "; "; diff --git a/src/mongo/db/repl/topology_coordinator_impl.h b/src/mongo/db/repl/topology_coordinator_impl.h index 7fe537708b9..fb4b7786cb3 100644 --- a/src/mongo/db/repl/topology_coordinator_impl.h +++ b/src/mongo/db/repl/topology_coordinator_impl.h @@ -280,7 +280,8 @@ private: NoData = 1 << 6, NotInitialized = 1 << 7, VotedTooRecently = 1 << 8, - RefusesToStand = 1 << 9 + RefusesToStand = 1 << 9, + NotCloseEnoughToLatestForPriorityTakeover = 1 << 10, }; typedef int UnelectableReasonMask; @@ -309,6 +310,9 @@ private: bool _isOpTimeCloseEnoughToLatestToElect(const OpTime& otherOpTime, const OpTime& ourLastOpApplied) const; + // Is our optime close enough to the latest known optime to call for a priority takeover. + bool _amIFreshEnoughForPriorityTakeover(const OpTime& ourLastOpApplied) const; + // Returns reason why "self" member is unelectable UnelectableReasonMask _getMyUnelectableReason(const Date_t now, const OpTime& lastOpApplied) const; |