summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSpencer T Brody <spencer@mongodb.com>2016-10-26 19:11:11 -0400
committerSpencer T Brody <spencer@mongodb.com>2016-11-04 12:52:46 -0400
commitc3e843b10cd3522fd5f5db71486cfc738c88428d (patch)
tree37f9ba96cb1c6ac788b59ba49ae1ce0162f3239e
parent605083d00f91c24595d3a112b7ab207d3cbb8aec (diff)
downloadmongo-c3e843b10cd3522fd5f5db71486cfc738c88428d.tar.gz
SERVER-26748 Don't call for priority takeover when repl lagged.
Backport of 286a682a5ecff2e1646cbe68315529f7b3f6bd7c
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp260
-rw-r--r--src/mongo/db/repl/replication_coordinator_test_fixture.cpp16
-rw-r--r--src/mongo/db/repl/replication_coordinator_test_fixture.h9
-rw-r--r--src/mongo/db/repl/topology_coordinator_impl.cpp52
-rw-r--r--src/mongo/db/repl/topology_coordinator_impl.h6
5 files changed, 302 insertions, 41 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp
index e29e772b011..9eb0c6aca9c 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp
@@ -769,8 +769,70 @@ TEST_F(ReplCoordElectV1Test, ElectionFailsWhenTermChangesDuringActualElection) {
countLogLinesContaining("not becoming primary, we have been superceded already"));
}
-TEST_F(ReplCoordElectV1Test, SchedulesPriorityTakeoverIfNodeHasHigherPriorityThanCurrentPrimary) {
- startCapturingLogMessages();
+class PriorityTakeoverTest : public ReplCoordTest {
+public:
+ void respondToAllHeartbeats(const ReplicaSetConfig& config,
+ Date_t runUntil,
+ const HostAndPort& primaryHostAndPort,
+ const OpTime& otherNodesOpTime) {
+ auto replCoord = getReplCoord();
+
+ auto net = getNet();
+ net->enterNetwork();
+ while (net->now() < runUntil || net->hasReadyRequests()) {
+ if (net->now() < runUntil) {
+ net->runUntil(runUntil);
+ }
+ auto noi = net->getNextReadyRequest();
+ auto&& request = noi->getRequest();
+ log() << request.target << " processing " << request.cmdObj;
+ ASSERT_EQUALS("replSetHeartbeat", request.cmdObj.firstElement().fieldNameStringData());
+ ReplSetHeartbeatArgsV1 hbArgs;
+ if (hbArgs.initialize(request.cmdObj).isOK()) {
+ ReplSetHeartbeatResponse hbResp;
+ hbResp.setSetName(config.getReplSetName());
+ if (request.target == primaryHostAndPort) {
+ hbResp.setState(MemberState::RS_PRIMARY);
+ } else {
+ hbResp.setState(MemberState::RS_SECONDARY);
+ }
+ hbResp.setConfigVersion(config.getConfigVersion());
+ hbResp.setTerm(replCoord->getTerm());
+ hbResp.setAppliedOpTime(otherNodesOpTime);
+ hbResp.setDurableOpTime(otherNodesOpTime);
+ auto response =
+ makeResponseStatus(hbResp.toBSON(replCoord->isV1ElectionProtocol()));
+ net->scheduleResponse(noi, net->now(), response);
+ } else {
+ error() << "Black holing unexpected request to " << request.target << ": "
+ << request.cmdObj;
+ net->blackHole(noi);
+ }
+ }
+ net->runReadyNetworkOperations();
+ net->exitNetwork();
+ }
+
+ void performSuccessfulPriorityTakeover(Date_t priorityTakeoverTime) {
+ startCapturingLogMessages();
+ simulateSuccessfulV1ElectionAt(priorityTakeoverTime);
+ getReplCoord()->waitForElectionFinish_forTest();
+ stopCapturingLogMessages();
+
+ ASSERT(getReplCoord()->getMemberState().primary());
+
+ // Check last vote
+ auto lastVote = getExternalState()->loadLocalLastVoteDocument(nullptr);
+ ASSERT(lastVote.isOK());
+ ASSERT_EQ(0, lastVote.getValue().getCandidateIndex());
+ ASSERT_EQ(1, lastVote.getValue().getTerm());
+
+ ASSERT_EQUALS(1, countLogLinesContaining("Starting an election for a priority takeover"));
+ ASSERT_EQUALS(1, countLogLinesContaining("election succeeded"));
+ }
+};
+
+TEST_F(PriorityTakeoverTest, SchedulesPriorityTakeoverIfNodeHasHigherPriorityThanCurrentPrimary) {
BSONObj configObj = BSON("_id"
<< "mySet"
<< "version" << 1 << "members"
@@ -795,35 +857,8 @@ TEST_F(ReplCoordElectV1Test, SchedulesPriorityTakeoverIfNodeHasHigherPriorityTha
ASSERT_EQUALS(Date_t(), replCoord->getPriorityTakeover_forTest());
- auto net = getNet();
- net->enterNetwork();
- auto now = net->now();
- while (net->hasReadyRequests()) {
- auto noi = net->getNextReadyRequest();
- auto&& request = noi->getRequest();
- log() << request.target << " processing " << request.cmdObj;
- ASSERT_EQUALS("replSetHeartbeat", request.cmdObj.firstElement().fieldNameStringData());
- ReplSetHeartbeatArgsV1 hbArgs;
- if (hbArgs.initialize(request.cmdObj).isOK()) {
- ReplSetHeartbeatResponse hbResp;
- hbResp.setSetName(config.getReplSetName());
- if (request.target == HostAndPort("node2", 12345)) {
- hbResp.setState(MemberState::RS_PRIMARY);
- } else {
- hbResp.setState(MemberState::RS_SECONDARY);
- }
- hbResp.setConfigVersion(config.getConfigVersion());
- hbResp.setTerm(replCoord->getTerm());
- auto response = makeResponseStatus(hbResp.toBSON(replCoord->isV1ElectionProtocol()));
- net->scheduleResponse(noi, net->now(), response);
- } else {
- error() << "Black holing unexpected request to " << request.target << ": "
- << request.cmdObj;
- net->blackHole(noi);
- }
- }
- net->runReadyNetworkOperations();
- net->exitNetwork();
+ auto now = getNet()->now();
+ respondToAllHeartbeats(config, now, HostAndPort("node2", 12345), time1);
ASSERT_NOT_EQUALS(Date_t(), replCoord->getPriorityTakeover_forTest());
ASSERT_EQUALS(now + config.getPriorityTakeoverDelay(0),
@@ -834,6 +869,169 @@ TEST_F(ReplCoordElectV1Test, SchedulesPriorityTakeoverIfNodeHasHigherPriorityTha
ASSERT_EQUALS(Date_t(), replCoord->getPriorityTakeover_forTest());
}
+TEST_F(PriorityTakeoverTest, SuccessfulPriorityTakeover) {
+ BSONObj configObj = BSON("_id"
+ << "mySet"
+ << "version" << 1 << "members"
+ << BSON_ARRAY(BSON("_id" << 1 << "host"
+ << "node1:12345"
+ << "priority" << 2)
+ << BSON("_id" << 2 << "host"
+ << "node2:12345")
+ << BSON("_id" << 3 << "host"
+ << "node3:12345")) << "protocolVersion"
+ << 1);
+ assertStartSuccess(configObj, HostAndPort("node1", 12345));
+ ReplicaSetConfig config = assertMakeRSConfig(configObj);
+
+ auto replCoord = getReplCoord();
+
+ OperationContextNoop txn;
+ OpTime time1(Timestamp(100, 1), 0);
+ replCoord->setMyLastAppliedOpTime(time1);
+ replCoord->setMyLastDurableOpTime(time1);
+ ASSERT(replCoord->setFollowerMode(MemberState::RS_SECONDARY));
+
+ ASSERT_EQUALS(Date_t(), replCoord->getPriorityTakeover_forTest());
+
+ auto now = getNet()->now();
+ respondToAllHeartbeats(config, now, HostAndPort("node2", 12345), time1);
+
+ auto priorityTakeoverTime = replCoord->getPriorityTakeover_forTest();
+ ASSERT_NOT_EQUALS(Date_t(), priorityTakeoverTime);
+ ASSERT_EQUALS(now + config.getPriorityTakeoverDelay(0), priorityTakeoverTime);
+
+ performSuccessfulPriorityTakeover(priorityTakeoverTime);
+}
+
+TEST_F(PriorityTakeoverTest, DontCallForPriorityTakeoverWhenLaggedSameSecond) {
+ BSONObj configObj = BSON("_id"
+ << "mySet"
+ << "version" << 1 << "members"
+ << BSON_ARRAY(BSON("_id" << 1 << "host"
+ << "node1:12345"
+ << "priority" << 2)
+ << BSON("_id" << 2 << "host"
+ << "node2:12345")
+ << BSON("_id" << 3 << "host"
+ << "node3:12345")) << "protocolVersion"
+ << 1);
+ assertStartSuccess(configObj, HostAndPort("node1", 12345));
+ ReplicaSetConfig config = assertMakeRSConfig(configObj);
+ HostAndPort primaryHostAndPort("node2", 12345);
+
+ auto replCoord = getReplCoord();
+
+ OperationContextNoop txn;
+ OpTime currentOpTime(Timestamp(100, 5000), 0);
+ OpTime behindOpTime(Timestamp(100, 3999), 0);
+ OpTime closeEnoughOpTime(Timestamp(100, 4000), 0);
+ replCoord->setMyLastAppliedOpTime(behindOpTime);
+ replCoord->setMyLastDurableOpTime(behindOpTime);
+ ASSERT(replCoord->setFollowerMode(MemberState::RS_SECONDARY));
+
+ ASSERT_EQUALS(Date_t(), replCoord->getPriorityTakeover_forTest());
+
+ auto now = getNet()->now();
+
+ respondToAllHeartbeats(config, now, primaryHostAndPort, currentOpTime);
+
+ auto priorityTakeoverTime = replCoord->getPriorityTakeover_forTest();
+ ASSERT_NOT_EQUALS(Date_t(), priorityTakeoverTime);
+ ASSERT_EQUALS(now + config.getPriorityTakeoverDelay(0), priorityTakeoverTime);
+
+
+ // At this point the other nodes are all ahead of the current node, so it can't call for
+ // priority takeover.
+ startCapturingLogMessages();
+ respondToAllHeartbeats(config, priorityTakeoverTime, primaryHostAndPort, currentOpTime);
+ stopCapturingLogMessages();
+
+
+ ASSERT(replCoord->getMemberState().secondary());
+ ASSERT_EQUALS(1,
+ countLogLinesContaining(
+ "Not standing for election because member is not "
+ "caught up enough to the most up-to-date member to "
+ "call for priority takeover"));
+
+ now = getNet()->now();
+ ASSERT_EQUALS(now, priorityTakeoverTime);
+ priorityTakeoverTime = replCoord->getPriorityTakeover_forTest();
+ ASSERT_NOT_EQUALS(Date_t(), priorityTakeoverTime);
+ ASSERT_EQUALS(now + config.getPriorityTakeoverDelay(0), priorityTakeoverTime);
+
+ // Now make us caught up enough to call for priority takeover to succeed.
+ replCoord->setMyLastAppliedOpTime(closeEnoughOpTime);
+ replCoord->setMyLastDurableOpTime(closeEnoughOpTime);
+
+ performSuccessfulPriorityTakeover(priorityTakeoverTime);
+}
+
+TEST_F(PriorityTakeoverTest, DontCallForPriorityTakeoverWhenLaggedDifferentSecond) {
+ BSONObj configObj = BSON("_id"
+ << "mySet"
+ << "version" << 1 << "members"
+ << BSON_ARRAY(BSON("_id" << 1 << "host"
+ << "node1:12345"
+ << "priority" << 2)
+ << BSON("_id" << 2 << "host"
+ << "node2:12345")
+ << BSON("_id" << 3 << "host"
+ << "node3:12345")) << "protocolVersion"
+ << 1);
+ assertStartSuccess(configObj, HostAndPort("node1", 12345));
+ ReplicaSetConfig config = assertMakeRSConfig(configObj);
+ HostAndPort primaryHostAndPort("node2", 12345);
+
+ auto replCoord = getReplCoord();
+
+ OperationContextNoop txn;
+ OpTime currentOpTime(Timestamp(100, 0), 0);
+ OpTime behindOpTime(Timestamp(97, 0), 0);
+ OpTime closeEnoughOpTime(Timestamp(98, 0), 0);
+ replCoord->setMyLastAppliedOpTime(behindOpTime);
+ replCoord->setMyLastDurableOpTime(behindOpTime);
+ ASSERT(replCoord->setFollowerMode(MemberState::RS_SECONDARY));
+
+ ASSERT_EQUALS(Date_t(), replCoord->getPriorityTakeover_forTest());
+
+ auto now = getNet()->now();
+
+ respondToAllHeartbeats(config, now, primaryHostAndPort, currentOpTime);
+
+ auto priorityTakeoverTime = replCoord->getPriorityTakeover_forTest();
+ ASSERT_NOT_EQUALS(Date_t(), priorityTakeoverTime);
+ ASSERT_EQUALS(now + config.getPriorityTakeoverDelay(0), priorityTakeoverTime);
+
+
+ // At this point the other nodes are all ahead of the current node, so it can't call for
+ // priority takeover.
+ startCapturingLogMessages();
+ respondToAllHeartbeats(config, priorityTakeoverTime, primaryHostAndPort, currentOpTime);
+ stopCapturingLogMessages();
+
+
+ ASSERT(replCoord->getMemberState().secondary());
+ ASSERT_EQUALS(1,
+ countLogLinesContaining(
+ "Not standing for election because member is not "
+ "caught up enough to the most up-to-date member to "
+ "call for priority takeover"));
+
+ now = getNet()->now();
+ ASSERT_EQUALS(now, priorityTakeoverTime);
+ priorityTakeoverTime = replCoord->getPriorityTakeover_forTest();
+ ASSERT_NOT_EQUALS(Date_t(), priorityTakeoverTime);
+ ASSERT_EQUALS(now + config.getPriorityTakeoverDelay(0), priorityTakeoverTime);
+
+ // Now make us caught up enough to call for priority takeover to succeed.
+ replCoord->setMyLastAppliedOpTime(closeEnoughOpTime);
+ replCoord->setMyLastDurableOpTime(closeEnoughOpTime);
+
+ performSuccessfulPriorityTakeover(priorityTakeoverTime);
+}
+
} // namespace
} // namespace repl
} // namespace mongo
diff --git a/src/mongo/db/repl/replication_coordinator_test_fixture.cpp b/src/mongo/db/repl/replication_coordinator_test_fixture.cpp
index 4a143cba71e..e7b8d561d55 100644
--- a/src/mongo/db/repl/replication_coordinator_test_fixture.cpp
+++ b/src/mongo/db/repl/replication_coordinator_test_fixture.cpp
@@ -237,21 +237,25 @@ void ReplCoordTest::simulateSuccessfulDryRun() {
}
void ReplCoordTest::simulateSuccessfulV1Election() {
+ auto electionTimeoutWhen = getReplCoord()->getElectionTimeout_forTest();
+ ASSERT_NOT_EQUALS(Date_t(), electionTimeoutWhen);
+ log() << "Election timeout scheduled at " << electionTimeoutWhen << " (simulator time)";
+
+ simulateSuccessfulV1ElectionAt(electionTimeoutWhen);
+}
+
+void ReplCoordTest::simulateSuccessfulV1ElectionAt(Date_t electionTime) {
OperationContextReplMock txn;
ReplicationCoordinatorImpl* replCoord = getReplCoord();
NetworkInterfaceMock* net = getNet();
- auto electionTimeoutWhen = replCoord->getElectionTimeout_forTest();
- ASSERT_NOT_EQUALS(Date_t(), electionTimeoutWhen);
- log() << "Election timeout scheduled at " << electionTimeoutWhen << " (simulator time)";
-
ReplicaSetConfig rsConfig = replCoord->getReplicaSetConfig_forTest();
ASSERT(replCoord->getMemberState().secondary()) << replCoord->getMemberState().toString();
while (!replCoord->getMemberState().primary()) {
log() << "Waiting on network in state " << replCoord->getMemberState();
getNet()->enterNetwork();
- if (net->now() < electionTimeoutWhen) {
- net->runUntil(electionTimeoutWhen);
+ if (net->now() < electionTime) {
+ net->runUntil(electionTime);
}
const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest();
const RemoteCommandRequest& request = noi->getRequest();
diff --git a/src/mongo/db/repl/replication_coordinator_test_fixture.h b/src/mongo/db/repl/replication_coordinator_test_fixture.h
index 301d458fb4b..84e525a671a 100644
--- a/src/mongo/db/repl/replication_coordinator_test_fixture.h
+++ b/src/mongo/db/repl/replication_coordinator_test_fixture.h
@@ -196,7 +196,7 @@ protected:
/**
* Brings the repl coord from SECONDARY to PRIMARY by simulating the messages required to
- * elect it.
+ * elect it, after progressing the mocked-out notion of time past the election timeout.
*
* Behavior is unspecified if node does not have a clean config, is not in SECONDARY, etc.
*/
@@ -204,6 +204,13 @@ protected:
void simulateSuccessfulV1Election();
/**
+ * Same as simulateSuccessfulV1Election, but rather than getting the election timeout and
+ * progressing time past that point, takes in what time to expect an election to occur at.
+ * Useful for simulating elections triggered via priority takeover.
+ */
+ void simulateSuccessfulV1ElectionAt(Date_t electionTime);
+
+ /**
* Shuts down the objects under test.
*/
void shutdown();
diff --git a/src/mongo/db/repl/topology_coordinator_impl.cpp b/src/mongo/db/repl/topology_coordinator_impl.cpp
index 3b1d4e413cd..e245ddb3276 100644
--- a/src/mongo/db/repl/topology_coordinator_impl.cpp
+++ b/src/mongo/db/repl/topology_coordinator_impl.cpp
@@ -48,8 +48,9 @@
#include "mongo/db/repl/repl_set_request_votes_args.h"
#include "mongo/db/repl/replication_executor.h"
#include "mongo/db/repl/rslog.h"
-#include "mongo/s/catalog/catalog_manager.h"
+#include "mongo/db/server_parameters.h"
#include "mongo/rpc/metadata/repl_set_metadata.h"
+#include "mongo/s/catalog/catalog_manager.h"
#include "mongo/util/hex.h"
#include "mongo/util/log.h"
#include "mongo/util/mongoutils/str.h"
@@ -62,6 +63,10 @@ using std::vector;
const Seconds TopologyCoordinatorImpl::VoteLease::leaseTime = Seconds(30);
+// Controls how caught up in replication a secondary with higher priority than the current primary
+// must be before it will call for a priority takeover election.
+MONGO_EXPORT_STARTUP_SERVER_PARAMETER(priorityTakeoverFreshnessWindowSeconds, int, 2);
+
namespace {
template <typename T>
@@ -1363,6 +1368,34 @@ bool TopologyCoordinatorImpl::_isOpTimeCloseEnoughToLatestToElect(
return otherOpTime.getSecs() + 10 >= (latestKnownOpTime.getSecs());
}
+bool TopologyCoordinatorImpl::_amIFreshEnoughForPriorityTakeover(
+ const OpTime& ourLastOpApplied) const {
+ const OpTime latestKnownOpTime = _latestKnownOpTime(ourLastOpApplied);
+
+ // Rules are:
+ // - If the terms don't match, we don't call for priority takeover.
+ // - If our optime and the latest optime happen in different seconds, our optime must be within
+ // at least priorityTakeoverFreshnessWindowSeconds seconds of the latest optime.
+ // - If our optime and the latest optime happen in the same second, our optime must be within
+ // at least 1000 oplog entries of the latest optime (i.e. the increment portion of the timestamp
+ // must be within 1000). This is to handle the case where a primary had its clock set far into
+ // the future, took some writes, then had its clock set back. In that case the timestamp
+ // component of all future oplog entries generated will be the same, until real world time
+ // passes the timestamp component of the last oplog entry.
+
+ if (ourLastOpApplied.getTerm() != latestKnownOpTime.getTerm()) {
+ return false;
+ }
+
+ if (ourLastOpApplied.getTimestamp().getSecs() != latestKnownOpTime.getTimestamp().getSecs()) {
+ return ourLastOpApplied.getTimestamp().getSecs() + priorityTakeoverFreshnessWindowSeconds >=
+ latestKnownOpTime.getTimestamp().getSecs();
+ } else {
+ return ourLastOpApplied.getTimestamp().getInc() + 1000 >=
+ latestKnownOpTime.getTimestamp().getInc();
+ }
+}
+
bool TopologyCoordinatorImpl::_iAmPrimary() const {
if (_role == Role::leader) {
invariant(_currentPrimaryIndex == _selfIndex);
@@ -1964,8 +1997,8 @@ TopologyCoordinatorImpl::UnelectableReasonMask TopologyCoordinatorImpl::_getMyUn
result |= NotSecondary;
}
- // Election rules only for protocol version 0.
if (_rsConfig.getProtocolVersion() == 0) {
+ // Election rules only for protocol version 0.
if (_voteLease.whoId != -1 &&
_voteLease.whoId != _rsConfig.getMemberAt(_selfIndex).getId() &&
_voteLease.when + VoteLease::leaseTime >= now) {
@@ -1974,6 +2007,13 @@ TopologyCoordinatorImpl::UnelectableReasonMask TopologyCoordinatorImpl::_getMyUn
if (!_isOpTimeCloseEnoughToLatestToElect(lastApplied, lastApplied)) {
result |= NotCloseEnoughToLatestOptime;
}
+ } else {
+ // Election rules only for protocol version 1.
+ invariant(_rsConfig.getProtocolVersion() == 1);
+ bool isPriorityTakeover = _currentPrimaryIndex != -1;
+ if (isPriorityTakeover && !_amIFreshEnoughForPriorityTakeover(lastApplied)) {
+ result |= NotCloseEnoughToLatestForPriorityTakeover;
+ }
}
return result;
}
@@ -2037,6 +2077,14 @@ std::string TopologyCoordinatorImpl::_getUnelectableReasonString(
hasWrittenToStream = true;
ss << "member is more than 10 seconds behind the most up-to-date member";
}
+ if (ur & NotCloseEnoughToLatestForPriorityTakeover) {
+ if (hasWrittenToStream) {
+ ss << "; ";
+ }
+ hasWrittenToStream = true;
+ ss << "member is not caught up enough to the most up-to-date member to call for priority "
+ "takeover - must be within " << priorityTakeoverFreshnessWindowSeconds << " seconds";
+ }
if (ur & NotInitialized) {
if (hasWrittenToStream) {
ss << "; ";
diff --git a/src/mongo/db/repl/topology_coordinator_impl.h b/src/mongo/db/repl/topology_coordinator_impl.h
index 7fe537708b9..fb4b7786cb3 100644
--- a/src/mongo/db/repl/topology_coordinator_impl.h
+++ b/src/mongo/db/repl/topology_coordinator_impl.h
@@ -280,7 +280,8 @@ private:
NoData = 1 << 6,
NotInitialized = 1 << 7,
VotedTooRecently = 1 << 8,
- RefusesToStand = 1 << 9
+ RefusesToStand = 1 << 9,
+ NotCloseEnoughToLatestForPriorityTakeover = 1 << 10,
};
typedef int UnelectableReasonMask;
@@ -309,6 +310,9 @@ private:
bool _isOpTimeCloseEnoughToLatestToElect(const OpTime& otherOpTime,
const OpTime& ourLastOpApplied) const;
+ // Is our optime close enough to the latest known optime to call for a priority takeover.
+ bool _amIFreshEnoughForPriorityTakeover(const OpTime& ourLastOpApplied) const;
+
// Returns reason why "self" member is unelectable
UnelectableReasonMask _getMyUnelectableReason(const Date_t now,
const OpTime& lastOpApplied) const;