summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSamy Lanka <samy.lanka@10gen.com>2017-07-06 11:21:24 -0400
committerSamy Lanka <samy.lanka@10gen.com>2017-08-01 16:52:47 -0400
commit718cf09aa21b36e9436a675c8645770826078da7 (patch)
tree294bab8ca6e03feaf873af83b3e09d061c14ad5e
parent63a3a8f1e7da9c5bfd5e4b604c3453561f30b2b4 (diff)
downloadmongo-718cf09aa21b36e9436a675c8645770826078da7.tar.gz
SERVER-29501 Make catchup takeover timeout configurable
-rw-r--r--src/mongo/db/repl/repl_set_config.cpp33
-rw-r--r--src/mongo/db/repl/repl_set_config.h9
-rw-r--r--src/mongo/db/repl/repl_set_config_test.cpp73
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp2
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp56
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp2
-rw-r--r--src/mongo/db/repl/topology_coordinator_impl.cpp8
7 files changed, 140 insertions, 43 deletions
diff --git a/src/mongo/db/repl/repl_set_config.cpp b/src/mongo/db/repl/repl_set_config.cpp
index cace43edfcd..fe127318d10 100644
--- a/src/mongo/db/repl/repl_set_config.cpp
+++ b/src/mongo/db/repl/repl_set_config.cpp
@@ -45,6 +45,8 @@ namespace repl {
const size_t ReplSetConfig::kMaxMembers;
const size_t ReplSetConfig::kMaxVotingMembers;
const Milliseconds ReplSetConfig::kInfiniteCatchUpTimeout(-1);
+const Milliseconds ReplSetConfig::kCatchUpDisabled(0);
+const Milliseconds ReplSetConfig::kCatchUpTakeoverDisabled(-1);
const std::string ReplSetConfig::kConfigServerFieldName = "configsvr";
const std::string ReplSetConfig::kVersionFieldName = "version";
@@ -54,7 +56,7 @@ const Seconds ReplSetConfig::kDefaultHeartbeatTimeoutPeriod(10);
const Milliseconds ReplSetConfig::kDefaultElectionTimeoutPeriod(10000);
const Milliseconds ReplSetConfig::kDefaultCatchUpTimeoutPeriod(kInfiniteCatchUpTimeout);
const bool ReplSetConfig::kDefaultChainingAllowed(true);
-const Milliseconds ReplSetConfig::kDefaultCatchupTakeoverDelay(30000);
+const Milliseconds ReplSetConfig::kDefaultCatchUpTakeoverDelay(30000);
namespace {
@@ -82,6 +84,7 @@ const std::string kHeartbeatIntervalFieldName = "heartbeatIntervalMillis";
const std::string kHeartbeatTimeoutFieldName = "heartbeatTimeoutSecs";
const std::string kCatchUpTimeoutFieldName = "catchUpTimeoutMillis";
const std::string kReplicaSetIdFieldName = "replicaSetId";
+const std::string kCatchUpTakeoverDelayFieldName = "catchUpTakeoverDelayMillis";
} // namespace
@@ -272,13 +275,15 @@ Status ReplSetConfig::_parseSettingsSubdocument(const BSONObj& settings) {
//
// Parse catchUpTimeoutMillis
//
- auto validCatchUpTimeout = [](long long timeout) { return timeout >= 0LL || timeout == -1LL; };
+ auto validCatchUpParameter = [](long long timeout) {
+ return timeout >= 0LL || timeout == -1LL;
+ };
long long catchUpTimeoutMillis;
Status catchUpTimeoutStatus = bsonExtractIntegerFieldWithDefaultIf(
settings,
kCatchUpTimeoutFieldName,
durationCount<Milliseconds>(kDefaultCatchUpTimeoutPeriod),
- validCatchUpTimeout,
+ validCatchUpParameter,
"catch-up timeout must be positive, 0 (no catch-up) or -1 (infinite catch-up).",
&catchUpTimeoutMillis);
if (!catchUpTimeoutStatus.isOK()) {
@@ -287,6 +292,22 @@ Status ReplSetConfig::_parseSettingsSubdocument(const BSONObj& settings) {
_catchUpTimeoutPeriod = Milliseconds(catchUpTimeoutMillis);
//
+ // Parse catchUpTakeoverDelayMillis
+ //
+ long long catchUpTakeoverDelayMillis;
+ Status catchUpTakeoverDelayStatus = bsonExtractIntegerFieldWithDefaultIf(
+ settings,
+ kCatchUpTakeoverDelayFieldName,
+ durationCount<Milliseconds>(kDefaultCatchUpTakeoverDelay),
+ validCatchUpParameter,
+ "catch-up takeover delay must be -1 (no catch-up takeover) or greater than or equal to 0.",
+ &catchUpTakeoverDelayMillis);
+ if (!catchUpTakeoverDelayStatus.isOK()) {
+ return catchUpTakeoverDelayStatus;
+ }
+ _catchUpTakeoverDelay = Milliseconds(catchUpTakeoverDelayMillis);
+
+ //
// Parse chainingAllowed
//
Status status = bsonExtractBooleanFieldWithDefault(
@@ -784,6 +805,8 @@ BSONObj ReplSetConfig::toBSON() const {
durationCount<Milliseconds>(_electionTimeoutPeriod));
settingsBuilder.appendIntOrLL(kCatchUpTimeoutFieldName,
durationCount<Milliseconds>(_catchUpTimeoutPeriod));
+ settingsBuilder.appendIntOrLL(kCatchUpTakeoverDelayFieldName,
+ durationCount<Milliseconds>(_catchUpTakeoverDelay));
BSONObjBuilder gleModes(settingsBuilder.subobjStart(kGetLastErrorModesFieldName));
@@ -831,10 +854,6 @@ Milliseconds ReplSetConfig::getPriorityTakeoverDelay(int memberIdx) const {
return (priorityRank + 1) * getElectionTimeoutPeriod();
}
-Milliseconds ReplSetConfig::getCatchupTakeoverDelay() const {
- return kDefaultCatchupTakeoverDelay;
-}
-
int ReplSetConfig::_calculatePriorityRank(double priority) const {
int count = 0;
for (MemberIterator mem = membersBegin(); mem != membersEnd(); mem++) {
diff --git a/src/mongo/db/repl/repl_set_config.h b/src/mongo/db/repl/repl_set_config.h
index 0b8b868d747..df7c40288fb 100644
--- a/src/mongo/db/repl/repl_set_config.h
+++ b/src/mongo/db/repl/repl_set_config.h
@@ -60,13 +60,15 @@ public:
static const size_t kMaxMembers = 50;
static const size_t kMaxVotingMembers = 7;
static const Milliseconds kInfiniteCatchUpTimeout;
+ static const Milliseconds kCatchUpDisabled;
+ static const Milliseconds kCatchUpTakeoverDisabled;
static const Milliseconds kDefaultElectionTimeoutPeriod;
static const Milliseconds kDefaultHeartbeatInterval;
static const Seconds kDefaultHeartbeatTimeoutPeriod;
static const Milliseconds kDefaultCatchUpTimeoutPeriod;
static const bool kDefaultChainingAllowed;
- static const Milliseconds kDefaultCatchupTakeoverDelay;
+ static const Milliseconds kDefaultCatchUpTakeoverDelay;
/**
* Initializes this ReplSetConfig from the contents of "cfg".
@@ -345,7 +347,9 @@ public:
* Returns the duration to wait before running for election when this node
* sees that it is more caught up than the current primary.
*/
- Milliseconds getCatchupTakeoverDelay() const;
+ Milliseconds getCatchUpTakeoverDelay() const {
+ return _catchUpTakeoverDelay;
+ }
private:
/**
@@ -391,6 +395,7 @@ private:
Milliseconds _heartbeatInterval = kDefaultHeartbeatInterval;
Seconds _heartbeatTimeoutPeriod = kDefaultHeartbeatTimeoutPeriod;
Milliseconds _catchUpTimeoutPeriod = kDefaultCatchUpTimeoutPeriod;
+ Milliseconds _catchUpTakeoverDelay = kDefaultCatchUpTakeoverDelay;
bool _chainingAllowed = kDefaultChainingAllowed;
bool _writeConcernMajorityJournalDefault = false;
int _majorityVoteCount = 0;
diff --git a/src/mongo/db/repl/repl_set_config_test.cpp b/src/mongo/db/repl/repl_set_config_test.cpp
index f9d0ec01f9d..9a9731d89f8 100644
--- a/src/mongo/db/repl/repl_set_config_test.cpp
+++ b/src/mongo/db/repl/repl_set_config_test.cpp
@@ -1632,29 +1632,56 @@ TEST(ReplSetConfig, GetPriorityTakeoverDelay) {
ASSERT_EQUALS(Milliseconds(1000), configB.getPriorityTakeoverDelay(4));
}
-TEST(ReplSetConfig, GetCatchupTakeoverDelay) {
- ReplSetConfig configA;
- ASSERT_OK(configA.initialize(BSON("_id"
- << "rs0"
- << "version"
- << 1
- << "members"
- << BSON_ARRAY(BSON("_id" << 0 << "host"
- << "localhost:12345"
- << "priority"
- << 1)
- << BSON("_id" << 1 << "host"
- << "localhost:54321"
- << "priority"
- << 2)
- << BSON("_id" << 2 << "host"
- << "localhost:5321"
- << "priority"
- << 3))
- << "settings"
- << BSON("electionTimeoutMillis" << 1000))));
- ASSERT_OK(configA.validate());
- ASSERT_EQUALS(Milliseconds(30000), configA.getCatchupTakeoverDelay());
+TEST(ReplSetConfig, GetCatchUpTakeoverDelay) {
+ ReplSetConfig config;
+ ASSERT_OK(config.initialize(BSON("_id"
+ << "rs0"
+ << "version"
+ << 1
+ << "members"
+ << BSON_ARRAY(BSON("_id" << 0 << "host"
+ << "localhost:12345"))
+ << "settings"
+ << BSON("catchUpTakeoverDelayMillis" << 5000))));
+ ASSERT_OK(config.validate());
+ ASSERT_EQUALS(Milliseconds(5000), config.getCatchUpTakeoverDelay());
+
+ Status status = config.initialize(BSON("_id"
+ << "rs0"
+ << "version"
+ << 1
+ << "members"
+ << BSON_ARRAY(BSON("_id" << 0 << "host"
+ << "localhost:12345"))
+ << "settings"
+ << BSON("catchUpTakeoverDelayMillis" << -5000)));
+ ASSERT_EQUALS(ErrorCodes::BadValue, status);
+ ASSERT_STRING_CONTAINS(
+ status.reason(),
+ "catch-up takeover delay must be -1 (no catch-up takeover) or greater than or equal to 0");
+}
+
+TEST(ReplSetConfig, GetCatchUpTakeoverDelayDefault) {
+ ReplSetConfig config;
+ ASSERT_OK(config.initialize(BSON("_id"
+ << "rs0"
+ << "version"
+ << 1
+ << "members"
+ << BSON_ARRAY(BSON("_id" << 0 << "host"
+ << "localhost:12345"
+ << "priority"
+ << 1)
+ << BSON("_id" << 1 << "host"
+ << "localhost:54321"
+ << "priority"
+ << 2)
+ << BSON("_id" << 2 << "host"
+ << "localhost:5321"
+ << "priority"
+ << 3)))));
+ ASSERT_OK(config.validate());
+ ASSERT_EQUALS(Milliseconds(30000), config.getCatchUpTakeoverDelay());
}
TEST(ReplSetConfig, ConfirmDefaultValuesOfAndAbilityToSetWriteConcernMajorityJournalDefault) {
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index 151c7b4b275..e1f995f4d6f 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -2524,7 +2524,7 @@ void ReplicationCoordinatorImpl::CatchupState::start_inlock() {
auto catchupTimeout = _repl->_rsConfig.getCatchUpTimeoutPeriod();
// When catchUpTimeoutMillis is 0, we skip doing catchup entirely.
- if (catchupTimeout == Milliseconds::zero()) {
+ if (catchupTimeout == ReplSetConfig::kCatchUpDisabled) {
log() << "Skipping primary catchup since the catchup timeout is 0.";
abort_inlock();
return;
diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp
index 6248e4e6423..1f75f7c9a75 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp
@@ -922,6 +922,48 @@ private:
}
};
+TEST_F(TakeoverTest, DoesntScheduleCatchupTakeoverIfCatchupDisabledButTakeoverDelaySet) {
+ BSONObj configObj = BSON("_id"
+ << "mySet"
+ << "version"
+ << 1
+ << "members"
+ << BSON_ARRAY(BSON("_id" << 1 << "host"
+ << "node1:12345")
+ << BSON("_id" << 2 << "host"
+ << "node2:12345")
+ << BSON("_id" << 3 << "host"
+ << "node3:12345"))
+ << "protocolVersion"
+ << 1
+ << "settings"
+ << BSON("catchUpTimeoutMillis" << 0 << "catchUpTakeoverDelay"
+ << 10000));
+ assertStartSuccess(configObj, HostAndPort("node1", 12345));
+ ReplSetConfig config = assertMakeRSConfig(configObj);
+
+ auto replCoord = getReplCoord();
+ auto now = getNet()->now();
+
+ OperationContextNoop opCtx;
+ OpTime currentOptime(Timestamp(200, 1), 0);
+ replCoord->setMyLastAppliedOpTime(currentOptime);
+ replCoord->setMyLastDurableOpTime(currentOptime);
+ OpTime behindOptime(Timestamp(100, 1), 0);
+ ASSERT_EQUALS(ErrorCodes::StaleTerm, replCoord->updateTerm(&opCtx, 1));
+
+ // Make sure we're secondary and that no catchup takeover has been scheduled yet.
+ ASSERT_OK(replCoord->setFollowerMode(MemberState::RS_SECONDARY));
+ ASSERT_FALSE(replCoord->getCatchupTakeover_forTest());
+
+ // Mock a first round of heartbeat responses, which should give us enough
+ // information to know that we are fresher than the current primary.
+ now = respondToHeartbeatsUntil(config, now, HostAndPort("node2", 12345), behindOptime);
+
+ // Make sure that the catchup takeover was not scheduled.
+ ASSERT_FALSE(replCoord->getCatchupTakeover_forTest());
+}
+
TEST_F(TakeoverTest, SchedulesCatchupTakeoverIfNodeIsFresherThanCurrentPrimary) {
BSONObj configObj = BSON("_id"
<< "mySet"
@@ -962,7 +1004,7 @@ TEST_F(TakeoverTest, SchedulesCatchupTakeoverIfNodeIsFresherThanCurrentPrimary)
ASSERT(replCoord->getCatchupTakeover_forTest());
auto catchupTakeoverTime = replCoord->getCatchupTakeover_forTest().get();
Milliseconds catchupTakeoverDelay = catchupTakeoverTime - now;
- ASSERT_EQUALS(config.getCatchupTakeoverDelay(), catchupTakeoverDelay);
+ ASSERT_EQUALS(config.getCatchUpTakeoverDelay(), catchupTakeoverDelay);
}
TEST_F(TakeoverTest, SchedulesCatchupTakeoverIfBothTakeoversAnOption) {
@@ -1008,7 +1050,7 @@ TEST_F(TakeoverTest, SchedulesCatchupTakeoverIfBothTakeoversAnOption) {
ASSERT_FALSE(replCoord->getPriorityTakeover_forTest());
auto catchupTakeoverTime = replCoord->getCatchupTakeover_forTest().get();
Milliseconds catchupTakeoverDelay = catchupTakeoverTime - now;
- ASSERT_EQUALS(config.getCatchupTakeoverDelay(), catchupTakeoverDelay);
+ ASSERT_EQUALS(config.getCatchUpTakeoverDelay(), catchupTakeoverDelay);
}
TEST_F(TakeoverTest, CatchupTakeoverNotScheduledTwice) {
@@ -1053,7 +1095,7 @@ TEST_F(TakeoverTest, CatchupTakeoverNotScheduledTwice) {
replCoord->getCatchupTakeoverCbh_forTest();
auto catchupTakeoverTime = replCoord->getCatchupTakeover_forTest().get();
Milliseconds catchupTakeoverDelay = catchupTakeoverTime - now;
- ASSERT_EQUALS(config.getCatchupTakeoverDelay(), catchupTakeoverDelay);
+ ASSERT_EQUALS(config.getCatchUpTakeoverDelay(), catchupTakeoverDelay);
// Mock another round of heartbeat responses
now = respondToHeartbeatsUntil(
@@ -1106,7 +1148,7 @@ TEST_F(TakeoverTest, CatchupAndPriorityTakeoverNotScheduledAtSameTime) {
ASSERT(replCoord->getCatchupTakeover_forTest());
auto catchupTakeoverTime = replCoord->getCatchupTakeover_forTest().get();
Milliseconds catchupTakeoverDelay = catchupTakeoverTime - now;
- ASSERT_EQUALS(config.getCatchupTakeoverDelay(), catchupTakeoverDelay);
+ ASSERT_EQUALS(config.getCatchUpTakeoverDelay(), catchupTakeoverDelay);
// Mock another heartbeat where the primary is now up to date
now = respondToHeartbeatsUntil(
@@ -1164,7 +1206,7 @@ TEST_F(TakeoverTest, CatchupTakeoverCallbackCanceledIfElectionTimeoutRuns) {
ASSERT(replCoord->getCatchupTakeover_forTest());
auto catchupTakeoverTime = replCoord->getCatchupTakeover_forTest().get();
Milliseconds catchupTakeoverDelay = catchupTakeoverTime - now;
- ASSERT_EQUALS(config.getCatchupTakeoverDelay(), catchupTakeoverDelay);
+ ASSERT_EQUALS(config.getCatchUpTakeoverDelay(), catchupTakeoverDelay);
// Fast forward clock to after electionTimeout and black hole all
// heartbeat requests to make sure the election timeout runs.
@@ -1234,7 +1276,7 @@ TEST_F(TakeoverTest, CatchupTakeoverCanceledIfTransitionToRollback) {
ASSERT(replCoord->getCatchupTakeover_forTest());
auto catchupTakeoverTime = replCoord->getCatchupTakeover_forTest().get();
Milliseconds catchupTakeoverDelay = catchupTakeoverTime - now;
- ASSERT_EQUALS(config.getCatchupTakeoverDelay(), catchupTakeoverDelay);
+ ASSERT_EQUALS(config.getCatchUpTakeoverDelay(), catchupTakeoverDelay);
// Transitioning to rollback state should cancel the takeover
ASSERT_OK(replCoord->setFollowerMode(MemberState::RS_ROLLBACK));
@@ -1288,7 +1330,7 @@ TEST_F(TakeoverTest, CatchupTakeoverElectionIsANoop) {
ASSERT(replCoord->getCatchupTakeover_forTest());
auto catchupTakeoverTime = replCoord->getCatchupTakeover_forTest().get();
Milliseconds catchupTakeoverDelay = catchupTakeoverTime - now;
- ASSERT_EQUALS(config.getCatchupTakeoverDelay(), catchupTakeoverDelay);
+ ASSERT_EQUALS(config.getCatchUpTakeoverDelay(), catchupTakeoverDelay);
startCapturingLogMessages();
now = respondToHeartbeatsUntil(config, catchupTakeoverTime, primaryHostAndPort, behindOptime);
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
index 0298002326c..fbaeb9dca44 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
@@ -293,7 +293,7 @@ stdx::unique_lock<stdx::mutex> ReplicationCoordinatorImpl::_handleHeartbeatRespo
case HeartbeatResponseAction::CatchupTakeover: {
// Don't schedule a catchup takeover if any takeover is already scheduled.
if (!_catchupTakeoverCbh.isValid() && !_priorityTakeoverCbh.isValid()) {
- Milliseconds catchupTakeoverDelay = _rsConfig.getCatchupTakeoverDelay();
+ Milliseconds catchupTakeoverDelay = _rsConfig.getCatchUpTakeoverDelay();
_catchupTakeoverWhen = _replExecutor->now() + catchupTakeoverDelay;
log() << "Scheduling catchup takeover at " << _catchupTakeoverWhen;
_catchupTakeoverCbh = _scheduleWorkAt(
diff --git a/src/mongo/db/repl/topology_coordinator_impl.cpp b/src/mongo/db/repl/topology_coordinator_impl.cpp
index de85d892552..8d707362fda 100644
--- a/src/mongo/db/repl/topology_coordinator_impl.cpp
+++ b/src/mongo/db/repl/topology_coordinator_impl.cpp
@@ -1309,8 +1309,12 @@ HeartbeatResponseAction TopologyCoordinatorImpl::_updatePrimaryFromHBDataV1(
// the remote primary will become aware of that election eventually and step down.
if (_memberData.at(primaryIndex).getTerm() == _term && updatedConfigIndex == primaryIndex) {
- if (_memberData.at(primaryIndex).getLastAppliedOpTime() <
- _memberData.at(_selfIndex).getLastAppliedOpTime()) {
+ // Don't schedule catchup takeover if catchup takeover or primary catchup is disabled.
+ bool catchupTakeoverDisabled =
+ ReplSetConfig::kCatchUpDisabled == _rsConfig.getCatchUpTimeoutPeriod() ||
+ ReplSetConfig::kCatchUpTakeoverDisabled == _rsConfig.getCatchUpTakeoverDelay();
+ if (!catchupTakeoverDisabled && (_memberData.at(primaryIndex).getLastAppliedOpTime() <
+ _memberData.at(_selfIndex).getLastAppliedOpTime())) {
LOG(2) << "I can take over the primary due to fresher data."
<< " Current primary index: " << primaryIndex << " in term "
<< _memberData.at(primaryIndex).getTerm() << "."