summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSpencer T Brody <spencer@mongodb.com>2015-02-03 15:35:51 -0500
committerRamon Fernandez <ramon.fernandez@mongodb.com>2015-02-04 13:47:50 -0500
commit02c6f0b6c12a0ecbb6d37fd2224f14deefe93ba2 (patch)
tree30270b3a3232c4b7299558a99eac92a4e82352e8
parent245f53125f4eb18d881b4278fc7bd99c6cbca9d0 (diff)
downloadmongo-02c6f0b6c12a0ecbb6d37fd2224f14deefe93ba2.tar.gz
SERVER-17164 Don't start elections while coming out of RECOVERING state
(cherry picked from commit 30c0fd907f5279786c0eb6958bb61039697c5c29)
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp6
-rw-r--r--src/mongo/db/repl/topology_coordinator_impl.cpp19
-rw-r--r--src/mongo/db/repl/topology_coordinator_impl.h4
-rw-r--r--src/mongo/db/repl/topology_coordinator_impl_test.cpp12
4 files changed, 39 insertions, 2 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index da41d088a74..3ed93bfb287 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -1910,6 +1910,9 @@ namespace {
const MemberState newState = _topCoord->getMemberState();
if (newState == _memberState) {
if (_topCoord->getRole() == TopologyCoordinator::Role::candidate) {
+ invariant(_rsConfig.getNumMembers() == 1 &&
+ _selfIndex == 0 &&
+ _rsConfig.getMemberAt(0).isElectable());
return kActionWinElection;
}
return kActionNone;
@@ -1942,6 +1945,9 @@ namespace {
// When transitioning to SECONDARY, the only way for _topCoord to report the candidate
// role is if the configuration represents a single-node replica set. In that case, the
// overriding requirement is to elect this singleton node primary.
+ invariant(_rsConfig.getNumMembers() == 1 &&
+ _selfIndex == 0 &&
+ _rsConfig.getMemberAt(0).isElectable());
result = kActionWinElection;
}
diff --git a/src/mongo/db/repl/topology_coordinator_impl.cpp b/src/mongo/db/repl/topology_coordinator_impl.cpp
index 49f328b73e5..9e4b3eba66d 100644
--- a/src/mongo/db/repl/topology_coordinator_impl.cpp
+++ b/src/mongo/db/repl/topology_coordinator_impl.cpp
@@ -793,6 +793,7 @@ namespace {
const StatusWith<ReplSetHeartbeatResponse>& hbResponse,
OpTime myLastOpApplied) {
+ const MemberState originalState = getMemberState();
PingStats& hbStats = _pings[target];
invariant(hbStats.getLastHeartbeatStartDate() != Date_t(0));
if (!hbResponse.isOK()) {
@@ -904,6 +905,7 @@ namespace {
}
HeartbeatResponseAction nextAction = _updateHeartbeatDataImpl(
memberIndex,
+ originalState,
now,
myLastOpApplied);
@@ -913,6 +915,7 @@ namespace {
HeartbeatResponseAction TopologyCoordinatorImpl::_updateHeartbeatDataImpl(
int updatedConfigIndex,
+ const MemberState& originalState,
Date_t now,
const OpTime& lastOpApplied) {
@@ -1086,6 +1089,19 @@ namespace {
fassert(18505, _currentPrimaryIndex == -1);
+ const MemberState currentState = getMemberState();
+ if (originalState.recovering() && currentState.secondary()) {
+ // We just transitioned from RECOVERING to SECONDARY, this can only happen if we
+ // received a heartbeat with an auth error when previously all the heartbeats we'd
+ // received had auth errors. In this case, don't return makeElectAction() because
+ // that could cause the election to start before the ReplicationCoordinator has updated
+ // its notion of the member state to SECONDARY. Instead return noAction so that the
+ // ReplicationCooridinator knows to update its tracking of the member state off of the
+ // TopologyCoordinator, and leave starting the election until the next heartbeat comes
+ // back.
+ return HeartbeatResponseAction::makeNoAction();
+ }
+
// At this point, there is no primary anywhere. Check to see if we should become a
// candidate.
if (!checkShouldStandForElection(now, lastOpApplied)) {
@@ -1145,7 +1161,8 @@ namespace {
bool TopologyCoordinatorImpl::_isOpTimeCloseEnoughToLatestToElect(
const OpTime& otherOpTime, const OpTime& ourLastOpApplied) const {
const OpTime latestKnownOpTime = _latestKnownOpTime(ourLastOpApplied);
- return otherOpTime.getSecs() >= (latestKnownOpTime.getSecs() - 10);
+ // Use addition instead of subtraction to avoid overflow.
+ return otherOpTime.getSecs() + 10 >= (latestKnownOpTime.getSecs());
}
bool TopologyCoordinatorImpl::_iAmPrimary() const {
diff --git a/src/mongo/db/repl/topology_coordinator_impl.h b/src/mongo/db/repl/topology_coordinator_impl.h
index 1076f08d1dd..095ca946d9c 100644
--- a/src/mongo/db/repl/topology_coordinator_impl.h
+++ b/src/mongo/db/repl/topology_coordinator_impl.h
@@ -250,7 +250,8 @@ namespace repl {
// Sees if a majority number of votes are held by members who are currently "up"
bool _aMajoritySeemsToBeUp() const;
- // Is otherOpTime close enough to the latest known optime to qualify for an election
+ // Is otherOpTime close enough (within 10 seconds) to the latest known optime to qualify
+ // for an election
bool _isOpTimeCloseEnoughToLatestToElect(const OpTime& otherOpTime,
const OpTime& ourLastOpApplied) const;
@@ -290,6 +291,7 @@ namespace repl {
*/
HeartbeatResponseAction _updateHeartbeatDataImpl(
int updatedConfigIndex,
+ const MemberState& originalState,
Date_t now,
const OpTime& lastOpApplied);
diff --git a/src/mongo/db/repl/topology_coordinator_impl_test.cpp b/src/mongo/db/repl/topology_coordinator_impl_test.cpp
index a0505446905..45e10da2ad9 100644
--- a/src/mongo/db/repl/topology_coordinator_impl_test.cpp
+++ b/src/mongo/db/repl/topology_coordinator_impl_test.cpp
@@ -545,6 +545,18 @@ namespace {
receiveDownHeartbeat(HostAndPort("h3"), "rs0", OpTime(), ErrorCodes::Unauthorized);
ASSERT_TRUE(getTopoCoord().chooseNewSyncSource(now()++, OpTime(0,0)).empty());
ASSERT_EQUALS(MemberState::RS_RECOVERING, getTopoCoord().getMemberState().s);
+
+ // Having an auth error but with another node up should bring us out of RECOVERING
+ HeartbeatResponseAction action = receiveUpHeartbeat(HostAndPort("h2"),
+ "rs0",
+ MemberState::RS_SECONDARY,
+ OpTime(0, 0),
+ OpTime(2, 0),
+ OpTime(2, 0));
+ ASSERT_EQUALS(MemberState::RS_SECONDARY, getTopoCoord().getMemberState().s);
+ // Test that the heartbeat that brings us from RECOVERING to SECONDARY doesn't initiate
+ // an election (SERVER-17164)
+ ASSERT_NO_ACTION(action.getAction());
}
TEST_F(TopoCoordTest, ReceiveHeartbeatWhileAbsentFromConfig) {