diff options
author | Spencer T Brody <spencer@mongodb.com> | 2015-02-03 15:35:51 -0500 |
---|---|---|
committer | Spencer T Brody <spencer@mongodb.com> | 2015-02-04 13:40:35 -0500 |
commit | 30c0fd907f5279786c0eb6958bb61039697c5c29 (patch) | |
tree | 0d1bb9a5fa73591d0f21da8abe4f7568b15e7df8 | |
parent | 7d3d0e5c6ae4f6054d63a208f74e58fe26b65cf9 (diff) | |
download | mongo-30c0fd907f5279786c0eb6958bb61039697c5c29.tar.gz |
SERVER-17164 Don't start elections while coming out of RECOVERING state
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator_impl.cpp | 19 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator_impl.h | 4 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator_impl_test.cpp | 12 |
4 files changed, 39 insertions, 2 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index da41d088a74..3ed93bfb287 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -1910,6 +1910,9 @@ namespace { const MemberState newState = _topCoord->getMemberState(); if (newState == _memberState) { if (_topCoord->getRole() == TopologyCoordinator::Role::candidate) { + invariant(_rsConfig.getNumMembers() == 1 && + _selfIndex == 0 && + _rsConfig.getMemberAt(0).isElectable()); return kActionWinElection; } return kActionNone; @@ -1942,6 +1945,9 @@ namespace { // When transitioning to SECONDARY, the only way for _topCoord to report the candidate // role is if the configuration represents a single-node replica set. In that case, the // overriding requirement is to elect this singleton node primary. + invariant(_rsConfig.getNumMembers() == 1 && + _selfIndex == 0 && + _rsConfig.getMemberAt(0).isElectable()); result = kActionWinElection; } diff --git a/src/mongo/db/repl/topology_coordinator_impl.cpp b/src/mongo/db/repl/topology_coordinator_impl.cpp index 49f328b73e5..9e4b3eba66d 100644 --- a/src/mongo/db/repl/topology_coordinator_impl.cpp +++ b/src/mongo/db/repl/topology_coordinator_impl.cpp @@ -793,6 +793,7 @@ namespace { const StatusWith<ReplSetHeartbeatResponse>& hbResponse, OpTime myLastOpApplied) { + const MemberState originalState = getMemberState(); PingStats& hbStats = _pings[target]; invariant(hbStats.getLastHeartbeatStartDate() != Date_t(0)); if (!hbResponse.isOK()) { @@ -904,6 +905,7 @@ namespace { } HeartbeatResponseAction nextAction = _updateHeartbeatDataImpl( memberIndex, + originalState, now, myLastOpApplied); @@ -913,6 +915,7 @@ namespace { HeartbeatResponseAction TopologyCoordinatorImpl::_updateHeartbeatDataImpl( int updatedConfigIndex, + const MemberState& originalState, Date_t now, const OpTime& lastOpApplied) { @@ -1086,6 +1089,19 @@ namespace { fassert(18505, _currentPrimaryIndex == -1); + const MemberState currentState = getMemberState(); + if (originalState.recovering() && currentState.secondary()) { + // We just transitioned from RECOVERING to SECONDARY, this can only happen if we + // received a heartbeat with an auth error when previously all the heartbeats we'd + // received had auth errors. In this case, don't return makeElectAction() because + // that could cause the election to start before the ReplicationCoordinator has updated + // its notion of the member state to SECONDARY. Instead return noAction so that the + // ReplicationCooridinator knows to update its tracking of the member state off of the + // TopologyCoordinator, and leave starting the election until the next heartbeat comes + // back. + return HeartbeatResponseAction::makeNoAction(); + } + // At this point, there is no primary anywhere. Check to see if we should become a // candidate. if (!checkShouldStandForElection(now, lastOpApplied)) { @@ -1145,7 +1161,8 @@ namespace { bool TopologyCoordinatorImpl::_isOpTimeCloseEnoughToLatestToElect( const OpTime& otherOpTime, const OpTime& ourLastOpApplied) const { const OpTime latestKnownOpTime = _latestKnownOpTime(ourLastOpApplied); - return otherOpTime.getSecs() >= (latestKnownOpTime.getSecs() - 10); + // Use addition instead of subtraction to avoid overflow. + return otherOpTime.getSecs() + 10 >= (latestKnownOpTime.getSecs()); } bool TopologyCoordinatorImpl::_iAmPrimary() const { diff --git a/src/mongo/db/repl/topology_coordinator_impl.h b/src/mongo/db/repl/topology_coordinator_impl.h index 1076f08d1dd..095ca946d9c 100644 --- a/src/mongo/db/repl/topology_coordinator_impl.h +++ b/src/mongo/db/repl/topology_coordinator_impl.h @@ -250,7 +250,8 @@ namespace repl { // Sees if a majority number of votes are held by members who are currently "up" bool _aMajoritySeemsToBeUp() const; - // Is otherOpTime close enough to the latest known optime to qualify for an election + // Is otherOpTime close enough (within 10 seconds) to the latest known optime to qualify + // for an election bool _isOpTimeCloseEnoughToLatestToElect(const OpTime& otherOpTime, const OpTime& ourLastOpApplied) const; @@ -290,6 +291,7 @@ namespace repl { */ HeartbeatResponseAction _updateHeartbeatDataImpl( int updatedConfigIndex, + const MemberState& originalState, Date_t now, const OpTime& lastOpApplied); diff --git a/src/mongo/db/repl/topology_coordinator_impl_test.cpp b/src/mongo/db/repl/topology_coordinator_impl_test.cpp index a0505446905..45e10da2ad9 100644 --- a/src/mongo/db/repl/topology_coordinator_impl_test.cpp +++ b/src/mongo/db/repl/topology_coordinator_impl_test.cpp @@ -545,6 +545,18 @@ namespace { receiveDownHeartbeat(HostAndPort("h3"), "rs0", OpTime(), ErrorCodes::Unauthorized); ASSERT_TRUE(getTopoCoord().chooseNewSyncSource(now()++, OpTime(0,0)).empty()); ASSERT_EQUALS(MemberState::RS_RECOVERING, getTopoCoord().getMemberState().s); + + // Having an auth error but with another node up should bring us out of RECOVERING + HeartbeatResponseAction action = receiveUpHeartbeat(HostAndPort("h2"), + "rs0", + MemberState::RS_SECONDARY, + OpTime(0, 0), + OpTime(2, 0), + OpTime(2, 0)); + ASSERT_EQUALS(MemberState::RS_SECONDARY, getTopoCoord().getMemberState().s); + // Test that the heartbeat that brings us from RECOVERING to SECONDARY doesn't initiate + // an election (SERVER-17164) + ASSERT_NO_ACTION(action.getAction()); } TEST_F(TopoCoordTest, ReceiveHeartbeatWhileAbsentFromConfig) { |