diff options
author | matt dannenberg <matt.dannenberg@10gen.com> | 2015-11-24 06:33:08 -0500 |
---|---|---|
committer | matt dannenberg <matt.dannenberg@10gen.com> | 2015-11-24 09:21:24 -0500 |
commit | e8f4a2ea35060e97281221f3b1457ab7106e631e (patch) | |
tree | 4abc6c4dc5452333d24424712f943c56b400b0b4 | |
parent | cdd8c86aaae0da573bbea2e3c86267d42e8d6b56 (diff) | |
download | mongo-e8f4a2ea35060e97281221f3b1457ab7106e631e.tar.gz |
SERVER-21249 only restart heartbeats once when a node cannot find a syncsource
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.cpp | 21 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.h | 9 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp | 19 |
3 files changed, 29 insertions, 20 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index 2e5f6feeb25..877910c792f 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -1460,8 +1460,7 @@ void ReplicationCoordinatorImpl::_stepDownContinue( return; } stdx::lock_guard<stdx::mutex> lk(_mutex); - _cancelHeartbeats_inlock(); - _startHeartbeats_inlock(cbData); + _restartHeartbeats_inlock(cbData); } void ReplicationCoordinatorImpl::_handleTimePassing( @@ -2643,18 +2642,17 @@ void ReplicationCoordinatorImpl::_chooseNewSyncSource( if (cbData.status == ErrorCodes::CallbackCanceled) { return; } + + HostAndPort oldSyncSource = _topCoord->getSyncSourceAddress(); + *newSyncSource = _topCoord->chooseNewSyncSource(_replExecutor.now(), lastTimestampFetched); stdx::lock_guard<stdx::mutex> lock(_mutex); - // If no sync source is found, schedule new heartbeats immediately to update our member state, - // allowing us to make informed sync source decisions. - if (newSyncSource->empty() && _justLostSyncSource && _selfIndex >= 0 && + // If we lost our sync source, schedule new heartbeats immediately to update our knowledge + // of other members's state, allowing us to make informed sync source decisions. + if (newSyncSource->empty() && !oldSyncSource.empty() && _selfIndex >= 0 && !_getMemberState_inlock().primary()) { - _cancelHeartbeats_inlock(); - _startHeartbeats_inlock(cbData); - _justLostSyncSource = false; - } else { - _justLostSyncSource = true; + _restartHeartbeats_inlock(cbData); } } @@ -3358,8 +3356,7 @@ void ReplicationCoordinatorImpl::_scheduleElectionWinNotification() { return; } - _cancelHeartbeats_inlock(); - _startHeartbeats_inlock(cbData); + _restartHeartbeats_inlock(cbData); }; auto cbStatus = _replExecutor.scheduleWork(electionWinNotificationCallback); diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h index d1cca5c96e3..a772d35e8ce 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.h +++ b/src/mongo/db/repl/replication_coordinator_impl.h @@ -778,6 +778,12 @@ private: void _cancelHeartbeats_inlock(); /** + * Cancels all heartbeats, then starts a heartbeat for each member in the current config. + * Called within the executor context. + */ + void _restartHeartbeats_inlock(const ReplicationExecutor::CallbackArgs& cbData); + + /** * Asynchronously sends a heartbeat to "target". "targetIndex" is the index * into the replica set config members array that corresponds to the "target", or -1 if * we don't have a valid replica set config. @@ -1380,9 +1386,6 @@ private: // Cached copy of the current config protocol version. AtomicInt64 _protVersion; // (S) - - // Prevents a busy loop of cancelling heartbeats when we have no sync source. - bool _justLostSyncSource = true; // (M) }; } // namespace repl diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index ba850f86eab..cfc8116e1b6 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -526,6 +526,12 @@ void ReplicationCoordinatorImpl::_cancelHeartbeats_inlock() { } } +void ReplicationCoordinatorImpl::_restartHeartbeats_inlock( + const ReplicationExecutor::CallbackArgs& cbData) { + _cancelHeartbeats_inlock(); + _startHeartbeats_inlock(cbData); +} + void ReplicationCoordinatorImpl::_startHeartbeats_inlock( const ReplicationExecutor::CallbackArgs& cbData) { const Date_t now = _replExecutor.now(); @@ -575,8 +581,8 @@ void ReplicationCoordinatorImpl::_handleLivenessTimeout( slaveInfo.down = true; if (_memberState.primary()) { - // Only adjust hbdata if we are primary, since only the primary has a full view of - // the entire cluster. + // Only adjust hbdata if we are primary, since only the primary has a full view + // of the entire cluster. // Secondaries might not see other secondaries in the cluster if they are not // downstream. HeartbeatResponseAction action = @@ -604,7 +610,8 @@ void ReplicationCoordinatorImpl::_scheduleNextLivenessUpdate_inlock( if (!isV1ElectionProtocol()) { return; } - // Scan liveness table for earliest date; schedule a run at (that date plus election timeout). + // Scan liveness table for earliest date; schedule a run at (that date plus election + // timeout). Date_t earliestDate = Date_t::max(); int earliestMemberId = -1; for (auto&& slaveInfo : _slaveInfo) { @@ -710,7 +717,8 @@ void ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1(bool isPriorityTake return; } - // We should always reschedule this callback even if we do not make it to the election process. + // We should always reschedule this callback even if we do not make it to the election + // process. { stdx::lock_guard<stdx::mutex> lock(_mutex); _cancelPriorityTakeover_inlock(); @@ -719,7 +727,8 @@ void ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1(bool isPriorityTake if (!_topCoord->becomeCandidateIfElectable(_replExecutor.now(), getMyLastOptime())) { if (isPriorityTakeOver) { - log() << "Not starting an election for a priority takeover, since we are not electable"; + log() << "Not starting an election for a priority takeover, since we are not " + "electable"; } else { log() << "Not starting an election, since we are not electable"; } |