diff options
author | Lingzhi Deng <lingzhi.deng@mongodb.com> | 2021-01-20 09:36:26 -0500 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-02-17 03:41:21 +0000 |
commit | 4fb715053b3ad308c85501e9e9d0a1169bc78556 (patch) | |
tree | 7c0ef47221fc3d68ae6fd41e0313a37b968f4e9c /src/mongo | |
parent | ecbb91d31416fc9a68d896ea255f5494ca2a54d4 (diff) | |
download | mongo-4fb715053b3ad308c85501e9e9d0a1169bc78556.tar.gz |
SERVER-53612: Fix StepDown hangs when all nodes are caught up but none is immediately electable
(cherry picked from commit 6308db5c83a3e95f4532c63df8b635b8090036ae)
Diffstat (limited to 'src/mongo')
-rw-r--r-- | src/mongo/db/repl/heartbeat_response_action.cpp | 4 | ||||
-rw-r--r-- | src/mongo/db/repl/heartbeat_response_action.h | 15 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp | 10 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator.cpp | 4 |
4 files changed, 30 insertions, 3 deletions
diff --git a/src/mongo/db/repl/heartbeat_response_action.cpp b/src/mongo/db/repl/heartbeat_response_action.cpp index c21dbd0cf53..6b6bd4797a8 100644 --- a/src/mongo/db/repl/heartbeat_response_action.cpp +++ b/src/mongo/db/repl/heartbeat_response_action.cpp @@ -79,5 +79,9 @@ void HeartbeatResponseAction::setAdvancedOpTime(bool advanced) { _advancedOpTime = advanced; } +void HeartbeatResponseAction::setBecameElectable(bool becameElectable) { + _becameElectable = becameElectable; +} + } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/heartbeat_response_action.h b/src/mongo/db/repl/heartbeat_response_action.h index 9f0b3e4da8c..194b4bb0845 100644 --- a/src/mongo/db/repl/heartbeat_response_action.h +++ b/src/mongo/db/repl/heartbeat_response_action.h @@ -105,6 +105,12 @@ public: */ void setAdvancedOpTime(bool advanced); + /* + * Sets whether or not the member has transitioned from unelectable to electable since the last + * heartbeat response. + */ + void setBecameElectable(bool becameElectable); + /** * Gets the action type of this action. */ @@ -136,11 +142,20 @@ public: return _advancedOpTime; } + /* + * Returns true if the heartbeat response results in the member transitioning from unelectable + * to electable. + */ + bool getBecameElectable() const { + return _becameElectable; + } + private: Action _action; int _primaryIndex; Date_t _nextHeartbeatStartDate; bool _advancedOpTime = false; + bool _becameElectable = false; }; } // namespace repl diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index c993d696a0d..0b32eacc507 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -237,9 +237,13 @@ void ReplicationCoordinatorImpl::_handleHeartbeatResponse( if (action.getAction() == HeartbeatResponseAction::NoAction && hbStatusResponse.isOK() && hbStatusResponse.getValue().hasState() && - hbStatusResponse.getValue().getState() != MemberState::RS_PRIMARY && - action.getAdvancedOpTime()) { - _updateLastCommittedOpTimeAndWallTime(lk); + hbStatusResponse.getValue().getState() != MemberState::RS_PRIMARY) { + if (action.getAdvancedOpTime()) { + _updateLastCommittedOpTimeAndWallTime(lk); + } else if (action.getBecameElectable() && _topCoord->isSteppingDown()) { + // Try to wake up the stepDown waiter when a new node becomes electable. + _wakeReadyWaiters(lk); + } } // Abort catchup if we have caught up to the latest known optime after heartbeat refreshing. diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp index 1df593d8330..727270bee7d 100644 --- a/src/mongo/db/repl/topology_coordinator.cpp +++ b/src/mongo/db/repl/topology_coordinator.cpp @@ -854,6 +854,7 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse( MemberData& hbData = _memberData.at(memberIndex); const MemberConfig member = _rsConfig.getMemberAt(memberIndex); bool advancedOpTime = false; + bool becameElectable = false; if (!hbResponse.isOK()) { if (isUnauthorized) { hbData.setAuthIssue(now); @@ -871,7 +872,9 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse( ReplSetHeartbeatResponse hbr = std::move(hbResponse.getValue()); LOG(3) << "setUpValues: heartbeat response good for member _id:" << member.getId(); pingsInConfig++; + auto wasUnelectable = hbData.isUnelectable(); advancedOpTime = hbData.setUpValues(now, std::move(hbr)); + becameElectable = wasUnelectable && !hbData.isUnelectable(); } HeartbeatResponseAction nextAction; @@ -879,6 +882,7 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse( nextAction.setNextHeartbeatStartDate(nextHeartbeatStartDate); nextAction.setAdvancedOpTime(advancedOpTime); + nextAction.setBecameElectable(becameElectable); return nextAction; } |