diff options
author | Lingzhi Deng <lingzhi.deng@mongodb.com> | 2021-01-20 09:36:26 -0500 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-02-17 03:41:21 +0000 |
commit | 4fb715053b3ad308c85501e9e9d0a1169bc78556 (patch) | |
tree | 7c0ef47221fc3d68ae6fd41e0313a37b968f4e9c | |
parent | ecbb91d31416fc9a68d896ea255f5494ca2a54d4 (diff) | |
download | mongo-4fb715053b3ad308c85501e9e9d0a1169bc78556.tar.gz |
SERVER-53612: Fix StepDown hangs when all nodes are caught up but none is immediately electable
(cherry picked from commit 6308db5c83a3e95f4532c63df8b635b8090036ae)
7 files changed, 70 insertions, 5 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml index d752f7a2f73..2328b36d5bc 100644 --- a/etc/backports_required_for_multiversion_tests.yml +++ b/etc/backports_required_for_multiversion_tests.yml @@ -94,6 +94,8 @@ all: test_file: jstests/replsets/election_handoff_skips_unelectable_nodes.js - ticket: SERVER-53394 test_file: jstests/sharding/sharding_task_executor_pool_matching_policy.js + - ticket: SERVER-53612 + test_file: jstests/replsets/election_handoff_not_immediately_electable.js # Tests that should only be excluded from particular suites should be listed under that suite. suites: diff --git a/jstests/replsets/election_handoff_not_immediately_electable.js b/jstests/replsets/election_handoff_not_immediately_electable.js new file mode 100644 index 00000000000..0111553a8c7 --- /dev/null +++ b/jstests/replsets/election_handoff_not_immediately_electable.js @@ -0,0 +1,32 @@ +/** + * Test that election handoff works correctly in the case where a node is caught up with primary's + * lastApplied but is not immediately electable. (See SERVER-53612) + */ + +(function() { +"use strict"; +load("jstests/replsets/libs/election_handoff.js"); + +const testName = jsTestName(); +const rst = ReplSetTest({name: testName, nodes: 2}); +rst.startSet(); + +// Make sure there are no election timeouts firing for the duration of the test. This helps +// ensure that the test will only pass if the election handoff succeeds. +rst.initiateWithHighElectionTimeout(); + +rst.awaitLastOpCommitted(); + +// ElectionHandoffTest.testElectionHandoff uses a 30s secondaryCatchUpPeriodSecs, freeze the +// secondary for 15s so that the secondary is not immediately electable during election handoff. +const secondary = rst.getSecondary(); +assert.commandWorked(secondary.adminCommand({replSetFreeze: 15})); +// replSetStepUp should fail due to replSetFreeze. +assert.commandFailedWithCode(secondary.adminCommand({replSetStepUp: 1}), ErrorCodes.CommandFailed); + +// Test that the election handoff works eventually when the secondary node becomes electable. +ElectionHandoffTest.testElectionHandoff( + rst, 0, 1, {stepDownPeriodSecs: 30, secondaryCatchUpPeriodSecs: 30}); + +rst.stopSet(); +})(); diff --git a/jstests/replsets/libs/election_handoff.js b/jstests/replsets/libs/election_handoff.js index 255ebbd0e6f..3bd38150187 100644 --- a/jstests/replsets/libs/election_handoff.js +++ b/jstests/replsets/libs/election_handoff.js @@ -19,6 +19,9 @@ var ElectionHandoffTest = (function() { * are: * stepDownBySignal - When this option is set, the primary will be stepped down by stopping * and restarting with sigterm, rather than with a replSetStepDown command + * stepDownPeriodSecs - The number of seconds to step down the primary. + * secondaryCatchUpPeriodSecs - The number of seconds that the mongod will wait for an + * electable secondary to catch up to the primary. */ function testElectionHandoff(rst, initialPrimaryId, expectedCandidateId, options = {}) { const config = rst.getReplSetConfigFromNode(); @@ -59,8 +62,9 @@ var ElectionHandoffTest = (function() { rst.start(initialPrimaryId, {}, true); } else { assert.commandWorked(primary.adminCommand({ - replSetStepDown: kStepDownPeriodSecs, - secondaryCatchUpPeriodSecs: kStepDownPeriodSecs / 2 + replSetStepDown: options.stepDownPeriodSecs || kStepDownPeriodSecs, + secondaryCatchUpPeriodSecs: + options.secondaryCatchUpPeriodSecs || kStepDownPeriodSecs / 2 })); } diff --git a/src/mongo/db/repl/heartbeat_response_action.cpp b/src/mongo/db/repl/heartbeat_response_action.cpp index c21dbd0cf53..6b6bd4797a8 100644 --- a/src/mongo/db/repl/heartbeat_response_action.cpp +++ b/src/mongo/db/repl/heartbeat_response_action.cpp @@ -79,5 +79,9 @@ void HeartbeatResponseAction::setAdvancedOpTime(bool advanced) { _advancedOpTime = advanced; } +void HeartbeatResponseAction::setBecameElectable(bool becameElectable) { + _becameElectable = becameElectable; +} + } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/heartbeat_response_action.h b/src/mongo/db/repl/heartbeat_response_action.h index 9f0b3e4da8c..194b4bb0845 100644 --- a/src/mongo/db/repl/heartbeat_response_action.h +++ b/src/mongo/db/repl/heartbeat_response_action.h @@ -105,6 +105,12 @@ public: */ void setAdvancedOpTime(bool advanced); + /* + * Sets whether or not the member has transitioned from unelectable to electable since the last + * heartbeat response. + */ + void setBecameElectable(bool becameElectable); + /** * Gets the action type of this action. */ @@ -136,11 +142,20 @@ public: return _advancedOpTime; } + /* + * Returns true if the heartbeat response results in the member transitioning from unelectable + * to electable. + */ + bool getBecameElectable() const { + return _becameElectable; + } + private: Action _action; int _primaryIndex; Date_t _nextHeartbeatStartDate; bool _advancedOpTime = false; + bool _becameElectable = false; }; } // namespace repl diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index c993d696a0d..0b32eacc507 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -237,9 +237,13 @@ void ReplicationCoordinatorImpl::_handleHeartbeatResponse( if (action.getAction() == HeartbeatResponseAction::NoAction && hbStatusResponse.isOK() && hbStatusResponse.getValue().hasState() && - hbStatusResponse.getValue().getState() != MemberState::RS_PRIMARY && - action.getAdvancedOpTime()) { - _updateLastCommittedOpTimeAndWallTime(lk); + hbStatusResponse.getValue().getState() != MemberState::RS_PRIMARY) { + if (action.getAdvancedOpTime()) { + _updateLastCommittedOpTimeAndWallTime(lk); + } else if (action.getBecameElectable() && _topCoord->isSteppingDown()) { + // Try to wake up the stepDown waiter when a new node becomes electable. + _wakeReadyWaiters(lk); + } } // Abort catchup if we have caught up to the latest known optime after heartbeat refreshing. diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp index 1df593d8330..727270bee7d 100644 --- a/src/mongo/db/repl/topology_coordinator.cpp +++ b/src/mongo/db/repl/topology_coordinator.cpp @@ -854,6 +854,7 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse( MemberData& hbData = _memberData.at(memberIndex); const MemberConfig member = _rsConfig.getMemberAt(memberIndex); bool advancedOpTime = false; + bool becameElectable = false; if (!hbResponse.isOK()) { if (isUnauthorized) { hbData.setAuthIssue(now); @@ -871,7 +872,9 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse( ReplSetHeartbeatResponse hbr = std::move(hbResponse.getValue()); LOG(3) << "setUpValues: heartbeat response good for member _id:" << member.getId(); pingsInConfig++; + auto wasUnelectable = hbData.isUnelectable(); advancedOpTime = hbData.setUpValues(now, std::move(hbr)); + becameElectable = wasUnelectable && !hbData.isUnelectable(); } HeartbeatResponseAction nextAction; @@ -879,6 +882,7 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse( nextAction.setNextHeartbeatStartDate(nextHeartbeatStartDate); nextAction.setAdvancedOpTime(advancedOpTime); + nextAction.setBecameElectable(becameElectable); return nextAction; } |