summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLingzhi Deng <lingzhi.deng@mongodb.com>2021-01-20 09:36:26 -0500
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-02-17 03:41:21 +0000
commit4fb715053b3ad308c85501e9e9d0a1169bc78556 (patch)
tree7c0ef47221fc3d68ae6fd41e0313a37b968f4e9c
parentecbb91d31416fc9a68d896ea255f5494ca2a54d4 (diff)
downloadmongo-4fb715053b3ad308c85501e9e9d0a1169bc78556.tar.gz
SERVER-53612: Fix StepDown hangs when all nodes are caught up but none is immediately electable
(cherry picked from commit 6308db5c83a3e95f4532c63df8b635b8090036ae)
-rw-r--r--etc/backports_required_for_multiversion_tests.yml2
-rw-r--r--jstests/replsets/election_handoff_not_immediately_electable.js32
-rw-r--r--jstests/replsets/libs/election_handoff.js8
-rw-r--r--src/mongo/db/repl/heartbeat_response_action.cpp4
-rw-r--r--src/mongo/db/repl/heartbeat_response_action.h15
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp10
-rw-r--r--src/mongo/db/repl/topology_coordinator.cpp4
7 files changed, 70 insertions, 5 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml
index d752f7a2f73..2328b36d5bc 100644
--- a/etc/backports_required_for_multiversion_tests.yml
+++ b/etc/backports_required_for_multiversion_tests.yml
@@ -94,6 +94,8 @@ all:
test_file: jstests/replsets/election_handoff_skips_unelectable_nodes.js
- ticket: SERVER-53394
test_file: jstests/sharding/sharding_task_executor_pool_matching_policy.js
+ - ticket: SERVER-53612
+ test_file: jstests/replsets/election_handoff_not_immediately_electable.js
# Tests that should only be excluded from particular suites should be listed under that suite.
suites:
diff --git a/jstests/replsets/election_handoff_not_immediately_electable.js b/jstests/replsets/election_handoff_not_immediately_electable.js
new file mode 100644
index 00000000000..0111553a8c7
--- /dev/null
+++ b/jstests/replsets/election_handoff_not_immediately_electable.js
@@ -0,0 +1,32 @@
+/**
+ * Test that election handoff works correctly in the case where a node is caught up with primary's
+ * lastApplied but is not immediately electable. (See SERVER-53612)
+ */
+
+(function() {
+"use strict";
+load("jstests/replsets/libs/election_handoff.js");
+
+const testName = jsTestName();
+const rst = ReplSetTest({name: testName, nodes: 2});
+rst.startSet();
+
+// Make sure there are no election timeouts firing for the duration of the test. This helps
+// ensure that the test will only pass if the election handoff succeeds.
+rst.initiateWithHighElectionTimeout();
+
+rst.awaitLastOpCommitted();
+
+// ElectionHandoffTest.testElectionHandoff uses a 30s secondaryCatchUpPeriodSecs, freeze the
+// secondary for 15s so that the secondary is not immediately electable during election handoff.
+const secondary = rst.getSecondary();
+assert.commandWorked(secondary.adminCommand({replSetFreeze: 15}));
+// replSetStepUp should fail due to replSetFreeze.
+assert.commandFailedWithCode(secondary.adminCommand({replSetStepUp: 1}), ErrorCodes.CommandFailed);
+
+// Test that the election handoff works eventually when the secondary node becomes electable.
+ElectionHandoffTest.testElectionHandoff(
+ rst, 0, 1, {stepDownPeriodSecs: 30, secondaryCatchUpPeriodSecs: 30});
+
+rst.stopSet();
+})();
diff --git a/jstests/replsets/libs/election_handoff.js b/jstests/replsets/libs/election_handoff.js
index 255ebbd0e6f..3bd38150187 100644
--- a/jstests/replsets/libs/election_handoff.js
+++ b/jstests/replsets/libs/election_handoff.js
@@ -19,6 +19,9 @@ var ElectionHandoffTest = (function() {
* are:
* stepDownBySignal - When this option is set, the primary will be stepped down by stopping
* and restarting with sigterm, rather than with a replSetStepDown command
+ * stepDownPeriodSecs - The number of seconds to step down the primary.
+ * secondaryCatchUpPeriodSecs - The number of seconds that the mongod will wait for an
+ * electable secondary to catch up to the primary.
*/
function testElectionHandoff(rst, initialPrimaryId, expectedCandidateId, options = {}) {
const config = rst.getReplSetConfigFromNode();
@@ -59,8 +62,9 @@ var ElectionHandoffTest = (function() {
rst.start(initialPrimaryId, {}, true);
} else {
assert.commandWorked(primary.adminCommand({
- replSetStepDown: kStepDownPeriodSecs,
- secondaryCatchUpPeriodSecs: kStepDownPeriodSecs / 2
+ replSetStepDown: options.stepDownPeriodSecs || kStepDownPeriodSecs,
+ secondaryCatchUpPeriodSecs:
+ options.secondaryCatchUpPeriodSecs || kStepDownPeriodSecs / 2
}));
}
diff --git a/src/mongo/db/repl/heartbeat_response_action.cpp b/src/mongo/db/repl/heartbeat_response_action.cpp
index c21dbd0cf53..6b6bd4797a8 100644
--- a/src/mongo/db/repl/heartbeat_response_action.cpp
+++ b/src/mongo/db/repl/heartbeat_response_action.cpp
@@ -79,5 +79,9 @@ void HeartbeatResponseAction::setAdvancedOpTime(bool advanced) {
_advancedOpTime = advanced;
}
+void HeartbeatResponseAction::setBecameElectable(bool becameElectable) {
+ _becameElectable = becameElectable;
+}
+
} // namespace repl
} // namespace mongo
diff --git a/src/mongo/db/repl/heartbeat_response_action.h b/src/mongo/db/repl/heartbeat_response_action.h
index 9f0b3e4da8c..194b4bb0845 100644
--- a/src/mongo/db/repl/heartbeat_response_action.h
+++ b/src/mongo/db/repl/heartbeat_response_action.h
@@ -105,6 +105,12 @@ public:
*/
void setAdvancedOpTime(bool advanced);
+ /*
+ * Sets whether or not the member has transitioned from unelectable to electable since the last
+ * heartbeat response.
+ */
+ void setBecameElectable(bool becameElectable);
+
/**
* Gets the action type of this action.
*/
@@ -136,11 +142,20 @@ public:
return _advancedOpTime;
}
+ /*
+ * Returns true if the heartbeat response results in the member transitioning from unelectable
+ * to electable.
+ */
+ bool getBecameElectable() const {
+ return _becameElectable;
+ }
+
private:
Action _action;
int _primaryIndex;
Date_t _nextHeartbeatStartDate;
bool _advancedOpTime = false;
+ bool _becameElectable = false;
};
} // namespace repl
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
index c993d696a0d..0b32eacc507 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
@@ -237,9 +237,13 @@ void ReplicationCoordinatorImpl::_handleHeartbeatResponse(
if (action.getAction() == HeartbeatResponseAction::NoAction && hbStatusResponse.isOK() &&
hbStatusResponse.getValue().hasState() &&
- hbStatusResponse.getValue().getState() != MemberState::RS_PRIMARY &&
- action.getAdvancedOpTime()) {
- _updateLastCommittedOpTimeAndWallTime(lk);
+ hbStatusResponse.getValue().getState() != MemberState::RS_PRIMARY) {
+ if (action.getAdvancedOpTime()) {
+ _updateLastCommittedOpTimeAndWallTime(lk);
+ } else if (action.getBecameElectable() && _topCoord->isSteppingDown()) {
+ // Try to wake up the stepDown waiter when a new node becomes electable.
+ _wakeReadyWaiters(lk);
+ }
}
// Abort catchup if we have caught up to the latest known optime after heartbeat refreshing.
diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp
index 1df593d8330..727270bee7d 100644
--- a/src/mongo/db/repl/topology_coordinator.cpp
+++ b/src/mongo/db/repl/topology_coordinator.cpp
@@ -854,6 +854,7 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse(
MemberData& hbData = _memberData.at(memberIndex);
const MemberConfig member = _rsConfig.getMemberAt(memberIndex);
bool advancedOpTime = false;
+ bool becameElectable = false;
if (!hbResponse.isOK()) {
if (isUnauthorized) {
hbData.setAuthIssue(now);
@@ -871,7 +872,9 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse(
ReplSetHeartbeatResponse hbr = std::move(hbResponse.getValue());
LOG(3) << "setUpValues: heartbeat response good for member _id:" << member.getId();
pingsInConfig++;
+ auto wasUnelectable = hbData.isUnelectable();
advancedOpTime = hbData.setUpValues(now, std::move(hbr));
+ becameElectable = wasUnelectable && !hbData.isUnelectable();
}
HeartbeatResponseAction nextAction;
@@ -879,6 +882,7 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse(
nextAction.setNextHeartbeatStartDate(nextHeartbeatStartDate);
nextAction.setAdvancedOpTime(advancedOpTime);
+ nextAction.setBecameElectable(becameElectable);
return nextAction;
}