summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWenbin Zhu <wenbin.zhu@mongodb.com>2023-02-09 21:39:49 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2023-02-10 00:49:23 +0000
commit6b19e54d461bab075ade6e3e05767a881ee37597 (patch)
treec0e39abeebfc3b116a8685894bc7b1926fa762dd
parent8969fd59cb4e056c37c5a24b3f2e69822b6587f7 (diff)
downloadmongo-6b19e54d461bab075ade6e3e05767a881ee37597.tar.gz
SERVER-72774 Prevent a node in quiesce mode to win election.
-rw-r--r--etc/backports_required_for_multiversion_tests.yml4
-rw-r--r--jstests/replsets/quiesce_mode_fails_elections.js63
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp4
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp16
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp2
5 files changed, 85 insertions, 4 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml
index c8871c6c667..fe00e4762c1 100644
--- a/etc/backports_required_for_multiversion_tests.yml
+++ b/etc/backports_required_for_multiversion_tests.yml
@@ -306,6 +306,8 @@ last-continuous:
ticket: SERVER-72620
- test_file: jstests/core/timeseries/bucket_unpacking_with_sort_extended_range.js
ticket: SERVER-73110
+ - test_file: jstests/replsets/quiesce_mode_fails_elections.js
+ ticket: SERVER-72774
suites: null
last-lts:
all:
@@ -687,4 +689,6 @@ last-lts:
ticket: SERVER-72620
- test_file: jstests/core/timeseries/bucket_unpacking_with_sort_extended_range.js
ticket: SERVER-73110
+ - test_file: jstests/replsets/quiesce_mode_fails_elections.js
+ ticket: SERVER-72774
suites: null
diff --git a/jstests/replsets/quiesce_mode_fails_elections.js b/jstests/replsets/quiesce_mode_fails_elections.js
new file mode 100644
index 00000000000..be80af8017c
--- /dev/null
+++ b/jstests/replsets/quiesce_mode_fails_elections.js
@@ -0,0 +1,63 @@
+/**
+ * Test that once a node enters quiesce mode, any concurrent or new elections cannot succeed.
+ */
+(function() {
+"use strict";
+
+load("jstests/libs/fail_point_util.js");
+load("jstests/libs/parallel_shell_helpers.js");
+
+const rst = new ReplSetTest({
+ name: jsTestName(),
+ nodes: 3,
+ // Override the quiesce period.
+ nodeOptions: {setParameter: "shutdownTimeoutMillisForSignaledShutdown=5000"}
+});
+
+rst.startSet();
+rst.initiateWithHighElectionTimeout();
+
+const dbName = "test";
+const primary = rst.getPrimary();
+const secondary = rst.getSecondaries()[0];
+const primaryDB = primary.getDB(dbName);
+
+assert.commandWorked(
+ primaryDB.coll.insert([{_id: 0, data: "initial data"}], {writeConcern: {w: "majority"}}));
+rst.awaitReplication();
+
+jsTestLog("Make the secondary hang before processing real election vote result.");
+let voteRequestCompleteFailPoint =
+ configureFailPoint(secondary, "hangBeforeOnVoteRequestCompleteCallback");
+
+jsTestLog("Stepping up the secondary.");
+const awaitStepUp = startParallelShell(() => {
+ assert.commandFailedWithCode(db.adminCommand({replSetStepUp: 1}), ErrorCodes.CommandFailed);
+}, secondary.port);
+
+// Wait for secondary to hit the failpoint. Even though the election on secondary has not finished,
+// the primary should step down due to seeing a higher term.
+voteRequestCompleteFailPoint.wait();
+rst.waitForState(primary, ReplSetTest.State.SECONDARY);
+
+jsTestLog("Make the secondary hang after entering quiesce mode.");
+let quiesceModeFailPoint = configureFailPoint(secondary, "hangDuringQuiesceMode");
+rst.stop(secondary, null /*signal*/, {skipValidation: true}, {forRestart: true, waitpid: false});
+quiesceModeFailPoint.wait();
+
+jsTestLog("Unblock secondary election, the in-progress step up attempt should be cancelled");
+voteRequestCompleteFailPoint.off();
+awaitStepUp();
+// Check log line with id 214480: "Not becoming primary, election has been cancelled".
+checkLog.checkContainsOnceJson(secondary, 214480);
+
+jsTestLog("Attempting another stepup should fail immediately due to being in quiesce mode");
+assert.commandFailedWithCode(secondary.adminCommand({replSetStepUp: 1}), ErrorCodes.CommandFailed);
+// Check log line with id 4615654: "Not starting an election, since we are shutting down".
+checkLog.checkContainsOnceJson(secondary, 4615654);
+
+jsTestLog("Unblock the secondary from quiesce mode");
+quiesceModeFailPoint.off();
+
+rst.stopSet();
+})();
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index e6af5c08838..d4bd43a16ff 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -1007,6 +1007,10 @@ bool ReplicationCoordinatorImpl::enterQuiesceModeIfSecondary(Milliseconds quiesc
return false;
}
+ // Cancel any ongoing election so that the node cannot become primary once in quiesce mode,
+ // and do not wait for cancellation to complete.
+ _cancelElectionIfNeeded(lk);
+
_inQuiesceMode = true;
_quiesceDeadline = _replExecutor->now() + quiesceTime;
diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp
index 57959db018b..a064fedf1e7 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp
@@ -49,6 +49,8 @@ namespace mongo {
namespace repl {
MONGO_FAIL_POINT_DEFINE(hangInWritingLastVoteForDryRun);
+MONGO_FAIL_POINT_DEFINE(electionHangsBeforeUpdateMemberState);
+MONGO_FAIL_POINT_DEFINE(hangBeforeOnVoteRequestCompleteCallback);
class ReplicationCoordinatorImpl::ElectionState::LoseElectionGuardV1 {
LoseElectionGuardV1(const LoseElectionGuardV1&) = delete;
@@ -137,7 +139,12 @@ ReplicationCoordinatorImpl::ElectionState::getElectionDryRunFinishedEvent(WithLo
void ReplicationCoordinatorImpl::ElectionState::cancel(WithLock) {
_isCanceled = true;
- _voteRequester->cancel();
+ // This check is necessary because _voteRequester is only initialized in _startVoteRequester.
+ // Since we don't hold mutex during the entire election process, it is possible to get here
+ // before _startVoteRequester is ever called.
+ if (_voteRequester) {
+ _voteRequester->cancel();
+ }
}
void ReplicationCoordinatorImpl::ElectionState::start(WithLock lk, StartElectionReasonEnum reason) {
@@ -396,13 +403,16 @@ void ReplicationCoordinatorImpl::ElectionState::_requestVotesForRealElection(
_replExecutor
->onEvent(nextPhaseEvh.getValue(),
[=](const executor::TaskExecutor::CallbackArgs&) {
+ if (MONGO_unlikely(hangBeforeOnVoteRequestCompleteCallback.shouldFail())) {
+ LOGV2(7277400,
+ "Hang due to hangBeforeOnVoteRequestCompleteCallback failpoint");
+ hangBeforeOnVoteRequestCompleteCallback.pauseWhileSet();
+ }
_onVoteRequestComplete(newTerm, reason);
})
.status_with_transitional_ignore();
}
-MONGO_FAIL_POINT_DEFINE(electionHangsBeforeUpdateMemberState);
-
void ReplicationCoordinatorImpl::ElectionState::_onVoteRequestComplete(
long long newTerm, StartElectionReasonEnum reason) {
stdx::lock_guard<Latch> lk(_repl->_mutex);
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
index 83f39f24cd1..4330baaf036 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
@@ -1327,7 +1327,7 @@ void ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1(WithLock lk,
_cancelCatchupTakeover_inlock();
_cancelPriorityTakeover_inlock();
_cancelAndRescheduleElectionTimeout_inlock();
- if (_inShutdown) {
+ if (_inShutdown || _inQuiesceMode) {
LOGV2_FOR_ELECTION(4615654, 0, "Not starting an election, since we are shutting down");
return;
}