diff options
author | Wenbin Zhu <wenbin.zhu@mongodb.com> | 2023-02-09 21:39:49 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2023-03-25 01:21:48 +0000 |
commit | 48f111ef3858426b6b9ce3e307718acec8bdc080 (patch) | |
tree | fca755af065e352878b161f796db39004f63c81f | |
parent | a18e964fe609f30d79a9f5dc614cffd97049e9d2 (diff) | |
download | mongo-48f111ef3858426b6b9ce3e307718acec8bdc080.tar.gz |
SERVER-72774 Prevent a node in quiesce mode to win election.
(cherry picked from commit 6b19e54d461bab075ade6e3e05767a881ee37597)
5 files changed, 85 insertions, 4 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml index 348d1c6e923..b0f56e70526 100644 --- a/etc/backports_required_for_multiversion_tests.yml +++ b/etc/backports_required_for_multiversion_tests.yml @@ -104,6 +104,8 @@ last-continuous: ticket: SERVER-74124 - test_file: jstests/core/timeseries/timeseries_filter_extended_range.js ticket: SERVER-69952 + - test_file: jstests/replsets/quiesce_mode_fails_elections.js + ticket: SERVER-72774 suites: change_streams_multiversion_passthrough: null change_streams_sharded_collections_multiversion_passthrough: null @@ -303,6 +305,8 @@ last-lts: ticket: SERVER-74124 - test_file: jstests/core/timeseries/timeseries_filter_extended_range.js ticket: SERVER-69952 + - test_file: jstests/replsets/quiesce_mode_fails_elections.js + ticket: SERVER-72774 suites: change_streams_multiversion_passthrough: null change_streams_sharded_collections_multiversion_passthrough: null diff --git a/jstests/replsets/quiesce_mode_fails_elections.js b/jstests/replsets/quiesce_mode_fails_elections.js new file mode 100644 index 00000000000..be80af8017c --- /dev/null +++ b/jstests/replsets/quiesce_mode_fails_elections.js @@ -0,0 +1,63 @@ +/** + * Test that once a node enters quiesce mode, any concurrent or new elections cannot succeed. + */ +(function() { +"use strict"; + +load("jstests/libs/fail_point_util.js"); +load("jstests/libs/parallel_shell_helpers.js"); + +const rst = new ReplSetTest({ + name: jsTestName(), + nodes: 3, + // Override the quiesce period. + nodeOptions: {setParameter: "shutdownTimeoutMillisForSignaledShutdown=5000"} +}); + +rst.startSet(); +rst.initiateWithHighElectionTimeout(); + +const dbName = "test"; +const primary = rst.getPrimary(); +const secondary = rst.getSecondaries()[0]; +const primaryDB = primary.getDB(dbName); + +assert.commandWorked( + primaryDB.coll.insert([{_id: 0, data: "initial data"}], {writeConcern: {w: "majority"}})); +rst.awaitReplication(); + +jsTestLog("Make the secondary hang before processing real election vote result."); +let voteRequestCompleteFailPoint = + configureFailPoint(secondary, "hangBeforeOnVoteRequestCompleteCallback"); + +jsTestLog("Stepping up the secondary."); +const awaitStepUp = startParallelShell(() => { + assert.commandFailedWithCode(db.adminCommand({replSetStepUp: 1}), ErrorCodes.CommandFailed); +}, secondary.port); + +// Wait for secondary to hit the failpoint. Even though the election on secondary has not finished, +// the primary should step down due to seeing a higher term. +voteRequestCompleteFailPoint.wait(); +rst.waitForState(primary, ReplSetTest.State.SECONDARY); + +jsTestLog("Make the secondary hang after entering quiesce mode."); +let quiesceModeFailPoint = configureFailPoint(secondary, "hangDuringQuiesceMode"); +rst.stop(secondary, null /*signal*/, {skipValidation: true}, {forRestart: true, waitpid: false}); +quiesceModeFailPoint.wait(); + +jsTestLog("Unblock secondary election, the in-progress step up attempt should be cancelled"); +voteRequestCompleteFailPoint.off(); +awaitStepUp(); +// Check log line with id 214480: "Not becoming primary, election has been cancelled". +checkLog.checkContainsOnceJson(secondary, 214480); + +jsTestLog("Attempting another stepup should fail immediately due to being in quiesce mode"); +assert.commandFailedWithCode(secondary.adminCommand({replSetStepUp: 1}), ErrorCodes.CommandFailed); +// Check log line with id 4615654: "Not starting an election, since we are shutting down". +checkLog.checkContainsOnceJson(secondary, 4615654); + +jsTestLog("Unblock the secondary from quiesce mode"); +quiesceModeFailPoint.off(); + +rst.stopSet(); +})(); diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index aab76a4c6d8..a6df2f77d29 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -944,6 +944,10 @@ bool ReplicationCoordinatorImpl::enterQuiesceModeIfSecondary(Milliseconds quiesc return false; } + // Cancel any ongoing election so that the node cannot become primary once in quiesce mode, + // and do not wait for cancellation to complete. + _cancelElectionIfNeeded(lk); + _inQuiesceMode = true; _quiesceDeadline = _replExecutor->now() + quiesceTime; diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp index 0c05b9f4a0c..e87cf7f7837 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp @@ -46,6 +46,8 @@ namespace mongo { namespace repl { MONGO_FAIL_POINT_DEFINE(hangInWritingLastVoteForDryRun); +MONGO_FAIL_POINT_DEFINE(electionHangsBeforeUpdateMemberState); +MONGO_FAIL_POINT_DEFINE(hangBeforeOnVoteRequestCompleteCallback); class ReplicationCoordinatorImpl::ElectionState::LoseElectionGuardV1 { LoseElectionGuardV1(const LoseElectionGuardV1&) = delete; @@ -134,7 +136,12 @@ ReplicationCoordinatorImpl::ElectionState::getElectionDryRunFinishedEvent(WithLo void ReplicationCoordinatorImpl::ElectionState::cancel(WithLock) { _isCanceled = true; - _voteRequester->cancel(); + // This check is necessary because _voteRequester is only initialized in _startVoteRequester. + // Since we don't hold mutex during the entire election process, it is possible to get here + // before _startVoteRequester is ever called. + if (_voteRequester) { + _voteRequester->cancel(); + } } void ReplicationCoordinatorImpl::ElectionState::start(WithLock lk, StartElectionReasonEnum reason) { @@ -390,13 +397,16 @@ void ReplicationCoordinatorImpl::ElectionState::_requestVotesForRealElection( _replExecutor ->onEvent(nextPhaseEvh.getValue(), [=](const executor::TaskExecutor::CallbackArgs&) { + if (MONGO_unlikely(hangBeforeOnVoteRequestCompleteCallback.shouldFail())) { + LOGV2(7277400, + "Hang due to hangBeforeOnVoteRequestCompleteCallback failpoint"); + hangBeforeOnVoteRequestCompleteCallback.pauseWhileSet(); + } _onVoteRequestComplete(newTerm, reason); }) .status_with_transitional_ignore(); } -MONGO_FAIL_POINT_DEFINE(electionHangsBeforeUpdateMemberState); - void ReplicationCoordinatorImpl::ElectionState::_onVoteRequestComplete( long long newTerm, StartElectionReasonEnum reason) { stdx::lock_guard<Latch> lk(_repl->_mutex); diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index 35c623ae56e..e6917737833 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -1155,7 +1155,7 @@ void ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1(WithLock lk, _cancelCatchupTakeover_inlock(); _cancelPriorityTakeover_inlock(); _cancelAndRescheduleElectionTimeout_inlock(); - if (_inShutdown) { + if (_inShutdown || _inQuiesceMode) { LOGV2_FOR_ELECTION(4615654, 0, "Not starting an election, since we are shutting down"); return; } |