diff options
author | A. Jesse Jiryu Davis <jesse@mongodb.com> | 2019-03-18 08:53:15 -0400 |
---|---|---|
committer | A. Jesse Jiryu Davis <jesse@mongodb.com> | 2019-03-21 16:50:34 -0400 |
commit | a1c7c798168f13284a486153dde4b335735cea0a (patch) | |
tree | 2058f50439c0dff139a264104f86e028a74932e6 | |
parent | 979d456ccf4e6756e433799a626ca373a493fc8a (diff) | |
download | mongo-a1c7c798168f13284a486153dde4b335735cea0a.tar.gz |
SERVER-37255 Fix invariant when reconfig races with election
7 files changed, 191 insertions, 70 deletions
diff --git a/buildscripts/resmokeconfig/suites/replica_sets_pv0.yml b/buildscripts/resmokeconfig/suites/replica_sets_pv0.yml index 0131907c81e..bf393221bca 100644 --- a/buildscripts/resmokeconfig/suites/replica_sets_pv0.yml +++ b/buildscripts/resmokeconfig/suites/replica_sets_pv0.yml @@ -36,6 +36,8 @@ selector: - jstests/replsets/catchup_takeover_two_nodes_ahead.js # electionTimeoutMillis is not available in PV0. - jstests/replsets/noop_writes_wait_for_write_concern_fcv.js + # Tests a race condition in PV1, requires a PV1-specific failpoint. + - jstests/replsets/reconfig_during_election.js # Election handoff is not supported in PV0, since it requires replSetStepUp. - jstests/replsets/election_handoff*.js # replSetGetStatus doesn't return read concern majority optime in PV0. diff --git a/jstests/replsets/reconfig_during_election.js b/jstests/replsets/reconfig_during_election.js new file mode 100644 index 00000000000..4f58b3cd477 --- /dev/null +++ b/jstests/replsets/reconfig_during_election.js @@ -0,0 +1,49 @@ +/** + * SERVER-37255: replSetReconfig runs on a node that is concurrently processing an election win. + */ + +(function() { + "use strict"; + load("jstests/replsets/libs/election_handoff.js"); + load("jstests/libs/check_log.js"); + + const rst = ReplSetTest({nodes: 2}); + const nodes = rst.startSet(); + const config = rst.getReplSetConfig(); + // Prevent elections and set heartbeat timeout >> electionHangsBeforeUpdateMemberState. + config.settings = {electionTimeoutMillis: 12 * 60 * 60 * 1000, heartbeatTimeoutSecs: 60 * 1000}; + rst.initiate(config); + + const incumbent = rst.getPrimary(); + const candidate = rst.getSecondary(); + + jsTestLog("Step down"); + + assert.commandWorked(candidate.adminCommand({ + configureFailPoint: "electionHangsBeforeUpdateMemberState", + mode: "alwaysOn", + data: {waitForMillis: 10 * 1000} + })); + + // The incumbent sends replSetStepUp to the candidate for election handoff. + assert.adminCommandWorkedAllowingNetworkError(incumbent, { + replSetStepDown: ElectionHandoffTest.stepDownPeriodSecs, + secondaryCatchUpPeriodSecs: ElectionHandoffTest.stepDownPeriodSecs / 2 + }); + + jsTestLog("Wait for candidate to win the election"); + + checkLog.contains( + candidate, "election succeeded - electionHangsBeforeUpdateMemberState fail point enabled"); + + jsTestLog("Try to interrupt it with a reconfig"); + + config.members[nodes.indexOf(candidate)].priority = 2; + config.version++; + assert.commandWorked(candidate.adminCommand({replSetReconfig: config, force: true})); + + assert.commandWorked(candidate.adminCommand( + {configureFailPoint: "electionHangsBeforeUpdateMemberState", mode: "off"})); + + rst.stopSet(); +})();
\ No newline at end of file diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index c2b41c82ad3..687c48f6168 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -556,7 +556,7 @@ void ReplicationCoordinatorImpl::_finishLoadLocalConfig( if (localConfig.getReplSetName() != _settings.ourSetName()) { warning() << "Local replica set configuration document reports set name of " << localConfig.getReplSetName() << ", but command line reports " - << _settings.ourSetName() << "; waitng for reconfig or remote heartbeat"; + << _settings.ourSetName() << "; waiting for reconfig or remote heartbeat"; myIndex = StatusWith<int>(-1); } @@ -639,7 +639,12 @@ void ReplicationCoordinatorImpl::_finishLoadLocalConfig( _updateTerm_inlock(term); } LOG(1) << "Current term is now " << term; - _performPostMemberStateUpdateAction(action); + if (action == kActionWinElection) { + stdx::lock_guard<stdx::mutex> lk(_mutex); + _postWonElectionUpdateMemberState_inlock(); + } else { + _performPostMemberStateUpdateAction(action); + } if (!isArbiter) { _externalState->startThreads(_settings); @@ -929,8 +934,13 @@ Status ReplicationCoordinatorImpl::setFollowerMode(const MemberState& newState) const PostMemberStateUpdateAction action = _updateMemberStateFromTopologyCoordinator_inlock(nullptr); - lk.unlock(); - _performPostMemberStateUpdateAction(action); + + if (action == kActionWinElection) { + _postWonElectionUpdateMemberState_inlock(); + } else { + lk.unlock(); + _performPostMemberStateUpdateAction(action); + } return Status::OK(); } @@ -1783,24 +1793,30 @@ Status ReplicationCoordinatorImpl::stepDown(OperationContext* opCtx, invariant(opCtx->lockState()->isW()); auto action = _updateMemberStateFromTopologyCoordinator_inlock(opCtx); - lk.unlock(); + // Seems unlikely but handle kActionWinElection in case some surprising sequence leads here. + if (action == kActionWinElection) { + _postWonElectionUpdateMemberState_inlock(); + lk.unlock(); + } else { + lk.unlock(); - if (MONGO_FAIL_POINT(stepdownHangBeforePerformingPostMemberStateUpdateActions)) { - log() << "stepping down from primary - " - "stepdownHangBeforePerformingPostMemberStateUpdateActions fail point enabled. " - "Blocking until fail point is disabled."; - while (MONGO_FAIL_POINT(stepdownHangBeforePerformingPostMemberStateUpdateActions)) { - mongo::sleepsecs(1); - { - stdx::lock_guard<stdx::mutex> lock(_mutex); - if (_inShutdown) { - break; + if (MONGO_FAIL_POINT(stepdownHangBeforePerformingPostMemberStateUpdateActions)) { + log() << "stepping down from primary - " + "stepdownHangBeforePerformingPostMemberStateUpdateActions fail point " + "enabled. Blocking until fail point is disabled."; + while (MONGO_FAIL_POINT(stepdownHangBeforePerformingPostMemberStateUpdateActions)) { + mongo::sleepsecs(1); + { + stdx::lock_guard<stdx::mutex> lock(_mutex); + if (_inShutdown) { + break; + } } } } - } - _performPostMemberStateUpdateAction(action); + _performPostMemberStateUpdateAction(action); + } }; ScopeGuard onExitGuard = MakeGuard([&] { abortFn(); @@ -1927,7 +1943,8 @@ void ReplicationCoordinatorImpl::_handleTimePassing( }(); if (wonSingleNodeElection) { - _performPostMemberStateUpdateAction(kActionWinElection); + stdx::lock_guard<stdx::mutex> lk(_mutex); + _postWonElectionUpdateMemberState_inlock(); } } @@ -2256,8 +2273,14 @@ Status ReplicationCoordinatorImpl::setMaintenanceMode(bool activate) { const PostMemberStateUpdateAction action = _updateMemberStateFromTopologyCoordinator_inlock(nullptr); - lk.unlock(); - _performPostMemberStateUpdateAction(action); + + if (action == kActionWinElection) { + _postWonElectionUpdateMemberState_inlock(); + } else { + lk.unlock(); + _performPostMemberStateUpdateAction(action); + } + return Status::OK(); } @@ -2505,13 +2528,20 @@ void ReplicationCoordinatorImpl::_finishReplSetReconfig( const PostMemberStateUpdateAction action = _setCurrentRSConfig_inlock(opCtx.get(), newConfig, myIndex); - // On a reconfig we drop all snapshots so we don't mistakenely read from the wrong one. + // On a reconfig we drop all snapshots so we don't mistakenly read from the wrong one. // For example, if we change the meaning of the "committed" snapshot from applied -> durable. _dropAllSnapshots_inlock(); lk.unlock(); _resetElectionInfoOnProtocolVersionUpgrade(opCtx.get(), oldConfig, newConfig); - _performPostMemberStateUpdateAction(action); + if (action == kActionWinElection) { + lk.lock(); + _postWonElectionUpdateMemberState_inlock(); + lk.unlock(); + } else { + _performPostMemberStateUpdateAction(action); + } + _replExecutor->signalEvent(finishedEvent); } @@ -2619,8 +2649,12 @@ void ReplicationCoordinatorImpl::_finishReplSetInitiate(OperationContext* opCtx, invariant(_rsConfigState == kConfigInitiating); invariant(!_rsConfig.isInitialized()); auto action = _setCurrentRSConfig_inlock(opCtx, newConfig, myIndex); - lk.unlock(); - _performPostMemberStateUpdateAction(action); + if (action == kActionWinElection) { + _postWonElectionUpdateMemberState_inlock(); + } else { + lk.unlock(); + _performPostMemberStateUpdateAction(action); + } } void ReplicationCoordinatorImpl::_setConfigState_inlock(ConfigState newState) { @@ -2772,52 +2806,21 @@ ReplicationCoordinatorImpl::_updateMemberStateFromTopologyCoordinator_inlock( void ReplicationCoordinatorImpl::_performPostMemberStateUpdateAction( PostMemberStateUpdateAction action) { + invariant(action != kActionWinElection); switch (action) { case kActionNone: break; case kActionFollowerModeStateChange: - // In follower mode, or sub-mode so ensure replication is active - _externalState->signalApplierToChooseNewSyncSource(); + _onFollowerModeStateChange(); break; case kActionCloseAllConnections: _externalState->closeConnections(); _externalState->shardingOnStepDownHook(); _externalState->stopNoopWriter(); break; - case kActionWinElection: { - stdx::unique_lock<stdx::mutex> lk(_mutex); - if (isV1ElectionProtocol()) { - invariant(_topCoord->getTerm() != OpTime::kUninitializedTerm); - _electionId = OID::fromTerm(_topCoord->getTerm()); - } else { - _electionId = OID::gen(); - } - - auto ts = LogicalClock::get(getServiceContext())->reserveTicks(1).asTimestamp(); - _topCoord->processWinElection(_electionId, ts); - const PostMemberStateUpdateAction nextAction = - _updateMemberStateFromTopologyCoordinator_inlock(nullptr); - invariant(nextAction != kActionWinElection); - lk.unlock(); - _performPostMemberStateUpdateAction(nextAction); - lk.lock(); - if (!_getMemberState_inlock().primary()) { - break; - } - // Notify all secondaries of the election win. - _restartHeartbeats_inlock(); - if (isV1ElectionProtocol()) { - invariant(!_catchupState); - _catchupState = stdx::make_unique<CatchupState>(this); - _catchupState->start_inlock(); - } else { - _enterDrainMode_inlock(); - } - break; - } case kActionStartSingleNodeElection: - // In protocol version 1, single node replset will run an election instead of - // kActionWinElection as in protocol version 0. + // In protocol version 1, single node replset will run an election instead of directly + // calling _postWonElectionUpdateMemberState_inlock as in protocol version 0. _startElectSelfV1(TopologyCoordinator::StartElectionReason::kElectionTimeout); break; default: @@ -2826,6 +2829,38 @@ void ReplicationCoordinatorImpl::_performPostMemberStateUpdateAction( } } +void ReplicationCoordinatorImpl::_postWonElectionUpdateMemberState_inlock() { + if (isV1ElectionProtocol()) { + invariant(_topCoord->getTerm() != OpTime::kUninitializedTerm); + _electionId = OID::fromTerm(_topCoord->getTerm()); + } else { + _electionId = OID::gen(); + } + + auto ts = LogicalClock::get(getServiceContext())->reserveTicks(1).asTimestamp(); + _topCoord->processWinElection(_electionId, ts); + const PostMemberStateUpdateAction nextAction = + _updateMemberStateFromTopologyCoordinator_inlock(nullptr); + invariant(nextAction == kActionFollowerModeStateChange, + str::stream() << "nextAction == " << static_cast<int>(nextAction)); + invariant(_getMemberState_inlock().primary()); + // Clear the sync source. + _onFollowerModeStateChange(); + // Notify all secondaries of the election win. + _restartHeartbeats_inlock(); + if (isV1ElectionProtocol()) { + invariant(!_catchupState); + _catchupState = stdx::make_unique<CatchupState>(this); + _catchupState->start_inlock(); + } else { + _enterDrainMode_inlock(); + } +} + +void ReplicationCoordinatorImpl::_onFollowerModeStateChange() { + _externalState->signalApplierToChooseNewSyncSource(); +} + void ReplicationCoordinatorImpl::CatchupState::start_inlock() { log() << "Entering primary catch-up mode."; diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h index 02cf0ee8816..939dcd52233 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.h +++ b/src/mongo/db/repl/replication_coordinator_impl.h @@ -814,11 +814,22 @@ private: OperationContext* opCtx); /** - * Performs a post member-state update action. Do not call while holding _mutex. + * Performs a post member-state update action. Do not call while holding _mutex. "action" must + * not be kActionWinElection, use _postWonElectionUpdateMemberState_inlock instead. */ void _performPostMemberStateUpdateAction(PostMemberStateUpdateAction action); /** + * Update state after winning an election. + */ + void _postWonElectionUpdateMemberState_inlock(); + + /** + * Helper to select appropriate sync source after transitioning from a follower state. + */ + void _onFollowerModeStateChange(); + + /** * Begins an attempt to elect this node. * Called after an incoming heartbeat changes this node's view of the set such that it * believes it can be elected PRIMARY. diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect.cpp index 07ec5d45930..a4ab4e0de83 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_elect.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_elect.cpp @@ -272,8 +272,7 @@ void ReplicationCoordinatorImpl::_onElectCmdRunnerComplete() { _freshnessChecker.reset(NULL); _electCmdRunner.reset(NULL); auto electionFinishedEvent = _electionFinishedEvent; - lk.unlock(); - _performPostMemberStateUpdateAction(kActionWinElection); + _postWonElectionUpdateMemberState_inlock(); _replExecutor->signalEvent(electionFinishedEvent); } diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp index 4c0c810781a..5b5983789c7 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp @@ -290,6 +290,8 @@ void ReplicationCoordinatorImpl::_startVoteRequester_inlock(long long newTerm) { .status_with_transitional_ignore(); } +MONGO_FP_DECLARE(electionHangsBeforeUpdateMemberState); + void ReplicationCoordinatorImpl::_onVoteRequestComplete(long long newTerm) { stdx::unique_lock<stdx::mutex> lk(_mutex); LoseElectionGuardV1 lossGuard(this); @@ -329,9 +331,16 @@ void ReplicationCoordinatorImpl::_onVoteRequestComplete(long long newTerm) { _voteRequester.reset(); auto electionFinishedEvent = _electionFinishedEvent; - lk.unlock(); - _performPostMemberStateUpdateAction(kActionWinElection); + MONGO_FAIL_POINT_BLOCK(electionHangsBeforeUpdateMemberState, customWait) { + auto waitForMillis = Milliseconds(customWait.getData()["waitForMillis"].numberInt()); + log() << "election succeeded - electionHangsBeforeUpdateMemberState fail point " + "enabled, sleeping " + << waitForMillis; + sleepFor(waitForMillis); + } + _postWonElectionUpdateMemberState_inlock(); + lk.unlock(); _replExecutor->signalEvent(electionFinishedEvent); lossGuard.dismiss(); } diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index cfe87d4a4f3..1934ad0c40b 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -263,9 +263,13 @@ stdx::unique_lock<stdx::mutex> ReplicationCoordinatorImpl::_handleHeartbeatRespo if (_memberState != _topCoord->getMemberState()) { const PostMemberStateUpdateAction postUpdateAction = _updateMemberStateFromTopologyCoordinator_inlock(nullptr); - lock.unlock(); - _performPostMemberStateUpdateAction(postUpdateAction); - lock.lock(); + if (postUpdateAction == kActionWinElection) { + _postWonElectionUpdateMemberState_inlock(); + } else { + lock.unlock(); + _performPostMemberStateUpdateAction(postUpdateAction); + lock.lock(); + } } break; case HeartbeatResponseAction::Reconfig: @@ -420,8 +424,14 @@ void ReplicationCoordinatorImpl::_stepDownFinish( invariant(result != TopologyCoordinator::UpdateTermResult::kTriggerStepDown); _pendingTermUpdateDuringStepDown = boost::none; } - lk.unlock(); - _performPostMemberStateUpdateAction(action); + + if (action == kActionWinElection) { + _postWonElectionUpdateMemberState_inlock(); + } else { + lk.unlock(); + _performPostMemberStateUpdateAction(action); + } + _replExecutor->signalEvent(finishedEvent); } @@ -640,9 +650,15 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish( const int myIndexValue = myIndex.getStatus().isOK() ? myIndex.getValue() : -1; const PostMemberStateUpdateAction action = _setCurrentRSConfig_inlock(opCtx.get(), newConfig, myIndexValue); + lk.unlock(); _resetElectionInfoOnProtocolVersionUpgrade(opCtx.get(), oldConfig, newConfig); - _performPostMemberStateUpdateAction(action); + if (action == kActionWinElection) { + lk.lock(); + _postWonElectionUpdateMemberState_inlock(); + } else { + _performPostMemberStateUpdateAction(action); + } } void ReplicationCoordinatorImpl::_trackHeartbeatHandle_inlock( |