diff options
author | Lingzhi Deng <lingzhi.deng@mongodb.com> | 2020-03-13 22:10:43 -0400 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-04-07 03:28:36 +0000 |
commit | 8e2737e5b88a0f639b77b51f7b57730f337fea55 (patch) | |
tree | 9990b9d5ed73ec219115a882fa56ecf5fbee6306 | |
parent | f9170b2a35d3ab9d1d6d7669d1bacf9da785a94d (diff) | |
download | mongo-8e2737e5b88a0f639b77b51f7b57730f337fea55.tar.gz |
SERVER-46517: Move the update of readWriteAbility out of _updateMemberStateFromTopologyCoordinator
(cherry picked from commit 6d0a10abd1e6f222bc16c59afc28dcfb9613b86f)
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.cpp | 67 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.h | 11 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp | 10 |
3 files changed, 49 insertions, 39 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index fdd4bd983c9..800ef68f611 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -1013,7 +1013,13 @@ Status ReplicationCoordinatorImpl::_setFollowerMode(OperationContext* opCtx, _topCoord->setFollowerMode(newState.s); - const PostMemberStateUpdateAction action = _updateMemberStateFromTopologyCoordinator(lk, opCtx); + if (opCtx && _memberState.secondary() && newState == MemberState::RS_ROLLBACK) { + // If we are switching out of SECONDARY and to ROLLBACK, we must make sure that we hold the + // RSTL in mode X to prevent readers that have the RSTL in intent mode from reading. + _readWriteAbility->setCanServeNonLocalReads(opCtx, 0U); + } + + const PostMemberStateUpdateAction action = _updateMemberStateFromTopologyCoordinator(lk); lk.unlock(); _performPostMemberStateUpdateAction(action); @@ -1098,8 +1104,9 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx, // our election in onTransitionToPrimary(), above. _updateLastCommittedOpTimeAndWallTime(lk); - // Update _canAcceptNonLocalWrites - _updateMemberStateFromTopologyCoordinator(lk, opCtx); + // Update _canAcceptNonLocalWrites. + _updateWriteAbilityFromTopologyCoordinator(lk, opCtx); + _updateMemberStateFromTopologyCoordinator(lk); log() << "transition to primary complete; database writes are now permitted" << rsLog; _drainFinishedCond.notify_all(); @@ -2076,16 +2083,18 @@ void ReplicationCoordinatorImpl::stepDown(OperationContext* opCtx, // of a stepdown attempt. This will prevent us from accepting writes so that if our stepdown // attempt fails later we can release the RSTL and go to sleep to allow secondaries to // catch up without allowing new writes in. - auto action = _updateMemberStateFromTopologyCoordinator(lk, opCtx); + _updateWriteAbilityFromTopologyCoordinator(lk, opCtx); + auto action = _updateMemberStateFromTopologyCoordinator(lk); invariant(action == PostMemberStateUpdateAction::kActionNone); invariant(!_readWriteAbility->canAcceptNonLocalWrites(lk)); - // Make sure that we leave _canAcceptNonLocalWrites in the proper state. auto updateMemberState = [&] { invariant(lk.owns_lock()); invariant(opCtx->lockState()->isRSTLExclusive()); - auto action = _updateMemberStateFromTopologyCoordinator(lk, opCtx); + // Make sure that we leave _canAcceptNonLocalWrites in the proper state. + _updateWriteAbilityFromTopologyCoordinator(lk, opCtx); + auto action = _updateMemberStateFromTopologyCoordinator(lk); lk.unlock(); if (MONGO_FAIL_POINT(stepdownHangBeforePerformingPostMemberStateUpdateActions)) { @@ -2571,8 +2580,7 @@ Status ReplicationCoordinatorImpl::setMaintenanceMode(bool activate) { return Status(ErrorCodes::OperationFailed, "already out of maintenance mode"); } - const PostMemberStateUpdateAction action = - _updateMemberStateFromTopologyCoordinator(lk, nullptr); + const PostMemberStateUpdateAction action = _updateMemberStateFromTopologyCoordinator(lk); lk.unlock(); _performPostMemberStateUpdateAction(action); return Status::OK(); @@ -2767,6 +2775,9 @@ void ReplicationCoordinatorImpl::_finishReplSetReconfig(OperationContext* opCtx, // Clear the node's election candidate metrics since it is no longer primary. ReplicationMetrics::get(opCtx).clearElectionCandidateMetrics(); _wMajorityWriteAvailabilityWaiter.reset(); + + // Update _canAcceptNonLocalWrites. + _updateWriteAbilityFromTopologyCoordinator(lk, opCtx); } else { // Release the rstl lock as the node might have stepped down due to // other unconditional step down code paths like learning new term via heartbeat & @@ -2920,17 +2931,14 @@ void ReplicationCoordinatorImpl::_setConfigState_inlock(ConfigState newState) { } } -ReplicationCoordinatorImpl::PostMemberStateUpdateAction -ReplicationCoordinatorImpl::_updateMemberStateFromTopologyCoordinator(WithLock lk, - OperationContext* opCtx) { - { - // We have to do this check even if our current and target state are the same as we might - // have just failed a stepdown attempt and thus are staying in PRIMARY state but restoring - // our ability to accept writes. - bool canAcceptWrites = _topCoord->canAcceptWrites(); - _readWriteAbility->setCanAcceptNonLocalWrites(lk, opCtx, canAcceptWrites); - } +void ReplicationCoordinatorImpl::_updateWriteAbilityFromTopologyCoordinator( + WithLock lk, OperationContext* opCtx) { + bool canAcceptWrites = _topCoord->canAcceptWrites(); + _readWriteAbility->setCanAcceptNonLocalWrites(lk, opCtx, canAcceptWrites); +} +ReplicationCoordinatorImpl::PostMemberStateUpdateAction +ReplicationCoordinatorImpl::_updateMemberStateFromTopologyCoordinator(WithLock lk) { const MemberState newState = _topCoord->getMemberState(); if (newState == _memberState) { if (_topCoord->getRole() == TopologyCoordinator::Role::kCandidate) { @@ -2949,7 +2957,7 @@ ReplicationCoordinatorImpl::_updateMemberStateFromTopologyCoordinator(WithLock l // Wake up the optime waiter that is waiting for primary catch-up to finish. _opTimeWaiterList.signalAll_inlock(); - // _canAcceptNonLocalWrites should already be set above. + // _canAcceptNonLocalWrites should already be set. invariant(!_readWriteAbility->canAcceptNonLocalWrites(lk)); serverGlobalParams.validateFeaturesAsMaster.store(false); @@ -2977,12 +2985,9 @@ ReplicationCoordinatorImpl::_updateMemberStateFromTopologyCoordinator(WithLock l _externalState->startProducerIfStopped(); } - if (_memberState.secondary() && newState.rollback()) { - // If we are switching out of SECONDARY and to ROLLBACK, we must make sure that we hold the - // RSTL in mode X to prevent readers that have the RSTL in intent mode from reading. - _readWriteAbility->setCanServeNonLocalReads(opCtx, 0U); - } else if (_memberState.secondary() && !newState.primary()) { - // Switching out of SECONDARY, but not to PRIMARY or ROLLBACK. + if (_memberState.secondary() && !newState.primary() && !newState.rollback()) { + // Switching out of SECONDARY, but not to PRIMARY or ROLLBACK. Note that ROLLBACK case is + // handled separately and requires RSTL lock held, see setFollowerModeStrict. _readWriteAbility->setCanServeNonLocalReads_UNSAFE(0U); } else if (!_memberState.primary() && newState.secondary()) { // Switching into SECONDARY, but not from PRIMARY. @@ -3091,8 +3096,7 @@ void ReplicationCoordinatorImpl::_postWonElectionUpdateMemberState(WithLock lk) _electionId = OID::fromTerm(_topCoord->getTerm()); auto ts = LogicalClock::get(getServiceContext())->reserveTicks(1).asTimestamp(); _topCoord->processWinElection(_electionId, ts); - const PostMemberStateUpdateAction nextAction = - _updateMemberStateFromTopologyCoordinator(lk, nullptr); + const PostMemberStateUpdateAction nextAction = _updateMemberStateFromTopologyCoordinator(lk); invariant(nextAction == kActionFollowerModeStateChange, str::stream() << "nextAction == " << static_cast<int>(nextAction)); @@ -3351,7 +3355,7 @@ ReplicationCoordinatorImpl::_setCurrentRSConfig(WithLock lk, _cancelPriorityTakeover_inlock(); _cancelAndRescheduleElectionTimeout_inlock(); - const PostMemberStateUpdateAction action = _updateMemberStateFromTopologyCoordinator(lk, opCtx); + const PostMemberStateUpdateAction action = _updateMemberStateFromTopologyCoordinator(lk); if (_selfIndex >= 0) { // Don't send heartbeats if we're not in the config, if we get re-added one of the // nodes in the set will contact us. @@ -4232,13 +4236,12 @@ bool ReplicationCoordinatorImpl::setContainsArbiter() const { void ReplicationCoordinatorImpl::ReadWriteAbility::setCanAcceptNonLocalWrites( WithLock lk, OperationContext* opCtx, bool canAcceptWrites) { - if (canAcceptWrites == canAcceptNonLocalWrites(lk)) { - return; - } - // We must be holding the RSTL in mode X to change _canAcceptNonLocalWrites. invariant(opCtx); invariant(opCtx->lockState()->isRSTLExclusive()); + if (canAcceptWrites == canAcceptNonLocalWrites(lk)) { + return; + } _canAcceptNonLocalWrites.store(canAcceptWrites); } diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h index 73c46dcc4e4..c6ab7789d71 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.h +++ b/src/mongo/db/repl/replication_coordinator_impl.h @@ -1014,18 +1014,19 @@ private: void _setConfigState_inlock(ConfigState newState); /** + * Update _canAcceptNonLocalWrites based on _topCoord->canAcceptWrites(). + */ + void _updateWriteAbilityFromTopologyCoordinator(WithLock lk, OperationContext* opCtx); + + /** * Updates the cached value, _memberState, to match _topCoord's reported * member state, from getMemberState(). * * Returns an enum indicating what action to take after releasing _mutex, if any. * Call performPostMemberStateUpdateAction on the return value after releasing * _mutex. - * - * Note: opCtx may be null as currently not all paths thread an OperationContext all the way - * down, but it must be non-null for any calls that change _canAcceptNonLocalWrites. */ - PostMemberStateUpdateAction _updateMemberStateFromTopologyCoordinator(WithLock lk, - OperationContext* opCtx); + PostMemberStateUpdateAction _updateMemberStateFromTopologyCoordinator(WithLock lk); /** * Performs a post member-state update action. Do not call while holding _mutex. diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index a1afae9f3ae..7ec6bd06051 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -273,7 +273,7 @@ stdx::unique_lock<Latch> ReplicationCoordinatorImpl::_handleHeartbeatResponseAct // Update the cached member state if different than the current topology member state if (_memberState != _topCoord->getMemberState()) { const PostMemberStateUpdateAction postUpdateAction = - _updateMemberStateFromTopologyCoordinator(lock, nullptr); + _updateMemberStateFromTopologyCoordinator(lock); lock.unlock(); _performPostMemberStateUpdateAction(postUpdateAction); lock.lock(); @@ -430,7 +430,10 @@ void ReplicationCoordinatorImpl::_stepDownFinish( _topCoord->finishUnconditionalStepDown(); - const auto action = _updateMemberStateFromTopologyCoordinator(lk, opCtx.get()); + // Update _canAcceptNonLocalWrites. + _updateWriteAbilityFromTopologyCoordinator(lk, opCtx.get()); + + const auto action = _updateMemberStateFromTopologyCoordinator(lk); if (_pendingTermUpdateDuringStepDown) { TopologyCoordinator::UpdateTermResult result; _updateTerm_inlock(*_pendingTermUpdateDuringStepDown, &result); @@ -654,6 +657,9 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish( // Clear the node's election candidate metrics since it is no longer primary. ReplicationMetrics::get(opCtx.get()).clearElectionCandidateMetrics(); _wMajorityWriteAvailabilityWaiter.reset(); + + // Update _canAcceptNonLocalWrites. + _updateWriteAbilityFromTopologyCoordinator(lk, opCtx.get()); } else { // Release the rstl lock as the node might have stepped down due to // other unconditional step down code paths like learning new term via heartbeat & |