diff options
author | Lingzhi Deng <lingzhi.deng@mongodb.com> | 2020-03-13 22:10:43 -0400 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-03-25 15:55:41 +0000 |
commit | b973f465871f19815f8a5b60b6aeffb10bd1cb78 (patch) | |
tree | bb88d8c5b67731be93128da3e5a459ffea470717 | |
parent | 09ea2c315d2e7a44a49a990ed7649af3919acd4d (diff) | |
download | mongo-b973f465871f19815f8a5b60b6aeffb10bd1cb78.tar.gz |
SERVER-46517: Move the update of readWriteAbility out of _updateMemberStateFromTopologyCoordinator
(cherry picked from commit 6d0a10abd1e6f222bc16c59afc28dcfb9613b86f)
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.cpp | 73 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.h | 11 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp | 11 |
3 files changed, 52 insertions, 43 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index 44b1723a268..8efa495886b 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -1031,7 +1031,13 @@ Status ReplicationCoordinatorImpl::_setFollowerMode(OperationContext* opCtx, _topCoord->setFollowerMode(newState.s); - const PostMemberStateUpdateAction action = _updateMemberStateFromTopologyCoordinator(lk, opCtx); + if (opCtx && _memberState.secondary() && newState == MemberState::RS_ROLLBACK) { + // If we are switching out of SECONDARY and to ROLLBACK, we must make sure that we hold the + // RSTL in mode X to prevent readers that have the RSTL in intent mode from reading. + _readWriteAbility->setCanServeNonLocalReads(opCtx, 0U); + } + + const PostMemberStateUpdateAction action = _updateMemberStateFromTopologyCoordinator(lk); lk.unlock(); _performPostMemberStateUpdateAction(action); @@ -1156,8 +1162,9 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx, _updateLastCommittedOpTimeAndWallTime(lk); _wakeReadyWaiters(lk); - // Update _canAcceptNonLocalWrites - _updateMemberStateFromTopologyCoordinator(lk, opCtx); + // Update _canAcceptNonLocalWrites. + _updateWriteAbilityFromTopologyCoordinator(lk, opCtx); + _updateMemberStateFromTopologyCoordinator(lk); LOGV2_OPTIONS(21331, {logv2::LogTag::kRS}, @@ -2473,16 +2480,18 @@ void ReplicationCoordinatorImpl::stepDown(OperationContext* opCtx, // of a stepdown attempt. This will prevent us from accepting writes so that if our stepdown // attempt fails later we can release the RSTL and go to sleep to allow secondaries to // catch up without allowing new writes in. - auto action = _updateMemberStateFromTopologyCoordinator(lk, opCtx); + _updateWriteAbilityFromTopologyCoordinator(lk, opCtx); + auto action = _updateMemberStateFromTopologyCoordinator(lk); invariant(action == PostMemberStateUpdateAction::kActionNone); invariant(!_readWriteAbility->canAcceptNonLocalWrites(lk)); - // Make sure that we leave _canAcceptNonLocalWrites in the proper state. auto updateMemberState = [&] { invariant(lk.owns_lock()); invariant(opCtx->lockState()->isRSTLExclusive()); - auto action = _updateMemberStateFromTopologyCoordinator(lk, opCtx); + // Make sure that we leave _canAcceptNonLocalWrites in the proper state. + _updateWriteAbilityFromTopologyCoordinator(lk, opCtx); + auto action = _updateMemberStateFromTopologyCoordinator(lk); lk.unlock(); if (MONGO_unlikely(stepdownHangBeforePerformingPostMemberStateUpdateActions.shouldFail())) { @@ -3029,8 +3038,7 @@ Status ReplicationCoordinatorImpl::setMaintenanceMode(bool activate) { return Status(ErrorCodes::OperationFailed, "already out of maintenance mode"); } - const PostMemberStateUpdateAction action = - _updateMemberStateFromTopologyCoordinator(lk, nullptr); + const PostMemberStateUpdateAction action = _updateMemberStateFromTopologyCoordinator(lk); lk.unlock(); _performPostMemberStateUpdateAction(action); return Status::OK(); @@ -3314,6 +3322,9 @@ void ReplicationCoordinatorImpl::_finishReplSetReconfig(OperationContext* opCtx, // Clear the node's election candidate metrics since it is no longer primary. ReplicationMetrics::get(opCtx).clearElectionCandidateMetrics(); + + // Update _canAcceptNonLocalWrites. + _updateWriteAbilityFromTopologyCoordinator(lk, opCtx); } else { // Release the rstl lock as the node might have stepped down due to // other unconditional step down code paths like learning new term via heartbeat & @@ -3565,24 +3576,20 @@ void ReplicationCoordinatorImpl::incrementTopologyVersion(OperationContext* opCt _fulfillTopologyChangePromise(opCtx, lk); } -ReplicationCoordinatorImpl::PostMemberStateUpdateAction -ReplicationCoordinatorImpl::_updateMemberStateFromTopologyCoordinator(WithLock lk, - OperationContext* opCtx) { - { - // We have to do this check even if our current and target state are the same as we might - // have just failed a stepdown attempt and thus are staying in PRIMARY state but restoring - // our ability to accept writes. - bool canAcceptWrites = _topCoord->canAcceptWrites(); - _readWriteAbility->setCanAcceptNonLocalWrites(lk, opCtx, canAcceptWrites); - } +void ReplicationCoordinatorImpl::_updateWriteAbilityFromTopologyCoordinator( + WithLock lk, OperationContext* opCtx) { + bool canAcceptWrites = _topCoord->canAcceptWrites(); + _readWriteAbility->setCanAcceptNonLocalWrites(lk, opCtx, canAcceptWrites); +} +ReplicationCoordinatorImpl::PostMemberStateUpdateAction +ReplicationCoordinatorImpl::_updateMemberStateFromTopologyCoordinator(WithLock lk) { // We want to respond to any waiting isMasters even if our current and target state are the // same as it is possible writes have been disabled during a stepDown but the primary has yet // to transition to SECONDARY state. ON_BLOCK_EXIT([&] { if (_rsConfig.isInitialized()) { - _fulfillTopologyChangePromise(opCtx, lk); - // Use the global ServiceContext here in case the current opCtx is null. + _fulfillTopologyChangePromise(nullptr, lk); IsMasterMetrics::get(getGlobalServiceContext())->resetNumAwaitingTopologyChanges(); } }); @@ -3608,7 +3615,7 @@ ReplicationCoordinatorImpl::_updateMemberStateFromTopologyCoordinator(WithLock l _opTimeWaiterList.setErrorAll_inlock( {ErrorCodes::PrimarySteppedDown, "Primary stepped down while waiting for replication"}); - // _canAcceptNonLocalWrites should already be set above. + // _canAcceptNonLocalWrites should already be set. invariant(!_readWriteAbility->canAcceptNonLocalWrites(lk)); serverGlobalParams.validateFeaturesAsMaster.store(false); @@ -3636,12 +3643,9 @@ ReplicationCoordinatorImpl::_updateMemberStateFromTopologyCoordinator(WithLock l _externalState->startProducerIfStopped(); } - if (_memberState.secondary() && newState.rollback()) { - // If we are switching out of SECONDARY and to ROLLBACK, we must make sure that we hold the - // RSTL in mode X to prevent readers that have the RSTL in intent mode from reading. - _readWriteAbility->setCanServeNonLocalReads(opCtx, 0U); - } else if (_memberState.secondary() && !newState.primary()) { - // Switching out of SECONDARY, but not to PRIMARY or ROLLBACK. + if (_memberState.secondary() && !newState.primary() && !newState.rollback()) { + // Switching out of SECONDARY, but not to PRIMARY or ROLLBACK. Note that ROLLBACK case is + // handled separately and requires RSTL lock held, see setFollowerModeStrict. _readWriteAbility->setCanServeNonLocalReads_UNSAFE(0U); } else if (!_memberState.primary() && newState.secondary()) { // Switching into SECONDARY, but not from PRIMARY. @@ -3756,8 +3760,7 @@ void ReplicationCoordinatorImpl::_postWonElectionUpdateMemberState(WithLock lk) _electionId = OID::fromTerm(_topCoord->getTerm()); auto ts = LogicalClock::get(getServiceContext())->reserveTicks(1).asTimestamp(); _topCoord->processWinElection(_electionId, ts); - const PostMemberStateUpdateAction nextAction = - _updateMemberStateFromTopologyCoordinator(lk, nullptr); + const PostMemberStateUpdateAction nextAction = _updateMemberStateFromTopologyCoordinator(lk); invariant(nextAction == kActionFollowerModeStateChange, str::stream() << "nextAction == " << static_cast<int>(nextAction)); @@ -4101,7 +4104,7 @@ ReplicationCoordinatorImpl::_setCurrentRSConfig(WithLock lk, _cancelPriorityTakeover_inlock(); _cancelAndRescheduleElectionTimeout_inlock(); - const PostMemberStateUpdateAction action = _updateMemberStateFromTopologyCoordinator(lk, opCtx); + const PostMemberStateUpdateAction action = _updateMemberStateFromTopologyCoordinator(lk); if (_selfIndex >= 0) { // Don't send heartbeats if we're not in the config, if we get re-added one of the // nodes in the set will contact us. @@ -5059,15 +5062,13 @@ bool ReplicationCoordinatorImpl::setContainsArbiter() const { void ReplicationCoordinatorImpl::ReadWriteAbility::setCanAcceptNonLocalWrites( WithLock lk, OperationContext* opCtx, bool canAcceptWrites) { - if (canAcceptWrites == canAcceptNonLocalWrites(lk)) { - return; - } - // We must be holding the RSTL in mode X to change _canAcceptNonLocalWrites. invariant(opCtx); - if (opCtx->lockState()->isRSTLExclusive()) { - _canAcceptNonLocalWrites.store(canAcceptWrites); + invariant(opCtx->lockState()->isRSTLExclusive()); + if (canAcceptWrites == canAcceptNonLocalWrites(lk)) { + return; } + _canAcceptNonLocalWrites.store(canAcceptWrites); } bool ReplicationCoordinatorImpl::ReadWriteAbility::canAcceptNonLocalWrites(WithLock) const { diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h index 23aa2940b46..e0f975999f0 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.h +++ b/src/mongo/db/repl/replication_coordinator_impl.h @@ -1033,18 +1033,19 @@ private: void _fulfillTopologyChangePromise(OperationContext* opCtx, WithLock); /** + * Update _canAcceptNonLocalWrites based on _topCoord->canAcceptWrites(). + */ + void _updateWriteAbilityFromTopologyCoordinator(WithLock lk, OperationContext* opCtx); + + /** * Updates the cached value, _memberState, to match _topCoord's reported * member state, from getMemberState(). * * Returns an enum indicating what action to take after releasing _mutex, if any. * Call performPostMemberStateUpdateAction on the return value after releasing * _mutex. - * - * Note: opCtx may be null as currently not all paths thread an OperationContext all the way - * down, but it must be non-null for any calls that change _canAcceptNonLocalWrites. */ - PostMemberStateUpdateAction _updateMemberStateFromTopologyCoordinator(WithLock lk, - OperationContext* opCtx); + PostMemberStateUpdateAction _updateMemberStateFromTopologyCoordinator(WithLock lk); /** * Performs a post member-state update action. Do not call while holding _mutex. diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index 10c0bc1b0de..3bb13adefc4 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -285,7 +285,7 @@ stdx::unique_lock<Latch> ReplicationCoordinatorImpl::_handleHeartbeatResponseAct // Update the cached member state if different than the current topology member state if (_memberState != _topCoord->getMemberState()) { const PostMemberStateUpdateAction postUpdateAction = - _updateMemberStateFromTopologyCoordinator(lock, nullptr); + _updateMemberStateFromTopologyCoordinator(lock); lock.unlock(); _performPostMemberStateUpdateAction(postUpdateAction); lock.lock(); @@ -438,7 +438,11 @@ void ReplicationCoordinatorImpl::_stepDownFinish( ReplicationMetrics::get(opCtx.get()).clearElectionCandidateMetrics(); _topCoord->finishUnconditionalStepDown(); - const auto action = _updateMemberStateFromTopologyCoordinator(lk, opCtx.get()); + + // Update _canAcceptNonLocalWrites. + _updateWriteAbilityFromTopologyCoordinator(lk, opCtx.get()); + + const auto action = _updateMemberStateFromTopologyCoordinator(lk); if (_pendingTermUpdateDuringStepDown) { TopologyCoordinator::UpdateTermResult result; _updateTerm_inlock(*_pendingTermUpdateDuringStepDown, &result); @@ -691,6 +695,9 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish( // Clear the node's election candidate metrics since it is no longer primary. ReplicationMetrics::get(opCtx.get()).clearElectionCandidateMetrics(); + + // Update _canAcceptNonLocalWrites. + _updateWriteAbilityFromTopologyCoordinator(lk, opCtx.get()); } else { // Release the rstl lock as the node might have stepped down due to // other unconditional step down code paths like learning new term via heartbeat & |