SERVER-46517: Move the update of readWriteAbility out of _updateMemberStateFromTopologyCoordinator

(cherry picked from commit 6d0a10abd1e6f222bc16c59afc28dcfb9613b86f)
author: Lingzhi Deng <lingzhi.deng@mongodb.com> 2020-03-13 22:10:43 -0400
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2020-03-25 15:55:41 +0000
commit: b973f465871f19815f8a5b60b6aeffb10bd1cb78 (patch)
tree: bb88d8c5b67731be93128da3e5a459ffea470717
parent: 09ea2c315d2e7a44a49a990ed7649af3919acd4d (diff)
download: mongo-b973f465871f19815f8a5b60b6aeffb10bd1cb78.tar.gz
3 files changed, 52 insertions, 43 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index 44b1723a268..8efa495886b 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -1031,7 +1031,13 @@ Status ReplicationCoordinatorImpl::_setFollowerMode(OperationContext* opCtx,
 
     _topCoord->setFollowerMode(newState.s);
 
-    const PostMemberStateUpdateAction action = _updateMemberStateFromTopologyCoordinator(lk, opCtx);
+    if (opCtx && _memberState.secondary() && newState == MemberState::RS_ROLLBACK) {
+        // If we are switching out of SECONDARY and to ROLLBACK, we must make sure that we hold the
+        // RSTL in mode X to prevent readers that have the RSTL in intent mode from reading.
+        _readWriteAbility->setCanServeNonLocalReads(opCtx, 0U);
+    }
+
+    const PostMemberStateUpdateAction action = _updateMemberStateFromTopologyCoordinator(lk);
     lk.unlock();
     _performPostMemberStateUpdateAction(action);
 
@@ -1156,8 +1162,9 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx,
     _updateLastCommittedOpTimeAndWallTime(lk);
     _wakeReadyWaiters(lk);
 
-    // Update _canAcceptNonLocalWrites
-    _updateMemberStateFromTopologyCoordinator(lk, opCtx);
+    // Update _canAcceptNonLocalWrites.
+    _updateWriteAbilityFromTopologyCoordinator(lk, opCtx);
+    _updateMemberStateFromTopologyCoordinator(lk);
 
     LOGV2_OPTIONS(21331,
                   {logv2::LogTag::kRS},
@@ -2473,16 +2480,18 @@ void ReplicationCoordinatorImpl::stepDown(OperationContext* opCtx,
     // of a stepdown attempt.  This will prevent us from accepting writes so that if our stepdown
     // attempt fails later we can release the RSTL and go to sleep to allow secondaries to
     // catch up without allowing new writes in.
-    auto action = _updateMemberStateFromTopologyCoordinator(lk, opCtx);
+    _updateWriteAbilityFromTopologyCoordinator(lk, opCtx);
+    auto action = _updateMemberStateFromTopologyCoordinator(lk);
     invariant(action == PostMemberStateUpdateAction::kActionNone);
     invariant(!_readWriteAbility->canAcceptNonLocalWrites(lk));
 
-    // Make sure that we leave _canAcceptNonLocalWrites in the proper state.
     auto updateMemberState = [&] {
         invariant(lk.owns_lock());
         invariant(opCtx->lockState()->isRSTLExclusive());
 
-        auto action = _updateMemberStateFromTopologyCoordinator(lk, opCtx);
+        // Make sure that we leave _canAcceptNonLocalWrites in the proper state.
+        _updateWriteAbilityFromTopologyCoordinator(lk, opCtx);
+        auto action = _updateMemberStateFromTopologyCoordinator(lk);
         lk.unlock();
 
         if (MONGO_unlikely(stepdownHangBeforePerformingPostMemberStateUpdateActions.shouldFail())) {
@@ -3029,8 +3038,7 @@ Status ReplicationCoordinatorImpl::setMaintenanceMode(bool activate) {
         return Status(ErrorCodes::OperationFailed, "already out of maintenance mode");
     }
 
-    const PostMemberStateUpdateAction action =
-        _updateMemberStateFromTopologyCoordinator(lk, nullptr);
+    const PostMemberStateUpdateAction action = _updateMemberStateFromTopologyCoordinator(lk);
     lk.unlock();
     _performPostMemberStateUpdateAction(action);
     return Status::OK();
@@ -3314,6 +3322,9 @@ void ReplicationCoordinatorImpl::_finishReplSetReconfig(OperationContext* opCtx,
 
             // Clear the node's election candidate metrics since it is no longer primary.
             ReplicationMetrics::get(opCtx).clearElectionCandidateMetrics();
+
+            // Update _canAcceptNonLocalWrites.
+            _updateWriteAbilityFromTopologyCoordinator(lk, opCtx);
         } else {
             // Release the rstl lock as the node might have stepped down due to
             // other unconditional step down code paths like learning new term via heartbeat &
@@ -3565,24 +3576,20 @@ void ReplicationCoordinatorImpl::incrementTopologyVersion(OperationContext* opCt
     _fulfillTopologyChangePromise(opCtx, lk);
 }
 
-ReplicationCoordinatorImpl::PostMemberStateUpdateAction
-ReplicationCoordinatorImpl::_updateMemberStateFromTopologyCoordinator(WithLock lk,
-                                                                      OperationContext* opCtx) {
-    {
-        // We have to do this check even if our current and target state are the same as we might
-        // have just failed a stepdown attempt and thus are staying in PRIMARY state but restoring
-        // our ability to accept writes.
-        bool canAcceptWrites = _topCoord->canAcceptWrites();
-        _readWriteAbility->setCanAcceptNonLocalWrites(lk, opCtx, canAcceptWrites);
-    }
+void ReplicationCoordinatorImpl::_updateWriteAbilityFromTopologyCoordinator(
+    WithLock lk, OperationContext* opCtx) {
+    bool canAcceptWrites = _topCoord->canAcceptWrites();
+    _readWriteAbility->setCanAcceptNonLocalWrites(lk, opCtx, canAcceptWrites);
+}
 
+ReplicationCoordinatorImpl::PostMemberStateUpdateAction
+ReplicationCoordinatorImpl::_updateMemberStateFromTopologyCoordinator(WithLock lk) {
     // We want to respond to any waiting isMasters even if our current and target state are the
     // same as it is possible writes have been disabled during a stepDown but the primary has yet
     // to transition to SECONDARY state.
     ON_BLOCK_EXIT([&] {
         if (_rsConfig.isInitialized()) {
-            _fulfillTopologyChangePromise(opCtx, lk);
-            // Use the global ServiceContext here in case the current opCtx is null.
+            _fulfillTopologyChangePromise(nullptr, lk);
             IsMasterMetrics::get(getGlobalServiceContext())->resetNumAwaitingTopologyChanges();
         }
     });
@@ -3608,7 +3615,7 @@ ReplicationCoordinatorImpl::_updateMemberStateFromTopologyCoordinator(WithLock l
         _opTimeWaiterList.setErrorAll_inlock(
             {ErrorCodes::PrimarySteppedDown, "Primary stepped down while waiting for replication"});
 
-        // _canAcceptNonLocalWrites should already be set above.
+        // _canAcceptNonLocalWrites should already be set.
         invariant(!_readWriteAbility->canAcceptNonLocalWrites(lk));
 
         serverGlobalParams.validateFeaturesAsMaster.store(false);
@@ -3636,12 +3643,9 @@ ReplicationCoordinatorImpl::_updateMemberStateFromTopologyCoordinator(WithLock l
         _externalState->startProducerIfStopped();
     }
 
-    if (_memberState.secondary() && newState.rollback()) {
-        // If we are switching out of SECONDARY and to ROLLBACK, we must make sure that we hold the
-        // RSTL in mode X to prevent readers that have the RSTL in intent mode from reading.
-        _readWriteAbility->setCanServeNonLocalReads(opCtx, 0U);
-    } else if (_memberState.secondary() && !newState.primary()) {
-        // Switching out of SECONDARY, but not to PRIMARY or ROLLBACK.
+    if (_memberState.secondary() && !newState.primary() && !newState.rollback()) {
+        // Switching out of SECONDARY, but not to PRIMARY or ROLLBACK. Note that ROLLBACK case is
+        // handled separately and requires RSTL lock held, see setFollowerModeStrict.
         _readWriteAbility->setCanServeNonLocalReads_UNSAFE(0U);
     } else if (!_memberState.primary() && newState.secondary()) {
         // Switching into SECONDARY, but not from PRIMARY.
@@ -3756,8 +3760,7 @@ void ReplicationCoordinatorImpl::_postWonElectionUpdateMemberState(WithLock lk)
     _electionId = OID::fromTerm(_topCoord->getTerm());
     auto ts = LogicalClock::get(getServiceContext())->reserveTicks(1).asTimestamp();
     _topCoord->processWinElection(_electionId, ts);
-    const PostMemberStateUpdateAction nextAction =
-        _updateMemberStateFromTopologyCoordinator(lk, nullptr);
+    const PostMemberStateUpdateAction nextAction = _updateMemberStateFromTopologyCoordinator(lk);
 
     invariant(nextAction == kActionFollowerModeStateChange,
               str::stream() << "nextAction == " << static_cast<int>(nextAction));
@@ -4101,7 +4104,7 @@ ReplicationCoordinatorImpl::_setCurrentRSConfig(WithLock lk,
     _cancelPriorityTakeover_inlock();
     _cancelAndRescheduleElectionTimeout_inlock();
 
-    const PostMemberStateUpdateAction action = _updateMemberStateFromTopologyCoordinator(lk, opCtx);
+    const PostMemberStateUpdateAction action = _updateMemberStateFromTopologyCoordinator(lk);
     if (_selfIndex >= 0) {
         // Don't send heartbeats if we're not in the config, if we get re-added one of the
         // nodes in the set will contact us.
@@ -5059,15 +5062,13 @@ bool ReplicationCoordinatorImpl::setContainsArbiter() const {
 
 void ReplicationCoordinatorImpl::ReadWriteAbility::setCanAcceptNonLocalWrites(
     WithLock lk, OperationContext* opCtx, bool canAcceptWrites) {
-    if (canAcceptWrites == canAcceptNonLocalWrites(lk)) {
-        return;
-    }
-
     // We must be holding the RSTL in mode X to change _canAcceptNonLocalWrites.
     invariant(opCtx);
-    if (opCtx->lockState()->isRSTLExclusive()) {
-        _canAcceptNonLocalWrites.store(canAcceptWrites);
+    invariant(opCtx->lockState()->isRSTLExclusive());
+    if (canAcceptWrites == canAcceptNonLocalWrites(lk)) {
+        return;
     }
+    _canAcceptNonLocalWrites.store(canAcceptWrites);
 }
 
 bool ReplicationCoordinatorImpl::ReadWriteAbility::canAcceptNonLocalWrites(WithLock) const {
diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h
index 23aa2940b46..e0f975999f0 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.h
+++ b/src/mongo/db/repl/replication_coordinator_impl.h
@@ -1033,18 +1033,19 @@ private:
     void _fulfillTopologyChangePromise(OperationContext* opCtx, WithLock);
 
     /**
+     * Update _canAcceptNonLocalWrites based on _topCoord->canAcceptWrites().
+     */
+    void _updateWriteAbilityFromTopologyCoordinator(WithLock lk, OperationContext* opCtx);
+
+    /**
      * Updates the cached value, _memberState, to match _topCoord's reported
      * member state, from getMemberState().
      *
      * Returns an enum indicating what action to take after releasing _mutex, if any.
      * Call performPostMemberStateUpdateAction on the return value after releasing
      * _mutex.
-     *
-     * Note: opCtx may be null as currently not all paths thread an OperationContext all the way
-     * down, but it must be non-null for any calls that change _canAcceptNonLocalWrites.
      */
-    PostMemberStateUpdateAction _updateMemberStateFromTopologyCoordinator(WithLock lk,
-                                                                          OperationContext* opCtx);
+    PostMemberStateUpdateAction _updateMemberStateFromTopologyCoordinator(WithLock lk);
 
     /**
      * Performs a post member-state update action.  Do not call while holding _mutex.
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
index 10c0bc1b0de..3bb13adefc4 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
@@ -285,7 +285,7 @@ stdx::unique_lock<Latch> ReplicationCoordinatorImpl::_handleHeartbeatResponseAct
             // Update the cached member state if different than the current topology member state
             if (_memberState != _topCoord->getMemberState()) {
                 const PostMemberStateUpdateAction postUpdateAction =
-                    _updateMemberStateFromTopologyCoordinator(lock, nullptr);
+                    _updateMemberStateFromTopologyCoordinator(lock);
                 lock.unlock();
                 _performPostMemberStateUpdateAction(postUpdateAction);
                 lock.lock();
@@ -438,7 +438,11 @@ void ReplicationCoordinatorImpl::_stepDownFinish(
     ReplicationMetrics::get(opCtx.get()).clearElectionCandidateMetrics();
 
     _topCoord->finishUnconditionalStepDown();
-    const auto action = _updateMemberStateFromTopologyCoordinator(lk, opCtx.get());
+
+    // Update _canAcceptNonLocalWrites.
+    _updateWriteAbilityFromTopologyCoordinator(lk, opCtx.get());
+
+    const auto action = _updateMemberStateFromTopologyCoordinator(lk);
     if (_pendingTermUpdateDuringStepDown) {
         TopologyCoordinator::UpdateTermResult result;
         _updateTerm_inlock(*_pendingTermUpdateDuringStepDown, &result);
@@ -691,6 +695,9 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish(
 
             // Clear the node's election candidate metrics since it is no longer primary.
             ReplicationMetrics::get(opCtx.get()).clearElectionCandidateMetrics();
+
+            // Update _canAcceptNonLocalWrites.
+            _updateWriteAbilityFromTopologyCoordinator(lk, opCtx.get());
         } else {
             // Release the rstl lock as the node might have stepped down due to
             // other unconditional step down code paths like learning new term via heartbeat &
author	Lingzhi Deng <lingzhi.deng@mongodb.com>	2020-03-13 22:10:43 -0400
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2020-03-25 15:55:41 +0000
commit	b973f465871f19815f8a5b60b6aeffb10bd1cb78 (patch)
tree	bb88d8c5b67731be93128da3e5a459ffea470717
parent	09ea2c315d2e7a44a49a990ed7649af3919acd4d (diff)
download	mongo-b973f465871f19815f8a5b60b6aeffb10bd1cb78.tar.gz