diff options
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.cpp | 25 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp | 16 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator.cpp | 4 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator.h | 37 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator_v1_test.cpp | 30 |
5 files changed, 68 insertions, 44 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index 05bd9f78ff2..b6ced090602 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -2049,7 +2049,10 @@ void ReplicationCoordinatorImpl::stepDown(OperationContext* opCtx, ThreadWaiter waiter(lastAppliedOpTime, &waiterWriteConcern, &condVar); WaiterGuard guard(lk, &_replicationWaiterList, &waiter); - while (!_topCoord->attemptStepDown( + // If attemptStepDown() succeeds, we are guaranteed that no concurrent step up or + // step down can happen afterwards. So, it's safe to release the mutex before + // yieldLocksForPreparedTransactions(). + while (!_topCoord->tryToStartStepDown( termAtStart, _replExecutor->now(), waitUntil, stepDownUntil, force)) { // The stepdown attempt failed. We now release the RSTL to allow secondaries to read the @@ -2085,18 +2088,23 @@ void ReplicationCoordinatorImpl::stepDown(OperationContext* opCtx, // We ignore the case where waitForConditionOrInterruptUntil returns // stdx::cv_status::timeout because in that case coming back around the loop and calling - // attemptStepDown again will cause attemptStepDown to return ExceededTimeLimit with - // the proper error message. + // tryToStartStepDown again will cause tryToStartStepDown to return ExceededTimeLimit + // with the proper error message. opCtx->waitForConditionOrInterruptUntil( condVar, lk, std::min(stepDownUntil, waitUntil)); } } - // Stepdown success! + // Prepare for unconditional stepdown success! + // We need to release the mutex before yielding locks for prepared transactions, which might + // check out sessions, to avoid deadlocks with checked-out sessions accessing this mutex. + lk.unlock(); - // Yield locks for prepared transactions. yieldLocksForPreparedTransactions(opCtx); + + lk.lock(); _updateAndLogStatsOnStepDown(&arsd); + _topCoord->finishUnconditionalStepDown(); onExitGuard.dismiss(); updateMemberState(); @@ -2664,7 +2672,14 @@ void ReplicationCoordinatorImpl::_finishReplSetReconfig(OperationContext* opCtx, if (_topCoord->isSteppingDownUnconditionally()) { invariant(opCtx->lockState()->isRSTLExclusive()); log() << "stepping down from primary, because we received a new config"; + // We need to release the mutex before yielding locks for prepared transactions, which + // might check out sessions, to avoid deadlocks with checked-out sessions accessing + // this mutex. + lk.unlock(); + yieldLocksForPreparedTransactions(opCtx); + + lk.lock(); _updateAndLogStatsOnStepDown(&arsd.get()); } else { // Release the rstl lock as the node might have stepped down due to diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index a4349f6070c..53324e0e3f2 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -414,11 +414,16 @@ void ReplicationCoordinatorImpl::_stepDownFinish( return; } - // Yield locks for prepared transactions. + // We need to release the mutex before yielding locks for prepared transactions, which might + // check out sessions, to avoid deadlocks with checked-out sessions accessing this mutex. + lk.unlock(); + yieldLocksForPreparedTransactions(opCtx.get()); - _updateAndLogStatsOnStepDown(&arsd); + lk.lock(); + _updateAndLogStatsOnStepDown(&arsd); _topCoord->finishUnconditionalStepDown(); + const auto action = _updateMemberStateFromTopologyCoordinator(lk, opCtx.get()); if (_pendingTermUpdateDuringStepDown) { TopologyCoordinator::UpdateTermResult result; @@ -630,7 +635,14 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish( if (_topCoord->isSteppingDownUnconditionally()) { invariant(opCtx->lockState()->isRSTLExclusive()); log() << "stepping down from primary, because we received a new config via heartbeat"; + // We need to release the mutex before yielding locks for prepared transactions, which + // might check out sessions, to avoid deadlocks with checked-out sessions accessing + // this mutex. + lk.unlock(); + yieldLocksForPreparedTransactions(opCtx.get()); + + lk.lock(); _updateAndLogStatsOnStepDown(&arsd.get()); } else { // Release the rstl lock as the node might have stepped down due to diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp index 989510782c6..5aeb5f11741 100644 --- a/src/mongo/db/repl/topology_coordinator.cpp +++ b/src/mongo/db/repl/topology_coordinator.cpp @@ -2219,7 +2219,7 @@ void TopologyCoordinator::processLoseElection() { _role = Role::kFollower; } -bool TopologyCoordinator::attemptStepDown( +bool TopologyCoordinator::tryToStartStepDown( long long termAtStart, Date_t now, Date_t waitUntil, Date_t stepDownUntil, bool force) { if (_role != Role::kLeader || _leaderMode == LeaderMode::kSteppingDown || @@ -2255,7 +2255,7 @@ bool TopologyCoordinator::attemptStepDown( // Stepdown attempt success! _stepDownUntil = stepDownUntil; - _stepDownSelfAndReplaceWith(-1); + prepareForUnconditionalStepDown(); return true; } diff --git a/src/mongo/db/repl/topology_coordinator.h b/src/mongo/db/repl/topology_coordinator.h index 1d6975a9943..d73cbd469f9 100644 --- a/src/mongo/db/repl/topology_coordinator.h +++ b/src/mongo/db/repl/topology_coordinator.h @@ -542,9 +542,8 @@ public: StatusWith<StepDownAttemptAbortFn> prepareForStepDownAttempt(); /** - * Tries to transition the coordinator from the leader role to the follower role. - * - * A step down succeeds based on the following conditions: + * Tries to transition the coordinator's leader mode from kAttemptingStepDown to + * kSteppingDown only if we are able to meet the below requirements for stepdown. * * C1. 'force' is true and now > waitUntil * @@ -553,21 +552,20 @@ public: * * C3. There exists at least one electable secondary node in the majority set M. * - * - * If C1 is true, or if both C2 and C3 are true, then the stepdown occurs and this method - * returns true. If the conditions for successful stepdown aren't met yet, but waiting for more - * time to pass could make it succeed, returns false. If the whole stepdown attempt should be - * abandoned (for example because the time limit expired or because we've already stepped down), - * throws an exception. + * If C1 is true, or if both C2 and C3 are true, then the transition succeeds and this + * method returns true. If the conditions for successful stepdown aren't met yet, but waiting + * for more time to pass could make it succeed, returns false. If the whole stepdown attempt + * should be abandoned (for example because the time limit expired or because we've already + * stepped down), throws an exception. * TODO(spencer): Unify with the finishUnconditionalStepDown() method. */ - bool attemptStepDown( + bool tryToStartStepDown( long long termAtStart, Date_t now, Date_t waitUntil, Date_t stepDownUntil, bool force); /** * Returns whether it is safe for a stepdown attempt to complete, ignoring the 'force' argument. * This is essentially checking conditions C2 and C3 as described in the comment to - * attemptStepDown(). + * tryToStartStepDown(). */ bool isSafeToStepDown(); @@ -758,20 +756,16 @@ private: * | | | | | | | * | | | | | | | * v | | v v v | - * kAttemptingStepDown----------->kSteppingDown | - * | | | - * | | | - * | | | - * --------------------------------------------- - * + * kAttemptingStepDown----------->kSteppingDown------| */ enum class LeaderMode { kNotLeader, // This node is not currently a leader. kLeaderElect, // This node has been elected leader, but can't yet accept writes. kMaster, // This node reports ismaster:true and can accept writes. - kSteppingDown, // This node is in the middle of a (hb/force reconfig) stepdown that - // must complete. - kAttemptingStepDown, // This node is in the middle of a stepdown (cmd) that might fail. + kSteppingDown, // This node is in the middle of a hb, force reconfig or stepdown + // command that must complete. + kAttemptingStepDown, // This node is in the middle of a cmd initiated step down that might + // fail. }; enum UnelectableReason { @@ -868,7 +862,8 @@ private: /** * Returns whether a stepdown attempt should be allowed to proceed. See the comment for - * attemptStepDown() for more details on the rules of when stepdown attempts succeed or fail. + * tryToStartStepDown() for more details on the rules of when stepdown attempts succeed + * or fail. */ bool _canCompleteStepDownAttempt(Date_t now, Date_t waitUntil, bool force); diff --git a/src/mongo/db/repl/topology_coordinator_v1_test.cpp b/src/mongo/db/repl/topology_coordinator_v1_test.cpp index 39408cb1394..678b3d87148 100644 --- a/src/mongo/db/repl/topology_coordinator_v1_test.cpp +++ b/src/mongo/db/repl/topology_coordinator_v1_test.cpp @@ -4244,9 +4244,10 @@ TEST_F(TopoCoordTest, StepDownAttemptFailsWhenNotPrimary) { setSelfMemberState(MemberState::RS_SECONDARY); - ASSERT_THROWS_CODE(getTopoCoord().attemptStepDown(term, curTime, futureTime, futureTime, false), - DBException, - ErrorCodes::PrimarySteppedDown); + ASSERT_THROWS_CODE( + getTopoCoord().tryToStartStepDown(term, curTime, futureTime, futureTime, false), + DBException, + ErrorCodes::PrimarySteppedDown); } TEST_F(TopoCoordTest, StepDownAttemptFailsWhenAlreadySteppingDown) { @@ -4273,9 +4274,10 @@ TEST_F(TopoCoordTest, StepDownAttemptFailsWhenAlreadySteppingDown) { makeSelfPrimary(); getTopoCoord().prepareForUnconditionalStepDown(); - ASSERT_THROWS_CODE(getTopoCoord().attemptStepDown(term, curTime, futureTime, futureTime, false), - DBException, - ErrorCodes::PrimarySteppedDown); + ASSERT_THROWS_CODE( + getTopoCoord().tryToStartStepDown(term, curTime, futureTime, futureTime, false), + DBException, + ErrorCodes::PrimarySteppedDown); } TEST_F(TopoCoordTest, StepDownAttemptFailsForDifferentTerm) { @@ -4303,7 +4305,7 @@ TEST_F(TopoCoordTest, StepDownAttemptFailsForDifferentTerm) { ASSERT_OK(getTopoCoord().prepareForStepDownAttempt().getStatus()); ASSERT_THROWS_CODE( - getTopoCoord().attemptStepDown(term - 1, curTime, futureTime, futureTime, false), + getTopoCoord().tryToStartStepDown(term - 1, curTime, futureTime, futureTime, false), DBException, ErrorCodes::PrimarySteppedDown); } @@ -4333,7 +4335,7 @@ TEST_F(TopoCoordTest, StepDownAttemptFailsIfPastStepDownUntil) { ASSERT_OK(getTopoCoord().prepareForStepDownAttempt().getStatus()); ASSERT_THROWS_CODE_AND_WHAT( - getTopoCoord().attemptStepDown(term, curTime, futureTime, curTime, false), + getTopoCoord().tryToStartStepDown(term, curTime, futureTime, curTime, false), DBException, ErrorCodes::ExceededTimeLimit, "By the time we were ready to step down, we were already past the time we were supposed to " @@ -4369,7 +4371,7 @@ TEST_F(TopoCoordTest, StepDownAttemptFailsIfPastWaitUntil) { << ". Please use the replSetStepDown command with the argument " << "{force: true} to force node to step down."; ASSERT_THROWS_CODE_AND_WHAT( - getTopoCoord().attemptStepDown(term, curTime, curTime, futureTime, false), + getTopoCoord().tryToStartStepDown(term, curTime, curTime, futureTime, false), DBException, ErrorCodes::ExceededTimeLimit, expectedWhat); @@ -4405,7 +4407,7 @@ TEST_F(TopoCoordTest, StepDownAttemptFailsIfNoSecondariesCaughtUp) { heartbeatFromMember( HostAndPort("host3"), "rs0", MemberState::RS_SECONDARY, OpTime(Timestamp(4, 0), term)); - ASSERT_FALSE(getTopoCoord().attemptStepDown(term, curTime, futureTime, futureTime, false)); + ASSERT_FALSE(getTopoCoord().tryToStartStepDown(term, curTime, futureTime, futureTime, false)); } TEST_F(TopoCoordTest, StepDownAttemptFailsIfNoSecondariesCaughtUpForceIsTrueButNotPastWaitUntil) { @@ -4438,7 +4440,7 @@ TEST_F(TopoCoordTest, StepDownAttemptFailsIfNoSecondariesCaughtUpForceIsTrueButN heartbeatFromMember( HostAndPort("host3"), "rs0", MemberState::RS_SECONDARY, OpTime(Timestamp(4, 0), term)); - ASSERT_FALSE(getTopoCoord().attemptStepDown(term, curTime, futureTime, futureTime, true)); + ASSERT_FALSE(getTopoCoord().tryToStartStepDown(term, curTime, futureTime, futureTime, true)); } TEST_F(TopoCoordTest, StepDownAttemptSucceedsIfNoSecondariesCaughtUpForceIsTrueAndPastWaitUntil) { @@ -4471,7 +4473,7 @@ TEST_F(TopoCoordTest, StepDownAttemptSucceedsIfNoSecondariesCaughtUpForceIsTrueA heartbeatFromMember( HostAndPort("host3"), "rs0", MemberState::RS_SECONDARY, OpTime(Timestamp(4, 0), term)); - ASSERT_TRUE(getTopoCoord().attemptStepDown(term, curTime, curTime, futureTime, true)); + ASSERT_TRUE(getTopoCoord().tryToStartStepDown(term, curTime, curTime, futureTime, true)); } TEST_F(TopoCoordTest, StepDownAttemptSucceedsIfSecondariesCaughtUp) { @@ -4504,7 +4506,7 @@ TEST_F(TopoCoordTest, StepDownAttemptSucceedsIfSecondariesCaughtUp) { heartbeatFromMember( HostAndPort("host3"), "rs0", MemberState::RS_SECONDARY, OpTime(Timestamp(4, 0), term)); - ASSERT_TRUE(getTopoCoord().attemptStepDown(term, curTime, futureTime, futureTime, false)); + ASSERT_TRUE(getTopoCoord().tryToStartStepDown(term, curTime, futureTime, futureTime, false)); } TEST_F(TopoCoordTest, StepDownAttemptFailsIfSecondaryCaughtUpButNotElectable) { @@ -4541,7 +4543,7 @@ TEST_F(TopoCoordTest, StepDownAttemptFailsIfSecondaryCaughtUpButNotElectable) { heartbeatFromMember( HostAndPort("host3"), "rs0", MemberState::RS_SECONDARY, OpTime(Timestamp(4, 0), term)); - ASSERT_FALSE(getTopoCoord().attemptStepDown(term, curTime, futureTime, futureTime, false)); + ASSERT_FALSE(getTopoCoord().tryToStartStepDown(term, curTime, futureTime, futureTime, false)); } TEST_F(TopoCoordTest, |