summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSuganthi Mani <suganthi.mani@mongodb.com>2019-06-13 05:28:51 -0400
committerSuganthi Mani <suganthi.mani@mongodb.com>2019-07-15 11:35:13 -0400
commitcc1a75e4a6d8de8478e7253da7bd6376052d57a6 (patch)
treed7b8e0ac1ab43bdfa8de260cb26300d1a86c74a9
parentd6bd2c5885215c29d723f02d8607f2c6d662aacc (diff)
downloadmongo-cc1a75e4a6d8de8478e7253da7bd6376052d57a6.tar.gz
SERVER-41355 Step down calls yieldLocksForPreparedTransactions without
holding repl mutex.
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp25
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp16
-rw-r--r--src/mongo/db/repl/topology_coordinator.cpp4
-rw-r--r--src/mongo/db/repl/topology_coordinator.h37
-rw-r--r--src/mongo/db/repl/topology_coordinator_v1_test.cpp30
5 files changed, 68 insertions, 44 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index 05bd9f78ff2..b6ced090602 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -2049,7 +2049,10 @@ void ReplicationCoordinatorImpl::stepDown(OperationContext* opCtx,
ThreadWaiter waiter(lastAppliedOpTime, &waiterWriteConcern, &condVar);
WaiterGuard guard(lk, &_replicationWaiterList, &waiter);
- while (!_topCoord->attemptStepDown(
+ // If attemptStepDown() succeeds, we are guaranteed that no concurrent step up or
+ // step down can happen afterwards. So, it's safe to release the mutex before
+ // yieldLocksForPreparedTransactions().
+ while (!_topCoord->tryToStartStepDown(
termAtStart, _replExecutor->now(), waitUntil, stepDownUntil, force)) {
// The stepdown attempt failed. We now release the RSTL to allow secondaries to read the
@@ -2085,18 +2088,23 @@ void ReplicationCoordinatorImpl::stepDown(OperationContext* opCtx,
// We ignore the case where waitForConditionOrInterruptUntil returns
// stdx::cv_status::timeout because in that case coming back around the loop and calling
- // attemptStepDown again will cause attemptStepDown to return ExceededTimeLimit with
- // the proper error message.
+ // tryToStartStepDown again will cause tryToStartStepDown to return ExceededTimeLimit
+ // with the proper error message.
opCtx->waitForConditionOrInterruptUntil(
condVar, lk, std::min(stepDownUntil, waitUntil));
}
}
- // Stepdown success!
+ // Prepare for unconditional stepdown success!
+ // We need to release the mutex before yielding locks for prepared transactions, which might
+ // check out sessions, to avoid deadlocks with checked-out sessions accessing this mutex.
+ lk.unlock();
- // Yield locks for prepared transactions.
yieldLocksForPreparedTransactions(opCtx);
+
+ lk.lock();
_updateAndLogStatsOnStepDown(&arsd);
+ _topCoord->finishUnconditionalStepDown();
onExitGuard.dismiss();
updateMemberState();
@@ -2664,7 +2672,14 @@ void ReplicationCoordinatorImpl::_finishReplSetReconfig(OperationContext* opCtx,
if (_topCoord->isSteppingDownUnconditionally()) {
invariant(opCtx->lockState()->isRSTLExclusive());
log() << "stepping down from primary, because we received a new config";
+ // We need to release the mutex before yielding locks for prepared transactions, which
+ // might check out sessions, to avoid deadlocks with checked-out sessions accessing
+ // this mutex.
+ lk.unlock();
+
yieldLocksForPreparedTransactions(opCtx);
+
+ lk.lock();
_updateAndLogStatsOnStepDown(&arsd.get());
} else {
// Release the rstl lock as the node might have stepped down due to
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
index a4349f6070c..53324e0e3f2 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
@@ -414,11 +414,16 @@ void ReplicationCoordinatorImpl::_stepDownFinish(
return;
}
- // Yield locks for prepared transactions.
+ // We need to release the mutex before yielding locks for prepared transactions, which might
+ // check out sessions, to avoid deadlocks with checked-out sessions accessing this mutex.
+ lk.unlock();
+
yieldLocksForPreparedTransactions(opCtx.get());
- _updateAndLogStatsOnStepDown(&arsd);
+ lk.lock();
+ _updateAndLogStatsOnStepDown(&arsd);
_topCoord->finishUnconditionalStepDown();
+
const auto action = _updateMemberStateFromTopologyCoordinator(lk, opCtx.get());
if (_pendingTermUpdateDuringStepDown) {
TopologyCoordinator::UpdateTermResult result;
@@ -630,7 +635,14 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish(
if (_topCoord->isSteppingDownUnconditionally()) {
invariant(opCtx->lockState()->isRSTLExclusive());
log() << "stepping down from primary, because we received a new config via heartbeat";
+ // We need to release the mutex before yielding locks for prepared transactions, which
+ // might check out sessions, to avoid deadlocks with checked-out sessions accessing
+ // this mutex.
+ lk.unlock();
+
yieldLocksForPreparedTransactions(opCtx.get());
+
+ lk.lock();
_updateAndLogStatsOnStepDown(&arsd.get());
} else {
// Release the rstl lock as the node might have stepped down due to
diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp
index 989510782c6..5aeb5f11741 100644
--- a/src/mongo/db/repl/topology_coordinator.cpp
+++ b/src/mongo/db/repl/topology_coordinator.cpp
@@ -2219,7 +2219,7 @@ void TopologyCoordinator::processLoseElection() {
_role = Role::kFollower;
}
-bool TopologyCoordinator::attemptStepDown(
+bool TopologyCoordinator::tryToStartStepDown(
long long termAtStart, Date_t now, Date_t waitUntil, Date_t stepDownUntil, bool force) {
if (_role != Role::kLeader || _leaderMode == LeaderMode::kSteppingDown ||
@@ -2255,7 +2255,7 @@ bool TopologyCoordinator::attemptStepDown(
// Stepdown attempt success!
_stepDownUntil = stepDownUntil;
- _stepDownSelfAndReplaceWith(-1);
+ prepareForUnconditionalStepDown();
return true;
}
diff --git a/src/mongo/db/repl/topology_coordinator.h b/src/mongo/db/repl/topology_coordinator.h
index 1d6975a9943..d73cbd469f9 100644
--- a/src/mongo/db/repl/topology_coordinator.h
+++ b/src/mongo/db/repl/topology_coordinator.h
@@ -542,9 +542,8 @@ public:
StatusWith<StepDownAttemptAbortFn> prepareForStepDownAttempt();
/**
- * Tries to transition the coordinator from the leader role to the follower role.
- *
- * A step down succeeds based on the following conditions:
+ * Tries to transition the coordinator's leader mode from kAttemptingStepDown to
+ * kSteppingDown only if we are able to meet the below requirements for stepdown.
*
* C1. 'force' is true and now > waitUntil
*
@@ -553,21 +552,20 @@ public:
*
* C3. There exists at least one electable secondary node in the majority set M.
*
- *
- * If C1 is true, or if both C2 and C3 are true, then the stepdown occurs and this method
- * returns true. If the conditions for successful stepdown aren't met yet, but waiting for more
- * time to pass could make it succeed, returns false. If the whole stepdown attempt should be
- * abandoned (for example because the time limit expired or because we've already stepped down),
- * throws an exception.
+ * If C1 is true, or if both C2 and C3 are true, then the transition succeeds and this
+ * method returns true. If the conditions for successful stepdown aren't met yet, but waiting
+ * for more time to pass could make it succeed, returns false. If the whole stepdown attempt
+ * should be abandoned (for example because the time limit expired or because we've already
+ * stepped down), throws an exception.
* TODO(spencer): Unify with the finishUnconditionalStepDown() method.
*/
- bool attemptStepDown(
+ bool tryToStartStepDown(
long long termAtStart, Date_t now, Date_t waitUntil, Date_t stepDownUntil, bool force);
/**
* Returns whether it is safe for a stepdown attempt to complete, ignoring the 'force' argument.
* This is essentially checking conditions C2 and C3 as described in the comment to
- * attemptStepDown().
+ * tryToStartStepDown().
*/
bool isSafeToStepDown();
@@ -758,20 +756,16 @@ private:
* | | | | | | |
* | | | | | | |
* v | | v v v |
- * kAttemptingStepDown----------->kSteppingDown |
- * | | |
- * | | |
- * | | |
- * ---------------------------------------------
- *
+ * kAttemptingStepDown----------->kSteppingDown------|
*/
enum class LeaderMode {
kNotLeader, // This node is not currently a leader.
kLeaderElect, // This node has been elected leader, but can't yet accept writes.
kMaster, // This node reports ismaster:true and can accept writes.
- kSteppingDown, // This node is in the middle of a (hb/force reconfig) stepdown that
- // must complete.
- kAttemptingStepDown, // This node is in the middle of a stepdown (cmd) that might fail.
+ kSteppingDown, // This node is in the middle of a hb, force reconfig or stepdown
+ // command that must complete.
+ kAttemptingStepDown, // This node is in the middle of a cmd initiated step down that might
+ // fail.
};
enum UnelectableReason {
@@ -868,7 +862,8 @@ private:
/**
* Returns whether a stepdown attempt should be allowed to proceed. See the comment for
- * attemptStepDown() for more details on the rules of when stepdown attempts succeed or fail.
+ * tryToStartStepDown() for more details on the rules of when stepdown attempts succeed
+ * or fail.
*/
bool _canCompleteStepDownAttempt(Date_t now, Date_t waitUntil, bool force);
diff --git a/src/mongo/db/repl/topology_coordinator_v1_test.cpp b/src/mongo/db/repl/topology_coordinator_v1_test.cpp
index 39408cb1394..678b3d87148 100644
--- a/src/mongo/db/repl/topology_coordinator_v1_test.cpp
+++ b/src/mongo/db/repl/topology_coordinator_v1_test.cpp
@@ -4244,9 +4244,10 @@ TEST_F(TopoCoordTest, StepDownAttemptFailsWhenNotPrimary) {
setSelfMemberState(MemberState::RS_SECONDARY);
- ASSERT_THROWS_CODE(getTopoCoord().attemptStepDown(term, curTime, futureTime, futureTime, false),
- DBException,
- ErrorCodes::PrimarySteppedDown);
+ ASSERT_THROWS_CODE(
+ getTopoCoord().tryToStartStepDown(term, curTime, futureTime, futureTime, false),
+ DBException,
+ ErrorCodes::PrimarySteppedDown);
}
TEST_F(TopoCoordTest, StepDownAttemptFailsWhenAlreadySteppingDown) {
@@ -4273,9 +4274,10 @@ TEST_F(TopoCoordTest, StepDownAttemptFailsWhenAlreadySteppingDown) {
makeSelfPrimary();
getTopoCoord().prepareForUnconditionalStepDown();
- ASSERT_THROWS_CODE(getTopoCoord().attemptStepDown(term, curTime, futureTime, futureTime, false),
- DBException,
- ErrorCodes::PrimarySteppedDown);
+ ASSERT_THROWS_CODE(
+ getTopoCoord().tryToStartStepDown(term, curTime, futureTime, futureTime, false),
+ DBException,
+ ErrorCodes::PrimarySteppedDown);
}
TEST_F(TopoCoordTest, StepDownAttemptFailsForDifferentTerm) {
@@ -4303,7 +4305,7 @@ TEST_F(TopoCoordTest, StepDownAttemptFailsForDifferentTerm) {
ASSERT_OK(getTopoCoord().prepareForStepDownAttempt().getStatus());
ASSERT_THROWS_CODE(
- getTopoCoord().attemptStepDown(term - 1, curTime, futureTime, futureTime, false),
+ getTopoCoord().tryToStartStepDown(term - 1, curTime, futureTime, futureTime, false),
DBException,
ErrorCodes::PrimarySteppedDown);
}
@@ -4333,7 +4335,7 @@ TEST_F(TopoCoordTest, StepDownAttemptFailsIfPastStepDownUntil) {
ASSERT_OK(getTopoCoord().prepareForStepDownAttempt().getStatus());
ASSERT_THROWS_CODE_AND_WHAT(
- getTopoCoord().attemptStepDown(term, curTime, futureTime, curTime, false),
+ getTopoCoord().tryToStartStepDown(term, curTime, futureTime, curTime, false),
DBException,
ErrorCodes::ExceededTimeLimit,
"By the time we were ready to step down, we were already past the time we were supposed to "
@@ -4369,7 +4371,7 @@ TEST_F(TopoCoordTest, StepDownAttemptFailsIfPastWaitUntil) {
<< ". Please use the replSetStepDown command with the argument "
<< "{force: true} to force node to step down.";
ASSERT_THROWS_CODE_AND_WHAT(
- getTopoCoord().attemptStepDown(term, curTime, curTime, futureTime, false),
+ getTopoCoord().tryToStartStepDown(term, curTime, curTime, futureTime, false),
DBException,
ErrorCodes::ExceededTimeLimit,
expectedWhat);
@@ -4405,7 +4407,7 @@ TEST_F(TopoCoordTest, StepDownAttemptFailsIfNoSecondariesCaughtUp) {
heartbeatFromMember(
HostAndPort("host3"), "rs0", MemberState::RS_SECONDARY, OpTime(Timestamp(4, 0), term));
- ASSERT_FALSE(getTopoCoord().attemptStepDown(term, curTime, futureTime, futureTime, false));
+ ASSERT_FALSE(getTopoCoord().tryToStartStepDown(term, curTime, futureTime, futureTime, false));
}
TEST_F(TopoCoordTest, StepDownAttemptFailsIfNoSecondariesCaughtUpForceIsTrueButNotPastWaitUntil) {
@@ -4438,7 +4440,7 @@ TEST_F(TopoCoordTest, StepDownAttemptFailsIfNoSecondariesCaughtUpForceIsTrueButN
heartbeatFromMember(
HostAndPort("host3"), "rs0", MemberState::RS_SECONDARY, OpTime(Timestamp(4, 0), term));
- ASSERT_FALSE(getTopoCoord().attemptStepDown(term, curTime, futureTime, futureTime, true));
+ ASSERT_FALSE(getTopoCoord().tryToStartStepDown(term, curTime, futureTime, futureTime, true));
}
TEST_F(TopoCoordTest, StepDownAttemptSucceedsIfNoSecondariesCaughtUpForceIsTrueAndPastWaitUntil) {
@@ -4471,7 +4473,7 @@ TEST_F(TopoCoordTest, StepDownAttemptSucceedsIfNoSecondariesCaughtUpForceIsTrueA
heartbeatFromMember(
HostAndPort("host3"), "rs0", MemberState::RS_SECONDARY, OpTime(Timestamp(4, 0), term));
- ASSERT_TRUE(getTopoCoord().attemptStepDown(term, curTime, curTime, futureTime, true));
+ ASSERT_TRUE(getTopoCoord().tryToStartStepDown(term, curTime, curTime, futureTime, true));
}
TEST_F(TopoCoordTest, StepDownAttemptSucceedsIfSecondariesCaughtUp) {
@@ -4504,7 +4506,7 @@ TEST_F(TopoCoordTest, StepDownAttemptSucceedsIfSecondariesCaughtUp) {
heartbeatFromMember(
HostAndPort("host3"), "rs0", MemberState::RS_SECONDARY, OpTime(Timestamp(4, 0), term));
- ASSERT_TRUE(getTopoCoord().attemptStepDown(term, curTime, futureTime, futureTime, false));
+ ASSERT_TRUE(getTopoCoord().tryToStartStepDown(term, curTime, futureTime, futureTime, false));
}
TEST_F(TopoCoordTest, StepDownAttemptFailsIfSecondaryCaughtUpButNotElectable) {
@@ -4541,7 +4543,7 @@ TEST_F(TopoCoordTest, StepDownAttemptFailsIfSecondaryCaughtUpButNotElectable) {
heartbeatFromMember(
HostAndPort("host3"), "rs0", MemberState::RS_SECONDARY, OpTime(Timestamp(4, 0), term));
- ASSERT_FALSE(getTopoCoord().attemptStepDown(term, curTime, futureTime, futureTime, false));
+ ASSERT_FALSE(getTopoCoord().tryToStartStepDown(term, curTime, futureTime, futureTime, false));
}
TEST_F(TopoCoordTest,