summaryrefslogtreecommitdiff
path: root/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
diff options
context:
space:
mode:
authorSuganthi Mani <suganthi.mani@mongodb.com>2019-05-19 21:53:05 -0400
committerSuganthi Mani <suganthi.mani@mongodb.com>2019-05-19 21:53:05 -0400
commit674276b0149d3b77b51ac46adc53b11c47a26519 (patch)
tree8bd4b6b66901c35b6514918d1c70032f8dc17f42 /src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
parent8e6ad096f8a8b81e1be01d012920f52332650d6f (diff)
downloadmongo-674276b0149d3b77b51ac46adc53b11c47a26519.tar.gz
Revert "SERVER-37574 Force reconfig should kill operations that conflict state"
This reverts commit 8e6ad096f8a8b81e1be01d012920f52332650d6f.
Diffstat (limited to 'src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp')
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp96
1 files changed, 36 insertions, 60 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
index f818427ae7c..f5f7650eb71 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
@@ -38,6 +38,7 @@
#include <algorithm>
#include "mongo/base/status.h"
+#include "mongo/db/concurrency/replication_state_transition_lock_guard.h"
#include "mongo/db/index_builds_coordinator.h"
#include "mongo/db/kill_sessions_local.h"
#include "mongo/db/logical_clock.h"
@@ -401,20 +402,24 @@ void ReplicationCoordinatorImpl::_stepDownFinish(
auto opCtx = cc().makeOperationContext();
+ ReplicationStateTransitionLockGuard rstlLock(
+ opCtx.get(), MODE_X, ReplicationStateTransitionLockGuard::EnqueueOnly());
+
// kill all write operations which are no longer safe to run on step down. Also, operations that
- // have taken global lock in S mode and operations blocked on prepare conflict will be killed to
- // avoid 3-way deadlock between read, prepared transaction and step down thread.
- AutoGetRstlForStepUpStepDown arsd(this, opCtx.get());
- stdx::unique_lock<stdx::mutex> lk(_mutex);
+ // have taken global lock in S mode will be killed to avoid 3-way deadlock between read,
+ // prepared transaction and step down thread.
+ KillOpContainer koc(this, opCtx.get());
+ koc.startKillOpThread();
- // This node has already stepped down due to reconfig.
- if (!_topCoord->isSteppingDownUnconditionally()) {
- return;
+ {
+ auto rstlOnErrorGuard = makeGuard([&koc] { koc.stopAndWaitForKillOpThread(); });
+ rstlLock.waitForLockUntil(Date_t::max());
}
// Yield locks for prepared transactions.
yieldLocksForPreparedTransactions(opCtx.get());
- _updateAndLogStatsOnStepDown(&arsd);
+
+ stdx::unique_lock<stdx::mutex> lk(_mutex);
_topCoord->finishUnconditionalStepDown();
const auto action = _updateMemberStateFromTopologyCoordinator(lk, opCtx.get());
@@ -428,16 +433,10 @@ void ReplicationCoordinatorImpl::_stepDownFinish(
}
lk.unlock();
_performPostMemberStateUpdateAction(action);
+ _updateAndLogStatsOnStepDown(&koc);
_replExecutor->signalEvent(finishedEvent);
}
-bool ReplicationCoordinatorImpl::_shouldStepDownOnReconfig(WithLock,
- const ReplSetConfig& newConfig,
- StatusWith<int> myIndex) {
- return _memberState.primary() &&
- !(myIndex.isOK() && newConfig.getMemberAt(myIndex.getValue()).isElectable());
-}
-
void ReplicationCoordinatorImpl::_scheduleHeartbeatReconfig_inlock(const ReplSetConfig& newConfig) {
if (_inShutdown) {
return;
@@ -591,59 +590,37 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish(
return;
}
- // Do not conduct an election during a reconfig, as the node may not be electable post-reconfig.
- // If there is an election in-progress, there can be at most one. No new election can happen as
- // we have already set our ReplicationCoordinatorImpl::_rsConfigState state to
- // "kConfigReconfiguring" which prevents new elections from happening.
- {
- stdx::lock_guard<stdx::mutex> lk(_mutex);
- if (auto electionFinishedEvent = _cancelElectionIfNeeded_inlock()) {
- LOG_FOR_HEARTBEATS(0)
- << "Waiting for election to complete before finishing reconfig to version "
- << newConfig.getConfigVersion();
- // Wait for the election to complete and the node's Role to be set to follower.
- _replExecutor
- ->onEvent(electionFinishedEvent,
- [=](const executor::TaskExecutor::CallbackArgs& cbData) {
- _heartbeatReconfigFinish(cbData, newConfig, myIndex);
- })
- .status_with_transitional_ignore();
- return;
- }
- }
-
auto opCtx = cc().makeOperationContext();
-
- boost::optional<AutoGetRstlForStepUpStepDown> arsd;
- stdx::unique_lock<stdx::mutex> lk(_mutex);
- if (_shouldStepDownOnReconfig(lk, newConfig, myIndex)) {
- _topCoord->prepareForUnconditionalStepDown();
+ boost::optional<ReplicationStateTransitionLockGuard> transitionGuard;
+ stdx::unique_lock<stdx::mutex> lk{_mutex};
+ if (_memberState.primary()) {
+ // If we are primary, we need the RSTL in mode X to step down. If we somehow
+ // transition out of primary while waiting for the RSTL, there's no harm in holding
+ // it.
lk.unlock();
-
- // Primary node will be either unelectable or removed after the configuration change.
- // So, finish the reconfig under RSTL, so that the step down occurs safely.
- arsd.emplace(this, opCtx.get());
-
+ transitionGuard.emplace(opCtx.get(), MODE_X);
lk.lock();
- if (_topCoord->isSteppingDownUnconditionally()) {
- invariant(opCtx->lockState()->isRSTLExclusive());
- log() << "stepping down from primary, because we received a new config via heartbeat";
- yieldLocksForPreparedTransactions(opCtx.get());
- _updateAndLogStatsOnStepDown(&arsd.get());
- } else {
- // Release the rstl lock as the node might have stepped down due to
- // other unconditional step down code paths like learning new term via heartbeat &
- // liveness timeout. And, no new election can happen as we have already set our
- // ReplicationCoordinatorImpl::_rsConfigState state to "kConfigReconfiguring" which
- // prevents new elections from happening. So, its safe to release the RSTL lock.
- arsd.reset();
- }
}
invariant(_rsConfigState == kConfigHBReconfiguring);
invariant(!_rsConfig.isInitialized() ||
_rsConfig.getConfigVersion() < newConfig.getConfigVersion());
+ // Do not conduct an election during a reconfig, as the node may not be electable post-reconfig.
+ if (auto electionFinishedEvent = _cancelElectionIfNeeded_inlock()) {
+ LOG_FOR_HEARTBEATS(0)
+ << "Waiting for election to complete before finishing reconfig to version "
+ << newConfig.getConfigVersion();
+ // Wait for the election to complete and the node's Role to be set to follower.
+ _replExecutor
+ ->onEvent(electionFinishedEvent,
+ [=](const executor::TaskExecutor::CallbackArgs& cbData) {
+ _heartbeatReconfigFinish(cbData, newConfig, myIndex);
+ })
+ .status_with_transitional_ignore();
+ return;
+ }
+
if (!myIndex.isOK()) {
switch (myIndex.getStatus().code()) {
case ErrorCodes::NodeNotFound:
@@ -669,7 +646,6 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish(
const int myIndexValue = myIndex.getStatus().isOK() ? myIndex.getValue() : -1;
const PostMemberStateUpdateAction action =
_setCurrentRSConfig(lk, opCtx.get(), newConfig, myIndexValue);
-
lk.unlock();
_performPostMemberStateUpdateAction(action);