summaryrefslogtreecommitdiff
path: root/src/mongo
diff options
context:
space:
mode:
authorAdi Zaimi <adizaimi@yahoo.com>2022-01-24 19:20:35 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-01-24 20:44:26 +0000
commit757f44e1a1cc496f8030e00249aaf43163a85677 (patch)
treeba0e24fde45bfb565e1110f7f247bb7583bc11fb /src/mongo
parente33cee0a9be983e8bd349b8adfbaac09d55f56ac (diff)
downloadmongo-757f44e1a1cc496f8030e00249aaf43163a85677.tar.gz
SERVER-56756 fassert in stepup/down when RSTL timeout is above threshold
Diffstat (limited to 'src/mongo')
-rw-r--r--src/mongo/db/repl/repl_server_parameters.idl11
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp37
2 files changed, 40 insertions, 8 deletions
diff --git a/src/mongo/db/repl/repl_server_parameters.idl b/src/mongo/db/repl/repl_server_parameters.idl
index 93e34020947..367351fe0d1 100644
--- a/src/mongo/db/repl/repl_server_parameters.idl
+++ b/src/mongo/db/repl/repl_server_parameters.idl
@@ -596,6 +596,17 @@ server_parameters:
validator:
gte: 1
+ fassertOnLockTimeoutForStepUpDown:
+ description: >-
+ Time limit threshold to fassert if getting RSTL times out when executing a stepdown or stepup command.
+ Set to 0 to disable.
+ set_at: [ startup, runtime ]
+ cpp_vartype: AtomicWord<int>
+ cpp_varname: fassertOnLockTimeoutForStepUpDown
+ default: 15
+ validator:
+ gte: 0
+
feature_flags:
featureFlagRetryableFindAndModify:
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index 55b52fcf4c0..0021211d17e 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -2504,14 +2504,36 @@ ReplicationCoordinatorImpl::AutoGetRstlForStepUpStepDown::AutoGetRstlForStepUpSt
// The state transition should never be rollback within this class.
invariant(_stateTransition != ReplicationCoordinator::OpsKillingStateTransitionEnum::kRollback);
- // Enqueues RSTL in X mode.
- _rstlLock.emplace(_opCtx, MODE_X, ReplicationStateTransitionLockGuard::EnqueueOnly());
-
- ON_BLOCK_EXIT([&] { _stopAndWaitForKillOpThread(); });
- _startKillOpThread();
+ int rstlTimeout = fassertOnLockTimeoutForStepUpDown.load();
+ Date_t start{Date_t::now()};
+ if (rstlTimeout > 0 && deadline - start > Seconds(rstlTimeout)) {
+ deadline = start + Seconds(rstlTimeout); // cap deadline
+ }
- // Wait for RSTL to be acquired.
- _rstlLock->waitForLockUntil(deadline);
+ try {
+ // Enqueues RSTL in X mode.
+ _rstlLock.emplace(_opCtx, MODE_X, ReplicationStateTransitionLockGuard::EnqueueOnly());
+
+ ON_BLOCK_EXIT([&] { _stopAndWaitForKillOpThread(); });
+ _startKillOpThread();
+
+ // Wait for RSTL to be acquired.
+ _rstlLock->waitForLockUntil(deadline);
+
+ } catch (const ExceptionFor<ErrorCodes::LockTimeout>&) {
+ if (rstlTimeout > 0 && Date_t::now() - start >= Seconds(rstlTimeout)) {
+ auto lockerInfo =
+ opCtx->lockState()->getLockerInfo(CurOp::get(opCtx)->getLockStatsBase());
+ BSONObjBuilder lockRep;
+ lockerInfo->stats.report(&lockRep);
+ LOGV2_FATAL(5675600,
+ "Time out exceeded waiting for RSTL, stepUp/stepDown is not possible "
+ "thus calling abort() to allow cluster to progress.",
+ "lockRep"_attr = lockRep.obj());
+ }
+ // Rethrow to keep processing as before at a higher layer.
+ throw;
+ }
};
void ReplicationCoordinatorImpl::AutoGetRstlForStepUpStepDown::_startKillOpThread() {
@@ -2616,7 +2638,6 @@ void ReplicationCoordinatorImpl::stepDown(OperationContext* opCtx,
const bool force,
const Milliseconds& waitTime,
const Milliseconds& stepdownTime) {
-
const Date_t startTime = _replExecutor->now();
const Date_t stepDownUntil = startTime + stepdownTime;
const Date_t waitUntil = startTime + waitTime;