diff options
author | Adi Zaimi <adizaimi@yahoo.com> | 2022-01-24 19:20:35 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-01-24 20:44:26 +0000 |
commit | 757f44e1a1cc496f8030e00249aaf43163a85677 (patch) | |
tree | ba0e24fde45bfb565e1110f7f247bb7583bc11fb /src/mongo | |
parent | e33cee0a9be983e8bd349b8adfbaac09d55f56ac (diff) | |
download | mongo-757f44e1a1cc496f8030e00249aaf43163a85677.tar.gz |
SERVER-56756 fassert in stepup/down when RSTL timeout is above threshold
Diffstat (limited to 'src/mongo')
-rw-r--r-- | src/mongo/db/repl/repl_server_parameters.idl | 11 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.cpp | 37 |
2 files changed, 40 insertions, 8 deletions
diff --git a/src/mongo/db/repl/repl_server_parameters.idl b/src/mongo/db/repl/repl_server_parameters.idl index 93e34020947..367351fe0d1 100644 --- a/src/mongo/db/repl/repl_server_parameters.idl +++ b/src/mongo/db/repl/repl_server_parameters.idl @@ -596,6 +596,17 @@ server_parameters: validator: gte: 1 + fassertOnLockTimeoutForStepUpDown: + description: >- + Time limit threshold to fassert if getting RSTL times out when executing a stepdown or stepup command. + Set to 0 to disable. + set_at: [ startup, runtime ] + cpp_vartype: AtomicWord<int> + cpp_varname: fassertOnLockTimeoutForStepUpDown + default: 15 + validator: + gte: 0 + feature_flags: featureFlagRetryableFindAndModify: diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index 55b52fcf4c0..0021211d17e 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -2504,14 +2504,36 @@ ReplicationCoordinatorImpl::AutoGetRstlForStepUpStepDown::AutoGetRstlForStepUpSt // The state transition should never be rollback within this class. invariant(_stateTransition != ReplicationCoordinator::OpsKillingStateTransitionEnum::kRollback); - // Enqueues RSTL in X mode. - _rstlLock.emplace(_opCtx, MODE_X, ReplicationStateTransitionLockGuard::EnqueueOnly()); - - ON_BLOCK_EXIT([&] { _stopAndWaitForKillOpThread(); }); - _startKillOpThread(); + int rstlTimeout = fassertOnLockTimeoutForStepUpDown.load(); + Date_t start{Date_t::now()}; + if (rstlTimeout > 0 && deadline - start > Seconds(rstlTimeout)) { + deadline = start + Seconds(rstlTimeout); // cap deadline + } - // Wait for RSTL to be acquired. - _rstlLock->waitForLockUntil(deadline); + try { + // Enqueues RSTL in X mode. + _rstlLock.emplace(_opCtx, MODE_X, ReplicationStateTransitionLockGuard::EnqueueOnly()); + + ON_BLOCK_EXIT([&] { _stopAndWaitForKillOpThread(); }); + _startKillOpThread(); + + // Wait for RSTL to be acquired. + _rstlLock->waitForLockUntil(deadline); + + } catch (const ExceptionFor<ErrorCodes::LockTimeout>&) { + if (rstlTimeout > 0 && Date_t::now() - start >= Seconds(rstlTimeout)) { + auto lockerInfo = + opCtx->lockState()->getLockerInfo(CurOp::get(opCtx)->getLockStatsBase()); + BSONObjBuilder lockRep; + lockerInfo->stats.report(&lockRep); + LOGV2_FATAL(5675600, + "Time out exceeded waiting for RSTL, stepUp/stepDown is not possible " + "thus calling abort() to allow cluster to progress.", + "lockRep"_attr = lockRep.obj()); + } + // Rethrow to keep processing as before at a higher layer. + throw; + } }; void ReplicationCoordinatorImpl::AutoGetRstlForStepUpStepDown::_startKillOpThread() { @@ -2616,7 +2638,6 @@ void ReplicationCoordinatorImpl::stepDown(OperationContext* opCtx, const bool force, const Milliseconds& waitTime, const Milliseconds& stepdownTime) { - const Date_t startTime = _replExecutor->now(); const Date_t stepDownUntil = startTime + stepdownTime; const Date_t waitUntil = startTime + waitTime; |