From 73ab1b397b1292dd791775e9f2f73fb0b3b4699b Mon Sep 17 00:00:00 2001 From: Wenbin Zhu Date: Fri, 3 Sep 2021 23:06:13 +0000 Subject: SERVER-59226 Fix deadlock between uninterruptible profiling operation and stepdown thread. (cherry picked from commit 43479818bd01f27ee25b6e992045529d2ac0185a) --- src/mongo/db/introspect.cpp | 28 ++++++++++++++++------------ src/mongo/db/operation_context.cpp | 16 +++++++++++++++- src/mongo/db/operation_context.h | 11 +++++++++++ 3 files changed, 42 insertions(+), 13 deletions(-) diff --git a/src/mongo/db/introspect.cpp b/src/mongo/db/introspect.cpp index bc3232f31a2..f63cffbd86c 100644 --- a/src/mongo/db/introspect.cpp +++ b/src/mongo/db/introspect.cpp @@ -89,21 +89,25 @@ void profile(OperationContext* opCtx, NetworkOp op) { // replication lag. As such, they should be excluded from Flow Control. opCtx->setShouldParticipateInFlowControl(false); + // Set the opCtx to be only interruptible for replication state changes. This is needed + // because for some operations that are already marked as killed due to errors such as + // operation time exceeding maxTimeMS, we still want to output the profiler entry. Thus + // in these cases we do not interrupt lock acquisition even though the opCtx has already + // been killed. In the meantime we need to make sure replication state changes can still + // interrupt lock acquisition, otherwise there could be deadlocks when the state change + // thread is waiting for the session checked out by this opCtx while holding RSTL lock. + // However when maxLockTimeout is set, we want it to be always interruptible. + if (!opCtx->lockState()->hasMaxLockTimeout()) { + opCtx->setIgnoreInterruptsExceptForReplStateChange(true); + } + // IX lock acquisitions beyond this block will not be related to writes to system.profile. - ON_BLOCK_EXIT( - [opCtx, origFlowControl] { opCtx->setShouldParticipateInFlowControl(origFlowControl); }); + ON_BLOCK_EXIT([opCtx, origFlowControl] { + opCtx->setIgnoreInterruptsExceptForReplStateChange(false); + opCtx->setShouldParticipateInFlowControl(origFlowControl); + }); try { - - // Even if the operation we are profiling was interrupted, we still want to output the - // profiler entry. This lock guard will prevent lock acquisitions from throwing exceptions - // before we finish writing the entry. However, our maximum lock timeout overrides - // uninterruptibility. - boost::optional noInterrupt; - if (!opCtx->lockState()->hasMaxLockTimeout()) { - noInterrupt.emplace(opCtx->lockState()); - } - const auto dbProfilingNS = NamespaceString(dbName, "system.profile"); AutoGetCollection autoColl(opCtx, dbProfilingNS, MODE_IX); Database* const db = autoColl.getDb(); diff --git a/src/mongo/db/operation_context.cpp b/src/mongo/db/operation_context.cpp index 8817793d618..b8cd5935669 100644 --- a/src/mongo/db/operation_context.cpp +++ b/src/mongo/db/operation_context.cpp @@ -225,6 +225,14 @@ bool opShouldFail(Client* client, const BSONObj& failPointInfo) { } // namespace Status OperationContext::checkForInterruptNoAssert() noexcept { + const auto killStatus = getKillStatus(); + + if (_ignoreInterruptsExceptForReplStateChange && + killStatus != ErrorCodes::InterruptedDueToReplStateChange && + !_killRequestedForReplStateChange.loadRelaxed()) { + return Status::OK(); + } + // TODO: Remove the MONGO_likely(hasClientAndServiceContext) once all operation contexts are // constructed with clients. const auto hasClientAndServiceContext = getClient() && getServiceContext(); @@ -257,7 +265,6 @@ Status OperationContext::checkForInterruptNoAssert() noexcept { }, [&](auto&& data) { return opShouldFail(getClient(), data); }); - const auto killStatus = getKillStatus(); if (killStatus != ErrorCodes::OK) { if (killStatus == ErrorCodes::TransactionExceededLifetimeLimitSeconds) return Status( @@ -356,6 +363,13 @@ void OperationContext::markKilled(ErrorCodes::Error killCode) { LOGV2(20883, "Interrupted operation as its client disconnected", "opId"_attr = getOpID()); } + // Record that a kill was requested on this operationContext due to replication state change + // since it is possible to call markKilled() multiple times but only the first killCode will + // be preserved. + if (killCode == ErrorCodes::InterruptedDueToReplStateChange) { + _killRequestedForReplStateChange.store(true); + } + if (auto status = ErrorCodes::OK; _killCode.compareAndSwap(&status, killCode)) { _baton->notify(); } diff --git a/src/mongo/db/operation_context.h b/src/mongo/db/operation_context.h index 4eed5b9c445..a95cc673387 100644 --- a/src/mongo/db/operation_context.h +++ b/src/mongo/db/operation_context.h @@ -427,6 +427,14 @@ public: return _alwaysInterruptAtStepDownOrUp.load(); } + /** + * Sets that this operation should ignore interruption except for replication state change. Can + * only be called by the thread executing this on behalf of this OperationContext. + */ + void setIgnoreInterruptsExceptForReplStateChange(bool target) { + _ignoreInterruptsExceptForReplStateChange = target; + } + /** * Clears metadata associated with a multi-document transaction. */ @@ -620,11 +628,14 @@ private: bool _shouldParticipateInFlowControl = true; bool _inMultiDocumentTransaction = false; bool _isStartingMultiDocumentTransaction = false; + bool _ignoreInterruptsExceptForReplStateChange = false; // If true, this OpCtx will get interrupted during replica set stepUp and stepDown, regardless // of what locks it's taken. AtomicWord _alwaysInterruptAtStepDownOrUp{false}; + AtomicWord _killRequestedForReplStateChange{false}; + // If populated, this is an owned singleton BSONObj whose only field, 'comment', is a copy of // the 'comment' field from the input command object. boost::optional _comment; -- cgit v1.2.1