From 73ab1b397b1292dd791775e9f2f73fb0b3b4699b Mon Sep 17 00:00:00 2001
From: Wenbin Zhu <wenbin.zhu@mongodb.com>
Date: Fri, 3 Sep 2021 23:06:13 +0000
Subject: SERVER-59226 Fix deadlock between uninterruptible profiling operation
 and stepdown thread.

(cherry picked from commit 43479818bd01f27ee25b6e992045529d2ac0185a)
---
 src/mongo/db/introspect.cpp        | 28 ++++++++++++++++------------
 src/mongo/db/operation_context.cpp | 16 +++++++++++++++-
 src/mongo/db/operation_context.h   | 11 +++++++++++
 3 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/src/mongo/db/introspect.cpp b/src/mongo/db/introspect.cpp
index bc3232f31a2..f63cffbd86c 100644
--- a/src/mongo/db/introspect.cpp
+++ b/src/mongo/db/introspect.cpp
@@ -89,21 +89,25 @@ void profile(OperationContext* opCtx, NetworkOp op) {
     // replication lag. As such, they should be excluded from Flow Control.
     opCtx->setShouldParticipateInFlowControl(false);
 
+    // Set the opCtx to be only interruptible for replication state changes. This is needed
+    // because for some operations that are already marked as killed due to errors such as
+    // operation time exceeding maxTimeMS, we still want to output the profiler entry. Thus
+    // in these cases we do not interrupt lock acquisition even though the opCtx has already
+    // been killed. In the meantime we need to make sure replication state changes can still
+    // interrupt lock acquisition, otherwise there could be deadlocks when the state change
+    // thread is waiting for the session checked out by this opCtx while holding RSTL lock.
+    // However when maxLockTimeout is set, we want it to be always interruptible.
+    if (!opCtx->lockState()->hasMaxLockTimeout()) {
+        opCtx->setIgnoreInterruptsExceptForReplStateChange(true);
+    }
+
     // IX lock acquisitions beyond this block will not be related to writes to system.profile.
-    ON_BLOCK_EXIT(
-        [opCtx, origFlowControl] { opCtx->setShouldParticipateInFlowControl(origFlowControl); });
+    ON_BLOCK_EXIT([opCtx, origFlowControl] {
+        opCtx->setIgnoreInterruptsExceptForReplStateChange(false);
+        opCtx->setShouldParticipateInFlowControl(origFlowControl);
+    });
 
     try {
-
-        // Even if the operation we are profiling was interrupted, we still want to output the
-        // profiler entry.  This lock guard will prevent lock acquisitions from throwing exceptions
-        // before we finish writing the entry. However, our maximum lock timeout overrides
-        // uninterruptibility.
-        boost::optional<UninterruptibleLockGuard> noInterrupt;
-        if (!opCtx->lockState()->hasMaxLockTimeout()) {
-            noInterrupt.emplace(opCtx->lockState());
-        }
-
         const auto dbProfilingNS = NamespaceString(dbName, "system.profile");
         AutoGetCollection autoColl(opCtx, dbProfilingNS, MODE_IX);
         Database* const db = autoColl.getDb();
diff --git a/src/mongo/db/operation_context.cpp b/src/mongo/db/operation_context.cpp
index 8817793d618..b8cd5935669 100644
--- a/src/mongo/db/operation_context.cpp
+++ b/src/mongo/db/operation_context.cpp
@@ -225,6 +225,14 @@ bool opShouldFail(Client* client, const BSONObj& failPointInfo) {
 }  // namespace
 
 Status OperationContext::checkForInterruptNoAssert() noexcept {
+    const auto killStatus = getKillStatus();
+
+    if (_ignoreInterruptsExceptForReplStateChange &&
+        killStatus != ErrorCodes::InterruptedDueToReplStateChange &&
+        !_killRequestedForReplStateChange.loadRelaxed()) {
+        return Status::OK();
+    }
+
     // TODO: Remove the MONGO_likely(hasClientAndServiceContext) once all operation contexts are
     // constructed with clients.
     const auto hasClientAndServiceContext = getClient() && getServiceContext();
@@ -257,7 +265,6 @@ Status OperationContext::checkForInterruptNoAssert() noexcept {
         },
         [&](auto&& data) { return opShouldFail(getClient(), data); });
 
-    const auto killStatus = getKillStatus();
     if (killStatus != ErrorCodes::OK) {
         if (killStatus == ErrorCodes::TransactionExceededLifetimeLimitSeconds)
             return Status(
@@ -356,6 +363,13 @@ void OperationContext::markKilled(ErrorCodes::Error killCode) {
         LOGV2(20883, "Interrupted operation as its client disconnected", "opId"_attr = getOpID());
     }
 
+    // Record that a kill was requested on this operationContext due to replication state change
+    // since it is possible to call markKilled() multiple times but only the first killCode will
+    // be preserved.
+    if (killCode == ErrorCodes::InterruptedDueToReplStateChange) {
+        _killRequestedForReplStateChange.store(true);
+    }
+
     if (auto status = ErrorCodes::OK; _killCode.compareAndSwap(&status, killCode)) {
         _baton->notify();
     }
diff --git a/src/mongo/db/operation_context.h b/src/mongo/db/operation_context.h
index 4eed5b9c445..a95cc673387 100644
--- a/src/mongo/db/operation_context.h
+++ b/src/mongo/db/operation_context.h
@@ -427,6 +427,14 @@ public:
         return _alwaysInterruptAtStepDownOrUp.load();
     }
 
+    /**
+     * Sets that this operation should ignore interruption except for replication state change. Can
+     * only be called by the thread executing this on behalf of this OperationContext.
+     */
+    void setIgnoreInterruptsExceptForReplStateChange(bool target) {
+        _ignoreInterruptsExceptForReplStateChange = target;
+    }
+
     /**
      * Clears metadata associated with a multi-document transaction.
      */
@@ -620,11 +628,14 @@ private:
     bool _shouldParticipateInFlowControl = true;
     bool _inMultiDocumentTransaction = false;
     bool _isStartingMultiDocumentTransaction = false;
+    bool _ignoreInterruptsExceptForReplStateChange = false;
 
     // If true, this OpCtx will get interrupted during replica set stepUp and stepDown, regardless
     // of what locks it's taken.
     AtomicWord<bool> _alwaysInterruptAtStepDownOrUp{false};
 
+    AtomicWord<bool> _killRequestedForReplStateChange{false};
+
     // If populated, this is an owned singleton BSONObj whose only field, 'comment', is a copy of
     // the 'comment' field from the input command object.
     boost::optional<BSONObj> _comment;
-- 
cgit v1.2.1