SERVER-59226 Fix deadlock between uninterruptible profiling operation and stepdown thread.

(cherry picked from commit 43479818bd01f27ee25b6e992045529d2ac0185a)
author: Wenbin Zhu <wenbin.zhu@mongodb.com> 2021-09-03 23:06:13 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2021-10-21 15:56:08 +0000
commit: 46d180e4f463f96b7d5f64c73e43204fd47f5fd4 (patch)
tree: 87dbfff6f95546dc0f99dcf3d49da174ca202cb0
parent: 5500c2b6e8a0fad7e779d003feb0d3c9f92f1c77 (diff)
download: mongo-46d180e4f463f96b7d5f64c73e43204fd47f5fd4.tar.gz
3 files changed, 40 insertions, 9 deletions
diff --git a/src/mongo/db/introspect.cpp b/src/mongo/db/introspect.cpp
index a23d80ecbcc..ec7a744420b 100644
--- a/src/mongo/db/introspect.cpp
+++ b/src/mongo/db/introspect.cpp
@@ -119,20 +119,26 @@ void profile(OperationContext* opCtx, NetworkOp op) {
     bool acquireDbXLock = false;
 
     try {
-        // Even if the operation we are profiling was interrupted, we still want to output the
-        // profiler entry.  This lock guard will prevent lock acquisitions from throwing exceptions
-        // before we finish writing the entry. However, our maximum lock timeout overrides
-        // uninterruptibility.
-        boost::optional<UninterruptibleLockGuard> noInterrupt;
+        // Set the opCtx to be only interruptible for replication state changes. This is needed
+        // because for some operations that are already marked as killed due to errors such as
+        // operation time exceeding maxTimeMS, we still want to output the profiler entry. Thus
+        // in these cases we do not interrupt lock acquisition even though the opCtx has already
+        // been killed. In the meantime we need to make sure replication state changes can still
+        // interrupt lock acquisition, otherwise there could be deadlocks when the state change
+        // thread is waiting for the session checked out by this opCtx while holding RSTL lock.
+        // However when maxLockTimeout is set, we want it to be always interruptible.
         if (!opCtx->lockState()->hasMaxLockTimeout()) {
-            noInterrupt.emplace(opCtx->lockState());
+            opCtx->setIgnoreInterruptsExceptForReplStateChange(true);
         }
 
+        // IX lock acquisitions beyond this block will not be related to writes to system.profile.
+        ON_BLOCK_EXIT([opCtx] { opCtx->setIgnoreInterruptsExceptForReplStateChange(false); });
+
         while (true) {
             std::unique_ptr<AutoGetDb> autoGetDb;
             if (acquireDbXLock) {
-                // We should not attempt to acquire an X lock while in "noInterrupt" scope.
-                noInterrupt.reset();
+                // We should not attempt to acquire an X lock while opCtx ignores interrupts.
+                opCtx->setIgnoreInterruptsExceptForReplStateChange(false);
 
                 autoGetDb.reset(new AutoGetDb(opCtx, dbName, MODE_X));
                 if (autoGetDb->getDb()) {
diff --git a/src/mongo/db/operation_context.cpp b/src/mongo/db/operation_context.cpp
index 02e8b511503..9f16861e985 100644
--- a/src/mongo/db/operation_context.cpp
+++ b/src/mongo/db/operation_context.cpp
@@ -202,6 +202,14 @@ bool opShouldFail(Client* client, const BSONObj& failPointInfo) {
 }  // namespace
 
 Status OperationContext::checkForInterruptNoAssert() noexcept {
+    const auto killStatus = getKillStatus();
+
+    if (_ignoreInterruptsExceptForReplStateChange &&
+        killStatus != ErrorCodes::InterruptedDueToReplStateChange &&
+        !_killRequestedForReplStateChange.loadRelaxed()) {
+        return Status::OK();
+    }
+
     // TODO: Remove the MONGO_likely(hasClientAndServiceContext) once all operation contexts are
     // constructed with clients.
     const auto hasClientAndServiceContext = getClient() && getServiceContext();
@@ -234,7 +242,6 @@ Status OperationContext::checkForInterruptNoAssert() noexcept {
         }
     }
 
-    const auto killStatus = getKillStatus();
     if (killStatus != ErrorCodes::OK) {
         return Status(killStatus, "operation was interrupted");
     }
@@ -327,6 +334,13 @@ void OperationContext::markKilled(ErrorCodes::Error killCode) {
         log() << "operation was interrupted because a client disconnected";
     }
 
+    // Record that a kill was requested on this operationContext due to replication state change
+    // since it is possible to call markKilled() multiple times but only the first killCode will
+    // be preserved.
+    if (killCode == ErrorCodes::InterruptedDueToReplStateChange) {
+        _killRequestedForReplStateChange.store(true);
+    }
+
     if (auto status = ErrorCodes::OK; _killCode.compareAndSwap(&status, killCode)) {
         _baton->notify();
     }
diff --git a/src/mongo/db/operation_context.h b/src/mongo/db/operation_context.h
index 38dae63a4c9..718c0c43af4 100644
--- a/src/mongo/db/operation_context.h
+++ b/src/mongo/db/operation_context.h
@@ -376,6 +376,14 @@ public:
     }
 
     /**
+     * Sets that this operation should ignore interruption except for replication state change. Can
+     * only be called by the thread executing this on behalf of this OperationContext.
+     */
+    void setIgnoreInterruptsExceptForReplStateChange(bool target) {
+        _ignoreInterruptsExceptForReplStateChange = target;
+    }
+
+    /**
      * Clears metadata associated with a multi-document transaction.
      */
     void resetMultiDocumentTransactionState() {
@@ -531,10 +539,13 @@ private:
     bool _writesAreReplicated = true;
     bool _shouldParticipateInFlowControl = true;
     bool _inMultiDocumentTransaction = false;
+    bool _ignoreInterruptsExceptForReplStateChange = false;
 
     // If true, this OpCtx will get interrupted during replica set stepUp and stepDown, regardless
     // of what locks it's taken.
     AtomicWord<bool> _alwaysInterruptAtStepDownOrUp{false};
+
+    AtomicWord<bool> _killRequestedForReplStateChange{false};
 };
 
 namespace repl {
author	Wenbin Zhu <wenbin.zhu@mongodb.com>	2021-09-03 23:06:13 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2021-10-21 15:56:08 +0000
commit	46d180e4f463f96b7d5f64c73e43204fd47f5fd4 (patch)
tree	87dbfff6f95546dc0f99dcf3d49da174ca202cb0
parent	5500c2b6e8a0fad7e779d003feb0d3c9f92f1c77 (diff)
download	mongo-46d180e4f463f96b7d5f64c73e43204fd47f5fd4.tar.gz