SERVER-59226 Fix deadlock between uninterruptible profiling operation and stepdown thread.

(cherry picked from commit 43479818bd01f27ee25b6e992045529d2ac0185a)
author: Wenbin Zhu <wenbin.zhu@mongodb.com> 2021-09-03 23:06:13 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2021-11-01 20:27:30 +0000
commit: d1a9e68d581f8e3162bc790bdb30986b8d030d86 (patch)
tree: 51e20934ba9c53716fd7daf4ac9926c8dd31d67e
parent: aa68c654c00bf87283f34607405d7b72823a92ee (diff)
download: mongo-d1a9e68d581f8e3162bc790bdb30986b8d030d86.tar.gz
3 files changed, 47 insertions, 9 deletions
diff --git a/src/mongo/db/introspect.cpp b/src/mongo/db/introspect.cpp
index 8bda2543426..78ae238a239 100644
--- a/src/mongo/db/introspect.cpp
+++ b/src/mongo/db/introspect.cpp
@@ -50,9 +50,9 @@
 
 namespace mongo {
 
-using std::unique_ptr;
 using std::endl;
 using std::string;
+using std::unique_ptr;
 
 namespace {
 
@@ -117,20 +117,32 @@ void profile(OperationContext* opCtx, NetworkOp op) {
 
     const string dbName(nsToDatabase(CurOp::get(opCtx)->getNS()));
 
+    // True if we need to acquire an X lock on the database in order to create the system.profile
+    // collection.
+    bool acquireDbXLock = false;
+
     try {
-        // Even if the operation we are profiling was interrupted, we still want to output the
-        // profiler entry.  This lock guard will prevent lock acquisitions from throwing exceptions
-        // before we finish writing the entry. However, our maximum lock timeout overrides
-        // uninterruptibility.
-        boost::optional<UninterruptibleLockGuard> noInterrupt;
+        // Set the opCtx to be only interruptible for replication state changes. This is needed
+        // because for some operations that are already marked as killed due to errors such as
+        // operation time exceeding maxTimeMS, we still want to output the profiler entry. Thus
+        // in these cases we do not interrupt lock acquisition even though the opCtx has already
+        // been killed. In the meantime we need to make sure replication state changes can still
+        // interrupt lock acquisition, otherwise there could be deadlocks when the state change
+        // thread is waiting for the session checked out by this opCtx while holding RSTL lock.
+        // However when maxLockTimeout is set, we want it to be always interruptible.
         if (!opCtx->lockState()->hasMaxLockTimeout()) {
-            noInterrupt.emplace(opCtx->lockState());
+            opCtx->setIgnoreInterruptsExceptForReplStateChange(true);
         }
 
-        bool acquireDbXLock = false;
+        // IX lock acquisitions beyond this block will not be related to writes to system.profile.
+        ON_BLOCK_EXIT([opCtx] { opCtx->setIgnoreInterruptsExceptForReplStateChange(false); });
+
         while (true) {
             std::unique_ptr<AutoGetDb> autoGetDb;
             if (acquireDbXLock) {
+                // We should not attempt to acquire an X lock while opCtx ignores interrupts.
+                opCtx->setIgnoreInterruptsExceptForReplStateChange(false);
+
                 autoGetDb.reset(new AutoGetDb(opCtx, dbName, MODE_X));
                 if (autoGetDb->getDb()) {
                     createProfileCollection(opCtx, autoGetDb->getDb()).transitional_ignore();
diff --git a/src/mongo/db/operation_context.cpp b/src/mongo/db/operation_context.cpp
index 02a3b1226d5..3775b73a82f 100644
--- a/src/mongo/db/operation_context.cpp
+++ b/src/mongo/db/operation_context.cpp
@@ -196,6 +196,14 @@ bool opShouldFail(Client* client, const BSONObj& failPointInfo) {
 }  // namespace
 
 Status OperationContext::checkForInterruptNoAssert() {
+    const auto killStatus = getKillStatus();
+
+    if (_ignoreInterruptsExceptForReplStateChange &&
+        killStatus != ErrorCodes::InterruptedDueToReplStateChange &&
+        !_killRequestedForReplStateChange.loadRelaxed()) {
+        return Status::OK();
+    }
+
     // TODO: Remove the MONGO_likely(getClient()) once all operation contexts are constructed with
     // clients.
     if (MONGO_likely(getClient() && getServiceContext()) &&
@@ -215,7 +223,6 @@ Status OperationContext::checkForInterruptNoAssert() {
         }
     }
 
-    const auto killStatus = getKillStatus();
     if (killStatus != ErrorCodes::OK) {
         return Status(killStatus, "operation was interrupted");
     }
@@ -361,6 +368,14 @@ void OperationContext::markKilled(ErrorCodes::Error killCode) {
         });
         lkWaitMutex = stdx::unique_lock<stdx::mutex>{*_waitMutex};
     }
+
+    // Record that a kill was requested on this operationContext due to replication state change
+    // since it is possible to call markKilled() multiple times but only the first killCode will
+    // be preserved.
+    if (killCode == ErrorCodes::InterruptedDueToReplStateChange) {
+        _killRequestedForReplStateChange.store(true);
+    }
+
     _killCode.compareAndSwap(ErrorCodes::OK, killCode);
     if (lkWaitMutex && _numKillers == 0) {
         invariant(_waitCV);
diff --git a/src/mongo/db/operation_context.h b/src/mongo/db/operation_context.h
index 4a9bc116da4..a2eacc1e700 100644
--- a/src/mongo/db/operation_context.h
+++ b/src/mongo/db/operation_context.h
@@ -432,6 +432,14 @@ public:
      */
     Microseconds getRemainingMaxTimeMicros() const;
 
+    /**
+     * Sets that this operation should ignore interruption except for replication state change. Can
+     * only be called by the thread executing this on behalf of this OperationContext.
+     */
+    void setIgnoreInterruptsExceptForReplStateChange(bool target) {
+        _ignoreInterruptsExceptForReplStateChange = target;
+    }
+
 private:
     /**
      * Returns true if this operation has a deadline and it has passed according to the fast clock
@@ -521,6 +529,9 @@ private:
     Timer _elapsedTime;
 
     bool _writesAreReplicated = true;
+
+    bool _ignoreInterruptsExceptForReplStateChange = false;
+    AtomicWord<bool> _killRequestedForReplStateChange{false};
 };
 
 namespace repl {
author	Wenbin Zhu <wenbin.zhu@mongodb.com>	2021-09-03 23:06:13 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2021-11-01 20:27:30 +0000
commit	d1a9e68d581f8e3162bc790bdb30986b8d030d86 (patch)
tree	51e20934ba9c53716fd7daf4ac9926c8dd31d67e
parent	aa68c654c00bf87283f34607405d7b72823a92ee (diff)
download	mongo-d1a9e68d581f8e3162bc790bdb30986b8d030d86.tar.gz