SERVER-69059 Create an InterruptibleLockGuard to place in ClientCursorPin to ensure read operations are interruptible

author: Dianna Hohensee <dianna.hohensee@mongodb.com> 2022-09-02 03:32:06 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2022-09-02 04:47:05 +0000
commit: 97cdfef9e4f74918176f1916bc53ada8f9f66868 (patch)
tree: 641eb38b0c182d564474a80c0cb9eac0e7b47e69
parent: 8de369efe3fb47e15000f6a8af722b53911392e6 (diff)
download: mongo-97cdfef9e4f74918176f1916bc53ada8f9f66868.tar.gz
7 files changed, 67 insertions, 45 deletions
diff --git a/jstests/replsets/initial_sync_during_stepdown.js b/jstests/replsets/initial_sync_during_stepdown.js
index 342d57d141d..fc3c00564e7 100644
--- a/jstests/replsets/initial_sync_during_stepdown.js
+++ b/jstests/replsets/initial_sync_during_stepdown.js
@@ -48,7 +48,7 @@ function setupTest({
     jsTestLog("Enabling failpoint '" + failPoint + "' on primary (sync source).");
     assert.commandWorked(primary.adminCommand({
         configureFailPoint: failPoint,
-        data: {nss: nss + nssSuffix, shouldCheckForInterrupt: true, shouldNotdropLock: true},
+        data: {nss: nss + nssSuffix, shouldCheckForInterrupt: true},
         mode: "alwaysOn"
     }));
 
@@ -164,7 +164,7 @@ assert.commandWorked(primaryColl.insert([{_id: 3}, {_id: 4}]));
 // such that it doesn't drop locks when getmore cmd waits inside the fail point block.
 assert.commandWorked(primary.adminCommand({
     configureFailPoint: "waitWithPinnedCursorDuringGetMoreBatch",
-    data: {nss: oplogNss, shouldCheckForInterrupt: true, shouldNotdropLock: true},
+    data: {nss: oplogNss, shouldCheckForInterrupt: true},
     mode: "alwaysOn"
 }));
 
diff --git a/src/mongo/db/catalog/rename_collection.cpp b/src/mongo/db/catalog/rename_collection.cpp
index 0f4a7fcea93..8d7e315cb25 100644
--- a/src/mongo/db/catalog/rename_collection.cpp
+++ b/src/mongo/db/catalog/rename_collection.cpp
@@ -206,10 +206,6 @@ Status renameCollectionDirectly(OperationContext* opCtx,
                 return status;
         }
 
-        // Rename is not resilient to interruption when the onRenameCollection OpObserver
-        // takes an oplog collection lock.
-        UninterruptibleLockGuard noInterrupt(opCtx->lockState());
-
         // We have to override the provided 'dropTarget' setting for idempotency reasons to
         // avoid unintentionally removing a collection on a secondary with the same name as
         // the target.
diff --git a/src/mongo/db/clientcursor.cpp b/src/mongo/db/clientcursor.cpp
index 860e5027fc1..2a471eb1288 100644
--- a/src/mongo/db/clientcursor.cpp
+++ b/src/mongo/db/clientcursor.cpp
@@ -164,7 +164,10 @@ GenericCursor ClientCursor::toGenericCursor() const {
 ClientCursorPin::ClientCursorPin(OperationContext* opCtx,
                                  ClientCursor* cursor,
                                  CursorManager* cursorManager)
-    : _opCtx(opCtx), _cursor(cursor), _cursorManager(cursorManager) {
+    : _opCtx(opCtx),
+      _cursor(cursor),
+      _cursorManager(cursorManager),
+      _interruptibleLockGuard(std::make_unique<InterruptibleLockGuard>(opCtx->lockState())) {
     invariant(_cursor);
     invariant(_cursor->_operationUsingCursor);
     invariant(!_cursor->_disposed);
@@ -181,6 +184,7 @@ ClientCursorPin::ClientCursorPin(ClientCursorPin&& other)
     : _opCtx(other._opCtx),
       _cursor(other._cursor),
       _cursorManager(other._cursorManager),
+      _interruptibleLockGuard(std::move(other._interruptibleLockGuard)),
       _shouldSaveRecoveryUnit(other._shouldSaveRecoveryUnit) {
     // The pinned cursor is being transferred to us from another pin. The 'other' pin must have a
     // pinned cursor.
@@ -217,6 +221,8 @@ ClientCursorPin& ClientCursorPin::operator=(ClientCursorPin&& other) {
     _cursorManager = other._cursorManager;
     other._cursorManager = nullptr;
 
+    _interruptibleLockGuard = std::move(other._interruptibleLockGuard);
+
     _shouldSaveRecoveryUnit = other._shouldSaveRecoveryUnit;
     other._shouldSaveRecoveryUnit = false;
 
diff --git a/src/mongo/db/clientcursor.h b/src/mongo/db/clientcursor.h
index 121566cbdb8..89239e6230d 100644
--- a/src/mongo/db/clientcursor.h
+++ b/src/mongo/db/clientcursor.h
@@ -541,6 +541,20 @@ private:
     OperationContext* _opCtx = nullptr;
     ClientCursor* _cursor = nullptr;
     CursorManager* _cursorManager = nullptr;
+
+    // A pinned cursor takes ownership of storage resources (storage-level cursors owned by the
+    // PlanExecutor) without lock-manager locks. Such an operation must ensure interruptibility when
+    // later acquiring a lock in order to avoid deadlocking with replication rollback at the storage
+    // engine level. Rollback signals interrupt to active readers, acquires a global X lock and then
+    // waits for all storage cursors to be closed in order to proceed; while a pinned cursor
+    // operation holds storage-level cursors and then may try to acquire a lock.
+    //
+    // An operation holding a pinned cursor must never have an UninterruptibleLockGuard on the stack
+    // that causes lock acquisition to hang without checking for interrupt. This
+    // InterruptibleLockGuard ensures that operations holding a ClientCursorPin will eventually
+    // observe and obey interrupt signals in the locking layer.
+    std::unique_ptr<InterruptibleLockGuard> _interruptibleLockGuard;
+
     bool _shouldSaveRecoveryUnit = false;
 };
 
diff --git a/src/mongo/db/commands/getmore_cmd.cpp b/src/mongo/db/commands/getmore_cmd.cpp
index 532d5996631..ec99c214b1f 100644
--- a/src/mongo/db/commands/getmore_cmd.cpp
+++ b/src/mongo/db/commands/getmore_cmd.cpp
@@ -533,34 +533,12 @@ public:
                     opCtx, nss, true));
             }
 
-            // If the 'waitAfterPinningCursorBeforeGetMoreBatch' fail point is enabled, set the
-            // 'msg' field of this operation's CurOp to signal that we've hit this point and then
-            // repeatedly release and re-acquire the collection readLock at regular intervals until
-            // the failpoint is released. This is done in order to avoid deadlocks caused by the
-            // pinned-cursor failpoints in this file (see SERVER-21997).
-            std::function<void()> dropAndReacquireReadLockIfLocked =
-                [&readLock, opCtx, &cursorPin]() {
-                    if (!readLock) {
-                        // This function is a no-op if 'readLock' is not held in the first place.
-                        return;
-                    }
-
-                    // Make sure an interrupted operation does not prevent us from reacquiring the
-                    // lock.
-                    UninterruptibleLockGuard noInterrupt(opCtx->lockState());
-                    readLock.reset();
-                    readLock.emplace(opCtx,
-                                     cursorPin->getExecutor()->nss(),
-                                     AutoGetCollectionViewMode::kViewsForbidden,
-                                     Date_t::max(),
-                                     cursorPin->getExecutor()->getSecondaryNamespaces());
-                };
             if (MONGO_unlikely(waitAfterPinningCursorBeforeGetMoreBatch.shouldFail())) {
                 CurOpFailpointHelpers::waitWhileFailPointEnabled(
                     &waitAfterPinningCursorBeforeGetMoreBatch,
                     opCtx,
                     "waitAfterPinningCursorBeforeGetMoreBatch",
-                    dropAndReacquireReadLockIfLocked,
+                    []() {}, /*empty function*/
                     nss);
             }
 
@@ -641,25 +619,12 @@ public:
                 awaitDataState(opCtx).shouldWaitForInserts = true;
             }
 
-            // We're about to begin running the PlanExecutor in order to fill the getMore batch. If
-            // the 'waitWithPinnedCursorDuringGetMoreBatch' failpoint is active, set the 'msg' field
-            // of this operation's CurOp to signal that we've hit this point and then spin until the
-            // failpoint is released.
-            std::function<void()> saveAndRestoreStateWithReadLockReacquisition =
-                [exec, dropAndReacquireReadLockIfLocked, &readLock]() {
-                    exec->saveState();
-                    dropAndReacquireReadLockIfLocked();
-                    exec->restoreState(readLock ? &readLock->getCollection() : nullptr);
-                };
-
             waitWithPinnedCursorDuringGetMoreBatch.execute([&](const BSONObj& data) {
                 CurOpFailpointHelpers::waitWhileFailPointEnabled(
                     &waitWithPinnedCursorDuringGetMoreBatch,
                     opCtx,
                     "waitWithPinnedCursorDuringGetMoreBatch",
-                    data["shouldNotdropLock"].booleanSafe()
-                        ? []() {} /*empty function*/
-                        : saveAndRestoreStateWithReadLockReacquisition,
+                    []() {}, /*empty function*/
                     nss);
             });
 
diff --git a/src/mongo/db/concurrency/locker.h b/src/mongo/db/concurrency/locker.h
index fdb7d816da3..bd7a80fe19e 100644
--- a/src/mongo/db/concurrency/locker.h
+++ b/src/mongo/db/concurrency/locker.h
@@ -52,6 +52,7 @@ class Locker {
     Locker& operator=(const Locker&) = delete;
 
     friend class UninterruptibleLockGuard;
+    friend class InterruptibleLockGuard;
 
 public:
     virtual ~Locker() {}
@@ -567,6 +568,13 @@ protected:
     int _uninterruptibleLocksRequested = 0;
 
     /**
+     * The number of callers that are guarding against uninterruptible lock requests. An int,
+     * instead of a boolean, to support multiple simultaneous requests. When > 0, ensures that
+     * _uninterruptibleLocksRequested above is _not_ used.
+     */
+    int _keepInterruptibleRequests = 0;
+
+    /**
      * The number of LockRequests to unlock at the end of this WUOW. This is used for locks
      * participating in two-phase locking.
      */
@@ -601,6 +609,7 @@ public:
      */
     explicit UninterruptibleLockGuard(Locker* locker) : _locker(locker) {
         invariant(_locker);
+        invariant(_locker->_keepInterruptibleRequests == 0);
         invariant(_locker->_uninterruptibleLocksRequested >= 0);
         invariant(_locker->_uninterruptibleLocksRequested < std::numeric_limits<int>::max());
         _locker->_uninterruptibleLocksRequested += 1;
@@ -616,6 +625,38 @@ private:
 };
 
 /**
+ * This RAII type ensures that there are no uninterruptible lock acquisitions while in scope. If an
+ * UninterruptibleLockGuard is held at a higher level, or taken at a lower level, an invariant will
+ * occur. This protects against UninterruptibleLockGuard uses on code paths that must be
+ * interruptible. Safe to nest InterruptibleLockGuard instances.
+ */
+class InterruptibleLockGuard {
+    InterruptibleLockGuard(const InterruptibleLockGuard& other) = delete;
+    InterruptibleLockGuard(InterruptibleLockGuard&& other) = delete;
+
+public:
+    /*
+     * Accepts a Locker, and increments the Locker's _keepInterruptibleRequests counter. Decrements
+     * the counter when destroyed.
+     */
+    explicit InterruptibleLockGuard(Locker* locker) : _locker(locker) {
+        invariant(_locker);
+        invariant(_locker->_uninterruptibleLocksRequested == 0);
+        invariant(_locker->_keepInterruptibleRequests >= 0);
+        invariant(_locker->_keepInterruptibleRequests < std::numeric_limits<int>::max());
+        _locker->_keepInterruptibleRequests += 1;
+    }
+
+    ~InterruptibleLockGuard() {
+        invariant(_locker->_keepInterruptibleRequests > 0);
+        _locker->_keepInterruptibleRequests -= 1;
+    }
+
+private:
+    Locker* const _locker;
+};
+
+/**
  * RAII-style class to opt out of replication's use of the ParallelBatchWriterMode lock.
  */
 class ShouldNotConflictWithSecondaryBatchApplicationBlock {
diff --git a/src/mongo/db/exhaust_cursor_currentop_integration_test.cpp b/src/mongo/db/exhaust_cursor_currentop_integration_test.cpp
index ca248706cf8..3bb0217b72b 100644
--- a/src/mongo/db/exhaust_cursor_currentop_integration_test.cpp
+++ b/src/mongo/db/exhaust_cursor_currentop_integration_test.cpp
@@ -76,7 +76,7 @@ void setWaitWithPinnedCursorDuringGetMoreBatchFailpoint(DBClientBase* conn, bool
     auto cmdObj = BSON("configureFailPoint"
                        << "waitWithPinnedCursorDuringGetMoreBatch"
                        << "mode" << (enable ? "alwaysOn" : "off") << "data"
-                       << BSON("shouldNotdropLock" << true << "shouldContinueOnInterrupt" << true));
+                       << BSON("shouldContinueOnInterrupt" << true));
     auto reply = conn->runCommand(OpMsgRequest::fromDBAndBody("admin", cmdObj));
     ASSERT_OK(getStatusFromCommandResult(reply->getCommandReply()));
 }
author	Dianna Hohensee <dianna.hohensee@mongodb.com>	2022-09-02 03:32:06 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2022-09-02 04:47:05 +0000
commit	97cdfef9e4f74918176f1916bc53ada8f9f66868 (patch)
tree	641eb38b0c182d564474a80c0cb9eac0e7b47e69
parent	8de369efe3fb47e15000f6a8af722b53911392e6 (diff)
download	mongo-97cdfef9e4f74918176f1916bc53ada8f9f66868.tar.gz