SERVER-76777 Retry lock coll lock acquisition on self abort

author: Yu Jin Kang Park <yujin.kang@mongodb.com> 2023-05-11 14:46:52 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2023-05-11 16:01:16 +0000
commit: 837ff506b122b33005d9fe451a0918b497cefeb1 (patch)
tree: 3ff0aa2027671c2005f04a35db9e370459e0f7f3
parent: 2f667bface2924113e5812be0972874bb95e6436 (diff)
download: mongo-837ff506b122b33005d9fe451a0918b497cefeb1.tar.gz
5 files changed, 268 insertions, 44 deletions
diff --git a/jstests/noPassthrough/index_build_external_and_internal_abort_do_not_deadlock_single_phase.js b/jstests/noPassthrough/index_build_external_and_internal_abort_do_not_deadlock_single_phase.js
new file mode 100644
index 00000000000..d9a37f17dd6
--- /dev/null
+++ b/jstests/noPassthrough/index_build_external_and_internal_abort_do_not_deadlock_single_phase.js
@@ -0,0 +1,47 @@
+/**
+ * Tests dropping a collection (causing an external index build abort) does not deadlock with an
+ * internal self abort for single-phase index builds.
+ */
+(function() {
+"use strict";
+
+load('jstests/noPassthrough/libs/index_build.js');
+load("jstests/libs/fail_point_util.js");
+
+// A standalone configuration is key to running the index build single-phase.
+const conn = MongoRunner.runMongod();
+
+const dbName = 'test';
+const collName = 'coll';
+const db = conn.getDB(dbName);
+const coll = db.getCollection(collName);
+
+coll.drop();
+assert.commandWorked(coll.insert({a: [0, "a"]}));
+
+// Hang after the index build has checked if the build is already aborted, but before taking
+// collection locks for cleanup.
+const hangBeforeCleanup = configureFailPoint(db, 'hangIndexBuildBeforeAbortCleanUp');
+
+const hangAfterCollDropHasLocks =
+    configureFailPoint(db, 'hangAbortIndexBuildByBuildUUIDAfterLocks');
+
+const createIdx =
+    IndexBuildTest.startIndexBuild(conn, coll.getFullName(), {a: "2d"}, null, [13026]);
+
+hangBeforeCleanup.wait();
+
+const collDrop = startParallelShell(funWithArgs(function(dbName, collName) {
+                                        db.getSiblingDB(dbName).getCollection(collName).drop();
+                                    }, dbName, collName), conn.port);
+
+hangAfterCollDropHasLocks.wait();
+hangBeforeCleanup.off();
+hangAfterCollDropHasLocks.off();
+
+jsTestLog("Waiting for collection drop shell to return");
+collDrop();
+createIdx();
+
+MongoRunner.stopMongod(conn);
+})();
diff --git a/jstests/noPassthrough/index_build_external_and_internal_abort_do_not_deadlock_two_phase.js b/jstests/noPassthrough/index_build_external_and_internal_abort_do_not_deadlock_two_phase.js
new file mode 100644
index 00000000000..0c6c8e35f74
--- /dev/null
+++ b/jstests/noPassthrough/index_build_external_and_internal_abort_do_not_deadlock_two_phase.js
@@ -0,0 +1,88 @@
+/**
+ * Tests dropping a collection (causing an external index build abort) does not deadlock with an
+ * internal self abort for two-phase index builds.
+ *
+ * @tags: [
+ *   requires_replication,
+ * ]
+ */
+(function() {
+"use strict";
+
+load('jstests/noPassthrough/libs/index_build.js');
+load("jstests/libs/fail_point_util.js");
+
+const rst = new ReplSetTest({
+    nodes: [
+        {},
+        {
+            // Disallow elections on secondary.
+            rsConfig: {
+                priority: 0,
+            },
+        },
+    ]
+});
+rst.startSet();
+rst.initiate();
+
+const primary = rst.getPrimary();
+const primaryDB = primary.getDB('test');
+const primaryColl = primaryDB.getCollection('test');
+
+primaryColl.drop();
+assert.commandWorked(primaryColl.insert({a: 1}));
+
+// Pause the index builds on the secondary, using the 'hangAfterStartingIndexBuild' failpoint.
+const failpointHangAfterInit = configureFailPoint(primaryDB, "hangAfterInitializingIndexBuild");
+const hangBeforeCleanup = configureFailPoint(primaryDB, 'hangIndexBuildBeforeAbortCleanUp');
+
+// Block secondary to avoid commitQuorum being fullfilled.
+IndexBuildTest.pauseIndexBuilds(rst.getSecondary());
+
+jsTestLog("Waiting for index build to start");
+const createIdx = IndexBuildTest.startIndexBuild(
+    primary, primaryColl.getFullName(), {a: 1}, null, [ErrorCodes.OutOfDiskSpace]);
+
+const buildUUID =
+    IndexBuildTest
+        .assertIndexesSoon(primaryColl, 2, ['_id_'], ['a_1'], {includeBuildUUIDs: true})['a_1']
+        .buildUUID;
+
+const failAfterVoteForCommitReadiness =
+    configureFailPoint(primaryDB,
+                       "failIndexBuildWithErrorInSecondDrain",
+                       {buildUUID: buildUUID, error: ErrorCodes.OutOfDiskSpace});
+
+// Continue index build after preparing the artificial failure.
+failpointHangAfterInit.off();
+
+// Wait for the index build to be in clean up path.
+hangBeforeCleanup.wait();
+
+const hangAfterCollDropHasLocks =
+    configureFailPoint(primaryDB, 'hangAbortIndexBuildByBuildUUIDAfterLocks');
+
+const collDrop = startParallelShell(funWithArgs(function(dbName, collName) {
+                                        jsTestLog("Dropping collection");
+                                        db.getSiblingDB(dbName).getCollection(collName).drop();
+                                    }, primaryDB.getName(), primaryColl.getName()), primary.port);
+
+hangAfterCollDropHasLocks.wait();
+hangBeforeCleanup.off();
+hangAfterCollDropHasLocks.off();
+
+jsTestLog("Waiting for the index build to be killed");
+// "Index build: joined after abort".
+checkLog.containsJson(primary, 20655, {
+    buildUUID: function(uuid) {
+        return uuid && uuid["uuid"]["$uuid"] === extractUUIDFromObject(buildUUID);
+    }
+});
+
+jsTestLog("Waiting for collection drop shell to return");
+collDrop();
+createIdx();
+
+rst.stopSet();
+})();
diff --git a/src/mongo/db/index_builds_coordinator.cpp b/src/mongo/db/index_builds_coordinator.cpp
index de5004df03d..17fe7380a5d 100644
--- a/src/mongo/db/index_builds_coordinator.cpp
+++ b/src/mongo/db/index_builds_coordinator.cpp
@@ -94,7 +94,10 @@ MONGO_FAIL_POINT_DEFINE(hangIndexBuildBeforeWaitingUntilMajorityOpTime);
 MONGO_FAIL_POINT_DEFINE(hangBeforeUnregisteringAfterCommit);
 MONGO_FAIL_POINT_DEFINE(failSetUpResumeIndexBuild);
 MONGO_FAIL_POINT_DEFINE(failIndexBuildWithError);
+MONGO_FAIL_POINT_DEFINE(failIndexBuildWithErrorInSecondDrain);
 MONGO_FAIL_POINT_DEFINE(hangInRemoveIndexBuildEntryAfterCommitOrAbort);
+MONGO_FAIL_POINT_DEFINE(hangAbortIndexBuildByBuildUUIDAfterLocks);
+
 
 IndexBuildsCoordinator::IndexBuildsSSS::IndexBuildsSSS()
     : ServerStatusSection("indexBuilds"),
@@ -1409,6 +1412,8 @@ bool IndexBuildsCoordinator::abortIndexBuildByBuildUUID(OperationContext* opCtx,
         AutoGetCollection indexBuildEntryColl(
             opCtx, NamespaceString::kIndexBuildEntryNamespace, MODE_IX);
 
+        hangAbortIndexBuildByBuildUUIDAfterLocks.pauseWhileSet();
+
         // If we are using two-phase index builds and are no longer primary after receiving an
         // abort, we cannot replicate an abortIndexBuild oplog entry. Continue holding the RSTL to
         // check the replication state and to prevent any state transitions from happening while
@@ -2157,7 +2162,8 @@ StatusWith<std::tuple<Lock::DBLock,
                       repl::ReplicationStateTransitionLockGuard>>
 IndexBuildsCoordinator::_acquireExclusiveLockWithRSTLRetry(OperationContext* opCtx,
                                                            ReplIndexBuildState* replState,
-                                                           bool retry) {
+                                                           bool retry,
+                                                           bool collLockTimeout) {
 
     while (true) {
         // Skip the check for sharding's critical section check as it can only be acquired during a
@@ -2168,8 +2174,20 @@ IndexBuildsCoordinator::_acquireExclusiveLockWithRSTLRetry(OperationContext* opC
                                             /*.skipRSTLLock=*/true};
         Lock::DBLock dbLock{opCtx, replState->dbName, MODE_IX, Date_t::max(), lockOptions};
 
-        CollectionNamespaceOrUUIDLock collLock{
-            opCtx, {replState->dbName, replState->collectionUUID}, MODE_X};
+        Date_t collLockDeadline = Date_t::max();
+        if (collLockTimeout) {
+            collLockDeadline = Date_t::now() + Milliseconds{100};
+        }
+
+        boost::optional<CollectionNamespaceOrUUIDLock> collLock;
+        try {
+            collLock.emplace(opCtx,
+                             NamespaceStringOrUUID{replState->dbName, replState->collectionUUID},
+                             MODE_X,
+                             collLockDeadline);
+        } catch (const ExceptionFor<ErrorCodes::LockTimeout>& ex) {
+            return ex.toStatus();
+        }
 
         // If we can't acquire the RSTL within a given time period, there is an active state
         // transition and we should release our locks and try again. We would otherwise introduce a
@@ -2200,7 +2218,7 @@ IndexBuildsCoordinator::_acquireExclusiveLockWithRSTLRetry(OperationContext* opC
             continue;
         }
 
-        return std::make_tuple(std::move(dbLock), std::move(collLock), std::move(rstl));
+        return std::make_tuple(std::move(dbLock), std::move(collLock.value()), std::move(rstl));
     }
 }
 
@@ -2648,18 +2666,38 @@ void IndexBuildsCoordinator::_cleanUpSinglePhaseAfterNonShutdownFailure(
     // would prevent us from taking locks. Use a new OperationContext to abort the index build.
     runOnAlternateContext(
         opCtx, "self-abort", [this, replState, status](OperationContext* abortCtx) {
-            ShouldNotConflictWithSecondaryBatchApplicationBlock noConflict(abortCtx->lockState());
-            // Skip RSTL to avoid deadlocks with prepare conflicts and state transitions caused by
-            // taking a strong collection lock. See SERVER-42621.
-            Lock::DBLockSkipOptions lockOptions{/*.skipFlowControlTicket=*/false,
-                                                /*.skipRSTLLock=*/true};
-            Lock::DBLock dbLock(abortCtx, replState->dbName, MODE_IX, Date_t::max(), lockOptions);
-
-            const NamespaceStringOrUUID dbAndUUID(replState->dbName, replState->collectionUUID);
-            CollectionNamespaceOrUUIDLock collLock(abortCtx, dbAndUUID, MODE_X);
-            AutoGetCollection indexBuildEntryColl(
-                abortCtx, NamespaceString::kIndexBuildEntryNamespace, MODE_IX);
-            _completeSelfAbort(abortCtx, replState, *indexBuildEntryColl, status);
+            // TODO (SERVER-76935): Remove collection lock timeout and abort state check loop.
+            // To avoid potential deadlocks with concurrent external aborts, which hold the
+            // collection MODE_X lock while waiting for this thread to signal its exit, the
+            // collection lock is acquired with a timeout, and retried only if the build is not
+            // already aborted (externally).
+            while (!replState->isAborted()) {
+                try {
+                    ShouldNotConflictWithSecondaryBatchApplicationBlock noConflict(
+                        abortCtx->lockState());
+                    // Skip RSTL to avoid deadlocks with prepare conflicts and state transitions
+                    // caused by taking a strong collection lock. See SERVER-42621.
+                    Lock::DBLockSkipOptions lockOptions{/*.skipFlowControlTicket=*/false,
+                                                        /*.skipRSTLLock=*/true};
+                    Lock::DBLock dbLock(
+                        abortCtx, replState->dbName, MODE_IX, Date_t::max(), lockOptions);
+
+                    const NamespaceStringOrUUID dbAndUUID(replState->dbName,
+                                                          replState->collectionUUID);
+                    CollectionNamespaceOrUUIDLock collLock(
+                        abortCtx, dbAndUUID, MODE_X, Date_t::now() + Milliseconds{100});
+                    AutoGetCollection indexBuildEntryColl(
+                        abortCtx, NamespaceString::kIndexBuildEntryNamespace, MODE_IX);
+                    _completeSelfAbort(abortCtx, replState, *indexBuildEntryColl, status);
+                } catch (const ExceptionFor<ErrorCodes::LockTimeout>&) {
+                    LOGV2(7677700,
+                          "Unable to acquire collection lock within the timeout, a concurrent "
+                          "abort might be waiting for the builder thread to exit. Rechecking if "
+                          "self abort is still required.",
+                          "buildUUID"_attr = replState->buildUUID);
+                    continue;
+                }
+            }
         });
 }
 
@@ -2707,27 +2745,47 @@ void IndexBuildsCoordinator::_cleanUpTwoPhaseAfterNonShutdownFailure(
                 ShouldNotConflictWithSecondaryBatchApplicationBlock noConflict(
                     abortCtx->lockState());
 
-                // Take RSTL to observe and prevent replication state from changing. This is done
-                // with the release/reacquire strategy to avoid deadlock with prepared txns.
-                auto [dbLock, collLock, rstl] = std::move(
-                    _acquireExclusiveLockWithRSTLRetry(abortCtx, replState.get()).getValue());
-
-                const NamespaceStringOrUUID dbAndUUID(replState->dbName, replState->collectionUUID);
-                auto replCoord = repl::ReplicationCoordinator::get(abortCtx);
-                if (!replCoord->canAcceptWritesFor(abortCtx, dbAndUUID)) {
-                    // Index builds may not fail on secondaries. If a primary replicated
-                    // an abortIndexBuild oplog entry, then this index build would have
-                    // received an IndexBuildAborted error code.
-                    fassert(51101,
-                            status.withContext(str::stream()
-                                               << "Index build: " << replState->buildUUID
-                                               << "; Database: "
-                                               << replState->dbName.toStringForErrorMsg()));
-                }
+                // TODO (SERVER-76935): Remove collection lock timeout and abort state check loop.
+                // To avoid potential deadlocks with concurrent external aborts, which hold the
+                // collection MODE_X lock while waiting for this thread to signal its exit, the
+                // collection lock is acquired with a timeout, and retried only if the build is not
+                // already aborted (externally).
+                while (!replState->isAborted()) {
+                    // Take RSTL to observe and prevent replication state from changing. This is
+                    // done with the release/reacquire strategy to avoid deadlock with prepared
+                    // txns.
+                    auto swLocks = _acquireExclusiveLockWithRSTLRetry(
+                        abortCtx, replState.get(), /*retry=*/true, /*collLockTimeout=*/true);
+                    if (!swLocks.isOK()) {
+                        LOGV2_DEBUG(
+                            7677701,
+                            1,
+                            "Index build: lock acquisition for self-abort failed, will retry.",
+                            "buildUUD"_attr = replState->buildUUID,
+                            "error"_attr = swLocks.getStatus());
+                        continue;
+                    }
 
-                AutoGetCollection indexBuildEntryColl(
-                    abortCtx, NamespaceString::kIndexBuildEntryNamespace, MODE_IX);
-                _completeSelfAbort(abortCtx, replState, *indexBuildEntryColl, status);
+                    auto [dbLock, collLock, rstl] = std::move(swLocks.getValue());
+
+                    const NamespaceStringOrUUID dbAndUUID(replState->dbName,
+                                                          replState->collectionUUID);
+                    auto replCoord = repl::ReplicationCoordinator::get(abortCtx);
+                    if (!replCoord->canAcceptWritesFor(abortCtx, dbAndUUID)) {
+                        // Index builds may not fail on secondaries. If a primary replicated
+                        // an abortIndexBuild oplog entry, then this index build would have
+                        // received an IndexBuildAborted error code.
+                        fassert(51101,
+                                status.withContext(str::stream()
+                                                   << "Index build: " << replState->buildUUID
+                                                   << "; Database: "
+                                                   << replState->dbName.toStringForErrorMsg()));
+                    }
+
+                    AutoGetCollection indexBuildEntryColl(
+                        abortCtx, NamespaceString::kIndexBuildEntryNamespace, MODE_IX);
+                    _completeSelfAbort(abortCtx, replState, *indexBuildEntryColl, status);
+                }
             }
         });
 }
@@ -2791,14 +2849,25 @@ void IndexBuildsCoordinator::_runIndexBuildInner(
     // If the index build has already been cleaned-up because it encountered an error, there is no
     // work to do. If feature flag IndexBuildGracefulErrorHandling is not enabled, the most routine
     // case is for this to be due to a self-abort caused by constraint checking during the commit
-    // phase. When the flag is enabled, constraint violations cause the index build to abort
-    // immediately on primaries, and an async external abort is requested.
+    // phase. When the flag is enabled, constraint violations cause the index build to fail
+    // immediately, but is not yet set to aborted, so an external async abort will be requested
+    // later on. It is also possible the build was concurrently aborted, between the detection of
+    // the failure and this check here, in which case we exit early.
     if (replState->isAborted()) {
         if (ErrorCodes::isTenantMigrationError(replState->getAbortStatus()))
             uassertStatusOK(replState->getAbortStatus());
         uassertStatusOK(status);
     }
 
+    // TODO (SERVER-76935): Remove collection lock timeout.
+    // It is also possible for the concurrent abort to happen after the check. This is an issue as
+    // external aborters hold the collection MODE_X lock while waiting for this thread to signal the
+    // promise, but if this thread proceeds beyond this check first it will try to acquire the
+    // collection lock before signaling the promise, potentially creating a deadlock. This is worked
+    // around by adding a timeout to the collection lock in the self-abort path, and rechecking if
+    // the build was aborted externally on timeout.
+
+
     // We do not hold a collection lock here, but we are protected against the collection being
     // dropped while the index build is still registered for the collection -- until abortIndexBuild
     // is called. The collection can be renamed, but it is OK for the name to be stale just for
@@ -3134,6 +3203,16 @@ void IndexBuildsCoordinator::_insertKeysFromSideTablesBlockingWrites(
     const IndexBuildOptions& indexBuildOptions) {
     indexBuildsSSS.drainSideWritesTablePreCommit.addAndFetch(1);
     const NamespaceStringOrUUID dbAndUUID(replState->dbName, replState->collectionUUID);
+
+    failIndexBuildWithErrorInSecondDrain.executeIf(
+        [](const BSONObj& data) {
+            uasserted(data["error"].safeNumberInt(),
+                      "failIndexBuildWithErrorInSecondDrain failpoint triggered");
+        },
+        [&](const BSONObj& data) {
+            return UUID::parse(data["buildUUID"]) == replState->buildUUID;
+        });
+
     // Perform the second drain while stopping writes on the collection.
     {
         // Skip RSTL to avoid deadlocks with prepare conflicts and state transitions. See
diff --git a/src/mongo/db/index_builds_coordinator.h b/src/mongo/db/index_builds_coordinator.h
index 96e0fffd661..449155c728d 100644
--- a/src/mongo/db/index_builds_coordinator.h
+++ b/src/mongo/db/index_builds_coordinator.h
@@ -611,7 +611,11 @@ protected:
      * exception of the RSTL. The RSTL will be acquired last, with a timeout. On timeout, all locks
      * are released. If 'retry' is true, keeps until successful RSTL acquisition, and the returned
      * StatusWith will always be OK and contain the locks. If false, it returns with the error after
-     * a single try.
+     * a single try. If 'collLockTimeout' is set to true, collection lock acquisition is done with
+     * a timeout. The function returns LockTimeout immediately upon collection lock acquisition
+     * timeout.
+     *
+     * TODO (SERVER-76935): Remove collection timeout.
      *
      * This is intended to avoid a three-way deadlock between prepared transactions, stepdown, and
      * index build threads when trying to acquire an exclusive collection lock.
@@ -623,7 +627,8 @@ protected:
                           repl::ReplicationStateTransitionLockGuard>>
     _acquireExclusiveLockWithRSTLRetry(OperationContext* opCtx,
                                        ReplIndexBuildState* replState,
-                                       bool retry = true);
+                                       bool retry = true,
+                                       bool collLockTimeout = false);
 
 
     /**
diff --git a/src/mongo/db/repl_index_build_state.h b/src/mongo/db/repl_index_build_state.h
index 930fc7e04be..5389279ca3d 100644
--- a/src/mongo/db/repl_index_build_state.h
+++ b/src/mongo/db/repl_index_build_state.h
@@ -144,12 +144,17 @@ public:
          */
         kApplyCommitOplogEntry,
         /**
-         * Below state indicates that index build was successfully able to commit or abort. For
-         * kCommitted, the state is set immediately before it commits the index build. For
-         * kAborted, this state is set after the build is cleaned up and the abort oplog entry is
-         * replicated.
+         * Below state indicates that index build was successfully able to commit and is set
+         * immediately before it commits the index build.
          */
         kCommitted,
+        /**
+         * Below state indicates that index build was successfully able to abort. In case of self
+         * abort this state is set after the build is cleaned up and the abort oplog entry is
+         * replicated. In case of an external abort, this state is set before interrupting the
+         * builder thread, as a way of indicating that a self abort is not required. Cleanup and
+         * oplog entry replicating in this case is done after setting the state.
+         */
         kAborted,
         /**
          * Below state indicates that the index build thread has voted for an abort to the current
author	Yu Jin Kang Park <yujin.kang@mongodb.com>	2023-05-11 14:46:52 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2023-05-11 16:01:16 +0000
commit	837ff506b122b33005d9fe451a0918b497cefeb1 (patch)
tree	3ff0aa2027671c2005f04a35db9e370459e0f7f3
parent	2f667bface2924113e5812be0972874bb95e6436 (diff)
download	mongo-837ff506b122b33005d9fe451a0918b497cefeb1.tar.gz