summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYu Jin Kang Park <yujin.kang@mongodb.com>2023-05-11 14:46:52 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2023-05-11 16:01:16 +0000
commit837ff506b122b33005d9fe451a0918b497cefeb1 (patch)
tree3ff0aa2027671c2005f04a35db9e370459e0f7f3
parent2f667bface2924113e5812be0972874bb95e6436 (diff)
downloadmongo-837ff506b122b33005d9fe451a0918b497cefeb1.tar.gz
SERVER-76777 Retry lock coll lock acquisition on self abort
-rw-r--r--jstests/noPassthrough/index_build_external_and_internal_abort_do_not_deadlock_single_phase.js47
-rw-r--r--jstests/noPassthrough/index_build_external_and_internal_abort_do_not_deadlock_two_phase.js88
-rw-r--r--src/mongo/db/index_builds_coordinator.cpp155
-rw-r--r--src/mongo/db/index_builds_coordinator.h9
-rw-r--r--src/mongo/db/repl_index_build_state.h13
5 files changed, 268 insertions, 44 deletions
diff --git a/jstests/noPassthrough/index_build_external_and_internal_abort_do_not_deadlock_single_phase.js b/jstests/noPassthrough/index_build_external_and_internal_abort_do_not_deadlock_single_phase.js
new file mode 100644
index 00000000000..d9a37f17dd6
--- /dev/null
+++ b/jstests/noPassthrough/index_build_external_and_internal_abort_do_not_deadlock_single_phase.js
@@ -0,0 +1,47 @@
+/**
+ * Tests dropping a collection (causing an external index build abort) does not deadlock with an
+ * internal self abort for single-phase index builds.
+ */
+(function() {
+"use strict";
+
+load('jstests/noPassthrough/libs/index_build.js');
+load("jstests/libs/fail_point_util.js");
+
+// A standalone configuration is key to running the index build single-phase.
+const conn = MongoRunner.runMongod();
+
+const dbName = 'test';
+const collName = 'coll';
+const db = conn.getDB(dbName);
+const coll = db.getCollection(collName);
+
+coll.drop();
+assert.commandWorked(coll.insert({a: [0, "a"]}));
+
+// Hang after the index build has checked if the build is already aborted, but before taking
+// collection locks for cleanup.
+const hangBeforeCleanup = configureFailPoint(db, 'hangIndexBuildBeforeAbortCleanUp');
+
+const hangAfterCollDropHasLocks =
+ configureFailPoint(db, 'hangAbortIndexBuildByBuildUUIDAfterLocks');
+
+const createIdx =
+ IndexBuildTest.startIndexBuild(conn, coll.getFullName(), {a: "2d"}, null, [13026]);
+
+hangBeforeCleanup.wait();
+
+const collDrop = startParallelShell(funWithArgs(function(dbName, collName) {
+ db.getSiblingDB(dbName).getCollection(collName).drop();
+ }, dbName, collName), conn.port);
+
+hangAfterCollDropHasLocks.wait();
+hangBeforeCleanup.off();
+hangAfterCollDropHasLocks.off();
+
+jsTestLog("Waiting for collection drop shell to return");
+collDrop();
+createIdx();
+
+MongoRunner.stopMongod(conn);
+})();
diff --git a/jstests/noPassthrough/index_build_external_and_internal_abort_do_not_deadlock_two_phase.js b/jstests/noPassthrough/index_build_external_and_internal_abort_do_not_deadlock_two_phase.js
new file mode 100644
index 00000000000..0c6c8e35f74
--- /dev/null
+++ b/jstests/noPassthrough/index_build_external_and_internal_abort_do_not_deadlock_two_phase.js
@@ -0,0 +1,88 @@
+/**
+ * Tests dropping a collection (causing an external index build abort) does not deadlock with an
+ * internal self abort for two-phase index builds.
+ *
+ * @tags: [
+ * requires_replication,
+ * ]
+ */
+(function() {
+"use strict";
+
+load('jstests/noPassthrough/libs/index_build.js');
+load("jstests/libs/fail_point_util.js");
+
+const rst = new ReplSetTest({
+ nodes: [
+ {},
+ {
+ // Disallow elections on secondary.
+ rsConfig: {
+ priority: 0,
+ },
+ },
+ ]
+});
+rst.startSet();
+rst.initiate();
+
+const primary = rst.getPrimary();
+const primaryDB = primary.getDB('test');
+const primaryColl = primaryDB.getCollection('test');
+
+primaryColl.drop();
+assert.commandWorked(primaryColl.insert({a: 1}));
+
+// Pause the index builds on the secondary, using the 'hangAfterStartingIndexBuild' failpoint.
+const failpointHangAfterInit = configureFailPoint(primaryDB, "hangAfterInitializingIndexBuild");
+const hangBeforeCleanup = configureFailPoint(primaryDB, 'hangIndexBuildBeforeAbortCleanUp');
+
+// Block secondary to avoid commitQuorum being fullfilled.
+IndexBuildTest.pauseIndexBuilds(rst.getSecondary());
+
+jsTestLog("Waiting for index build to start");
+const createIdx = IndexBuildTest.startIndexBuild(
+ primary, primaryColl.getFullName(), {a: 1}, null, [ErrorCodes.OutOfDiskSpace]);
+
+const buildUUID =
+ IndexBuildTest
+ .assertIndexesSoon(primaryColl, 2, ['_id_'], ['a_1'], {includeBuildUUIDs: true})['a_1']
+ .buildUUID;
+
+const failAfterVoteForCommitReadiness =
+ configureFailPoint(primaryDB,
+ "failIndexBuildWithErrorInSecondDrain",
+ {buildUUID: buildUUID, error: ErrorCodes.OutOfDiskSpace});
+
+// Continue index build after preparing the artificial failure.
+failpointHangAfterInit.off();
+
+// Wait for the index build to be in clean up path.
+hangBeforeCleanup.wait();
+
+const hangAfterCollDropHasLocks =
+ configureFailPoint(primaryDB, 'hangAbortIndexBuildByBuildUUIDAfterLocks');
+
+const collDrop = startParallelShell(funWithArgs(function(dbName, collName) {
+ jsTestLog("Dropping collection");
+ db.getSiblingDB(dbName).getCollection(collName).drop();
+ }, primaryDB.getName(), primaryColl.getName()), primary.port);
+
+hangAfterCollDropHasLocks.wait();
+hangBeforeCleanup.off();
+hangAfterCollDropHasLocks.off();
+
+jsTestLog("Waiting for the index build to be killed");
+// "Index build: joined after abort".
+checkLog.containsJson(primary, 20655, {
+ buildUUID: function(uuid) {
+ return uuid && uuid["uuid"]["$uuid"] === extractUUIDFromObject(buildUUID);
+ }
+});
+
+jsTestLog("Waiting for collection drop shell to return");
+collDrop();
+createIdx();
+
+rst.stopSet();
+})();
diff --git a/src/mongo/db/index_builds_coordinator.cpp b/src/mongo/db/index_builds_coordinator.cpp
index de5004df03d..17fe7380a5d 100644
--- a/src/mongo/db/index_builds_coordinator.cpp
+++ b/src/mongo/db/index_builds_coordinator.cpp
@@ -94,7 +94,10 @@ MONGO_FAIL_POINT_DEFINE(hangIndexBuildBeforeWaitingUntilMajorityOpTime);
MONGO_FAIL_POINT_DEFINE(hangBeforeUnregisteringAfterCommit);
MONGO_FAIL_POINT_DEFINE(failSetUpResumeIndexBuild);
MONGO_FAIL_POINT_DEFINE(failIndexBuildWithError);
+MONGO_FAIL_POINT_DEFINE(failIndexBuildWithErrorInSecondDrain);
MONGO_FAIL_POINT_DEFINE(hangInRemoveIndexBuildEntryAfterCommitOrAbort);
+MONGO_FAIL_POINT_DEFINE(hangAbortIndexBuildByBuildUUIDAfterLocks);
+
IndexBuildsCoordinator::IndexBuildsSSS::IndexBuildsSSS()
: ServerStatusSection("indexBuilds"),
@@ -1409,6 +1412,8 @@ bool IndexBuildsCoordinator::abortIndexBuildByBuildUUID(OperationContext* opCtx,
AutoGetCollection indexBuildEntryColl(
opCtx, NamespaceString::kIndexBuildEntryNamespace, MODE_IX);
+ hangAbortIndexBuildByBuildUUIDAfterLocks.pauseWhileSet();
+
// If we are using two-phase index builds and are no longer primary after receiving an
// abort, we cannot replicate an abortIndexBuild oplog entry. Continue holding the RSTL to
// check the replication state and to prevent any state transitions from happening while
@@ -2157,7 +2162,8 @@ StatusWith<std::tuple<Lock::DBLock,
repl::ReplicationStateTransitionLockGuard>>
IndexBuildsCoordinator::_acquireExclusiveLockWithRSTLRetry(OperationContext* opCtx,
ReplIndexBuildState* replState,
- bool retry) {
+ bool retry,
+ bool collLockTimeout) {
while (true) {
// Skip the check for sharding's critical section check as it can only be acquired during a
@@ -2168,8 +2174,20 @@ IndexBuildsCoordinator::_acquireExclusiveLockWithRSTLRetry(OperationContext* opC
/*.skipRSTLLock=*/true};
Lock::DBLock dbLock{opCtx, replState->dbName, MODE_IX, Date_t::max(), lockOptions};
- CollectionNamespaceOrUUIDLock collLock{
- opCtx, {replState->dbName, replState->collectionUUID}, MODE_X};
+ Date_t collLockDeadline = Date_t::max();
+ if (collLockTimeout) {
+ collLockDeadline = Date_t::now() + Milliseconds{100};
+ }
+
+ boost::optional<CollectionNamespaceOrUUIDLock> collLock;
+ try {
+ collLock.emplace(opCtx,
+ NamespaceStringOrUUID{replState->dbName, replState->collectionUUID},
+ MODE_X,
+ collLockDeadline);
+ } catch (const ExceptionFor<ErrorCodes::LockTimeout>& ex) {
+ return ex.toStatus();
+ }
// If we can't acquire the RSTL within a given time period, there is an active state
// transition and we should release our locks and try again. We would otherwise introduce a
@@ -2200,7 +2218,7 @@ IndexBuildsCoordinator::_acquireExclusiveLockWithRSTLRetry(OperationContext* opC
continue;
}
- return std::make_tuple(std::move(dbLock), std::move(collLock), std::move(rstl));
+ return std::make_tuple(std::move(dbLock), std::move(collLock.value()), std::move(rstl));
}
}
@@ -2648,18 +2666,38 @@ void IndexBuildsCoordinator::_cleanUpSinglePhaseAfterNonShutdownFailure(
// would prevent us from taking locks. Use a new OperationContext to abort the index build.
runOnAlternateContext(
opCtx, "self-abort", [this, replState, status](OperationContext* abortCtx) {
- ShouldNotConflictWithSecondaryBatchApplicationBlock noConflict(abortCtx->lockState());
- // Skip RSTL to avoid deadlocks with prepare conflicts and state transitions caused by
- // taking a strong collection lock. See SERVER-42621.
- Lock::DBLockSkipOptions lockOptions{/*.skipFlowControlTicket=*/false,
- /*.skipRSTLLock=*/true};
- Lock::DBLock dbLock(abortCtx, replState->dbName, MODE_IX, Date_t::max(), lockOptions);
-
- const NamespaceStringOrUUID dbAndUUID(replState->dbName, replState->collectionUUID);
- CollectionNamespaceOrUUIDLock collLock(abortCtx, dbAndUUID, MODE_X);
- AutoGetCollection indexBuildEntryColl(
- abortCtx, NamespaceString::kIndexBuildEntryNamespace, MODE_IX);
- _completeSelfAbort(abortCtx, replState, *indexBuildEntryColl, status);
+ // TODO (SERVER-76935): Remove collection lock timeout and abort state check loop.
+ // To avoid potential deadlocks with concurrent external aborts, which hold the
+ // collection MODE_X lock while waiting for this thread to signal its exit, the
+ // collection lock is acquired with a timeout, and retried only if the build is not
+ // already aborted (externally).
+ while (!replState->isAborted()) {
+ try {
+ ShouldNotConflictWithSecondaryBatchApplicationBlock noConflict(
+ abortCtx->lockState());
+ // Skip RSTL to avoid deadlocks with prepare conflicts and state transitions
+ // caused by taking a strong collection lock. See SERVER-42621.
+ Lock::DBLockSkipOptions lockOptions{/*.skipFlowControlTicket=*/false,
+ /*.skipRSTLLock=*/true};
+ Lock::DBLock dbLock(
+ abortCtx, replState->dbName, MODE_IX, Date_t::max(), lockOptions);
+
+ const NamespaceStringOrUUID dbAndUUID(replState->dbName,
+ replState->collectionUUID);
+ CollectionNamespaceOrUUIDLock collLock(
+ abortCtx, dbAndUUID, MODE_X, Date_t::now() + Milliseconds{100});
+ AutoGetCollection indexBuildEntryColl(
+ abortCtx, NamespaceString::kIndexBuildEntryNamespace, MODE_IX);
+ _completeSelfAbort(abortCtx, replState, *indexBuildEntryColl, status);
+ } catch (const ExceptionFor<ErrorCodes::LockTimeout>&) {
+ LOGV2(7677700,
+ "Unable to acquire collection lock within the timeout, a concurrent "
+ "abort might be waiting for the builder thread to exit. Rechecking if "
+ "self abort is still required.",
+ "buildUUID"_attr = replState->buildUUID);
+ continue;
+ }
+ }
});
}
@@ -2707,27 +2745,47 @@ void IndexBuildsCoordinator::_cleanUpTwoPhaseAfterNonShutdownFailure(
ShouldNotConflictWithSecondaryBatchApplicationBlock noConflict(
abortCtx->lockState());
- // Take RSTL to observe and prevent replication state from changing. This is done
- // with the release/reacquire strategy to avoid deadlock with prepared txns.
- auto [dbLock, collLock, rstl] = std::move(
- _acquireExclusiveLockWithRSTLRetry(abortCtx, replState.get()).getValue());
-
- const NamespaceStringOrUUID dbAndUUID(replState->dbName, replState->collectionUUID);
- auto replCoord = repl::ReplicationCoordinator::get(abortCtx);
- if (!replCoord->canAcceptWritesFor(abortCtx, dbAndUUID)) {
- // Index builds may not fail on secondaries. If a primary replicated
- // an abortIndexBuild oplog entry, then this index build would have
- // received an IndexBuildAborted error code.
- fassert(51101,
- status.withContext(str::stream()
- << "Index build: " << replState->buildUUID
- << "; Database: "
- << replState->dbName.toStringForErrorMsg()));
- }
+ // TODO (SERVER-76935): Remove collection lock timeout and abort state check loop.
+ // To avoid potential deadlocks with concurrent external aborts, which hold the
+ // collection MODE_X lock while waiting for this thread to signal its exit, the
+ // collection lock is acquired with a timeout, and retried only if the build is not
+ // already aborted (externally).
+ while (!replState->isAborted()) {
+ // Take RSTL to observe and prevent replication state from changing. This is
+ // done with the release/reacquire strategy to avoid deadlock with prepared
+ // txns.
+ auto swLocks = _acquireExclusiveLockWithRSTLRetry(
+ abortCtx, replState.get(), /*retry=*/true, /*collLockTimeout=*/true);
+ if (!swLocks.isOK()) {
+ LOGV2_DEBUG(
+ 7677701,
+ 1,
+ "Index build: lock acquisition for self-abort failed, will retry.",
+ "buildUUD"_attr = replState->buildUUID,
+ "error"_attr = swLocks.getStatus());
+ continue;
+ }
- AutoGetCollection indexBuildEntryColl(
- abortCtx, NamespaceString::kIndexBuildEntryNamespace, MODE_IX);
- _completeSelfAbort(abortCtx, replState, *indexBuildEntryColl, status);
+ auto [dbLock, collLock, rstl] = std::move(swLocks.getValue());
+
+ const NamespaceStringOrUUID dbAndUUID(replState->dbName,
+ replState->collectionUUID);
+ auto replCoord = repl::ReplicationCoordinator::get(abortCtx);
+ if (!replCoord->canAcceptWritesFor(abortCtx, dbAndUUID)) {
+ // Index builds may not fail on secondaries. If a primary replicated
+ // an abortIndexBuild oplog entry, then this index build would have
+ // received an IndexBuildAborted error code.
+ fassert(51101,
+ status.withContext(str::stream()
+ << "Index build: " << replState->buildUUID
+ << "; Database: "
+ << replState->dbName.toStringForErrorMsg()));
+ }
+
+ AutoGetCollection indexBuildEntryColl(
+ abortCtx, NamespaceString::kIndexBuildEntryNamespace, MODE_IX);
+ _completeSelfAbort(abortCtx, replState, *indexBuildEntryColl, status);
+ }
}
});
}
@@ -2791,14 +2849,25 @@ void IndexBuildsCoordinator::_runIndexBuildInner(
// If the index build has already been cleaned-up because it encountered an error, there is no
// work to do. If feature flag IndexBuildGracefulErrorHandling is not enabled, the most routine
// case is for this to be due to a self-abort caused by constraint checking during the commit
- // phase. When the flag is enabled, constraint violations cause the index build to abort
- // immediately on primaries, and an async external abort is requested.
+ // phase. When the flag is enabled, constraint violations cause the index build to fail
+ // immediately, but is not yet set to aborted, so an external async abort will be requested
+ // later on. It is also possible the build was concurrently aborted, between the detection of
+ // the failure and this check here, in which case we exit early.
if (replState->isAborted()) {
if (ErrorCodes::isTenantMigrationError(replState->getAbortStatus()))
uassertStatusOK(replState->getAbortStatus());
uassertStatusOK(status);
}
+ // TODO (SERVER-76935): Remove collection lock timeout.
+ // It is also possible for the concurrent abort to happen after the check. This is an issue as
+ // external aborters hold the collection MODE_X lock while waiting for this thread to signal the
+ // promise, but if this thread proceeds beyond this check first it will try to acquire the
+ // collection lock before signaling the promise, potentially creating a deadlock. This is worked
+ // around by adding a timeout to the collection lock in the self-abort path, and rechecking if
+ // the build was aborted externally on timeout.
+
+
// We do not hold a collection lock here, but we are protected against the collection being
// dropped while the index build is still registered for the collection -- until abortIndexBuild
// is called. The collection can be renamed, but it is OK for the name to be stale just for
@@ -3134,6 +3203,16 @@ void IndexBuildsCoordinator::_insertKeysFromSideTablesBlockingWrites(
const IndexBuildOptions& indexBuildOptions) {
indexBuildsSSS.drainSideWritesTablePreCommit.addAndFetch(1);
const NamespaceStringOrUUID dbAndUUID(replState->dbName, replState->collectionUUID);
+
+ failIndexBuildWithErrorInSecondDrain.executeIf(
+ [](const BSONObj& data) {
+ uasserted(data["error"].safeNumberInt(),
+ "failIndexBuildWithErrorInSecondDrain failpoint triggered");
+ },
+ [&](const BSONObj& data) {
+ return UUID::parse(data["buildUUID"]) == replState->buildUUID;
+ });
+
// Perform the second drain while stopping writes on the collection.
{
// Skip RSTL to avoid deadlocks with prepare conflicts and state transitions. See
diff --git a/src/mongo/db/index_builds_coordinator.h b/src/mongo/db/index_builds_coordinator.h
index 96e0fffd661..449155c728d 100644
--- a/src/mongo/db/index_builds_coordinator.h
+++ b/src/mongo/db/index_builds_coordinator.h
@@ -611,7 +611,11 @@ protected:
* exception of the RSTL. The RSTL will be acquired last, with a timeout. On timeout, all locks
* are released. If 'retry' is true, keeps until successful RSTL acquisition, and the returned
* StatusWith will always be OK and contain the locks. If false, it returns with the error after
- * a single try.
+ * a single try. If 'collLockTimeout' is set to true, collection lock acquisition is done with
+ * a timeout. The function returns LockTimeout immediately upon collection lock acquisition
+ * timeout.
+ *
+ * TODO (SERVER-76935): Remove collection timeout.
*
* This is intended to avoid a three-way deadlock between prepared transactions, stepdown, and
* index build threads when trying to acquire an exclusive collection lock.
@@ -623,7 +627,8 @@ protected:
repl::ReplicationStateTransitionLockGuard>>
_acquireExclusiveLockWithRSTLRetry(OperationContext* opCtx,
ReplIndexBuildState* replState,
- bool retry = true);
+ bool retry = true,
+ bool collLockTimeout = false);
/**
diff --git a/src/mongo/db/repl_index_build_state.h b/src/mongo/db/repl_index_build_state.h
index 930fc7e04be..5389279ca3d 100644
--- a/src/mongo/db/repl_index_build_state.h
+++ b/src/mongo/db/repl_index_build_state.h
@@ -144,12 +144,17 @@ public:
*/
kApplyCommitOplogEntry,
/**
- * Below state indicates that index build was successfully able to commit or abort. For
- * kCommitted, the state is set immediately before it commits the index build. For
- * kAborted, this state is set after the build is cleaned up and the abort oplog entry is
- * replicated.
+ * Below state indicates that index build was successfully able to commit and is set
+ * immediately before it commits the index build.
*/
kCommitted,
+ /**
+ * Below state indicates that index build was successfully able to abort. In case of self
+ * abort this state is set after the build is cleaned up and the abort oplog entry is
+ * replicated. In case of an external abort, this state is set before interrupting the
+ * builder thread, as a way of indicating that a self abort is not required. Cleanup and
+ * oplog entry replicating in this case is done after setting the state.
+ */
kAborted,
/**
* Below state indicates that the index build thread has voted for an abort to the current