diff options
author | Yu Jin Kang Park <yujin.kang@mongodb.com> | 2023-05-11 14:46:52 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2023-05-11 16:01:16 +0000 |
commit | 837ff506b122b33005d9fe451a0918b497cefeb1 (patch) | |
tree | 3ff0aa2027671c2005f04a35db9e370459e0f7f3 | |
parent | 2f667bface2924113e5812be0972874bb95e6436 (diff) | |
download | mongo-837ff506b122b33005d9fe451a0918b497cefeb1.tar.gz |
SERVER-76777 Retry lock coll lock acquisition on self abort
5 files changed, 268 insertions, 44 deletions
diff --git a/jstests/noPassthrough/index_build_external_and_internal_abort_do_not_deadlock_single_phase.js b/jstests/noPassthrough/index_build_external_and_internal_abort_do_not_deadlock_single_phase.js new file mode 100644 index 00000000000..d9a37f17dd6 --- /dev/null +++ b/jstests/noPassthrough/index_build_external_and_internal_abort_do_not_deadlock_single_phase.js @@ -0,0 +1,47 @@ +/** + * Tests dropping a collection (causing an external index build abort) does not deadlock with an + * internal self abort for single-phase index builds. + */ +(function() { +"use strict"; + +load('jstests/noPassthrough/libs/index_build.js'); +load("jstests/libs/fail_point_util.js"); + +// A standalone configuration is key to running the index build single-phase. +const conn = MongoRunner.runMongod(); + +const dbName = 'test'; +const collName = 'coll'; +const db = conn.getDB(dbName); +const coll = db.getCollection(collName); + +coll.drop(); +assert.commandWorked(coll.insert({a: [0, "a"]})); + +// Hang after the index build has checked if the build is already aborted, but before taking +// collection locks for cleanup. +const hangBeforeCleanup = configureFailPoint(db, 'hangIndexBuildBeforeAbortCleanUp'); + +const hangAfterCollDropHasLocks = + configureFailPoint(db, 'hangAbortIndexBuildByBuildUUIDAfterLocks'); + +const createIdx = + IndexBuildTest.startIndexBuild(conn, coll.getFullName(), {a: "2d"}, null, [13026]); + +hangBeforeCleanup.wait(); + +const collDrop = startParallelShell(funWithArgs(function(dbName, collName) { + db.getSiblingDB(dbName).getCollection(collName).drop(); + }, dbName, collName), conn.port); + +hangAfterCollDropHasLocks.wait(); +hangBeforeCleanup.off(); +hangAfterCollDropHasLocks.off(); + +jsTestLog("Waiting for collection drop shell to return"); +collDrop(); +createIdx(); + +MongoRunner.stopMongod(conn); +})(); diff --git a/jstests/noPassthrough/index_build_external_and_internal_abort_do_not_deadlock_two_phase.js b/jstests/noPassthrough/index_build_external_and_internal_abort_do_not_deadlock_two_phase.js new file mode 100644 index 00000000000..0c6c8e35f74 --- /dev/null +++ b/jstests/noPassthrough/index_build_external_and_internal_abort_do_not_deadlock_two_phase.js @@ -0,0 +1,88 @@ +/** + * Tests dropping a collection (causing an external index build abort) does not deadlock with an + * internal self abort for two-phase index builds. + * + * @tags: [ + * requires_replication, + * ] + */ +(function() { +"use strict"; + +load('jstests/noPassthrough/libs/index_build.js'); +load("jstests/libs/fail_point_util.js"); + +const rst = new ReplSetTest({ + nodes: [ + {}, + { + // Disallow elections on secondary. + rsConfig: { + priority: 0, + }, + }, + ] +}); +rst.startSet(); +rst.initiate(); + +const primary = rst.getPrimary(); +const primaryDB = primary.getDB('test'); +const primaryColl = primaryDB.getCollection('test'); + +primaryColl.drop(); +assert.commandWorked(primaryColl.insert({a: 1})); + +// Pause the index builds on the secondary, using the 'hangAfterStartingIndexBuild' failpoint. +const failpointHangAfterInit = configureFailPoint(primaryDB, "hangAfterInitializingIndexBuild"); +const hangBeforeCleanup = configureFailPoint(primaryDB, 'hangIndexBuildBeforeAbortCleanUp'); + +// Block secondary to avoid commitQuorum being fullfilled. +IndexBuildTest.pauseIndexBuilds(rst.getSecondary()); + +jsTestLog("Waiting for index build to start"); +const createIdx = IndexBuildTest.startIndexBuild( + primary, primaryColl.getFullName(), {a: 1}, null, [ErrorCodes.OutOfDiskSpace]); + +const buildUUID = + IndexBuildTest + .assertIndexesSoon(primaryColl, 2, ['_id_'], ['a_1'], {includeBuildUUIDs: true})['a_1'] + .buildUUID; + +const failAfterVoteForCommitReadiness = + configureFailPoint(primaryDB, + "failIndexBuildWithErrorInSecondDrain", + {buildUUID: buildUUID, error: ErrorCodes.OutOfDiskSpace}); + +// Continue index build after preparing the artificial failure. +failpointHangAfterInit.off(); + +// Wait for the index build to be in clean up path. +hangBeforeCleanup.wait(); + +const hangAfterCollDropHasLocks = + configureFailPoint(primaryDB, 'hangAbortIndexBuildByBuildUUIDAfterLocks'); + +const collDrop = startParallelShell(funWithArgs(function(dbName, collName) { + jsTestLog("Dropping collection"); + db.getSiblingDB(dbName).getCollection(collName).drop(); + }, primaryDB.getName(), primaryColl.getName()), primary.port); + +hangAfterCollDropHasLocks.wait(); +hangBeforeCleanup.off(); +hangAfterCollDropHasLocks.off(); + +jsTestLog("Waiting for the index build to be killed"); +// "Index build: joined after abort". +checkLog.containsJson(primary, 20655, { + buildUUID: function(uuid) { + return uuid && uuid["uuid"]["$uuid"] === extractUUIDFromObject(buildUUID); + } +}); + +jsTestLog("Waiting for collection drop shell to return"); +collDrop(); +createIdx(); + +rst.stopSet(); +})(); diff --git a/src/mongo/db/index_builds_coordinator.cpp b/src/mongo/db/index_builds_coordinator.cpp index de5004df03d..17fe7380a5d 100644 --- a/src/mongo/db/index_builds_coordinator.cpp +++ b/src/mongo/db/index_builds_coordinator.cpp @@ -94,7 +94,10 @@ MONGO_FAIL_POINT_DEFINE(hangIndexBuildBeforeWaitingUntilMajorityOpTime); MONGO_FAIL_POINT_DEFINE(hangBeforeUnregisteringAfterCommit); MONGO_FAIL_POINT_DEFINE(failSetUpResumeIndexBuild); MONGO_FAIL_POINT_DEFINE(failIndexBuildWithError); +MONGO_FAIL_POINT_DEFINE(failIndexBuildWithErrorInSecondDrain); MONGO_FAIL_POINT_DEFINE(hangInRemoveIndexBuildEntryAfterCommitOrAbort); +MONGO_FAIL_POINT_DEFINE(hangAbortIndexBuildByBuildUUIDAfterLocks); + IndexBuildsCoordinator::IndexBuildsSSS::IndexBuildsSSS() : ServerStatusSection("indexBuilds"), @@ -1409,6 +1412,8 @@ bool IndexBuildsCoordinator::abortIndexBuildByBuildUUID(OperationContext* opCtx, AutoGetCollection indexBuildEntryColl( opCtx, NamespaceString::kIndexBuildEntryNamespace, MODE_IX); + hangAbortIndexBuildByBuildUUIDAfterLocks.pauseWhileSet(); + // If we are using two-phase index builds and are no longer primary after receiving an // abort, we cannot replicate an abortIndexBuild oplog entry. Continue holding the RSTL to // check the replication state and to prevent any state transitions from happening while @@ -2157,7 +2162,8 @@ StatusWith<std::tuple<Lock::DBLock, repl::ReplicationStateTransitionLockGuard>> IndexBuildsCoordinator::_acquireExclusiveLockWithRSTLRetry(OperationContext* opCtx, ReplIndexBuildState* replState, - bool retry) { + bool retry, + bool collLockTimeout) { while (true) { // Skip the check for sharding's critical section check as it can only be acquired during a @@ -2168,8 +2174,20 @@ IndexBuildsCoordinator::_acquireExclusiveLockWithRSTLRetry(OperationContext* opC /*.skipRSTLLock=*/true}; Lock::DBLock dbLock{opCtx, replState->dbName, MODE_IX, Date_t::max(), lockOptions}; - CollectionNamespaceOrUUIDLock collLock{ - opCtx, {replState->dbName, replState->collectionUUID}, MODE_X}; + Date_t collLockDeadline = Date_t::max(); + if (collLockTimeout) { + collLockDeadline = Date_t::now() + Milliseconds{100}; + } + + boost::optional<CollectionNamespaceOrUUIDLock> collLock; + try { + collLock.emplace(opCtx, + NamespaceStringOrUUID{replState->dbName, replState->collectionUUID}, + MODE_X, + collLockDeadline); + } catch (const ExceptionFor<ErrorCodes::LockTimeout>& ex) { + return ex.toStatus(); + } // If we can't acquire the RSTL within a given time period, there is an active state // transition and we should release our locks and try again. We would otherwise introduce a @@ -2200,7 +2218,7 @@ IndexBuildsCoordinator::_acquireExclusiveLockWithRSTLRetry(OperationContext* opC continue; } - return std::make_tuple(std::move(dbLock), std::move(collLock), std::move(rstl)); + return std::make_tuple(std::move(dbLock), std::move(collLock.value()), std::move(rstl)); } } @@ -2648,18 +2666,38 @@ void IndexBuildsCoordinator::_cleanUpSinglePhaseAfterNonShutdownFailure( // would prevent us from taking locks. Use a new OperationContext to abort the index build. runOnAlternateContext( opCtx, "self-abort", [this, replState, status](OperationContext* abortCtx) { - ShouldNotConflictWithSecondaryBatchApplicationBlock noConflict(abortCtx->lockState()); - // Skip RSTL to avoid deadlocks with prepare conflicts and state transitions caused by - // taking a strong collection lock. See SERVER-42621. - Lock::DBLockSkipOptions lockOptions{/*.skipFlowControlTicket=*/false, - /*.skipRSTLLock=*/true}; - Lock::DBLock dbLock(abortCtx, replState->dbName, MODE_IX, Date_t::max(), lockOptions); - - const NamespaceStringOrUUID dbAndUUID(replState->dbName, replState->collectionUUID); - CollectionNamespaceOrUUIDLock collLock(abortCtx, dbAndUUID, MODE_X); - AutoGetCollection indexBuildEntryColl( - abortCtx, NamespaceString::kIndexBuildEntryNamespace, MODE_IX); - _completeSelfAbort(abortCtx, replState, *indexBuildEntryColl, status); + // TODO (SERVER-76935): Remove collection lock timeout and abort state check loop. + // To avoid potential deadlocks with concurrent external aborts, which hold the + // collection MODE_X lock while waiting for this thread to signal its exit, the + // collection lock is acquired with a timeout, and retried only if the build is not + // already aborted (externally). + while (!replState->isAborted()) { + try { + ShouldNotConflictWithSecondaryBatchApplicationBlock noConflict( + abortCtx->lockState()); + // Skip RSTL to avoid deadlocks with prepare conflicts and state transitions + // caused by taking a strong collection lock. See SERVER-42621. + Lock::DBLockSkipOptions lockOptions{/*.skipFlowControlTicket=*/false, + /*.skipRSTLLock=*/true}; + Lock::DBLock dbLock( + abortCtx, replState->dbName, MODE_IX, Date_t::max(), lockOptions); + + const NamespaceStringOrUUID dbAndUUID(replState->dbName, + replState->collectionUUID); + CollectionNamespaceOrUUIDLock collLock( + abortCtx, dbAndUUID, MODE_X, Date_t::now() + Milliseconds{100}); + AutoGetCollection indexBuildEntryColl( + abortCtx, NamespaceString::kIndexBuildEntryNamespace, MODE_IX); + _completeSelfAbort(abortCtx, replState, *indexBuildEntryColl, status); + } catch (const ExceptionFor<ErrorCodes::LockTimeout>&) { + LOGV2(7677700, + "Unable to acquire collection lock within the timeout, a concurrent " + "abort might be waiting for the builder thread to exit. Rechecking if " + "self abort is still required.", + "buildUUID"_attr = replState->buildUUID); + continue; + } + } }); } @@ -2707,27 +2745,47 @@ void IndexBuildsCoordinator::_cleanUpTwoPhaseAfterNonShutdownFailure( ShouldNotConflictWithSecondaryBatchApplicationBlock noConflict( abortCtx->lockState()); - // Take RSTL to observe and prevent replication state from changing. This is done - // with the release/reacquire strategy to avoid deadlock with prepared txns. - auto [dbLock, collLock, rstl] = std::move( - _acquireExclusiveLockWithRSTLRetry(abortCtx, replState.get()).getValue()); - - const NamespaceStringOrUUID dbAndUUID(replState->dbName, replState->collectionUUID); - auto replCoord = repl::ReplicationCoordinator::get(abortCtx); - if (!replCoord->canAcceptWritesFor(abortCtx, dbAndUUID)) { - // Index builds may not fail on secondaries. If a primary replicated - // an abortIndexBuild oplog entry, then this index build would have - // received an IndexBuildAborted error code. - fassert(51101, - status.withContext(str::stream() - << "Index build: " << replState->buildUUID - << "; Database: " - << replState->dbName.toStringForErrorMsg())); - } + // TODO (SERVER-76935): Remove collection lock timeout and abort state check loop. + // To avoid potential deadlocks with concurrent external aborts, which hold the + // collection MODE_X lock while waiting for this thread to signal its exit, the + // collection lock is acquired with a timeout, and retried only if the build is not + // already aborted (externally). + while (!replState->isAborted()) { + // Take RSTL to observe and prevent replication state from changing. This is + // done with the release/reacquire strategy to avoid deadlock with prepared + // txns. + auto swLocks = _acquireExclusiveLockWithRSTLRetry( + abortCtx, replState.get(), /*retry=*/true, /*collLockTimeout=*/true); + if (!swLocks.isOK()) { + LOGV2_DEBUG( + 7677701, + 1, + "Index build: lock acquisition for self-abort failed, will retry.", + "buildUUD"_attr = replState->buildUUID, + "error"_attr = swLocks.getStatus()); + continue; + } - AutoGetCollection indexBuildEntryColl( - abortCtx, NamespaceString::kIndexBuildEntryNamespace, MODE_IX); - _completeSelfAbort(abortCtx, replState, *indexBuildEntryColl, status); + auto [dbLock, collLock, rstl] = std::move(swLocks.getValue()); + + const NamespaceStringOrUUID dbAndUUID(replState->dbName, + replState->collectionUUID); + auto replCoord = repl::ReplicationCoordinator::get(abortCtx); + if (!replCoord->canAcceptWritesFor(abortCtx, dbAndUUID)) { + // Index builds may not fail on secondaries. If a primary replicated + // an abortIndexBuild oplog entry, then this index build would have + // received an IndexBuildAborted error code. + fassert(51101, + status.withContext(str::stream() + << "Index build: " << replState->buildUUID + << "; Database: " + << replState->dbName.toStringForErrorMsg())); + } + + AutoGetCollection indexBuildEntryColl( + abortCtx, NamespaceString::kIndexBuildEntryNamespace, MODE_IX); + _completeSelfAbort(abortCtx, replState, *indexBuildEntryColl, status); + } } }); } @@ -2791,14 +2849,25 @@ void IndexBuildsCoordinator::_runIndexBuildInner( // If the index build has already been cleaned-up because it encountered an error, there is no // work to do. If feature flag IndexBuildGracefulErrorHandling is not enabled, the most routine // case is for this to be due to a self-abort caused by constraint checking during the commit - // phase. When the flag is enabled, constraint violations cause the index build to abort - // immediately on primaries, and an async external abort is requested. + // phase. When the flag is enabled, constraint violations cause the index build to fail + // immediately, but is not yet set to aborted, so an external async abort will be requested + // later on. It is also possible the build was concurrently aborted, between the detection of + // the failure and this check here, in which case we exit early. if (replState->isAborted()) { if (ErrorCodes::isTenantMigrationError(replState->getAbortStatus())) uassertStatusOK(replState->getAbortStatus()); uassertStatusOK(status); } + // TODO (SERVER-76935): Remove collection lock timeout. + // It is also possible for the concurrent abort to happen after the check. This is an issue as + // external aborters hold the collection MODE_X lock while waiting for this thread to signal the + // promise, but if this thread proceeds beyond this check first it will try to acquire the + // collection lock before signaling the promise, potentially creating a deadlock. This is worked + // around by adding a timeout to the collection lock in the self-abort path, and rechecking if + // the build was aborted externally on timeout. + + // We do not hold a collection lock here, but we are protected against the collection being // dropped while the index build is still registered for the collection -- until abortIndexBuild // is called. The collection can be renamed, but it is OK for the name to be stale just for @@ -3134,6 +3203,16 @@ void IndexBuildsCoordinator::_insertKeysFromSideTablesBlockingWrites( const IndexBuildOptions& indexBuildOptions) { indexBuildsSSS.drainSideWritesTablePreCommit.addAndFetch(1); const NamespaceStringOrUUID dbAndUUID(replState->dbName, replState->collectionUUID); + + failIndexBuildWithErrorInSecondDrain.executeIf( + [](const BSONObj& data) { + uasserted(data["error"].safeNumberInt(), + "failIndexBuildWithErrorInSecondDrain failpoint triggered"); + }, + [&](const BSONObj& data) { + return UUID::parse(data["buildUUID"]) == replState->buildUUID; + }); + // Perform the second drain while stopping writes on the collection. { // Skip RSTL to avoid deadlocks with prepare conflicts and state transitions. See diff --git a/src/mongo/db/index_builds_coordinator.h b/src/mongo/db/index_builds_coordinator.h index 96e0fffd661..449155c728d 100644 --- a/src/mongo/db/index_builds_coordinator.h +++ b/src/mongo/db/index_builds_coordinator.h @@ -611,7 +611,11 @@ protected: * exception of the RSTL. The RSTL will be acquired last, with a timeout. On timeout, all locks * are released. If 'retry' is true, keeps until successful RSTL acquisition, and the returned * StatusWith will always be OK and contain the locks. If false, it returns with the error after - * a single try. + * a single try. If 'collLockTimeout' is set to true, collection lock acquisition is done with + * a timeout. The function returns LockTimeout immediately upon collection lock acquisition + * timeout. + * + * TODO (SERVER-76935): Remove collection timeout. * * This is intended to avoid a three-way deadlock between prepared transactions, stepdown, and * index build threads when trying to acquire an exclusive collection lock. @@ -623,7 +627,8 @@ protected: repl::ReplicationStateTransitionLockGuard>> _acquireExclusiveLockWithRSTLRetry(OperationContext* opCtx, ReplIndexBuildState* replState, - bool retry = true); + bool retry = true, + bool collLockTimeout = false); /** diff --git a/src/mongo/db/repl_index_build_state.h b/src/mongo/db/repl_index_build_state.h index 930fc7e04be..5389279ca3d 100644 --- a/src/mongo/db/repl_index_build_state.h +++ b/src/mongo/db/repl_index_build_state.h @@ -144,12 +144,17 @@ public: */ kApplyCommitOplogEntry, /** - * Below state indicates that index build was successfully able to commit or abort. For - * kCommitted, the state is set immediately before it commits the index build. For - * kAborted, this state is set after the build is cleaned up and the abort oplog entry is - * replicated. + * Below state indicates that index build was successfully able to commit and is set + * immediately before it commits the index build. */ kCommitted, + /** + * Below state indicates that index build was successfully able to abort. In case of self + * abort this state is set after the build is cleaned up and the abort oplog entry is + * replicated. In case of an external abort, this state is set before interrupting the + * builder thread, as a way of indicating that a self abort is not required. Cleanup and + * oplog entry replicating in this case is done after setting the state. + */ kAborted, /** * Below state indicates that the index build thread has voted for an abort to the current |