diff options
author | Josef Ahmad <josef.ahmad@mongodb.com> | 2023-04-04 12:15:17 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2023-04-04 13:07:44 +0000 |
commit | 806b58d5fac1b17d848b3c7c997b67b68440b2ba (patch) | |
tree | 59bd4047a8b07b7491549ad57b1598b9f763d176 | |
parent | b7a17172c80bf156595ad3e9d92ea9ec900c03e2 (diff) | |
download | mongo-806b58d5fac1b17d848b3c7c997b67b68440b2ba.tar.gz |
SERVER-75308 Fix race between external and internal index build aborts
-rw-r--r-- | jstests/noPassthrough/index_build_external_and_internal_abort.js | 70 | ||||
-rw-r--r-- | src/mongo/db/index_builds_coordinator.cpp | 3 | ||||
-rw-r--r-- | src/mongo/db/index_builds_coordinator_mongod.cpp | 18 | ||||
-rw-r--r-- | src/mongo/db/repl_index_build_state.cpp | 8 | ||||
-rw-r--r-- | src/mongo/db/repl_index_build_state.h | 6 |
5 files changed, 102 insertions, 3 deletions
diff --git a/jstests/noPassthrough/index_build_external_and_internal_abort.js b/jstests/noPassthrough/index_build_external_and_internal_abort.js new file mode 100644 index 00000000000..b3b3e671abd --- /dev/null +++ b/jstests/noPassthrough/index_build_external_and_internal_abort.js @@ -0,0 +1,70 @@ +/** + * Validate a scenario where an external index build abort (e.g. collection drop) races with an + * internal index build abort (e.g. build failed due to invalid keys). + * + * @tags: [ + * featureFlagIndexBuildGracefulErrorHandling, + * requires_replication, + * ] + */ +(function() { +"use strict"; + +load("jstests/libs/feature_flag_util.js"); +load('jstests/noPassthrough/libs/index_build.js'); + +const rst = new ReplSetTest({ + nodes: [ + {}, + ] +}); +rst.startSet(); +rst.initiate(); + +const primary = rst.getPrimary(); +const testDB = primary.getDB('test'); +const coll = testDB.getCollection('test'); + +assert.commandWorked(coll.insert({point: {x: -15.0, y: "abc"}})); + +let indexBuilderThreadFP = + configureFailPoint(testDB, 'hangIndexBuildBeforeTransitioningReplStateTokAwaitPrimaryAbort'); +let connThreadFP = configureFailPoint(testDB, 'hangInRemoveIndexBuildEntryAfterCommitOrAbort'); + +// Will fail with error code 13026: "geo values must be 'legacy coordinate pairs' for 2d indexes" +const waitForIndexBuild = + IndexBuildTest.startIndexBuild(primary, coll.getFullName(), {point: "2d"}, {}, 13026); + +// Wait for the index builder to detect the malformed geo value and to hang before transitioning +// the index build state to kAwaitPrimaryAbort. +indexBuilderThreadFP.wait(); + +// Drop the underlying collection. +const awaitDropCollection = + startParallelShell(funWithArgs(function(collName) { + assert.commandWorked(db.runCommand({drop: collName})); + }, coll.getName()), primary.port); + +// Wait for the 'drop' command to hang while tearing down the index build, just after setting the +// index build state to kAborted. +connThreadFP.wait(); + +// Resume the index builder thread, which would now try to abort an index that's already in kAbort +// state. +indexBuilderThreadFP.off(); + +// Wait for the log to confirm the index builder won't attempt to abort the build, because it's +// already in aborted state. +checkLog.containsJson(primary, 7530800); + +// Resume the collection drop and wait for its completion. +connThreadFP.off(); +awaitDropCollection(); + +waitForIndexBuild(); + +// The collection does not exist. +assert.eq(testDB.getCollectionNames().indexOf(coll.getName()), -1, "collection still exists."); + +rst.stopSet(); +})(); diff --git a/src/mongo/db/index_builds_coordinator.cpp b/src/mongo/db/index_builds_coordinator.cpp index 2212b975414..cedf0227954 100644 --- a/src/mongo/db/index_builds_coordinator.cpp +++ b/src/mongo/db/index_builds_coordinator.cpp @@ -92,6 +92,7 @@ MONGO_FAIL_POINT_DEFINE(hangIndexBuildBeforeWaitingUntilMajorityOpTime); MONGO_FAIL_POINT_DEFINE(hangBeforeUnregisteringAfterCommit); MONGO_FAIL_POINT_DEFINE(failSetUpResumeIndexBuild); MONGO_FAIL_POINT_DEFINE(failIndexBuildWithError); +MONGO_FAIL_POINT_DEFINE(hangInRemoveIndexBuildEntryAfterCommitOrAbort); IndexBuildsCoordinator::IndexBuildsSSS::IndexBuildsSSS() : ServerStatusSection("indexBuilds"), @@ -191,6 +192,8 @@ void removeIndexBuildEntryAfterCommitOrAbort(OperationContext* opCtx, return; } + hangInRemoveIndexBuildEntryAfterCommitOrAbort.pauseWhileSet(); + auto replCoord = repl::ReplicationCoordinator::get(opCtx); if (!replCoord->canAcceptWritesFor(opCtx, dbAndUUID)) { return; diff --git a/src/mongo/db/index_builds_coordinator_mongod.cpp b/src/mongo/db/index_builds_coordinator_mongod.cpp index 48d582dafb8..3e4aa686c64 100644 --- a/src/mongo/db/index_builds_coordinator_mongod.cpp +++ b/src/mongo/db/index_builds_coordinator_mongod.cpp @@ -70,6 +70,7 @@ MONGO_FAIL_POINT_DEFINE(hangBeforeInitializingIndexBuild); MONGO_FAIL_POINT_DEFINE(hangIndexBuildAfterSignalPrimaryForCommitReadiness); MONGO_FAIL_POINT_DEFINE(hangBeforeRunningIndexBuild); MONGO_FAIL_POINT_DEFINE(hangIndexBuildBeforeSignalingPrimaryForAbort); +MONGO_FAIL_POINT_DEFINE(hangIndexBuildBeforeTransitioningReplStateTokAwaitPrimaryAbort); const StringData kMaxNumActiveUserIndexBuildsServerParameterName = "maxNumActiveUserIndexBuilds"_sd; @@ -681,13 +682,28 @@ bool IndexBuildsCoordinatorMongod::_signalIfCommitQuorumNotEnabled( void IndexBuildsCoordinatorMongod::_signalPrimaryForAbortAndWaitForExternalAbort( OperationContext* opCtx, ReplIndexBuildState* replState, const Status& abortStatus) { + + hangIndexBuildBeforeTransitioningReplStateTokAwaitPrimaryAbort.pauseWhileSet(opCtx); + LOGV2(7419402, "Index build: signaling primary to abort index build", "buildUUID"_attr = replState->buildUUID, logAttrs(replState->dbName), "collectionUUID"_attr = replState->collectionUUID, "reason"_attr = abortStatus); - replState->requestAbortFromPrimary(abortStatus); + const auto transitionedToWaitForAbort = replState->requestAbortFromPrimary(abortStatus); + + if (!transitionedToWaitForAbort) { + // The index build has likely been aborted externally (e.g. its underlying collection was + // dropped), and it's in the midst of tearing down. There's nothing else to do here. + LOGV2(7530800, + "Index build: the build is already in aborted state; not signaling primary to abort", + "buildUUID"_attr = replState->buildUUID, + "db"_attr = replState->dbName, + "collectionUUID"_attr = replState->collectionUUID, + "reason"_attr = abortStatus); + return; + } hangIndexBuildBeforeSignalingPrimaryForAbort.pauseWhileSet(opCtx); diff --git a/src/mongo/db/repl_index_build_state.cpp b/src/mongo/db/repl_index_build_state.cpp index ebc1d0e72f0..29bd6d22fab 100644 --- a/src/mongo/db/repl_index_build_state.cpp +++ b/src/mongo/db/repl_index_build_state.cpp @@ -229,7 +229,7 @@ void ReplIndexBuildState::commit(OperationContext* opCtx) { }); } -void ReplIndexBuildState::requestAbortFromPrimary(const Status& abortStatus) { +bool ReplIndexBuildState::requestAbortFromPrimary(const Status& abortStatus) { invariant(protocol == IndexBuildProtocol::kTwoPhase); stdx::lock_guard lk(_mutex); @@ -245,8 +245,14 @@ void ReplIndexBuildState::requestAbortFromPrimary(const Status& abortStatus) { "buildUUID"_attr = buildUUID); } + if (_indexBuildState.isAborted()) { + return false; + } + _indexBuildState.setState( IndexBuildState::kAwaitPrimaryAbort, false /* skipCheck */, boost::none, abortStatus); + + return true; } Timestamp ReplIndexBuildState::getCommitTimestamp() const { diff --git a/src/mongo/db/repl_index_build_state.h b/src/mongo/db/repl_index_build_state.h index eff7b0c976e..a1b4a1fe2bc 100644 --- a/src/mongo/db/repl_index_build_state.h +++ b/src/mongo/db/repl_index_build_state.h @@ -309,8 +309,12 @@ public: /** * Only for two-phase index builds. Requests the primary to abort the build, and transitions * into a waiting state. + * + * Returns true if the thread has transitioned into the waiting state. + * Returns false if the build is already in abort state. This can happen if the build detected + * an error while an external operation (e.g. a collection drop) is concurrently aborting it. */ - void requestAbortFromPrimary(const Status& abortStatus); + bool requestAbortFromPrimary(const Status& abortStatus); /** * Returns timestamp for committing this index build. |