diff options
author | Josef Ahmad <josef.ahmad@mongodb.com> | 2023-05-17 15:27:32 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2023-05-17 16:28:35 +0000 |
commit | 846ed8250b4c3322e52c2e51c4a6992c6ce7ba34 (patch) | |
tree | 97afaae7f21b62cfa9c24cd94e7d5b18ed24b6c4 /src/mongo | |
parent | b46ba042fdce866cb42bca247febecde398fd1c4 (diff) | |
download | mongo-846ed8250b4c3322e52c2e51c4a6992c6ce7ba34.tar.gz |
SERVER-77083 Index build stepup async task should tolerate stepdowns
Diffstat (limited to 'src/mongo')
-rw-r--r-- | src/mongo/db/index_builds_coordinator.cpp | 22 |
1 files changed, 16 insertions, 6 deletions
diff --git a/src/mongo/db/index_builds_coordinator.cpp b/src/mongo/db/index_builds_coordinator.cpp index 4961d533f68..d8c737757ba 100644 --- a/src/mongo/db/index_builds_coordinator.cpp +++ b/src/mongo/db/index_builds_coordinator.cpp @@ -98,6 +98,7 @@ MONGO_FAIL_POINT_DEFINE(failIndexBuildWithErrorInSecondDrain); MONGO_FAIL_POINT_DEFINE(hangInRemoveIndexBuildEntryAfterCommitOrAbort); MONGO_FAIL_POINT_DEFINE(hangIndexBuildOnSetupBeforeTakingLocks); MONGO_FAIL_POINT_DEFINE(hangAbortIndexBuildByBuildUUIDAfterLocks); +MONGO_FAIL_POINT_DEFINE(hangOnStepUpAsyncTaskBeforeCheckingCommitQuorum); IndexBuildsCoordinator::IndexBuildsSSS::IndexBuildsSSS() : ServerStatusSection("indexBuilds"), @@ -1703,6 +1704,8 @@ void IndexBuildsCoordinator::_onStepUpAsyncTaskFn(OperationContext* opCtx) { // This reads from system.indexBuilds collection to see if commit quorum got // satisfied. try { + hangOnStepUpAsyncTaskBeforeCheckingCommitQuorum.pauseWhileSet(); + if (_signalIfCommitQuorumIsSatisfied(opCtx, replState)) { // The index build has been signalled to commit. As retrying skipped records // during step-up is done to prevent waiting until commit time, if the build @@ -1711,12 +1714,20 @@ void IndexBuildsCoordinator::_onStepUpAsyncTaskFn(OperationContext* opCtx) { return; } } catch (DBException& ex) { + // If the operation context is interrupted (shutdown, stepdown, killOp), stop + // the verification process and exit. + opCtx->checkForInterrupt(); + fassert(31440, ex.toStatus()); } } try { - // Only checks if key generation is valid, does not actually insert. + // Unlike the primary, secondaries cannot fail immediately when detecting key + // generation errors; they instead temporarily store them in the 'skipped records' + // table, to validate them on commit. As an optimisation to potentially detect + // errors earlier, check the table on step-up. Unlike during commit, we only check + // key generation here, we do not actually insert the keys. uassertStatusOK(_indexBuildsManager.retrySkippedRecords( opCtx, replState->buildUUID, @@ -1724,13 +1735,12 @@ void IndexBuildsCoordinator::_onStepUpAsyncTaskFn(OperationContext* opCtx) { IndexBuildsManager::RetrySkippedRecordMode::kKeyGeneration)); } catch (const DBException& ex) { - // Shutdown or replication state change might happen while iterating the index - // builds. In both cases, the opCtx is interrupted, in which case we want to stop - // the verification process and exit. This might also be the case for a killOp. + // If the operation context is interrupted (shutdown, stepdown, killOp), stop the + // verification process and exit. opCtx->checkForInterrupt(); - // All other errors must be due to key generation. We can abort the build early as - // it would eventually fail anyways during the commit phase retry. + // All other errors must be due to key generation. Abort the build now, instead of + // failing later during the commit phase retry. auto status = ex.toStatus().withContext("Skipped records retry failed on step-up"); abortIndexBuildByBuildUUID( opCtx, replState->buildUUID, IndexBuildAction::kPrimaryAbort, status.reason()); |