diff options
author | Louis Williams <louis.williams@mongodb.com> | 2020-04-10 10:02:34 -0400 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-04-10 14:16:22 +0000 |
commit | 3d929ed533a72446353b18b5d60770aed33b58f1 (patch) | |
tree | 09bd3562511aef88eb6aa1ef3b6c45b8d73c16e9 /src | |
parent | 76d4548a751a56c8faf1887114685b540203a650 (diff) | |
download | mongo-3d929ed533a72446353b18b5d60770aed33b58f1.tar.gz |
SERVER-46560 Make abort index build deterministic
This redesigns user index build abort to have the following behavior:
- Take a collection X lock to stop the index build from making progress
- If we are no longer primary, return an error
- Check whether we can abort the index build (i.e. it is not already committing
or aborting)
- Delete the index catalog entry and write the abortIndexBuild oplog entry in a WUOW
- Interrupt the index builder thread
- Wait for the thread to exit
- Release locks
Diffstat (limited to 'src')
26 files changed, 882 insertions, 1321 deletions
diff --git a/src/mongo/db/catalog/drop_collection.cpp b/src/mongo/db/catalog/drop_collection.cpp index a0deed258c7..6df38942003 100644 --- a/src/mongo/db/catalog/drop_collection.cpp +++ b/src/mongo/db/catalog/drop_collection.cpp @@ -159,21 +159,21 @@ Status _abortIndexBuildsAndDropCollection(OperationContext* opCtx, const int numIndexes = coll->getIndexCatalog()->numIndexesTotal(opCtx); while (true) { - // Send the abort signal to any active index builds on the collection. - indexBuildsCoord->abortCollectionIndexBuildsNoWait( - opCtx, - collectionUUID, - str::stream() << "Collection " << coll->ns() << "(" << collectionUUID - << ") is being dropped"); - - // Now that the abort signals were sent out to the active index builders for this - // collection, we need to release the lock temporarily to allow those index builders to - // process the abort signal. Holding a lock here will cause the index builders to block - // indefinitely. + // Save a copy of the namespace before yielding our locks. + const NamespaceString collectionNs = coll->ns(); + + // Release locks before aborting index builds. The helper will acquire locks on our behalf. collLock = boost::none; autoDb = boost::none; - indexBuildsCoord->awaitNoIndexBuildInProgressForCollection(opCtx, collectionUUID); + // Send the abort signal to any active index builds on the collection. This waits until all + // aborted index builds complete. + indexBuildsCoord->abortCollectionIndexBuilds(opCtx, + collectionNs, + collectionUUID, + str::stream() + << "Collection " << collectionNs << "(" + << collectionUUID << ") is being dropped"); // Take an exclusive lock to finish the collection drop. autoDb.emplace(opCtx, startingNss.db(), MODE_IX); @@ -195,24 +195,6 @@ Status _abortIndexBuildsAndDropCollection(OperationContext* opCtx, if (!abortAgain) { break; } - - // We only need to hold an intent lock to send an abort signal to the active index - // builders on this collection. - collLock = boost::none; - autoDb = boost::none; - - autoDb.emplace(opCtx, startingNss.db(), MODE_IX); - collLock.emplace(opCtx, dbAndUUID, MODE_IX); - - // Abandon the snapshot as the index catalog will compare the in-memory state to the - // disk state, which may have changed when we released the collection lock temporarily. - opCtx->recoveryUnit()->abandonSnapshot(); - - coll = CollectionCatalog::get(opCtx).lookupCollectionByUUID(opCtx, collectionUUID); - status = _checkNssAndReplState(opCtx, coll); - if (!status.isOK()) { - return status; - } } // It's possible for the given collection to be drop pending after obtaining the locks again, if diff --git a/src/mongo/db/catalog/drop_database.cpp b/src/mongo/db/catalog/drop_database.cpp index 5e0f097bc3f..8a50570b765 100644 --- a/src/mongo/db/catalog/drop_database.cpp +++ b/src/mongo/db/catalog/drop_database.cpp @@ -176,10 +176,6 @@ Status _dropDatabase(OperationContext* opCtx, const std::string& dbName, bool ab // We need to keep aborting all the active index builders for this database until there // are none left when we retrieve the exclusive database lock again. while (indexBuildsCoord->inProgForDb(dbName)) { - // Sends the abort signal to all the active index builders for this database. - indexBuildsCoord->abortDatabaseIndexBuildsNoWait( - opCtx, dbName, "dropDatabase command"); - // Create a scope guard to reset the drop-pending state on the database to false if // there is a replica state change that kills this operation while the locks were // yielded. @@ -192,12 +188,12 @@ Status _dropDatabase(OperationContext* opCtx, const std::string& dbName, bool ab dropPendingGuard.dismiss(); }); - // Now that the abort signals were sent out to the active index builders for this - // database, we need to release the lock temporarily to allow those index builders - // to process the abort signal. Holding a lock here will cause the index builders to - // block indefinitely. + // Drop locks. The drop helper will acquire locks on our behalf. autoDB = boost::none; - indexBuildsCoord->awaitNoBgOpInProgForDb(opCtx, dbName); + + // Sends the abort signal to all the active index builders for this database. Waits + // for aborted index builds to complete. + indexBuildsCoord->abortDatabaseIndexBuilds(opCtx, dbName, "dropDatabase command"); if (MONGO_unlikely(dropDatabaseHangAfterWaitingForIndexBuilds.shouldFail())) { LOGV2(4612300, diff --git a/src/mongo/db/catalog/drop_indexes.cpp b/src/mongo/db/catalog/drop_indexes.cpp index be8dcad5a21..a3bf4802de9 100644 --- a/src/mongo/db/catalog/drop_indexes.cpp +++ b/src/mongo/db/catalog/drop_indexes.cpp @@ -153,13 +153,13 @@ StatusWith<std::vector<std::string>> getIndexNames(OperationContext* opCtx, /** * Attempts to abort a single index builder that is responsible for all the index names passed in. */ -std::vector<UUID> abortIndexBuildByIndexNamesNoWait(OperationContext* opCtx, - Collection* collection, - std::vector<std::string> indexNames) { +std::vector<UUID> abortIndexBuildByIndexNames(OperationContext* opCtx, + UUID collectionUUID, + std::vector<std::string> indexNames) { boost::optional<UUID> buildUUID = - IndexBuildsCoordinator::get(opCtx)->abortIndexBuildByIndexNamesNoWait( - opCtx, collection->uuid(), indexNames, std::string("dropIndexes command")); + IndexBuildsCoordinator::get(opCtx)->abortIndexBuildByIndexNames( + opCtx, collectionUUID, indexNames, std::string("dropIndexes command")); if (buildUUID) { return {*buildUUID}; } @@ -210,20 +210,19 @@ Status dropIndexByDescriptor(OperationContext* opCtx, * otherwise this attempts to abort a single index builder building the given index names. */ std::vector<UUID> abortActiveIndexBuilders(OperationContext* opCtx, - Collection* collection, + const NamespaceString& collectionNs, + CollectionUUID collectionUUID, const std::vector<std::string>& indexNames) { - invariant(opCtx->lockState()->isCollectionLockedForMode(collection->ns(), MODE_IX)); - if (indexNames.empty()) { return {}; } if (indexNames.front() == "*") { - return IndexBuildsCoordinator::get(opCtx)->abortCollectionIndexBuildsNoWait( - opCtx, collection->uuid(), "dropIndexes command"); + return IndexBuildsCoordinator::get(opCtx)->abortCollectionIndexBuilds( + opCtx, collectionNs, collectionUUID, "dropIndexes command"); } - return abortIndexBuildByIndexNamesNoWait(opCtx, collection, indexNames); + return abortIndexBuildByIndexNames(opCtx, collectionUUID, indexNames); } Status dropReadyIndexes(OperationContext* opCtx, @@ -333,20 +332,18 @@ Status dropIndexes(OperationContext* opCtx, indexNames = swIndexNames.getValue(); - // Send the abort signal to any index builders that match the users request. - abortedIndexBuilders = abortActiveIndexBuilders(opCtx, collection, indexNames); + // Copy the namespace and UUID before dropping locks. + auto collUUID = collection->uuid(); + auto collNs = collection->ns(); - // Now that the abort signals were sent to the intended index builders, release our lock - // temporarily to allow the index builders to process the abort signal. Holding a lock here - // will cause the index builders to block indefinitely. + // Release locks before aborting index builds. The helper will acquire locks on our behalf. autoColl = boost::none; - if (abortedIndexBuilders.size() == 1) { - indexBuildsCoord->awaitIndexBuildFinished(opCtx, abortedIndexBuilders.front()); - } else if (abortedIndexBuilders.size() > 1) { - // Only the "*" wildcard can abort multiple index builders. - invariant(isWildcard); - indexBuildsCoord->awaitNoIndexBuildInProgressForCollection(opCtx, collectionUUID); - } + + // Send the abort signal to any index builders that match the users request. Waits until all + // aborted builders complete. + auto justAborted = abortActiveIndexBuilders(opCtx, collNs, collUUID, indexNames); + abortedIndexBuilders.insert( + abortedIndexBuilders.end(), justAborted.begin(), justAborted.end()); // Take an exclusive lock on the collection now to be able to perform index catalog writes // when removing ready indexes from disk. @@ -380,35 +377,11 @@ Status dropIndexes(OperationContext* opCtx, if (!abortAgain) { break; } - - // We only need to hold an intent lock to send abort signals to the active index - // builder(s) we intend to abort. - autoColl = boost::none; - autoColl.emplace(opCtx, dbAndUUID, MODE_IX); - - // Abandon the snapshot as the index catalog will compare the in-memory state to the - // disk state, which may have changed when we released the lock temporarily. - opCtx->recoveryUnit()->abandonSnapshot(); - - db = autoColl->getDb(); - collection = autoColl->getCollection(); - if (!collection) { - return Status(ErrorCodes::NamespaceNotFound, - str::stream() << "ns not found on database " << dbAndUUID.db() - << " with collection " << dbAndUUID.uuid()); - } - - status = checkReplState(opCtx, dbAndUUID); - if (!status.isOK()) { - return status; - } } // If the "*" wildcard was not specified, verify that all the index names belonging to the // index builder were aborted. if (!isWildcard && !abortedIndexBuilders.empty()) { - invariant(abortedIndexBuilders.size() == 1); - // This is necessary to check shard version. OldClientContext ctx(opCtx, collection->ns().ns()); diff --git a/src/mongo/db/catalog/index_builds_manager.cpp b/src/mongo/db/catalog/index_builds_manager.cpp index 1388bf5b4f8..dbb307c98af 100644 --- a/src/mongo/db/catalog/index_builds_manager.cpp +++ b/src/mongo/db/catalog/index_builds_manager.cpp @@ -37,6 +37,7 @@ #include "mongo/db/catalog/collection_catalog.h" #include "mongo/db/catalog/index_catalog.h" #include "mongo/db/catalog/index_timestamp_helper.h" +#include "mongo/db/catalog/uncommitted_collections.h" #include "mongo/db/concurrency/write_conflict_exception.h" #include "mongo/db/operation_context.h" #include "mongo/db/storage/write_unit_of_work.h" @@ -51,7 +52,7 @@ namespace { /** * Returns basic info on index builders. */ -std::string toSummary(const std::map<UUID, std::shared_ptr<MultiIndexBlock>>& builders) { +std::string toSummary(const std::map<UUID, std::unique_ptr<MultiIndexBlock>>& builders) { str::stream ss; ss << "Number of builders: " << builders.size() << ": ["; bool first = true; @@ -253,26 +254,25 @@ Status IndexBuildsManager::commitIndexBuild(OperationContext* opCtx, return status; } wunit.commit(); - - // Required call to clean up even though commit cleaned everything up. - builder->cleanUpAfterBuild(opCtx, collection, MultiIndexBlock::kNoopOnCleanUpFn); - _unregisterIndexBuild(buildUUID); return Status::OK(); }); } -bool IndexBuildsManager::abortIndexBuild(const UUID& buildUUID, const std::string& reason) { - stdx::unique_lock<Latch> lk(_mutex); - - auto builderIt = _builders.find(buildUUID); - if (builderIt == _builders.end()) { +bool IndexBuildsManager::abortIndexBuild(OperationContext* opCtx, + Collection* collection, + const UUID& buildUUID, + OnCleanUpFn onCleanUpFn) { + auto builder = _getBuilder(buildUUID); + if (!builder.isOK()) { return false; } - std::shared_ptr<MultiIndexBlock> builder = builderIt->second; + // Since abortIndexBuild is special in that it can be called by threads other than the index + // builder, ensure the caller has an exclusive lock. + auto nss = collection->ns(); + UncommittedCollections::get(opCtx).invariantHasExclusiveAccessToCollection(opCtx, nss); - lk.unlock(); - builder->abort(reason); + builder.getValue()->abortIndexBuild(opCtx, collection, onCleanUpFn); return true; } @@ -291,25 +291,9 @@ bool IndexBuildsManager::abortIndexBuildWithoutCleanup(OperationContext* opCtx, "reason"_attr = reason); builder.getValue()->abortWithoutCleanup(opCtx); - builder.getValue()->cleanUpAfterBuild(opCtx, collection, MultiIndexBlock::kNoopOnCleanUpFn); - _unregisterIndexBuild(buildUUID); - return true; } -void IndexBuildsManager::tearDownIndexBuild(OperationContext* opCtx, - Collection* collection, - const UUID& buildUUID, - OnCleanUpFn onCleanUpFn) { - auto builder = _getBuilder(buildUUID); - if (!builder.isOK()) { - return; - } - - builder.getValue()->cleanUpAfterBuild(opCtx, collection, onCleanUpFn); - _unregisterIndexBuild(buildUUID); -} - bool IndexBuildsManager::isBackgroundBuilding(const UUID& buildUUID) { auto builder = invariant(_getBuilder(buildUUID)); return builder->isBackgroundBuilding(); @@ -322,11 +306,11 @@ void IndexBuildsManager::verifyNoIndexBuilds_forTestOnly() { void IndexBuildsManager::_registerIndexBuild(UUID buildUUID) { stdx::unique_lock<Latch> lk(_mutex); - std::shared_ptr<MultiIndexBlock> mib = std::make_shared<MultiIndexBlock>(); - invariant(_builders.insert(std::make_pair(buildUUID, mib)).second); + auto mib = std::make_unique<MultiIndexBlock>(); + invariant(_builders.insert(std::make_pair(buildUUID, std::move(mib))).second); } -void IndexBuildsManager::_unregisterIndexBuild(const UUID& buildUUID) { +void IndexBuildsManager::unregisterIndexBuild(const UUID& buildUUID) { stdx::unique_lock<Latch> lk(_mutex); auto builderIt = _builders.find(buildUUID); @@ -336,14 +320,13 @@ void IndexBuildsManager::_unregisterIndexBuild(const UUID& buildUUID) { _builders.erase(builderIt); } -StatusWith<std::shared_ptr<MultiIndexBlock>> IndexBuildsManager::_getBuilder( - const UUID& buildUUID) { +StatusWith<MultiIndexBlock*> IndexBuildsManager::_getBuilder(const UUID& buildUUID) { stdx::unique_lock<Latch> lk(_mutex); auto builderIt = _builders.find(buildUUID); if (builderIt == _builders.end()) { return {ErrorCodes::NoSuchKey, str::stream() << "No index build with UUID: " << buildUUID}; } - return builderIt->second; + return builderIt->second.get(); } } // namespace mongo diff --git a/src/mongo/db/catalog/index_builds_manager.h b/src/mongo/db/catalog/index_builds_manager.h index 0b6becb6689..0686bd6af81 100644 --- a/src/mongo/db/catalog/index_builds_manager.h +++ b/src/mongo/db/catalog/index_builds_manager.h @@ -87,6 +87,11 @@ public: SetupOptions options = {}); /** + * Unregisters the builder associated with the given buildUUID from the _builders map. + */ + void unregisterIndexBuild(const UUID& buildUUID); + + /** * Runs the scanning/insertion phase of the index build.. */ Status startBuildingIndex(OperationContext* opCtx, @@ -138,12 +143,13 @@ public: OnCommitFn onCommitFn); /** - * Signals the index build to be aborted and returns without waiting for completion. - * - * Returns true if a build existed to be signaled, as opposed to having already finished and - * been cleared away, or not having yet started.. + * Deletes the index entry from the durable catalog. */ - bool abortIndexBuild(const UUID& buildUUID, const std::string& reason); + using OnCleanUpFn = MultiIndexBlock::OnCleanUpFn; + bool abortIndexBuild(OperationContext* opCtx, + Collection* collection, + const UUID& buildUUID, + OnCleanUpFn onCleanUpFn); /** * Signals the index build to be aborted without being cleaned up and returns without waiting @@ -158,15 +164,6 @@ public: const std::string& reason); /** - * Cleans up the index build state and unregisters it from the manager. - */ - using OnCleanUpFn = MultiIndexBlock::OnCleanUpFn; - void tearDownIndexBuild(OperationContext* opCtx, - Collection* collection, - const UUID& buildUUID, - OnCleanUpFn onCleanUpFn); - - /** * Returns true if the index build supports background writes while building an index. This is * true for the kHybrid method. */ @@ -184,21 +181,16 @@ private: void _registerIndexBuild(UUID buildUUID); /** - * Unregisters the builder associcated with the given buildUUID from the _builders map. - */ - void _unregisterIndexBuild(const UUID& buildUUID); - - /** - * Returns a shared pointer to the builder. Returns a bad status if the builder does not exist. + * Returns a pointer to the builder. Returns a bad status if the builder does not exist. */ - StatusWith<std::shared_ptr<MultiIndexBlock>> _getBuilder(const UUID& buildUUID); + StatusWith<MultiIndexBlock*> _getBuilder(const UUID& buildUUID); // Protects the map data structures below. mutable Mutex _mutex = MONGO_MAKE_LATCH("IndexBuildsManager::_mutex"); // Map of index builders by build UUID. Allows access to the builders so that actions can be // taken on and information passed to and from index builds. - std::map<UUID, std::shared_ptr<MultiIndexBlock>> _builders; + std::map<UUID, std::unique_ptr<MultiIndexBlock>> _builders; }; } // namespace mongo diff --git a/src/mongo/db/catalog/index_builds_manager_test.cpp b/src/mongo/db/catalog/index_builds_manager_test.cpp index 34f1e26d212..aa43f898082 100644 --- a/src/mongo/db/catalog/index_builds_manager_test.cpp +++ b/src/mongo/db/catalog/index_builds_manager_test.cpp @@ -91,12 +91,12 @@ TEST_F(IndexBuildsManagerTest, IndexBuildsManagerSetUpAndTearDown) { _buildUUID, MultiIndexBlock::kNoopOnInitFn)); - _indexBuildsManager.tearDownIndexBuild(operationContext(), - autoColl.getCollection(), - _buildUUID, - MultiIndexBlock::kNoopOnCleanUpFn); + _indexBuildsManager.abortIndexBuild(operationContext(), + autoColl.getCollection(), + _buildUUID, + MultiIndexBlock::kNoopOnCleanUpFn); + _indexBuildsManager.unregisterIndexBuild(_buildUUID); } - } // namespace } // namespace mongo diff --git a/src/mongo/db/catalog/multi_index_block.cpp b/src/mongo/db/catalog/multi_index_block.cpp index 8d78c98481f..06e3158dfb7 100644 --- a/src/mongo/db/catalog/multi_index_block.cpp +++ b/src/mongo/db/catalog/multi_index_block.cpp @@ -78,41 +78,16 @@ MultiIndexBlock::~MultiIndexBlock() { MultiIndexBlock::OnCleanUpFn MultiIndexBlock::kNoopOnCleanUpFn = []() {}; -void MultiIndexBlock::cleanUpAfterBuild(OperationContext* opCtx, - Collection* collection, - OnCleanUpFn onCleanUp) { +void MultiIndexBlock::abortIndexBuild(OperationContext* opCtx, + Collection* collection, + OnCleanUpFn onCleanUp) noexcept { if (_collectionUUID) { // init() was previously called with a collection pointer, so ensure that the same // collection is being provided for clean up and the interface in not being abused. invariant(_collectionUUID.get() == collection->uuid()); } - if (_indexes.empty()) { - _buildIsCleanedUp = true; - return; - } - - if (!_needToCleanup) { - CollectionQueryInfo::get(collection).clearQueryCache(); - - // The temp tables cannot be dropped in commit() because commit() can be called multiple - // times on write conflict errors and the drop does not rollback in WUOWs. - - // Make lock acquisition uninterruptible. - UninterruptibleLockGuard noInterrupt(opCtx->lockState()); - - // Lock if it's not already locked, to ensure storage engine cannot be destructed out from - // underneath us. - boost::optional<Lock::GlobalLock> lk; - if (!opCtx->lockState()->isWriteLocked()) { - lk.emplace(opCtx, MODE_IS); - } - - for (auto& index : _indexes) { - index.block->deleteTemporaryTables(opCtx); - } - - _buildIsCleanedUp = true; + if (_buildIsCleanedUp) { return; } @@ -200,15 +175,6 @@ StatusWith<std::vector<BSONObj>> MultiIndexBlock::init(OperationContext* opCtx, invariant(opCtx->lockState()->isCollectionLockedForMode(collection->ns(), MODE_X), str::stream() << "Collection " << collection->ns() << " with UUID " << collection->uuid() << " is holding the incorrect lock"); - if (State::kAborted == _getState()) { - return {ErrorCodes::IndexBuildAborted, - str::stream() << "Index build aborted: " << _abortReason - << ". Cannot initialize index builder: " << collection->ns() << " (" - << collection->uuid() << "): " << indexSpecs.size() - << " index spec(s) provided. First index spec: " - << (indexSpecs.empty() ? BSONObj() : indexSpecs[0])}; - } - _collectionUUID = collection->uuid(); _buildIsCleanedUp = false; @@ -227,7 +193,7 @@ StatusWith<std::vector<BSONObj>> MultiIndexBlock::init(OperationContext* opCtx, for (auto& index : _indexes) { index.block->deleteTemporaryTables(opCtx); } - _indexes.clear(); + _buildIsCleanedUp = true; }); const auto& ns = collection->ns().ns(); @@ -338,9 +304,6 @@ StatusWith<std::vector<BSONObj>> MultiIndexBlock::init(OperationContext* opCtx, } wunit.commit(); - - _setState(State::kRunning); - return indexInfoObjs; // Avoid converting WCE to Status } catch (const WriteConflictException&) { @@ -372,6 +335,7 @@ void failPointHangDuringBuild(FailPoint* fp, StringData where, const BSONObj& do Status MultiIndexBlock::insertAllDocumentsInCollection(OperationContext* opCtx, Collection* collection) { + invariant(!_buildIsCleanedUp); invariant(opCtx->lockState()->isNoop() || !opCtx->lockState()->inAWriteUnitOfWork()); // UUIDs are not guaranteed during startup because the check happens after indexes are rebuilt. @@ -524,11 +488,7 @@ Status MultiIndexBlock::insertAllDocumentsInCollection(OperationContext* opCtx, } Status MultiIndexBlock::insert(OperationContext* opCtx, const BSONObj& doc, const RecordId& loc) { - if (State::kAborted == _getState()) { - return {ErrorCodes::IndexBuildAborted, - str::stream() << "Index build aborted: " << _abortReason}; - } - + invariant(!_buildIsCleanedUp); for (size_t i = 0; i < _indexes.size(); i++) { if (_indexes[i].filterExpression && !_indexes[i].filterExpression->matchesBSON(doc)) { continue; @@ -557,11 +517,7 @@ Status MultiIndexBlock::dumpInsertsFromBulk(OperationContext* opCtx) { Status MultiIndexBlock::dumpInsertsFromBulk(OperationContext* opCtx, std::set<RecordId>* dupRecords) { - if (State::kAborted == _getState()) { - return {ErrorCodes::IndexBuildAborted, - str::stream() << "Index build aborted: " << _abortReason}; - } - + invariant(!_buildIsCleanedUp); invariant(opCtx->lockState()->isNoop() || !opCtx->lockState()->inAWriteUnitOfWork()); for (size_t i = 0; i < _indexes.size(); i++) { // If 'dupRecords' is provided, it will be used to store all records that would result in @@ -620,15 +576,7 @@ Status MultiIndexBlock::drainBackgroundWrites( OperationContext* opCtx, RecoveryUnit::ReadSource readSource, IndexBuildInterceptor::DrainYieldPolicy drainYieldPolicy) { - if (State::kAborted == _getState()) { - return {ErrorCodes::IndexBuildAborted, - str::stream() << "Index build aborted: " << _abortReason - << ". Cannot complete drain phase for index build" - << (_collectionUUID ? (" on collection '" + - _collectionUUID.get().toString() + "'") - : ".")}; - } - + invariant(!_buildIsCleanedUp); invariant(!opCtx->lockState()->inAWriteUnitOfWork()); // Drain side-writes table for each index. This only drains what is visible. Assuming intent @@ -659,6 +607,7 @@ Status MultiIndexBlock::drainBackgroundWrites( } Status MultiIndexBlock::retrySkippedRecords(OperationContext* opCtx, Collection* collection) { + invariant(!_buildIsCleanedUp); for (auto&& index : _indexes) { auto interceptor = index.block->getEntry()->indexBuildInterceptor(); if (!interceptor) @@ -673,14 +622,7 @@ Status MultiIndexBlock::retrySkippedRecords(OperationContext* opCtx, Collection* } Status MultiIndexBlock::checkConstraints(OperationContext* opCtx) { - if (State::kAborted == _getState()) { - return {ErrorCodes::IndexBuildAborted, - str::stream() << "Index build aborted: " << _abortReason - << ". Cannot complete constraint checking for index build" - << (_collectionUUID ? (" on collection '" + - _collectionUUID.get().toString() + "'") - : ".")}; - } + invariant(!_buildIsCleanedUp); // For each index that may be unique, check that no recorded duplicates still exist. This can // only check what is visible on the index. Callers are responsible for ensuring all writes to @@ -699,8 +641,7 @@ Status MultiIndexBlock::checkConstraints(OperationContext* opCtx) { } void MultiIndexBlock::abortWithoutCleanup(OperationContext* opCtx) { - _setStateToAbortedIfNotCommitted("aborted without cleanup"_sd); - + invariant(!_buildIsCleanedUp); UninterruptibleLockGuard noInterrupt(opCtx->lockState()); // Lock if it's not already locked, to ensure storage engine cannot be destructed out from // underneath us. @@ -712,8 +653,7 @@ void MultiIndexBlock::abortWithoutCleanup(OperationContext* opCtx) { for (auto& index : _indexes) { index.block->deleteTemporaryTables(opCtx); } - _indexes.clear(); - _needToCleanup = false; + _buildIsCleanedUp = true; } MultiIndexBlock::OnCreateEachFn MultiIndexBlock::kNoopOnCreateEachFn = [](const BSONObj& spec) {}; @@ -723,6 +663,7 @@ Status MultiIndexBlock::commit(OperationContext* opCtx, Collection* collection, OnCreateEachFn onCreateEach, OnCommitFn onCommit) { + invariant(!_buildIsCleanedUp); invariant(opCtx->lockState()->isCollectionLockedForMode(collection->ns(), MODE_X), str::stream() << "Collection " << collection->ns() << " with UUID " << collection->uuid() << " is holding the incorrect lock"); @@ -732,14 +673,6 @@ Status MultiIndexBlock::commit(OperationContext* opCtx, invariant(_collectionUUID.get() == collection->uuid()); } - if (State::kAborted == _getState()) { - return { - ErrorCodes::IndexBuildAborted, - str::stream() << "Index build aborted: " << _abortReason - << ". Cannot commit index builder: " << collection->ns() - << (_collectionUUID ? (" (" + _collectionUUID->toString() + ")") : "")}; - } - // Do not interfere with writing multikey information when committing index builds. auto restartTracker = makeGuard( [this, opCtx] { MultikeyPathTracker::get(opCtx).startTrackingMultikeyPathInfo(); }); @@ -769,6 +702,11 @@ Status MultiIndexBlock::commit(OperationContext* opCtx, _indexes[i].block->getEntry()->setMultikey(opCtx, bulkBuilder->getMultikeyPaths()); } + // The commit() function can be called multiple times on write conflict errors. Dropping the + // temp tables cannot be rolled back, so do it only after the WUOW commits. + opCtx->recoveryUnit()->onCommit( + [opCtx, i, this](auto commitTs) { _indexes[i].block->deleteTemporaryTables(opCtx); }); + if (opCtx->getServiceContext()->getStorageEngine()->supportsCheckpoints()) { // Add the new index ident to a list so that the validate cmd with {background:true} // can ignore the new index until it is regularly checkpoint'ed with the rest of the @@ -795,79 +733,20 @@ Status MultiIndexBlock::commit(OperationContext* opCtx, onCommit(); - // The state of this index build is set to Committed only when the WUOW commits. - // It is possible for abort() to be called after the check at the beginning of this function and - // before the WUOW is committed. If the WUOW commits, the final state of this index builder will - // be Committed. Otherwise, the index builder state will remain as Aborted and further attempts - // to commit this index build will fail. - opCtx->recoveryUnit()->onCommit( - [this](boost::optional<Timestamp> commitTime) { _setState(State::kCommitted); }); - - // On rollback sets MultiIndexBlock::_needToCleanup to true. - opCtx->recoveryUnit()->onRollback([this]() { _needToCleanup = true; }); - _needToCleanup = false; + opCtx->recoveryUnit()->onCommit([collection, this](boost::optional<Timestamp> commitTime) { + CollectionQueryInfo::get(collection).clearQueryCache(); + _buildIsCleanedUp = true; + }); return Status::OK(); } -bool MultiIndexBlock::isCommitted() const { - return State::kCommitted == _getState(); -} - -void MultiIndexBlock::abort(StringData reason) { - _setStateToAbortedIfNotCommitted(reason); -} - - bool MultiIndexBlock::isBackgroundBuilding() const { return _method == IndexBuildMethod::kHybrid; } void MultiIndexBlock::setIndexBuildMethod(IndexBuildMethod indexBuildMethod) { - invariant(_getState() == State::kUninitialized); _method = indexBuildMethod; } -MultiIndexBlock::State MultiIndexBlock::getState_forTest() const { - return _getState(); -} - -MultiIndexBlock::State MultiIndexBlock::_getState() const { - stdx::lock_guard<Latch> lock(_mutex); - return _state; -} - -void MultiIndexBlock::_setState(State newState) { - invariant(State::kAborted != newState); - stdx::lock_guard<Latch> lock(_mutex); - _state = newState; -} - -void MultiIndexBlock::_setStateToAbortedIfNotCommitted(StringData reason) { - stdx::lock_guard<Latch> lock(_mutex); - if (State::kCommitted == _state) { - return; - } - _state = State::kAborted; - _abortReason = reason.toString(); -} - -StringData toString(MultiIndexBlock::State state) { - switch (state) { - case MultiIndexBlock::State::kUninitialized: - return "Uninitialized"_sd; - case MultiIndexBlock::State::kRunning: - return "Running"_sd; - case MultiIndexBlock::State::kCommitted: - return "Committed"_sd; - case MultiIndexBlock::State::kAborted: - return "Aborted"_sd; - } - MONGO_UNREACHABLE; -} - -std::ostream& operator<<(std::ostream& os, const MultiIndexBlock::State& state) { - return os << toString(state); -} - } // namespace mongo diff --git a/src/mongo/db/catalog/multi_index_block.h b/src/mongo/db/catalog/multi_index_block.h index e600882ed00..9a17a6c1c58 100644 --- a/src/mongo/db/catalog/multi_index_block.h +++ b/src/mongo/db/catalog/multi_index_block.h @@ -79,26 +79,6 @@ public: ~MultiIndexBlock(); /** - * Ensures the index build state is cleared correctly after index build success or failure. - * - * Must be called before object destruction if init() has been called; and safe to call if - * init() has not been called. - * - * By only requiring this call after init(), we allow owners of the object to exit without - * further handling if they never use the object. - * - * `onCleanUp` will be called after all indexes have been removed from the catalog. - */ - using OnCleanUpFn = std::function<void()>; - void cleanUpAfterBuild(OperationContext* opCtx, Collection* collection, OnCleanUpFn onCleanUp); - - /** - * Not all index aborts need this function, in particular index builds that do not need - * to timestamp catalog writes. This is a no-op. - */ - static OnCleanUpFn kNoopOnCleanUpFn; - - /** * By default we enforce the 'unique' flag in specs when building an index by failing. * If this is called before init(), we will ignore unique violations. This has no effect if * no specs are unique. @@ -255,45 +235,34 @@ public: static OnCommitFn kNoopOnCommitFn; /** - * Returns true if this index builder was added to the index catalog successfully. - * In addition to having commit() return without errors, the enclosing WUOW has to be committed - * for the indexes to show up in the index catalog. - */ - bool isCommitted() const; - - /** - * Signals the index build to abort. - * - * In-progress inserts and commits will still run to completion. However, subsequent index build - * operations will fail an IndexBuildAborted error. + * Ensures the index build state is cleared correctly after index build failure. * - * Aborts the uncommitted index build and prevents further inserts or commit attempts from - * proceeding. On destruction, all traces of uncommitted index builds will be removed. - * - * If the index build has already been aborted (using abort() or abortWithoutCleanup()), - * this function does nothing. + * Must be called before object destruction if init() has been called; and safe to call if + * init() has not been called. * - * If this index build has been committed successfully, this function has no effect. + * By only requiring this call after init(), we allow owners of the object to exit without + * further handling if they never use the object. * - * May be called from any thread. + * `onCleanUp` will be called after all indexes have been removed from the catalog. + */ + using OnCleanUpFn = std::function<void()>; + void abortIndexBuild(OperationContext* opCtx, + Collection* collection, + OnCleanUpFn onCleanUp) noexcept; + + /** + * Not all index aborts need this function, in particular index builds that do not need + * to timestamp catalog writes. This is a no-op. */ - void abort(StringData reason); + static OnCleanUpFn kNoopOnCleanUpFn; /** * May be called at any time after construction but before a successful commit(). Suppresses - * the default behavior on destruction of removing all traces of uncommitted index builds. - * - * The most common use of this is if the indexes were already dropped via some other - * mechanism such as the whole collection being dropped. In that case, it would be invalid - * to try to remove the indexes again. Also, replication uses this to ensure that indexes - * that are being built on shutdown are resumed on startup. - * - * Do not use this unless you are really sure you need to. + * the default behavior on destruction of removing all traces of uncommitted index builds. Does + * not perform any storage engine writes. May delete internal tables, but this is not + * transactional. * - * Does not matter whether it is called inside of a WriteUnitOfWork. Will not be rolled - * back. - * - * Must be called from owning thread. + * This should only be used during shutdown or rollback. */ void abortWithoutCleanup(OperationContext* opCtx); @@ -305,27 +274,6 @@ public: void setIndexBuildMethod(IndexBuildMethod indexBuildMethod); - /** - * State transitions: - * - * Uninitialized --> Running --> Committed - * | | ^ - * | | | - * \--------------+------> Aborted - * - * It is possible for abort() to skip intermediate states. For example, calling abort() when the - * index build has not been initialized will transition from Uninitialized directly to Aborted. - * - * In the case where we are in the midst of committing the WUOW for a successful commit() call, - * we may transition temporarily to Aborted before finally ending at Committed. See comments for - * MultiIndexBlock::abort(). - * - * For testing only. Callers should not have to query the state of the MultiIndexBlock directly. - */ - enum class State { kUninitialized, kRunning, kCommitted, kAborted }; - StringData toString(State state); - State getState_forTest() const; - private: struct IndexToBuild { std::unique_ptr<IndexBuildBlock> block; @@ -337,21 +285,6 @@ private: InsertDeleteOptions options; }; - /** - * Returns the current state. - */ - State _getState() const; - - /** - * Updates the current state to a non-Aborted state. - */ - void _setState(State newState); - - /** - * Updates the current state to Aborted with the given reason. - */ - void _setStateToAbortedIfNotCommitted(StringData reason); - // Is set during init() and ensures subsequent function calls act on the same Collection. boost::optional<UUID> _collectionUUID; @@ -363,8 +296,6 @@ private: bool _ignoreUnique = false; - bool _needToCleanup = true; - // Set to true when no work remains to be done, the object can safely destruct without leaving // incorrect state set anywhere. bool _buildIsCleanedUp = true; @@ -372,15 +303,5 @@ private: // A unique identifier associating this index build with a two-phase index build within a // replica set. boost::optional<UUID> _buildUUID; - - // Protects member variables of this class declared below. - mutable Mutex _mutex = MONGO_MAKE_LATCH("MultiIndexBlock::_mutex"); - - State _state = State::kUninitialized; - std::string _abortReason; }; - -// For unit tests that need to check MultiIndexBlock states. -// The ASSERT_*() macros use this function to print the value of 'state' when the predicate fails. -std::ostream& operator<<(std::ostream& os, const MultiIndexBlock::State& state); } // namespace mongo diff --git a/src/mongo/db/catalog/multi_index_block_test.cpp b/src/mongo/db/catalog/multi_index_block_test.cpp index 0b73737a991..3f0615b14aa 100644 --- a/src/mongo/db/catalog/multi_index_block_test.cpp +++ b/src/mongo/db/catalog/multi_index_block_test.cpp @@ -85,7 +85,6 @@ void MultiIndexBlockTest::tearDown() { auto service = getServiceContext(); repl::ReplicationCoordinator::set(service, {}); - _indexer->cleanUpAfterBuild(getOpCtx(), getCollection(), MultiIndexBlock::kNoopOnCleanUpFn); _indexer = {}; _collection = {}; @@ -109,17 +108,14 @@ MultiIndexBlock* MultiIndexBlockTest::getIndexer() const { TEST_F(MultiIndexBlockTest, CommitWithoutInsertingDocuments) { auto indexer = getIndexer(); - ASSERT_EQUALS(MultiIndexBlock::State::kUninitialized, indexer->getState_forTest()); auto specs = unittest::assertGet(indexer->init( getOpCtx(), getCollection(), std::vector<BSONObj>(), MultiIndexBlock::kNoopOnInitFn)); ASSERT_EQUALS(0U, specs.size()); - ASSERT_EQUALS(MultiIndexBlock::State::kRunning, indexer->getState_forTest()); ASSERT_OK(indexer->dumpInsertsFromBulk(getOpCtx())); ASSERT_OK(indexer->checkConstraints(getOpCtx())); - ASSERT_FALSE(indexer->isCommitted()); { WriteUnitOfWork wunit(getOpCtx()); ASSERT_OK(indexer->commit(getOpCtx(), @@ -128,24 +124,19 @@ TEST_F(MultiIndexBlockTest, CommitWithoutInsertingDocuments) { MultiIndexBlock::kNoopOnCommitFn)); wunit.commit(); } - ASSERT(indexer->isCommitted()); - ASSERT_EQUALS(MultiIndexBlock::State::kCommitted, indexer->getState_forTest()); } TEST_F(MultiIndexBlockTest, CommitAfterInsertingSingleDocument) { auto indexer = getIndexer(); - ASSERT_EQUALS(MultiIndexBlock::State::kUninitialized, indexer->getState_forTest()); auto specs = unittest::assertGet(indexer->init( getOpCtx(), getCollection(), std::vector<BSONObj>(), MultiIndexBlock::kNoopOnInitFn)); ASSERT_EQUALS(0U, specs.size()); - ASSERT_EQUALS(MultiIndexBlock::State::kRunning, indexer->getState_forTest()); ASSERT_OK(indexer->insert(getOpCtx(), {}, {})); ASSERT_OK(indexer->dumpInsertsFromBulk(getOpCtx())); ASSERT_OK(indexer->checkConstraints(getOpCtx())); - ASSERT_FALSE(indexer->isCommitted()); { WriteUnitOfWork wunit(getOpCtx()); ASSERT_OK(indexer->commit(getOpCtx(), @@ -154,11 +145,9 @@ TEST_F(MultiIndexBlockTest, CommitAfterInsertingSingleDocument) { MultiIndexBlock::kNoopOnCommitFn)); wunit.commit(); } - ASSERT(indexer->isCommitted()); // abort() should have no effect after the index build is committed. - indexer->abort("test"_sd); - ASSERT(indexer->isCommitted()); + indexer->abortIndexBuild(getOpCtx(), getCollection(), MultiIndexBlock::kNoopOnCleanUpFn); } TEST_F(MultiIndexBlockTest, AbortWithoutCleanupAfterInsertingSingleDocument) { @@ -168,94 +157,6 @@ TEST_F(MultiIndexBlockTest, AbortWithoutCleanupAfterInsertingSingleDocument) { ASSERT_EQUALS(0U, specs.size()); ASSERT_OK(indexer->insert(getOpCtx(), {}, {})); indexer->abortWithoutCleanup(getOpCtx()); - ASSERT_EQUALS(MultiIndexBlock::State::kAborted, indexer->getState_forTest()); - - ASSERT_FALSE(indexer->isCommitted()); -} - -TEST_F(MultiIndexBlockTest, InitFailsAfterAbort) { - auto indexer = getIndexer(); - ASSERT_EQUALS(MultiIndexBlock::State::kUninitialized, indexer->getState_forTest()); - - indexer->abort("test"_sd); - ASSERT_EQUALS(MultiIndexBlock::State::kAborted, indexer->getState_forTest()); - - ASSERT_EQUALS( - ErrorCodes::IndexBuildAborted, - indexer - ->init( - getOpCtx(), getCollection(), std::vector<BSONObj>(), MultiIndexBlock::kNoopOnInitFn) - .getStatus()); - ASSERT_EQUALS(MultiIndexBlock::State::kAborted, indexer->getState_forTest()); - - ASSERT_FALSE(indexer->isCommitted()); -} - -TEST_F(MultiIndexBlockTest, InsertingSingleDocumentFailsAfterAbort) { - auto indexer = getIndexer(); - ASSERT_EQUALS(MultiIndexBlock::State::kUninitialized, indexer->getState_forTest()); - - auto specs = unittest::assertGet(indexer->init( - getOpCtx(), getCollection(), std::vector<BSONObj>(), MultiIndexBlock::kNoopOnInitFn)); - ASSERT_EQUALS(0U, specs.size()); - ASSERT_EQUALS(MultiIndexBlock::State::kRunning, indexer->getState_forTest()); - - indexer->abort("test"_sd); - ASSERT_EQUALS(MultiIndexBlock::State::kAborted, indexer->getState_forTest()); - - ASSERT_EQUALS(ErrorCodes::IndexBuildAborted, - indexer->insert(getOpCtx(), BSON("_id" << 123 << "a" << 456), {})); - ASSERT_EQUALS(MultiIndexBlock::State::kAborted, indexer->getState_forTest()); - - ASSERT_FALSE(indexer->isCommitted()); -} - -TEST_F(MultiIndexBlockTest, DumpInsertsFromBulkFailsAfterAbort) { - auto indexer = getIndexer(); - ASSERT_EQUALS(MultiIndexBlock::State::kUninitialized, indexer->getState_forTest()); - - auto specs = unittest::assertGet(indexer->init( - getOpCtx(), getCollection(), std::vector<BSONObj>(), MultiIndexBlock::kNoopOnInitFn)); - ASSERT_EQUALS(0U, specs.size()); - ASSERT_EQUALS(MultiIndexBlock::State::kRunning, indexer->getState_forTest()); - - ASSERT_OK(indexer->insert(getOpCtx(), BSON("_id" << 123 << "a" << 456), {})); - ASSERT_EQUALS(MultiIndexBlock::State::kRunning, indexer->getState_forTest()); - - indexer->abort("test"_sd); - ASSERT_EQUALS(MultiIndexBlock::State::kAborted, indexer->getState_forTest()); - - ASSERT_EQUALS(ErrorCodes::IndexBuildAborted, indexer->dumpInsertsFromBulk(getOpCtx())); - ASSERT_EQUALS(MultiIndexBlock::State::kAborted, indexer->getState_forTest()); - - ASSERT_FALSE(indexer->isCommitted()); -} - -TEST_F(MultiIndexBlockTest, CommitFailsAfterAbort) { - auto indexer = getIndexer(); - ASSERT_EQUALS(MultiIndexBlock::State::kUninitialized, indexer->getState_forTest()); - - auto specs = unittest::assertGet(indexer->init( - getOpCtx(), getCollection(), std::vector<BSONObj>(), MultiIndexBlock::kNoopOnInitFn)); - ASSERT_EQUALS(0U, specs.size()); - ASSERT_EQUALS(MultiIndexBlock::State::kRunning, indexer->getState_forTest()); - - ASSERT_OK(indexer->insert(getOpCtx(), BSON("_id" << 123 << "a" << 456), {})); - ASSERT_EQUALS(MultiIndexBlock::State::kRunning, indexer->getState_forTest()); - - ASSERT_OK(indexer->dumpInsertsFromBulk(getOpCtx())); - - indexer->abort("test"_sd); - ASSERT_EQUALS(MultiIndexBlock::State::kAborted, indexer->getState_forTest()); - - ASSERT_EQUALS(ErrorCodes::IndexBuildAborted, - indexer->commit(getOpCtx(), - getCollection(), - MultiIndexBlock::kNoopOnCreateEachFn, - MultiIndexBlock::kNoopOnCommitFn)); - ASSERT_EQUALS(MultiIndexBlock::State::kAborted, indexer->getState_forTest()); - - ASSERT_FALSE(indexer->isCommitted()); } } // namespace diff --git a/src/mongo/db/cloner.cpp b/src/mongo/db/cloner.cpp index 1a45051af97..d7ec0c03de7 100644 --- a/src/mongo/db/cloner.cpp +++ b/src/mongo/db/cloner.cpp @@ -393,10 +393,6 @@ void Cloner::copyIndexes(OperationContext* opCtx, // implementations anyway as long as that is supported. MultiIndexBlock indexer; - // The code below throws, so ensure build cleanup occurs. - ON_BLOCK_EXIT( - [&] { indexer.cleanUpAfterBuild(opCtx, collection, MultiIndexBlock::kNoopOnCleanUpFn); }); - // Emit startIndexBuild and commitIndexBuild oplog entries if supported by the current FCV. auto opObserver = opCtx->getServiceContext()->getOpObserver(); auto fromMigrate = false; @@ -436,6 +432,11 @@ void Cloner::copyIndexes(OperationContext* opCtx, } auto indexInfoObjs = uassertStatusOK(indexer.init(opCtx, collection, indexesToBuild, onInitFn)); + + // The code below throws, so ensure build cleanup occurs. + auto abortOnExit = makeGuard( + [&] { indexer.abortIndexBuild(opCtx, collection, MultiIndexBlock::kNoopOnCleanUpFn); }); + uassertStatusOK(indexer.insertAllDocumentsInCollection(opCtx, collection)); uassertStatusOK(indexer.checkConstraints(opCtx)); @@ -463,6 +464,7 @@ void Cloner::copyIndexes(OperationContext* opCtx, } })); wunit.commit(); + abortOnExit.dismiss(); } bool Cloner::copyCollection(OperationContext* opCtx, diff --git a/src/mongo/db/commands/create_indexes.cpp b/src/mongo/db/commands/create_indexes.cpp index 6bfe04cafab..0ef519bb542 100644 --- a/src/mongo/db/commands/create_indexes.cpp +++ b/src/mongo/db/commands/create_indexes.cpp @@ -79,6 +79,7 @@ MONGO_FAIL_POINT_DEFINE(createIndexesWriteConflict); // collection is created. MONGO_FAIL_POINT_DEFINE(hangBeforeCreateIndexesCollectionCreate); MONGO_FAIL_POINT_DEFINE(hangBeforeIndexBuildAbortOnInterrupt); +MONGO_FAIL_POINT_DEFINE(hangAfterIndexBuildAbort); // This failpoint hangs between logging the index build UUID and starting the index build // through the IndexBuildsCoordinator. @@ -600,93 +601,80 @@ bool runCreateIndexesWithCoordinator(OperationContext* opCtx, opCtx, dbname, *collectionUUID, specs, buildUUID, protocol, indexBuildOptions)); auto deadline = opCtx->getDeadline(); - // Date_t::max() means no deadline. - if (deadline == Date_t::max()) { - LOGV2(20439, - "Waiting for index build to complete: {buildUUID}", - "buildUUID"_attr = buildUUID); - } else { - LOGV2(20440, - "Waiting for index build to complete: {buildUUID} (deadline: {deadline})", - "buildUUID"_attr = buildUUID, - "deadline"_attr = deadline); - } + LOGV2(20440, + "Waiting for index build to complete", + "buildUUID"_attr = buildUUID, + "deadline"_attr = deadline); // Throws on error. try { stats = buildIndexFuture.get(opCtx); } catch (const ExceptionForCat<ErrorCategory::Interruption>& interruptionEx) { LOGV2(20441, - "Index build interrupted: {buildUUID}: {interruptionEx}", + "Index build received interrupt signal", "buildUUID"_attr = buildUUID, - "interruptionEx"_attr = interruptionEx); + "signal"_attr = interruptionEx); hangBeforeIndexBuildAbortOnInterrupt.pauseWhileSet(); - boost::optional<Lock::GlobalLock> globalLock; if (IndexBuildProtocol::kTwoPhase == protocol) { // If this node is no longer a primary, the index build will continue to run in the // background and will complete when this node receives a commitIndexBuild oplog // entry from the new primary. if (ErrorCodes::InterruptedDueToReplStateChange == interruptionEx.code()) { LOGV2(20442, - "Index build continuing in background: {buildUUID}", + "Index build ignoring interrupt and continuing in background", "buildUUID"_attr = buildUUID); throw; } - - // If we are using two-phase index builds and are no longer primary after receiving - // an interrupt, we cannot replicate an abortIndexBuild oplog entry. Rely on the new - // primary to finish the index build. Acquire the global lock to check the - // replication state and to prevent any state transitions from happening while - // aborting the index build. - UninterruptibleLockGuard noInterrupt(opCtx->lockState()); - globalLock.emplace(opCtx, MODE_IS); - if (!repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesFor(opCtx, ns)) { - uassertStatusOK( - {ErrorCodes::NotMaster, - str::stream() - << "Unable to abort index build because we are no longer primary: " - << buildUUID}); - } } // It is unclear whether the interruption originated from the current opCtx instance // for the createIndexes command or that the IndexBuildsCoordinator task was interrupted // independently of this command invocation. We'll defensively abort the index build // with the assumption that if the index build was already in the midst of tearing down, - // this be a no-op. - // Use a null abort timestamp because the index build will generate its own timestamp - // on cleanup. - indexBuildsCoord->abortIndexBuildOnError(opCtx, buildUUID, interruptionEx.toStatus()); - LOGV2(20443, "Index build aborted: {buildUUID}", "buildUUID"_attr = buildUUID); - + // this will be a no-op. + { + // The current OperationContext may be interrupted, which would prevent us from + // taking locks. Use a new OperationContext to abort the index build. + auto newClient = opCtx->getServiceContext()->makeClient("abort-index-build"); + AlternativeClientRegion acr(newClient); + const auto abortCtx = cc().makeOperationContext(); + + std::string abortReason(str::stream() << "Index build aborted: " << buildUUID + << ": " << interruptionEx.toString()); + indexBuildsCoord->abortIndexBuildByBuildUUID( + abortCtx.get(), buildUUID, IndexBuildAction::kPrimaryAbort, abortReason); + LOGV2( + 20443, "Index build aborted due to interruption", "buildUUID"_attr = buildUUID); + } throw; } catch (const ExceptionForCat<ErrorCategory::NotMasterError>& ex) { LOGV2(20444, - "Index build interrupted due to change in replication state: {buildUUID}: {ex}", + "Index build received interrupt signal due to change in replication state", "buildUUID"_attr = buildUUID, "ex"_attr = ex); - // The index build will continue to run in the background and will complete when this - // node receives a commitIndexBuild oplog entry from the new primary. - + // If this node is no longer a primary, the index build will continue to run in the + // background and will complete when this node receives a commitIndexBuild oplog + // entry from the new primary. if (IndexBuildProtocol::kTwoPhase == protocol) { LOGV2(20445, - "Index build continuing in background: {buildUUID}", + "Index build ignoring interrupt and continuing in background", "buildUUID"_attr = buildUUID); throw; } - indexBuildsCoord->abortIndexBuildOnError(opCtx, buildUUID, ex.toStatus()); - LOGV2(20446, - "Index build aborted due to NotMaster error: {buildUUID}", - "buildUUID"_attr = buildUUID); - + std::string abortReason(str::stream() << "Index build aborted: " << buildUUID << ": " + << ex.toString()); + indexBuildsCoord->abortIndexBuildByBuildUUID( + opCtx, buildUUID, IndexBuildAction::kPrimaryAbort, abortReason); + LOGV2( + 20446, "Index build aborted due to NotMaster error", "buildUUID"_attr = buildUUID); throw; } - LOGV2(20447, "Index build completed: {buildUUID}", "buildUUID"_attr = buildUUID); + LOGV2(20447, "Index build completed", "buildUUID"_attr = buildUUID); } catch (DBException& ex) { // If the collection is dropped after the initial checks in this function (before the // AutoStatsTracker is created), the IndexBuildsCoordinator (either startIndexBuild() or @@ -703,10 +691,7 @@ bool runCreateIndexesWithCoordinator(OperationContext* opCtx, } // All other errors should be forwarded to the caller with index build information included. - LOGV2(20449, - "Index build failed: {buildUUID}: {ex_toStatus}", - "buildUUID"_attr = buildUUID, - "ex_toStatus"_attr = ex.toStatus()); + LOGV2(20449, "Index build failed", "buildUUID"_attr = buildUUID, "ex"_attr = ex.toStatus()); ex.addContext(str::stream() << "Index build failed: " << buildUUID << ": Collection " << ns << " ( " << *collectionUUID << " )"); @@ -770,6 +755,7 @@ public: try { return runCreateIndexesWithCoordinator(opCtx, dbname, cmdObj, errmsg, result); } catch (const DBException& ex) { + hangAfterIndexBuildAbort.pauseWhileSet(); // We can only wait for an existing index build to finish if we are able to release // our locks, in order to allow the existing index build to proceed. We cannot // release locks in transactions, so we bypass the below logic in transactions. diff --git a/src/mongo/db/commands/drop_indexes.cpp b/src/mongo/db/commands/drop_indexes.cpp index e924becf9f4..38266a3cf48 100644 --- a/src/mongo/db/commands/drop_indexes.cpp +++ b/src/mongo/db/commands/drop_indexes.cpp @@ -213,22 +213,20 @@ public: StatusWith<std::vector<BSONObj>> swIndexesToRebuild(ErrorCodes::UnknownError, "Uninitialized"); - // The 'indexer' can throw, so ensure build cleanup occurs. - ON_BLOCK_EXIT([&] { - indexer->cleanUpAfterBuild(opCtx, collection, MultiIndexBlock::kNoopOnCleanUpFn); + writeConflictRetry(opCtx, "dropAllIndexes", toReIndexNss.ns(), [&] { + WriteUnitOfWork wunit(opCtx); + collection->getIndexCatalog()->dropAllIndexes(opCtx, true); + + swIndexesToRebuild = + indexer->init(opCtx, collection, all, MultiIndexBlock::kNoopOnInitFn); + uassertStatusOK(swIndexesToRebuild.getStatus()); + wunit.commit(); }); - { - writeConflictRetry(opCtx, "dropAllIndexes", toReIndexNss.ns(), [&] { - WriteUnitOfWork wunit(opCtx); - collection->getIndexCatalog()->dropAllIndexes(opCtx, true); - - swIndexesToRebuild = - indexer->init(opCtx, collection, all, MultiIndexBlock::kNoopOnInitFn); - uassertStatusOK(swIndexesToRebuild.getStatus()); - wunit.commit(); - }); - } + // The 'indexer' can throw, so ensure build cleanup occurs. + auto abortOnExit = makeGuard([&] { + indexer->abortIndexBuild(opCtx, collection, MultiIndexBlock::kNoopOnCleanUpFn); + }); if (MONGO_unlikely(reIndexCrashAfterDrop.shouldFail())) { LOGV2(20458, "exiting because 'reIndexCrashAfterDrop' fail point was set"); @@ -241,16 +239,15 @@ public: uassertStatusOK(indexer->checkConstraints(opCtx)); - { - writeConflictRetry(opCtx, "commitReIndex", toReIndexNss.ns(), [&] { - WriteUnitOfWork wunit(opCtx); - uassertStatusOK(indexer->commit(opCtx, - collection, - MultiIndexBlock::kNoopOnCreateEachFn, - MultiIndexBlock::kNoopOnCommitFn)); - wunit.commit(); - }); - } + writeConflictRetry(opCtx, "commitReIndex", toReIndexNss.ns(), [&] { + WriteUnitOfWork wunit(opCtx); + uassertStatusOK(indexer->commit(opCtx, + collection, + MultiIndexBlock::kNoopOnCreateEachFn, + MultiIndexBlock::kNoopOnCommitFn)); + wunit.commit(); + }); + abortOnExit.dismiss(); // Do not allow majority reads from this collection until all original indexes are visible. // This was also done when dropAllIndexes() committed, but we need to ensure that no one diff --git a/src/mongo/db/index_builds_coordinator.cpp b/src/mongo/db/index_builds_coordinator.cpp index 31394c48e8c..ed2819c70de 100644 --- a/src/mongo/db/index_builds_coordinator.cpp +++ b/src/mongo/db/index_builds_coordinator.cpp @@ -68,6 +68,7 @@ using namespace indexbuildentryhelpers; MONGO_FAIL_POINT_DEFINE(hangAfterIndexBuildFirstDrain); MONGO_FAIL_POINT_DEFINE(hangAfterIndexBuildSecondDrain); MONGO_FAIL_POINT_DEFINE(hangAfterIndexBuildDumpsInsertsFromBulk); +MONGO_FAIL_POINT_DEFINE(hangAfterInitializingIndexBuild); namespace { @@ -147,9 +148,8 @@ bool shouldBuildIndexesOnEmptyCollectionSinglePhased(OperationContext* opCtx, /* * Determines whether to skip the index build state transition check. * Index builder not using ReplIndexBuildState::waitForNextAction to signal primary and secondaries - * to commit or abort signal will violate index build state transition i.e, they can move from - * prepareAbort to Committed and from Committed to prepareAbort. So, we should skip state transition - * verification. Otherwise, we would invariant. + * to commit or abort signal will violate index build state transition. So, we should skip state + * transition verification. Otherwise, we would invariant. */ bool shouldSkipIndexBuildStateTransitionCheck(OperationContext* opCtx, IndexBuildProtocol protocol) { @@ -161,14 +161,23 @@ bool shouldSkipIndexBuildStateTransitionCheck(OperationContext* opCtx, } /** - * Signal downstream secondary nodes to commit index build. + * Replicates a commitIndexBuild oplog entry for two-phase builds, which signals downstream + * secondary nodes to commit the index build. */ void onCommitIndexBuild(OperationContext* opCtx, const NamespaceString& nss, - ReplIndexBuildState& replState, - bool replSetAndNotPrimaryAtStart) { + ReplIndexBuildState& replState) { const auto& buildUUID = replState.buildUUID; + auto skipCheck = shouldSkipIndexBuildStateTransitionCheck(opCtx, replState.protocol); + { + stdx::unique_lock<Latch> lk(replState.mutex); + replState.indexBuildState.setState(IndexBuildState::kCommitted, skipCheck); + } + if (IndexBuildProtocol::kSinglePhase == replState.protocol) { + return; + } + invariant(IndexBuildProtocol::kTwoPhase == replState.protocol, str::stream() << "onCommitIndexBuild: " << buildUUID); invariant(opCtx->lockState()->isWriteLocked(), @@ -178,19 +187,10 @@ void onCommitIndexBuild(OperationContext* opCtx, const auto& collUUID = replState.collectionUUID; const auto& indexSpecs = replState.indexSpecs; auto fromMigrate = false; - auto skipCheck = shouldSkipIndexBuildStateTransitionCheck(opCtx, replState.protocol); - { - stdx::unique_lock<Latch> lk(replState.mutex); - replState.indexBuildState.setState(IndexBuildState::kCommitted, skipCheck); - } // Since two phase index builds are allowed to survive replication state transitions, we should // check if the node is currently a primary before attempting to write to the oplog. auto replCoord = repl::ReplicationCoordinator::get(opCtx); - if (!replCoord->getSettings().usingReplSets()) { - return; - } - if (!replCoord->canAcceptWritesFor(opCtx, nss)) { invariant(!opCtx->recoveryUnit()->getCommitTimestamp().isNull(), str::stream() << "commitIndexBuild: " << buildUUID); @@ -201,7 +201,8 @@ void onCommitIndexBuild(OperationContext* opCtx, } /** - * Signal downstream secondary nodes to abort index build. + * Replicates an abortIndexBuild oplog entry for two-phase builds, which signals downstream + * secondary nodes to abort the index build. */ void onAbortIndexBuild(OperationContext* opCtx, const NamespaceString& nss, @@ -216,58 +217,18 @@ void onAbortIndexBuild(OperationContext* opCtx, auto opObserver = opCtx->getServiceContext()->getOpObserver(); auto collUUID = replState.collectionUUID; auto fromMigrate = false; - auto skipCheck = shouldSkipIndexBuildStateTransitionCheck(opCtx, replState.protocol); - { - stdx::unique_lock<Latch> lk(replState.mutex); - replState.indexBuildState.setState(IndexBuildState::kAborted, skipCheck); - } opObserver->onAbortIndexBuild( opCtx, nss, collUUID, replState.buildUUID, replState.indexSpecs, cause, fromMigrate); } /** - * Aborts the index build identified by the provided 'replIndexBuildState'. - * It gets called by drop database/collection/index command. - */ -void abortIndexBuild(WithLock lk, - OperationContext* opCtx, - IndexBuildsManager* indexBuildsManager, - std::shared_ptr<ReplIndexBuildState> replIndexBuildState, - const std::string& reason) { - stdx::unique_lock<Latch> replStateLock(replIndexBuildState->mutex); - if (replIndexBuildState->waitForNextAction->getFuture().isReady()) { - const auto nextAction = replIndexBuildState->waitForNextAction->getFuture().get(); - invariant(nextAction == IndexBuildAction::kSinglePhaseCommit || - nextAction == IndexBuildAction::kCommitQuorumSatisfied || - nextAction == IndexBuildAction::kPrimaryAbort); - // Index build coordinator already received a signal to commit or abort. So, it's ok - // to return and wait for the index build to complete. The index build coordinator - // will not perform the signaled action (i.e, will not commit or abort the index build) - // only when the node steps down. When the node steps down, the caller of this function, - // drop commands (user operation) will also get interrupted. So, we no longer need to - // abort the index build on step down. - return; - } - - auto skipCheck = shouldSkipIndexBuildStateTransitionCheck(opCtx, replIndexBuildState->protocol); - // Set the state on replIndexBuildState and indexBuildsManager. And, then signal the value. It's - // important we do all these 3 things in a critical section by holding mutex lock. - replIndexBuildState->indexBuildState.setState( - IndexBuildState::kPrepareAbort, skipCheck, boost::none, reason); - indexBuildsManager->abortIndexBuild(replIndexBuildState->buildUUID, reason); - - IndexBuildsCoordinator::get(opCtx)->setSignalAndCancelVoteRequestCbkIfActive( - replStateLock, opCtx, replIndexBuildState, IndexBuildAction::kPrimaryAbort); -} - -/** * We do not need synchronization with step up and step down. Dropping the RSTL is important because * otherwise if we held the RSTL it would create deadlocks with prepared transactions on step up and * step down. A deadlock could result if the index build was attempting to acquire a Collection S * or X lock while a prepared transaction held a Collection IX lock, and a step down was waiting to * acquire the RSTL in mode X. */ -void unlockRSTLForIndexCleanup(OperationContext* opCtx) { +void unlockRSTL(OperationContext* opCtx) { opCtx->lockState()->unlockRSTLforPrepare(); invariant(!opCtx->lockState()->isRSTLLocked()); } @@ -562,92 +523,35 @@ void IndexBuildsCoordinator::waitForAllIndexBuildsToStopForShutdown(OperationCon _indexBuildsCondVar.wait(lk, pred); } -std::vector<UUID> IndexBuildsCoordinator::_abortCollectionIndexBuilds(stdx::unique_lock<Latch>& lk, - OperationContext* opCtx, - const UUID& collectionUUID, - const std::string& reason, - bool shouldWait) { - auto indexBuildFilter = [=](const auto& replState) { - return collectionUUID == replState.collectionUUID; - }; - auto collIndexBuilds = _filterIndexBuilds_inlock(lk, indexBuildFilter); - if (collIndexBuilds.empty()) { - return {}; - } - +std::vector<UUID> IndexBuildsCoordinator::abortCollectionIndexBuilds( + OperationContext* opCtx, + const NamespaceString collectionNss, + const UUID collectionUUID, + const std::string& reason) { LOGV2(23879, - "About to abort all index builders on collection with UUID: {collectionUUID}", - "collectionUUID"_attr = collectionUUID); + "About to abort all index builders on collection", + "collection"_attr = collectionNss, + "collectionUUID"_attr = collectionUUID, + "reason"_attr = reason); + + auto collIndexBuilds = [&]() -> std::vector<std::shared_ptr<ReplIndexBuildState>> { + stdx::unique_lock<Latch> lk(_mutex); + auto indexBuildFilter = [=](const auto& replState) { + return collectionUUID == replState.collectionUUID; + }; + return _filterIndexBuilds_inlock(lk, indexBuildFilter); + }(); std::vector<UUID> buildUUIDs; for (auto replState : collIndexBuilds) { - abortIndexBuild(lk, opCtx, &_indexBuildsManager, replState, reason); - buildUUIDs.push_back(replState->buildUUID); - } - - if (!shouldWait) { - return buildUUIDs; + if (abortIndexBuildByBuildUUID( + opCtx, replState->buildUUID, IndexBuildAction::kPrimaryAbort, reason)) { + buildUUIDs.push_back(replState->buildUUID); + } } - - _awaitNoIndexBuildInProgressForCollection(lk, opCtx, collectionUUID); - return buildUUIDs; } -void IndexBuildsCoordinator::_awaitNoIndexBuildInProgressForCollection(stdx::unique_lock<Latch>& lk, - OperationContext* opCtx, - const UUID& collectionUUID) { - auto indexBuildFilter = [=](const auto& replState) { - return collectionUUID == replState.collectionUUID; - }; - auto pred = [&, this]() { - auto collIndexBuilds = _filterIndexBuilds_inlock(lk, indexBuildFilter); - return collIndexBuilds.empty(); - }; - _indexBuildsCondVar.wait(lk, pred); -} - -void IndexBuildsCoordinator::abortCollectionIndexBuilds(OperationContext* opCtx, - const UUID& collectionUUID, - const std::string& reason) { - stdx::unique_lock<Latch> lk(_mutex); - const bool shouldWait = true; - _abortCollectionIndexBuilds(lk, opCtx, collectionUUID, reason, shouldWait); -} - -std::vector<UUID> IndexBuildsCoordinator::abortCollectionIndexBuildsNoWait( - OperationContext* opCtx, const UUID& collectionUUID, const std::string& reason) { - stdx::unique_lock<Latch> lk(_mutex); - const bool shouldWait = false; - return _abortCollectionIndexBuilds(lk, opCtx, collectionUUID, reason, shouldWait); -} - -void IndexBuildsCoordinator::_abortDatabaseIndexBuilds(stdx::unique_lock<Latch>& lk, - OperationContext* opCtx, - const StringData& db, - const std::string& reason, - bool shouldWait) { - auto indexBuildFilter = [db](const auto& replState) { return db == replState.dbName; }; - auto dbIndexBuilds = _filterIndexBuilds_inlock(lk, indexBuildFilter); - if (dbIndexBuilds.empty()) { - return; - } - - LOGV2(4612302, - "About to abort all index builders running for collections in the given database", - "database"_attr = db); - - for (auto replState : dbIndexBuilds) { - abortIndexBuild(lk, opCtx, &_indexBuildsManager, replState, reason); - } - - if (!shouldWait) { - return; - } - - _awaitNoBgOpInProgForDb(lk, opCtx, db); -} - void IndexBuildsCoordinator::_awaitNoBgOpInProgForDb(stdx::unique_lock<Latch>& lk, OperationContext* opCtx, StringData db) { @@ -662,17 +566,20 @@ void IndexBuildsCoordinator::_awaitNoBgOpInProgForDb(stdx::unique_lock<Latch>& l void IndexBuildsCoordinator::abortDatabaseIndexBuilds(OperationContext* opCtx, StringData db, const std::string& reason) { - stdx::unique_lock<Latch> lk(_mutex); - const bool shouldWait = true; - _abortDatabaseIndexBuilds(lk, opCtx, db, reason, shouldWait); -} + LOGV2(4612302, + "About to abort all index builders running for collections in the given database", + "database"_attr = db, + "reason"_attr = reason); -void IndexBuildsCoordinator::abortDatabaseIndexBuildsNoWait(OperationContext* opCtx, - StringData db, - const std::string& reason) { - stdx::unique_lock<Latch> lk(_mutex); - const bool shouldWait = false; - _abortDatabaseIndexBuilds(lk, opCtx, db, reason, shouldWait); + auto builds = [&]() -> std::vector<std::shared_ptr<ReplIndexBuildState>> { + stdx::unique_lock<Latch> lk(_mutex); + auto indexBuildFilter = [=](const auto& replState) { return db == replState.dbName; }; + return _filterIndexBuilds_inlock(lk, indexBuildFilter); + }(); + for (auto replState : builds) { + abortIndexBuildByBuildUUID( + opCtx, replState->buildUUID, IndexBuildAction::kPrimaryAbort, reason); + } } namespace { @@ -765,43 +672,51 @@ void IndexBuildsCoordinator::applyCommitIndexBuild(OperationContext* opCtx, auto replState = uassertStatusOK(indexBuildsCoord->_getIndexBuild(buildUUID)); - while (true) { - stdx::unique_lock<Latch> lk(replState->mutex); - if (replState->waitForNextAction->getFuture().isReady()) { - // If future wait is made uninterruptible, then the shutdown can stuck behind - // oplog applier if the indexBuildCoordinator thread died after interruption on - // shutdown. And, commitIndexBuild oplog entry will stuck waiting for reset of the - // promise. - const auto nextAction = replState->waitForNextAction->getFuture().get(opCtx); - invariant(nextAction == IndexBuildAction::kCommitQuorumSatisfied); - // Retry until the current promise result is consumed by the index builder thread and - // a new empty promise got created by the indexBuildscoordinator thread. - // Don't hammer it. - sleepmillis(1); - continue; - } - auto skipCheck = shouldSkipIndexBuildStateTransitionCheck(opCtx, replState->protocol); - replState->indexBuildState.setState(IndexBuildState::kPrepareCommit, - skipCheck, - opCtx->recoveryUnit()->getCommitTimestamp()); - // Promise can be set only once. - // We can't skip signaling here if a signal is already set because the previous commit or - // abort signal might have been sent to handle for primary case. - setSignalAndCancelVoteRequestCbkIfActive( - lk, opCtx, replState, IndexBuildAction::kOplogCommit); - break; + // Retry until we are able to put the index build in the kPrepareCommit state. None of the + // conditions for retrying are common or expected to be long-lived, so we believe this to be + // safe to poll at this frequency. + while (!_tryCommit(opCtx, replState)) { + opCtx->sleepFor(Milliseconds(100)); } auto fut = replState->sharedPromise.getFuture(); LOGV2(20654, - "Index build joined after commit: {buildUUID}: {fut_waitNoThrow_opCtx}", + "Index build joined after commit", "buildUUID"_attr = buildUUID, - "fut_waitNoThrow_opCtx"_attr = fut.waitNoThrow(opCtx)); + "result"_attr = fut.waitNoThrow(opCtx)); // Throws if there was an error building the index. fut.get(); } +bool IndexBuildsCoordinator::_tryCommit(OperationContext* opCtx, + std::shared_ptr<ReplIndexBuildState> replState) { + stdx::unique_lock<Latch> lk(replState->mutex); + if (replState->indexBuildState.isSettingUp()) { + // It's possible that the index build thread has not reached the point where it can be + // committed yet. + return false; + } + if (replState->waitForNextAction->getFuture().isReady()) { + // If the future wait were uninterruptible, then shutdown could hang. If the + // IndexBuildsCoordinator thread gets interrupted on shutdown, the oplog applier will hang + // waiting for the promise applying the commitIndexBuild oplog entry. + const auto nextAction = replState->waitForNextAction->getFuture().get(opCtx); + invariant(nextAction == IndexBuildAction::kCommitQuorumSatisfied); + // Retry until the current promise result is consumed by the index builder thread and + // a new empty promise got created by the indexBuildscoordinator thread. + return false; + } + auto skipCheck = shouldSkipIndexBuildStateTransitionCheck(opCtx, replState->protocol); + replState->indexBuildState.setState( + IndexBuildState::kPrepareCommit, skipCheck, opCtx->recoveryUnit()->getCommitTimestamp()); + // Promise can be set only once. + // We can't skip signaling here if a signal is already set because the previous commit or + // abort signal might have been sent to handle for primary case. + setSignalAndCancelVoteRequestCbkIfActive(lk, opCtx, replState, IndexBuildAction::kOplogCommit); + return true; +} + void IndexBuildsCoordinator::applyAbortIndexBuild(OperationContext* opCtx, const IndexBuildOplogEntry& oplogEntry) { const auto collUUID = oplogEntry.collUUID; @@ -820,50 +735,15 @@ void IndexBuildsCoordinator::applyAbortIndexBuild(OperationContext* opCtx, std::string abortReason(str::stream() << "abortIndexBuild oplog entry encountered: " << *oplogEntry.cause); auto indexBuildsCoord = IndexBuildsCoordinator::get(opCtx); - indexBuildsCoord->abortIndexBuildByBuildUUID(opCtx, - buildUUID, - IndexBuildAction::kOplogAbort, - opCtx->recoveryUnit()->getCommitTimestamp(), - abortReason); -} - -void IndexBuildsCoordinator::abortIndexBuildOnError(OperationContext* opCtx, - const UUID& buildUUID, - Status abortStatus) { - // Use a null abort timestamp because the index build will generate a ghost timestamp - // for the single-phase build on cleanup. - std::string abortReason(str::stream() << "Index build interrupted: " << buildUUID << ": " - << abortStatus.toString()); - abortIndexBuildByBuildUUIDNoWait( - opCtx, buildUUID, IndexBuildAction::kPrimaryAbort, boost::none, abortReason); -} - -void IndexBuildsCoordinator::abortIndexBuildByBuildUUID(OperationContext* opCtx, - const UUID& buildUUID, - IndexBuildAction signalAction, - boost::optional<Timestamp> abortTimestamp, - boost::optional<std::string> reason) { - if (!abortIndexBuildByBuildUUIDNoWait(opCtx, buildUUID, signalAction, abortTimestamp, reason)) { - return; - } - - auto replState = - invariant(_getIndexBuild(buildUUID), - str::stream() << "Abort timestamp: " - << abortTimestamp.get_value_or(Timestamp()).toString()); - - auto fut = replState->sharedPromise.getFuture(); - LOGV2(20655, - "Index build joined after abort: {buildUUID}: {fut_waitNoThrow}", - "buildUUID"_attr = buildUUID, - "fut_waitNoThrow"_attr = fut.waitNoThrow()); + indexBuildsCoord->abortIndexBuildByBuildUUID( + opCtx, buildUUID, IndexBuildAction::kOplogAbort, abortReason); } -boost::optional<UUID> IndexBuildsCoordinator::abortIndexBuildByIndexNamesNoWait( +boost::optional<UUID> IndexBuildsCoordinator::abortIndexBuildByIndexNames( OperationContext* opCtx, const UUID& collectionUUID, const std::vector<std::string>& indexNames, - boost::optional<std::string> reason) { + std::string reason) { boost::optional<UUID> buildUUID; auto indexBuilds = _getIndexBuilds(); auto onIndexBuild = [&](std::shared_ptr<ReplIndexBuildState> replState) { @@ -886,17 +766,13 @@ boost::optional<UUID> IndexBuildsCoordinator::abortIndexBuildByIndexNamesNoWait( "collectionUUID"_attr = collectionUUID, "replState_indexNames_front"_attr = replState->indexNames.front()); - if (this->abortIndexBuildByBuildUUIDNoWait(opCtx, - replState->buildUUID, - IndexBuildAction::kPrimaryAbort, - boost::none, - reason)) { + if (abortIndexBuildByBuildUUID( + opCtx, replState->buildUUID, IndexBuildAction::kPrimaryAbort, reason)) { buildUUID = replState->buildUUID; } }; - forEachIndexBuild(indexBuilds, - "IndexBuildsCoordinator::abortIndexBuildByIndexNamesNoWait - "_sd, - onIndexBuild); + forEachIndexBuild( + indexBuilds, "IndexBuildsCoordinator::abortIndexBuildByIndexNames - "_sd, onIndexBuild); return buildUUID; } @@ -925,14 +801,80 @@ bool IndexBuildsCoordinator::hasIndexBuilder(OperationContext* opCtx, return foundIndexBuilder; } -bool IndexBuildsCoordinator::abortIndexBuildByBuildUUIDNoWait( +IndexBuildsCoordinator::TryAbortResult IndexBuildsCoordinator::_tryAbort( OperationContext* opCtx, - const UUID& buildUUID, + std::shared_ptr<ReplIndexBuildState> replState, IndexBuildAction signalAction, - boost::optional<Timestamp> abortTimestamp, - boost::optional<std::string> reason) { - // We need to avoid race between commit and abort index build. + std::string reason) { + + { + stdx::unique_lock<Latch> lk(replState->mutex); + // Wait until the build is done setting up. This indicates that all required state is + // initialized to attempt an abort. + if (replState->indexBuildState.isSettingUp()) { + LOGV2_DEBUG(465605, + 2, + "waiting until index build is done setting up before attempting to abort", + "buildUUID"_attr = replState->buildUUID); + return TryAbortResult::kRetry; + } + if (replState->waitForNextAction->getFuture().isReady()) { + const auto nextAction = replState->waitForNextAction->getFuture().get(opCtx); + invariant(nextAction == IndexBuildAction::kSinglePhaseCommit || + nextAction == IndexBuildAction::kCommitQuorumSatisfied || + nextAction == IndexBuildAction::kPrimaryAbort); + + // Index build coordinator already received a signal to commit or abort. So, it's ok + // to return and wait for the index build to complete if we are trying to signal + // 'kPrimaryAbort'. The index build coordinator will not perform the signaled action + // (i.e, will not commit or abort the index build) only when the node steps down. + // When the node steps down, the caller of this function, dropIndexes/createIndexes + // command (user operation) will also get interrupted. So, we no longer need to + // abort the index build on step down. + if (signalAction == IndexBuildAction::kPrimaryAbort) { + // Indicate if the index build is already being committed or aborted. + if (nextAction == IndexBuildAction::kPrimaryAbort) { + return TryAbortResult::kAlreadyAborted; + } else { + return TryAbortResult::kNotAborted; + } + } + + // Retry until the current promise result is consumed by the index builder thread + // and a new empty promise got created by the indexBuildscoordinator thread. Or, + // until the index build got torn down after index build commit. + return TryAbortResult::kRetry; + } + + LOGV2(4656003, "aborting index build", "buildUUID"_attr = replState->buildUUID); + + // Set the state on replState. Once set, the calling thread must complete the abort process. + auto abortTimestamp = + boost::make_optional<Timestamp>(!opCtx->recoveryUnit()->getCommitTimestamp().isNull(), + opCtx->recoveryUnit()->getCommitTimestamp()); + auto skipCheck = shouldSkipIndexBuildStateTransitionCheck(opCtx, replState->protocol); + replState->indexBuildState.setState( + IndexBuildState::kAborted, skipCheck, abortTimestamp, reason); + setSignalAndCancelVoteRequestCbkIfActive(lk, opCtx, replState, signalAction); + } + return TryAbortResult::kContinueAbort; +} + +bool IndexBuildsCoordinator::abortIndexBuildByBuildUUID(OperationContext* opCtx, + const UUID& buildUUID, + IndexBuildAction signalAction, + std::string reason) { + std::shared_ptr<ReplIndexBuildState> replState; + bool retry = false; while (true) { + // Retry until we are able to put the index build into the kAborted state. None of the + // conditions for retrying are common or expected to be long-lived, so we believe this to be + // safe to poll at this frequency. + if (retry) { + opCtx->sleepFor(Milliseconds(1000)); + retry = false; + } + // It is possible to receive an abort for a non-existent index build. Abort should always // succeed, so suppress the error. auto replStateResult = _getIndexBuild(buildUUID); @@ -945,55 +887,181 @@ bool IndexBuildsCoordinator::abortIndexBuildByBuildUUIDNoWait( return false; } - auto replState = replStateResult.getValue(); + replState = replStateResult.getValue(); + LOGV2(4656010, "attempting to abort index build", "buildUUID"_attr = replState->buildUUID); - stdx::unique_lock<Latch> lk(replState->mutex); - if (replState->waitForNextAction->getFuture().isReady()) { - const auto nextAction = replState->waitForNextAction->getFuture().get(opCtx); - invariant(nextAction == IndexBuildAction::kSinglePhaseCommit || - nextAction == IndexBuildAction::kCommitQuorumSatisfied || - nextAction == IndexBuildAction::kPrimaryAbort); + const NamespaceStringOrUUID dbAndUUID(replState->dbName, replState->collectionUUID); + Lock::DBLock dbLock(opCtx, replState->dbName, MODE_IX); - // Index build coordinator already received a signal to commit or abort. So, it's ok - // to return and wait for the index build to complete if we are trying to signal - // 'kPrimaryAbort'. The index build coordinator will not perform the signaled action - // (i.e, will not commit or abort the index build) only when the node steps down. When - // the node steps down, the caller of this function, dropIndexes/createIndexes command - // (user operation) will also get interrupted. So, we no longer need to abort the index - // build on step down. - // - // Currently dropIndexes command calls this function with the - // collection lock held in IX mode, So, there are possibilities, we might block the - // index build from completing, leading to 3 way deadlocks involving step down, - // dropIndexes command, IndexBuildCoordinator thread. - if (signalAction == IndexBuildAction::kPrimaryAbort) { - // Only return true if the index build is being aborted already, not if it is going - // to commit. - return nextAction == IndexBuildAction::kPrimaryAbort; + if (IndexBuildProtocol::kSinglePhase == replState->protocol) { + // Unlock RSTL to avoid deadlocks with prepare conflicts and state transitions caused by + // taking a strong collection lock. See SERVER-42621. + unlockRSTL(opCtx); + } + Lock::CollectionLock collLock(opCtx, dbAndUUID, MODE_X); + + // If we are using two-phase index builds and are no longer primary after receiving an + // abort, we cannot replicate an abortIndexBuild oplog entry. Continue holding the RSTL to + // check the replication state and to prevent any state transitions from happening while + // aborting the index build. Once an index build is put into kAborted, the index builder + // thread will be torn down, and an oplog entry must be replicated. Single-phase builds do + // not have this restriction and may be aborted after a stepDown. + if (IndexBuildProtocol::kTwoPhase == replState->protocol) { + // The DBLock helper takes the RSTL implictly. + invariant(opCtx->lockState()->isRSTLLocked()); + auto replCoord = repl::ReplicationCoordinator::get(opCtx); + if (IndexBuildAction::kPrimaryAbort == signalAction && + !replCoord->canAcceptWritesFor(opCtx, dbAndUUID)) { + uassertStatusOK({ErrorCodes::NotMaster, + str::stream() + << "Unable to abort index build because we are not primary: " + << buildUUID}); } + } - // Retry until the current promise result is consumed by the index builder thread and - // a new empty promise got created by the indexBuildscoordinator thread. Or, until the - // index build got torn down after index build commit. - // Don't hammer it. - sleepmillis(1); + auto tryAbortResult = _tryAbort(opCtx, replState, signalAction, reason); + switch (tryAbortResult) { + case TryAbortResult::kNotAborted: + return false; + case TryAbortResult::kAlreadyAborted: + return true; + case TryAbortResult::kRetry: + case TryAbortResult::kContinueAbort: + break; + } + + if (TryAbortResult::kRetry == tryAbortResult) { + retry = true; continue; } - auto skipCheck = shouldSkipIndexBuildStateTransitionCheck(opCtx, replState->protocol); - // Set the state on replState and _indexBuildsManager. And, then signal the value. It's - // important we do all these 3 things in a critical section by holding mutex lock. - replState->indexBuildState.setState( - IndexBuildState::kPrepareAbort, skipCheck, abortTimestamp, reason); - _indexBuildsManager.abortIndexBuild(buildUUID, reason.get_value_or("")); + invariant(TryAbortResult::kContinueAbort == tryAbortResult); - setSignalAndCancelVoteRequestCbkIfActive(lk, opCtx, replState, signalAction); + // At this point we must continue aborting the index build. + try { + _completeAbort(opCtx, replState, signalAction, {ErrorCodes::IndexBuildAborted, reason}); + } catch (const DBException& e) { + LOGV2_FATAL( + 4656011, + "Failed to abort index build after partially tearing-down index build state", + "buildUUID"_attr = replState->buildUUID, + "reason"_attr = e.toString()); + } + + { + stdx::unique_lock<Latch> lk(replState->mutex); + + // Interrupt the builder thread so that it can no longer acquire locks or make progress. + auto serviceContext = opCtx->getServiceContext(); + auto target = serviceContext->getLockedClient(replState->opId); + if (!target) { + LOGV2_FATAL(4656001, + "Index builder thread did not appear to be running while aborting", + "buildUUID"_attr = replState->buildUUID, + "opId"_attr = replState->opId); + } + serviceContext->killOperation( + target, target->getOperationContext(), ErrorCodes::IndexBuildAborted); + } + + // Wait for the builder thread to receive the signal before unregistering. Don't release the + // Collection lock until this happens, guaranteeing the thread has stopped making progress + // and has exited. + auto fut = replState->sharedPromise.getFuture(); + LOGV2(20655, + "Index build thread exited", + "buildUUID"_attr = buildUUID, + "status"_attr = fut.waitNoThrow()); + + { + // Unregister last once we guarantee all other state has been cleaned up. + stdx::unique_lock<Latch> lk(_mutex); + _unregisterIndexBuild(lk, replState); + } break; } return true; } +void IndexBuildsCoordinator::_completeAbort(OperationContext* opCtx, + std::shared_ptr<ReplIndexBuildState> replState, + IndexBuildAction signalAction, + Status reason) { + auto coll = + CollectionCatalog::get(opCtx).lookupCollectionByUUID(opCtx, replState->collectionUUID); + auto nss = coll->ns(); + + switch (signalAction) { + // Replicates an abortIndexBuild oplog entry and deletes the index from the durable catalog. + case IndexBuildAction::kPrimaryAbort: { + auto onCleanUpFn = [&] { onAbortIndexBuild(opCtx, coll->ns(), *replState, reason); }; + _indexBuildsManager.abortIndexBuild(opCtx, coll, replState->buildUUID, onCleanUpFn); + break; + } + // Deletes the index from the durable catalog. + case IndexBuildAction::kOplogAbort: { + invariant(IndexBuildProtocol::kTwoPhase == replState->protocol); + _indexBuildsManager.abortIndexBuild( + opCtx, coll, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn); + break; + } + // No locks are required when aborting due to rollback. This performs no storage engine + // writes, only cleans up the remaining in-memory state. + case IndexBuildAction::kRollbackAbort: { + _indexBuildsManager.abortIndexBuildWithoutCleanup( + opCtx, coll, replState->buildUUID, reason.reason()); + break; + } + case IndexBuildAction::kNoAction: + case IndexBuildAction::kCommitQuorumSatisfied: + case IndexBuildAction::kOplogCommit: + case IndexBuildAction::kSinglePhaseCommit: + MONGO_UNREACHABLE; + } + + LOGV2(465611, "Cleaned up index build after abort. ", "buildUUID"_attr = replState->buildUUID); +} + +void IndexBuildsCoordinator::_completeSelfAbort(OperationContext* opCtx, + std::shared_ptr<ReplIndexBuildState> replState, + Status reason) { + _completeAbort(opCtx, replState, IndexBuildAction::kPrimaryAbort, reason); + { + auto skipCheck = shouldSkipIndexBuildStateTransitionCheck(opCtx, replState->protocol); + stdx::unique_lock<Latch> lk(replState->mutex); + replState->indexBuildState.setState(IndexBuildState::kAborted, skipCheck); + } + { + stdx::unique_lock<Latch> lk(_mutex); + _unregisterIndexBuild(lk, replState); + } +} + +void IndexBuildsCoordinator::_completeAbortForShutdown( + OperationContext* opCtx, + std::shared_ptr<ReplIndexBuildState> replState, + Collection* collection) { + // Leave it as-if kill -9 happened. Startup recovery will restart the index build. + _indexBuildsManager.abortIndexBuildWithoutCleanup( + opCtx, collection, replState->buildUUID, "shutting down"); + + { + // Promise should be set at least once before it's getting destroyed. + stdx::unique_lock<Latch> lk(replState->mutex); + if (!replState->waitForNextAction->getFuture().isReady()) { + replState->waitForNextAction->emplaceValue(IndexBuildAction::kNoAction); + } + auto skipCheck = shouldSkipIndexBuildStateTransitionCheck(opCtx, replState->protocol); + replState->indexBuildState.setState(IndexBuildState::kAborted, skipCheck); + } + { + // This allows the builder thread to exit. + stdx::unique_lock<Latch> lk(_mutex); + _unregisterIndexBuild(lk, replState); + } +} + std::size_t IndexBuildsCoordinator::getActiveIndexBuildCount(OperationContext* opCtx) { auto indexBuilds = _getIndexBuilds(); // We use forEachIndexBuild() to log basic details on the current index builds and don't intend @@ -1017,17 +1085,6 @@ void IndexBuildsCoordinator::onStepUp(OperationContext* opCtx) { return; } - { - stdx::unique_lock<Latch> lk(replState->mutex); - // After Sending the abort this might have stepped down and stepped back up. - if (replState->indexBuildState.isAbortPrepared() && - !replState->waitForNextAction->getFuture().isReady()) { - setSignalAndCancelVoteRequestCbkIfActive( - lk, opCtx, replState, IndexBuildAction::kPrimaryAbort); - return; - } - } - if (!_signalIfCommitQuorumNotEnabled(opCtx, replState)) { // This reads from system.indexBuilds collection to see if commit quorum got satisfied. _signalIfCommitQuorumIsSatisfied(opCtx, replState); @@ -1060,12 +1117,11 @@ IndexBuilds IndexBuildsCoordinator::stopIndexBuildsForRollback(OperationContext* } buildsStopped.insert({replState->buildUUID, aborted}); - // Leave abort timestamp as null. This will unblock the index build and allow it to - // complete without cleaning up. Subsequently, the rollback algorithm can decide how to - // undo the index build depending on the state of the oplog. - // Signals the kRollbackAbort and then waits for the thread to join. + // This will unblock the index build and allow it to complete without cleaning up. + // Subsequently, the rollback algorithm can decide how to undo the index build depending on + // the state of the oplog. Signals the kRollbackAbort and then waits for the thread to join. abortIndexBuildByBuildUUID( - opCtx, replState->buildUUID, IndexBuildAction::kRollbackAbort, boost::none, reason); + opCtx, replState->buildUUID, IndexBuildAction::kRollbackAbort, reason); }; forEachIndexBuild( indexBuilds, "IndexBuildsCoordinator::stopIndexBuildsForRollback - "_sd, onIndexBuild); @@ -1213,11 +1269,7 @@ void IndexBuildsCoordinator::createIndexes(OperationContext* opCtx, auto buildUUID = UUID::gen(); // Rest of this function can throw, so ensure the build cleanup occurs. - ON_BLOCK_EXIT([&] { - opCtx->recoveryUnit()->abandonSnapshot(); - _indexBuildsManager.tearDownIndexBuild( - opCtx, collection, buildUUID, MultiIndexBlock::kNoopOnCleanUpFn); - }); + ON_BLOCK_EXIT([&] { _indexBuildsManager.unregisterIndexBuild(buildUUID); }); auto onInitFn = MultiIndexBlock::makeTimestampedIndexOnInitFn(opCtx, collection); IndexBuildsManager::SetupOptions options; @@ -1225,6 +1277,10 @@ void IndexBuildsCoordinator::createIndexes(OperationContext* opCtx, uassertStatusOK(_indexBuildsManager.setUpIndexBuild( opCtx, collection, specs, buildUUID, onInitFn, options)); + auto abortOnExit = makeGuard([&] { + _indexBuildsManager.abortIndexBuild( + opCtx, collection, buildUUID, MultiIndexBlock::kNoopOnCleanUpFn); + }); uassertStatusOK(_indexBuildsManager.startBuildingIndex(opCtx, collection, buildUUID)); uassertStatusOK(_indexBuildsManager.checkIndexConstraintViolations(opCtx, buildUUID)); @@ -1278,6 +1334,7 @@ void IndexBuildsCoordinator::createIndexes(OperationContext* opCtx, }; uassertStatusOK(_indexBuildsManager.commitIndexBuild( opCtx, collection, nss, buildUUID, onCreateEachFn, onCommitFn)); + abortOnExit.dismiss(); } void IndexBuildsCoordinator::createIndexesOnEmptyCollection(OperationContext* opCtx, @@ -1376,8 +1433,7 @@ Status IndexBuildsCoordinator::_registerIndexBuild( if (auto ts = existingIndexBuild->indexBuildState.getTimestamp()) { ss << ", timestamp: " << ts->toString(); } - if (existingIndexBuild->indexBuildState.isSet(IndexBuildState::kPrepareAbort | - IndexBuildState::kAborted)) { + if (existingIndexBuild->indexBuildState.isAborted()) { if (auto abortReason = existingIndexBuild->indexBuildState.getAbortReason()) { ss << ", abort reason: " << abortReason.get(); @@ -1395,8 +1451,6 @@ Status IndexBuildsCoordinator::_registerIndexBuild( } } - // Register the index build. - invariant(_allIndexBuilds.emplace(replIndexBuildState->buildUUID, replIndexBuildState).second); _indexBuildsCondVar.notify_all(); @@ -1409,6 +1463,8 @@ void IndexBuildsCoordinator::_unregisterIndexBuild( invariant(_allIndexBuilds.erase(replIndexBuildState->buildUUID)); + LOGV2(4656004, "unregistering index build", "buildUUID"_attr = replIndexBuildState->buildUUID); + _indexBuildsManager.unregisterIndexBuild(replIndexBuildState->buildUUID); _indexBuildsCondVar.notify_all(); } @@ -1617,7 +1673,7 @@ IndexBuildsCoordinator::PostSetupAction IndexBuildsCoordinator::_setUpIndexBuild opCtx, collection, replState->indexSpecs, replState->buildUUID, onInitFn, options)); } } catch (DBException& ex) { - _indexBuildsManager.tearDownIndexBuild( + _indexBuildsManager.abortIndexBuild( opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn); const auto& status = ex.toStatus(); @@ -1632,7 +1688,6 @@ IndexBuildsCoordinator::PostSetupAction IndexBuildsCoordinator::_setUpIndexBuild throw; } - return PostSetupAction::kContinueIndexBuild; } @@ -1697,6 +1752,14 @@ void IndexBuildsCoordinator::_runIndexBuild(OperationContext* opCtx, return; } auto replState = invariant(swReplState); + { + // The index build is now past the setup stage and in progress. This makes it eligible to be + // aborted. Use the current OperationContext's opId as the means for interrupting the index + // build. + stdx::unique_lock<Latch> lk(replState->mutex); + replState->opId = opCtx->getOpID(); + replState->indexBuildState.setState(IndexBuildState::kInProgress, false /* skipCheck */); + } // Add build UUID to lock manager diagnostic output. auto locker = opCtx->lockState(); @@ -1710,6 +1773,11 @@ void IndexBuildsCoordinator::_runIndexBuild(OperationContext* opCtx, locker->setDebugInfo(ss); } + while (MONGO_unlikely(hangAfterInitializingIndexBuild.shouldFail())) { + opCtx->runWithoutInterruptionExceptAtGlobalShutdown( + [&] { opCtx->sleepFor(Milliseconds(100)); }); + } + auto status = [&]() { try { _runIndexBuildInner(opCtx, replState, indexBuildOptions); @@ -1723,18 +1791,30 @@ void IndexBuildsCoordinator::_runIndexBuild(OperationContext* opCtx, // Ensure the index build is unregistered from the Coordinator and the Promise is set with // the build's result so that callers are notified of the outcome. - - stdx::unique_lock<Latch> lk(_mutex); - - _unregisterIndexBuild(lk, replState); - if (status.isOK()) { + stdx::unique_lock<Latch> lk(_mutex); + // Unregister first so that when we fulfill the future, the build is not observed as active. + _unregisterIndexBuild(lk, replState); replState->sharedPromise.emplaceValue(replState->stats); - } else { - replState->sharedPromise.setError(status); + return; } + + // During a failure, unregistering is handled by either the caller or the current thread, + // depending on where the error originated. Signal to any waiters that an error occurred. + replState->sharedPromise.setError(status); } +namespace { + +template <typename Func> +void runOnAlternateContext(OperationContext* opCtx, std::string name, Func func) { + auto newClient = opCtx->getServiceContext()->makeClient(name); + AlternativeClientRegion acr(newClient); + const auto newCtx = cc().makeOperationContext(); + func(newCtx.get()); +} +} // namespace + void IndexBuildsCoordinator::_cleanUpSinglePhaseAfterFailure( OperationContext* opCtx, Collection* collection, @@ -1742,20 +1822,14 @@ void IndexBuildsCoordinator::_cleanUpSinglePhaseAfterFailure( const IndexBuildOptions& indexBuildOptions, const Status& status) { if (status.isA<ErrorCategory::ShutdownError>()) { - // Leave it as-if kill -9 happened. Startup recovery will rebuild the index. - _indexBuildsManager.abortIndexBuildWithoutCleanup( - opCtx, collection, replState->buildUUID, "shutting down"); - _indexBuildsManager.tearDownIndexBuild( - opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn); + _completeAbortForShutdown(opCtx, replState, collection); return; } - // If the index build was not completed successfully, we'll need to acquire some locks to - // clean it up. - UninterruptibleLockGuard noInterrupt(opCtx->lockState()); - - NamespaceString nss = collection->ns(); - Lock::DBLock dbLock(opCtx, nss.db(), MODE_IX); + // An external caller already cleaned up our state. + if (status == ErrorCodes::IndexBuildAborted) { + return; + } if (indexBuildOptions.replSetAndNotPrimaryAtStart) { // This build started and failed as a secondary. Single-phase index builds started on @@ -1766,14 +1840,23 @@ void IndexBuildsCoordinator::_cleanUpSinglePhaseAfterFailure( << "; Database: " << replState->dbName)); } - // Unlock the RSTL to avoid deadlocks with state transitions. - unlockRSTLForIndexCleanup(opCtx); - Lock::CollectionLock collLock(opCtx, nss, MODE_X); + // The index builder thread can abort on its own when it fails due to an indexing error or its + // deadline expires. + // The current operation's deadline may have expired, which would prevent us from taking + // locks. Use a new OperationContext to abort the index build. + runOnAlternateContext( + opCtx, "self-abort", [this, replState, status](OperationContext* abortCtx) { + ShouldNotConflictWithSecondaryBatchApplicationBlock noConflict(abortCtx->lockState()); + Lock::DBLock dbLock(abortCtx, replState->dbName, MODE_IX); + + // Unlock RSTL to avoid deadlocks with prepare conflicts and state transitions caused by + // taking a strong collection lock. See SERVER-42621. + unlockRSTL(abortCtx); - // If we started the build as a primary and are now unable to accept writes, this build was - // aborted due to a stepdown. - _indexBuildsManager.tearDownIndexBuild( - opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn); + const NamespaceStringOrUUID dbAndUUID(replState->dbName, replState->collectionUUID); + Lock::CollectionLock collLock(abortCtx, dbAndUUID, MODE_X); + _completeSelfAbort(abortCtx, replState, status); + }); } void IndexBuildsCoordinator::_cleanUpTwoPhaseAfterFailure( @@ -1784,140 +1867,63 @@ void IndexBuildsCoordinator::_cleanUpTwoPhaseAfterFailure( const Status& status) { if (status.isA<ErrorCategory::ShutdownError>()) { - // Promise should be set at least once before it's getting destroyed. Else it would - // invariant. - { - stdx::unique_lock<Latch> lk(replState->mutex); - if (!replState->waitForNextAction->getFuture().isReady()) { - setSignalAndCancelVoteRequestCbkIfActive( - lk, opCtx, replState, IndexBuildAction::kNoAction); - } - } - // Leave it as-if kill -9 happened. Startup recovery will restart the index build. - _indexBuildsManager.abortIndexBuildWithoutCleanup( - opCtx, collection, replState->buildUUID, "shutting down"); - _indexBuildsManager.tearDownIndexBuild( - opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn); + _completeAbortForShutdown(opCtx, replState, collection); return; } - // If the index build was not completed successfully, we'll need to acquire some locks to - // clean it up. - UninterruptibleLockGuard noInterrupt(opCtx->lockState()); - - NamespaceString nss = collection->ns(); - Lock::DBLock dbLock(opCtx, nss.db(), MODE_IX); + // An external caller interrupted us and will be responsible for cleaning up our state. + if (status == ErrorCodes::IndexBuildAborted) { + return; + } - auto replCoord = repl::ReplicationCoordinator::get(opCtx); - if (replCoord->getSettings().usingReplSets() && !replCoord->canAcceptWritesFor(opCtx, nss)) { - // We failed this index build as a secondary node. - - // Failed index builds should fatally assert on the secondary, except when the index build - // was stopped due to an explicit abort oplog entry or rollback. - if (status == ErrorCodes::IndexBuildAborted) { - // On a secondary, we should be able to obtain the timestamp for cleaning up the index - // build from the oplog entry unless the index build did not fail due to processing an - // abortIndexBuild oplog entry. This is the case if we were aborted due to rollback. - stdx::unique_lock<Latch> lk(replState->mutex); - invariant(replState->indexBuildState.isAbortPrepared(), - replState->buildUUID.toString()); - auto abortIndexBuildTimestamp = replState->indexBuildState.getTimestamp(); - - // If we were aborted and no abort timestamp is set, then we should leave the index - // build unfinished. This can happen during rollback because we are not primary and - // cannot generate an optime to timestamp the index build abort. We rely on the - // rollback process to correct this state. - if (!abortIndexBuildTimestamp) { - _indexBuildsManager.abortIndexBuildWithoutCleanup( - opCtx, collection, replState->buildUUID, "no longer primary"); - _indexBuildsManager.tearDownIndexBuild( - opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn); - return; + // The index builder thread can abort on its own when it fails due to an indexing error or its + // deadline expires. + // The current operation's deadline may have expired, which would prevent us from taking locks. + // Use a new OperationContext to abort the index build. + runOnAlternateContext( + opCtx, "self-abort", [this, replState, status](OperationContext* abortCtx) { + ShouldNotConflictWithSecondaryBatchApplicationBlock noConflict(abortCtx->lockState()); + + // Take RSTL (implicitly by DBLock) to observe and prevent replication state from + // changing. + Lock::DBLock dbLock(abortCtx, replState->dbName, MODE_IX); + + // Index builds may not fail on secondaries. If a primary replicated an abortIndexBuild + // oplog entry, then this index build would have received an IndexBuildAborted error + // code. + const NamespaceStringOrUUID dbAndUUID(replState->dbName, replState->collectionUUID); + auto replCoord = repl::ReplicationCoordinator::get(abortCtx); + if (replCoord->getSettings().usingReplSets() && + !replCoord->canAcceptWritesFor(abortCtx, dbAndUUID)) { + fassert(51101, + status.withContext(str::stream() << "Index build: " << replState->buildUUID + << "; Database: " << replState->dbName)); } - // Unlock the RSTL to avoid deadlocks with state transitions. See SERVER-42824. - unlockRSTLForIndexCleanup(opCtx); - Lock::CollectionLock collLock(opCtx, nss, MODE_X); - - TimestampBlock tsBlock(opCtx, abortIndexBuildTimestamp.get()); - _indexBuildsManager.tearDownIndexBuild( - opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn); - return; - } - - fassert(51101, - status.withContext(str::stream() << "Index build: " << replState->buildUUID - << "; Database: " << replState->dbName)); - } - - // We are currently a primary node. Notify downstream nodes to abort their index builds with the - // same build UUID. - Lock::CollectionLock collLock(opCtx, nss, MODE_X); - auto onCleanUpFn = [&] { onAbortIndexBuild(opCtx, nss, *replState, status); }; - _indexBuildsManager.tearDownIndexBuild(opCtx, collection, replState->buildUUID, onCleanUpFn); - return; + Lock::CollectionLock collLock(abortCtx, dbAndUUID, MODE_X); + _completeSelfAbort(abortCtx, replState, status); + }); } void IndexBuildsCoordinator::_runIndexBuildInner(OperationContext* opCtx, std::shared_ptr<ReplIndexBuildState> replState, const IndexBuildOptions& indexBuildOptions) { - const NamespaceStringOrUUID dbAndUUID(replState->dbName, replState->collectionUUID); - // This Status stays unchanged unless we catch an exception in the following try-catch block. auto status = Status::OK(); try { - // Lock acquisition might throw, and we would still need to clean up the index build state, - // so do it in the try-catch block - AutoGetDb autoDb(opCtx, replState->dbName, MODE_IX); - - // Do not use AutoGetCollection since the lock will be reacquired in various modes - // throughout the index build. Lock by UUID to protect against concurrent collection rename. - boost::optional<Lock::CollectionLock> collLock; - collLock.emplace(opCtx, dbAndUUID, MODE_X); - - // Two phase index builds and single-phase builds on secondaries can only be interrupted at - // shutdown. For the duration of the runWithoutInterruptionExceptAtGlobalShutdown() - // invocation, any kill status set by the killOp command will be ignored. After - // runWithoutInterruptionExceptAtGlobalShutdown() returns, any call to checkForInterrupt() - // will see the kill status and respond accordingly (checkForInterrupt() will throw an - // exception while checkForInterruptNoAssert() returns an error Status). - auto replCoord = repl::ReplicationCoordinator::get(opCtx); - if (!replCoord->getSettings().usingReplSets()) { - _buildIndex(opCtx, replState, indexBuildOptions, &collLock); - } else if (IndexBuildProtocol::kTwoPhase == replState->protocol) { - opCtx->runWithoutInterruptionExceptAtGlobalShutdown( - [&, this] { _buildIndex(opCtx, replState, indexBuildOptions, &collLock); }); - } else { - if (indexBuildOptions.replSetAndNotPrimaryAtStart) { - // We need to drop the RSTL here, as we do not need synchronization with step up and - // step down. Dropping the RSTL is important because otherwise if we held the RSTL - // it would create deadlocks with prepared transactions on step up and step down. A - // deadlock could result if the index build was attempting to acquire a Collection S - // or X lock while a prepared transaction held a Collection IX lock, and a step down - // was waiting to acquire the RSTL in mode X. - // TODO(SERVER-44045): Revisit this logic for the non-two phase index build case. - const bool unlocked = opCtx->lockState()->unlockRSTLforPrepare(); - invariant(unlocked); - opCtx->runWithoutInterruptionExceptAtGlobalShutdown( - [&, this] { _buildIndex(opCtx, replState, indexBuildOptions, &collLock); }); - } else { - _buildIndex(opCtx, replState, indexBuildOptions, &collLock); - } - } - // If _buildIndex returned normally, then we should have the collection X lock. It is not - // required to safely access the collection, though, because an index build is registerd. - auto collection = - CollectionCatalog::get(opCtx).lookupCollectionByUUID(opCtx, replState->collectionUUID); - invariant(collection); - replState->stats.numIndexesAfter = getNumIndexesTotal(opCtx, collection); + _buildIndex(opCtx, replState, indexBuildOptions); } catch (const DBException& ex) { status = ex.toStatus(); } + if (status.isOK()) { + return; + } + // We do not hold a collection lock here, but we are protected against the collection being - // dropped while the index build is still registered for the collection -- until - // tearDownIndexBuild is called. The collection can be renamed, but it is OK for the name to - // be stale just for logging purposes. + // dropped while the index build is still registered for the collection -- until abortIndexBuild + // is called. The collection can be renamed, but it is OK for the name to be stale just for + // logging purposes. auto collection = CollectionCatalog::get(opCtx).lookupCollectionByUUID(opCtx, replState->collectionUUID); invariant(collection, @@ -1925,25 +1931,6 @@ void IndexBuildsCoordinator::_runIndexBuildInner(OperationContext* opCtx, << " should exist because an index build is in progress: " << replState->buildUUID); NamespaceString nss = collection->ns(); - - if (status.isOK()) { - _indexBuildsManager.tearDownIndexBuild( - opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn); - - LOGV2(20663, - "Index build completed successfully: {replState_buildUUID}: {nss} ( " - "{replState_collectionUUID} ). Index specs built: {replState_indexSpecs_size}. " - "Indexes in catalog before build: {replState_stats_numIndexesBefore}. Indexes in " - "catalog after build: {replState_stats_numIndexesAfter}", - "replState_buildUUID"_attr = replState->buildUUID, - "nss"_attr = nss, - "replState_collectionUUID"_attr = replState->collectionUUID, - "replState_indexSpecs_size"_attr = replState->indexSpecs.size(), - "replState_stats_numIndexesBefore"_attr = replState->stats.numIndexesBefore, - "replState_stats_numIndexesAfter"_attr = replState->stats.numIndexesAfter); - return; - } - logFailure(status, nss, replState); if (IndexBuildProtocol::kSinglePhase == replState->protocol) { @@ -1958,84 +1945,44 @@ void IndexBuildsCoordinator::_runIndexBuildInner(OperationContext* opCtx, uassertStatusOK(status); } -void IndexBuildsCoordinator::_buildIndex( - OperationContext* opCtx, - std::shared_ptr<ReplIndexBuildState> replState, - const IndexBuildOptions& indexBuildOptions, - boost::optional<Lock::CollectionLock>* exclusiveCollectionLock) { - - if (IndexBuildProtocol::kSinglePhase == replState->protocol) { - _buildIndexSinglePhase(opCtx, replState, indexBuildOptions, exclusiveCollectionLock); - return; - } - - invariant(IndexBuildProtocol::kTwoPhase == replState->protocol, - str::stream() << replState->buildUUID); - _buildIndexTwoPhase(opCtx, replState, indexBuildOptions, exclusiveCollectionLock); -} - -void IndexBuildsCoordinator::_buildIndexSinglePhase( - OperationContext* opCtx, - std::shared_ptr<ReplIndexBuildState> replState, - const IndexBuildOptions& indexBuildOptions, - boost::optional<Lock::CollectionLock>* exclusiveCollectionLock) { - _scanCollectionAndInsertKeysIntoSorter(opCtx, replState, exclusiveCollectionLock); - _insertKeysFromSideTablesWithoutBlockingWrites(opCtx, replState); - _insertKeysFromSideTablesBlockingWrites(opCtx, replState); - _signalPrimaryForCommitReadiness(opCtx, replState); - _waitForNextIndexBuildAction(opCtx, replState); - _insertKeysFromSideTablesAndCommit( - opCtx, replState, indexBuildOptions, exclusiveCollectionLock, {}); -} - -void IndexBuildsCoordinator::_buildIndexTwoPhase( - OperationContext* opCtx, - std::shared_ptr<ReplIndexBuildState> replState, - const IndexBuildOptions& indexBuildOptions, - boost::optional<Lock::CollectionLock>* exclusiveCollectionLock) { - - _scanCollectionAndInsertKeysIntoSorter(opCtx, replState, exclusiveCollectionLock); +void IndexBuildsCoordinator::_buildIndex(OperationContext* opCtx, + std::shared_ptr<ReplIndexBuildState> replState, + const IndexBuildOptions& indexBuildOptions) { + _scanCollectionAndInsertKeysIntoSorter(opCtx, replState); _insertKeysFromSideTablesWithoutBlockingWrites(opCtx, replState); - _insertKeysFromSideTablesBlockingWrites(opCtx, replState); - + _insertKeysFromSideTablesBlockingWrites(opCtx, replState, indexBuildOptions); _signalPrimaryForCommitReadiness(opCtx, replState); auto commitIndexBuildTimestamp = _waitForNextIndexBuildAction(opCtx, replState); - + invariant(commitIndexBuildTimestamp.isNull() || + replState->protocol != IndexBuildProtocol::kSinglePhase, + str::stream() << "buildUUID: " << replState->buildUUID + << "commitTs: " << commitIndexBuildTimestamp.toString()); _insertKeysFromSideTablesAndCommit( - opCtx, replState, indexBuildOptions, exclusiveCollectionLock, commitIndexBuildTimestamp); + opCtx, replState, indexBuildOptions, commitIndexBuildTimestamp); } void IndexBuildsCoordinator::_scanCollectionAndInsertKeysIntoSorter( - OperationContext* opCtx, - std::shared_ptr<ReplIndexBuildState> replState, - boost::optional<Lock::CollectionLock>* exclusiveCollectionLock) { - + OperationContext* opCtx, std::shared_ptr<ReplIndexBuildState> replState) { + // Collection scan and insert into index. { + AutoGetDb autoDb(opCtx, replState->dbName, MODE_IX); + const NamespaceStringOrUUID dbAndUUID(replState->dbName, replState->collectionUUID); + Lock::CollectionLock collLock(opCtx, dbAndUUID, MODE_IX); + + // Rebuilding system indexes during startup using the IndexBuildsCoordinator is done by all + // storage engines if they're missing. + invariant(_indexBuildsManager.isBackgroundBuilding(replState->buildUUID)); + auto nss = CollectionCatalog::get(opCtx).lookupNSSByUUID(opCtx, replState->collectionUUID); invariant(nss); - invariant(opCtx->lockState()->isDbLockedForMode(replState->dbName, MODE_IX)); - invariant(opCtx->lockState()->isCollectionLockedForMode(*nss, MODE_X)); // Set up the thread's currentOp information to display createIndexes cmd information. updateCurOpOpDescription(opCtx, *nss, replState->indexSpecs); - } - - // Rebuilding system indexes during startup using the IndexBuildsCoordinator is done by all - // storage engines if they're missing. - invariant(_indexBuildsManager.isBackgroundBuilding(replState->buildUUID)); - // Index builds can safely ignore prepare conflicts and perform writes. On secondaries, prepare - // operations wait for index builds to complete. - opCtx->recoveryUnit()->abandonSnapshot(); - opCtx->recoveryUnit()->setPrepareConflictBehavior( - PrepareConflictBehavior::kIgnoreConflictsAllowWrites); - - // Collection scan and insert into index, followed by a drain of writes received in the - // background. - exclusiveCollectionLock->reset(); - { - const NamespaceStringOrUUID dbAndUUID(replState->dbName, replState->collectionUUID); - Lock::CollectionLock collLock(opCtx, dbAndUUID, MODE_IS); + // Index builds can safely ignore prepare conflicts and perform writes. On secondaries, + // prepare operations wait for index builds to complete. + opCtx->recoveryUnit()->setPrepareConflictBehavior( + PrepareConflictBehavior::kIgnoreConflictsAllowWrites); // The collection object should always exist while an index build is registered. auto collection = @@ -2060,8 +2007,8 @@ void IndexBuildsCoordinator::_insertKeysFromSideTablesWithoutBlockingWrites( // Perform the first drain while holding an intent lock. const NamespaceStringOrUUID dbAndUUID(replState->dbName, replState->collectionUUID); { - opCtx->recoveryUnit()->abandonSnapshot(); - Lock::CollectionLock collLock(opCtx, dbAndUUID, MODE_IS); + AutoGetDb autoDb(opCtx, replState->dbName, MODE_IX); + Lock::CollectionLock collLock(opCtx, dbAndUUID, MODE_IX); uassertStatusOK(_indexBuildsManager.drainBackgroundWrites( opCtx, @@ -2076,11 +2023,20 @@ void IndexBuildsCoordinator::_insertKeysFromSideTablesWithoutBlockingWrites( } } void IndexBuildsCoordinator::_insertKeysFromSideTablesBlockingWrites( - OperationContext* opCtx, std::shared_ptr<ReplIndexBuildState> replState) { + OperationContext* opCtx, + std::shared_ptr<ReplIndexBuildState> replState, + const IndexBuildOptions& indexBuildOptions) { const NamespaceStringOrUUID dbAndUUID(replState->dbName, replState->collectionUUID); // Perform the second drain while stopping writes on the collection. { - opCtx->recoveryUnit()->abandonSnapshot(); + AutoGetDb autoDb(opCtx, replState->dbName, MODE_IX); + + // Unlock RSTL to avoid deadlocks with prepare conflicts and state transitions. See + // SERVER-42621. + if (IndexBuildProtocol::kSinglePhase == replState->protocol && + indexBuildOptions.replSetAndNotPrimaryAtStart) { + unlockRSTL(opCtx); + } Lock::CollectionLock collLock(opCtx, dbAndUUID, MODE_S); uassertStatusOK(_indexBuildsManager.drainBackgroundWrites( @@ -2104,12 +2060,19 @@ void IndexBuildsCoordinator::_insertKeysFromSideTablesAndCommit( OperationContext* opCtx, std::shared_ptr<ReplIndexBuildState> replState, const IndexBuildOptions& indexBuildOptions, - boost::optional<Lock::CollectionLock>* exclusiveCollectionLock, const Timestamp& commitIndexBuildTimestamp) { - // Need to return the collection lock back to exclusive mode, to complete the index build. - opCtx->recoveryUnit()->abandonSnapshot(); + AutoGetDb autoDb(opCtx, replState->dbName, MODE_IX); + + // Unlock RSTL to avoid deadlocks with prepare conflicts and state transitions caused by taking + // a strong collection lock. See SERVER-42621. + if (IndexBuildProtocol::kSinglePhase == replState->protocol && + indexBuildOptions.replSetAndNotPrimaryAtStart) { + unlockRSTL(opCtx); + } + + // Need to return the collection lock back to exclusive mode to complete the index build. const NamespaceStringOrUUID dbAndUUID(replState->dbName, replState->collectionUUID); - exclusiveCollectionLock->emplace(opCtx, dbAndUUID, MODE_X); + Lock::CollectionLock collLock(opCtx, dbAndUUID, MODE_X); // The collection object should always exist while an index build is registered. auto collection = @@ -2159,14 +2122,7 @@ void IndexBuildsCoordinator::_insertKeysFromSideTablesAndCommit( // If two phase index builds is enabled, index build will be coordinated using // startIndexBuild and commitIndexBuild oplog entries. - auto onCommitFn = [&] { - if (IndexBuildProtocol::kTwoPhase != replState->protocol) { - return; - } - - onCommitIndexBuild( - opCtx, collection->ns(), *replState, indexBuildOptions.replSetAndNotPrimaryAtStart); - }; + auto onCommitFn = [&] { onCommitIndexBuild(opCtx, collection->ns(), *replState); }; auto onCreateEachFn = [&](const BSONObj& spec) { if (IndexBuildProtocol::kTwoPhase == replState->protocol) { @@ -2198,7 +2154,15 @@ void IndexBuildsCoordinator::_insertKeysFromSideTablesAndCommit( TimestampBlock tsBlock(opCtx, commitIndexBuildTimestamp); uassertStatusOK(_indexBuildsManager.commitIndexBuild( opCtx, collection, collection->ns(), replState->buildUUID, onCreateEachFn, onCommitFn)); - + replState->stats.numIndexesAfter = getNumIndexesTotal(opCtx, collection); + LOGV2(20663, + "Index build completed successfully", + "buildUUID"_attr = replState->buildUUID, + "collection"_attr = collection->ns(), + "collectionUUID"_attr = replState->collectionUUID, + "indexesBuilt"_attr = replState->indexSpecs.size(), + "numIndexesBefore"_attr = replState->stats.numIndexesBefore, + "numIndexesAfter"_attr = replState->stats.numIndexesAfter); return; } @@ -2269,12 +2233,7 @@ StatusWith<std::pair<long long, long long>> IndexBuildsCoordinator::_runIndexReb // Index build is registered in manager regardless of IndexBuildsManager::setUpIndexBuild() // result. - if (status.isOK()) { - // A successful index build means that all the requested indexes are now part of the - // catalog. - _indexBuildsManager.tearDownIndexBuild( - opCtx, collection, buildUUID, MultiIndexBlock::kNoopOnCleanUpFn); - } else { + if (!status.isOK()) { // An index build failure during recovery is fatal. logFailure(status, nss, replState); fassertNoTrace(51076, status); diff --git a/src/mongo/db/index_builds_coordinator.h b/src/mongo/db/index_builds_coordinator.h index c7524cec2c5..b12a1f0041d 100644 --- a/src/mongo/db/index_builds_coordinator.h +++ b/src/mongo/db/index_builds_coordinator.h @@ -203,74 +203,54 @@ public: * must continue to operate on the collection by UUID to protect against rename collection. The * provided 'reason' will be used in the error message that the index builders return to their * callers. + * + * Does not stop new index builds from starting. Caller must make that guarantee. + * + * Does not require holding locks. + * + * Returns the UUIDs of the index builds that were aborted or are already in the process of + * being aborted by another caller. */ - void abortCollectionIndexBuilds(OperationContext* opCtx, - const UUID& collectionUUID, - const std::string& reason); - - /** - * Signals all of the index builds on the specified collection to abort and returns the build - * UUIDs of the index builds that will be aborted. Must identify the collection with a UUID. The - * provided 'reason' will be used in the error message that the index builders return to their - * callers. - */ - std::vector<UUID> abortCollectionIndexBuildsNoWait(OperationContext* opCtx, - const UUID& collectionUUID, - const std::string& reason); + std::vector<UUID> abortCollectionIndexBuilds(OperationContext* opCx, + const NamespaceString collectionNss, + const UUID collectionUUID, + const std::string& reason); /** * Signals all of the index builds on the specified 'db' to abort and then waits until the index * builds are no longer running. The provided 'reason' will be used in the error message that * the index builders return to their callers. + * + * Does not require holding locks. + * + * Does not stop new index builds from starting. Caller must make that guarantee. */ void abortDatabaseIndexBuilds(OperationContext* opCtx, StringData db, const std::string& reason); /** - * Signals all of the index builds on the specified database to abort. The provided 'reason' - * will be used in the error message that the index builders return to their callers. - */ - void abortDatabaseIndexBuildsNoWait(OperationContext* opCtx, - StringData db, - const std::string& reason); - - /** - * Aborts an index build by index build UUID. This gets called when the index build on primary - * failed due to interruption or replica set state change. - * It's a wrapper function to abortIndexBuildByBuildUUIDNoWait(). - */ - void abortIndexBuildOnError(OperationContext* opCtx, const UUID& buildUUID, Status abortStatus); - - /** * Aborts an index build by index build UUID. Returns when the index build thread exits. + * + * Returns true if the index build was aborted or the index build is already in the process of + * being aborted. + * Returns false if the index build does not exist or the index build is already in the process + * of committing and cannot be aborted. */ - void abortIndexBuildByBuildUUID(OperationContext* opCtx, + bool abortIndexBuildByBuildUUID(OperationContext* opCtx, const UUID& buildUUID, IndexBuildAction signalAction, - boost::optional<Timestamp> abortTimestamp = boost::none, - boost::optional<std::string> reason = boost::none); - - /** - * Aborts an index build by index build UUID. Does not wait for the index build thread to - * exit. Returns true if an index build was aborted. - */ - bool abortIndexBuildByBuildUUIDNoWait(OperationContext* opCtx, - const UUID& buildUUID, - IndexBuildAction signalAction, - boost::optional<Timestamp> abortTimestamp = boost::none, - boost::optional<std::string> reason = boost::none); + std::string reason); /** * Aborts an index build by its index name(s). This will only abort in-progress index builds if * all of the indexes are specified that a single builder is building together. When an * appropriate builder exists, this returns the build UUID of the index builder that will be * aborted. */ - boost::optional<UUID> abortIndexBuildByIndexNamesNoWait( - OperationContext* opCtx, - const UUID& collectionUUID, - const std::vector<std::string>& indexNames, - boost::optional<std::string> reason = boost::none); + boost::optional<UUID> abortIndexBuildByIndexNames(OperationContext* opCtx, + const UUID& collectionUUID, + const std::vector<std::string>& indexNames, + std::string reason); /** * Returns true if there is an index builder building the given index names on a collection. @@ -568,40 +548,39 @@ protected: const Status& status); /** - * Modularizes the _indexBuildsManager calls part of _runIndexBuildInner. Throws on error. + * Attempt to abort an index build. Returns a flag indicating how the caller should proceed. */ - void _buildIndex(OperationContext* opCtx, - std::shared_ptr<ReplIndexBuildState> replState, - const IndexBuildOptions& indexBuildOptions, - boost::optional<Lock::CollectionLock>* collLock); - + enum class TryAbortResult { kRetry, kAlreadyAborted, kNotAborted, kContinueAbort }; + TryAbortResult _tryAbort(OperationContext* opCtx, + std::shared_ptr<ReplIndexBuildState> replState, + IndexBuildAction signalAction, + std::string reason); /** - * Builds the indexes single-phased. - * This method matches pre-4.4 behavior for a background index build driven by a single - * createIndexes oplog entry. + * Performs last steps of aborting an index build. */ - void _buildIndexSinglePhase(OperationContext* opCtx, - std::shared_ptr<ReplIndexBuildState> replState, - const IndexBuildOptions& indexBuildOptions, - boost::optional<Lock::CollectionLock>* collLock); + void _completeAbort(OperationContext* opCtx, + std::shared_ptr<ReplIndexBuildState> replState, + IndexBuildAction signalAction, + Status reason); + void _completeSelfAbort(OperationContext* opCtx, + std::shared_ptr<ReplIndexBuildState> replState, + Status reason); + void _completeAbortForShutdown(OperationContext* opCtx, + std::shared_ptr<ReplIndexBuildState> replState, + Collection* collection); /** - * Builds the indexes two-phased. - * The beginning and completion of a index build is driven by the startIndexBuild and - * commitIndexBuild oplog entries, respectively. + * Modularizes the _indexBuildsManager calls part of _runIndexBuildInner. Throws on error. */ - void _buildIndexTwoPhase(OperationContext* opCtx, - std::shared_ptr<ReplIndexBuildState> replState, - const IndexBuildOptions& indexBuildOptions, - boost::optional<Lock::CollectionLock>* collLock); + void _buildIndex(OperationContext* opCtx, + std::shared_ptr<ReplIndexBuildState> replState, + const IndexBuildOptions& indexBuildOptions); /** * First phase is the collection scan and insertion of the keys into the sorter. */ - void _scanCollectionAndInsertKeysIntoSorter( - OperationContext* opCtx, - std::shared_ptr<ReplIndexBuildState> replState, - boost::optional<Lock::CollectionLock>* exclusiveCollectionLock); + void _scanCollectionAndInsertKeysIntoSorter(OperationContext* opCtx, + std::shared_ptr<ReplIndexBuildState> replState); /** * Second phase is extracting the sorted keys and writing them into the new index table. @@ -609,7 +588,8 @@ protected: void _insertKeysFromSideTablesWithoutBlockingWrites( OperationContext* opCtx, std::shared_ptr<ReplIndexBuildState> replState); void _insertKeysFromSideTablesBlockingWrites(OperationContext* opCtx, - std::shared_ptr<ReplIndexBuildState> replState); + std::shared_ptr<ReplIndexBuildState> replState, + const IndexBuildOptions& indexBuildOptions); /** * Reads the commit ready members list for index build UUID in 'replState' from @@ -622,6 +602,13 @@ protected: OperationContext* opCtx, std::shared_ptr<ReplIndexBuildState> replState) = 0; /** + * Attempt to signal the index build to commit and advance the index build to the kPrepareCommit + * state. + * Returns true if successful and false if the attempt was unnecessful and the caller should + * retry. + */ + bool _tryCommit(OperationContext* opCtx, std::shared_ptr<ReplIndexBuildState> replState); + /** * Skips the voting process and directly signal primary to commit index build if * commit quorum is not enabled. */ @@ -673,12 +660,10 @@ protected: * index, which sets the ready flag to true, to the catalog; it is not used for the catch-up * writes during the final drain phase. */ - void _insertKeysFromSideTablesAndCommit( - OperationContext* opCtx, - std::shared_ptr<ReplIndexBuildState> replState, - const IndexBuildOptions& indexBuildOptions, - boost::optional<Lock::CollectionLock>* exclusiveCollectionLock, - const Timestamp& commitIndexBuildTimestamp); + void _insertKeysFromSideTablesAndCommit(OperationContext* opCtx, + std::shared_ptr<ReplIndexBuildState> replState, + const IndexBuildOptions& indexBuildOptions, + const Timestamp& commitIndexBuildTimestamp); /** * Runs the index build. @@ -712,33 +697,9 @@ protected: std::vector<std::shared_ptr<ReplIndexBuildState>> _filterIndexBuilds_inlock( WithLock lk, IndexBuildFilterFn indexBuildFilter) const; - /** - * Helper for 'abortCollectionIndexBuilds' and 'abortCollectionIndexBuildsNoWait'. Returns the - * UUIDs of the aborted index builders - */ - std::vector<UUID> _abortCollectionIndexBuilds(stdx::unique_lock<Latch>& lk, - OperationContext* opCtx, - const UUID& collectionUUID, - const std::string& reason, - bool shouldWait); - - void _awaitNoIndexBuildInProgressForCollection(stdx::unique_lock<Latch>& lk, - OperationContext* opCtx, - const UUID& collectionUUID); - - /** - * Helper for 'abortDatabaseIndexBuilds' and 'abortDatabaseIndexBuildsNoWait'. - */ - void _abortDatabaseIndexBuilds(stdx::unique_lock<Latch>& lk, - OperationContext* opCtx, - const StringData& db, - const std::string& reason, - bool shouldWait); - void _awaitNoBgOpInProgForDb(stdx::unique_lock<Latch>& lk, OperationContext* opCtx, StringData db); - // Protects the below state. mutable Mutex _mutex = MONGO_MAKE_LATCH("IndexBuildsCoordinator::_mutex"); diff --git a/src/mongo/db/index_builds_coordinator_mongod.cpp b/src/mongo/db/index_builds_coordinator_mongod.cpp index 97f8064c4b6..189e0397f29 100644 --- a/src/mongo/db/index_builds_coordinator_mongod.cpp +++ b/src/mongo/db/index_builds_coordinator_mongod.cpp @@ -61,7 +61,6 @@ using namespace indexbuildentryhelpers; namespace { MONGO_FAIL_POINT_DEFINE(hangBeforeInitializingIndexBuild); -MONGO_FAIL_POINT_DEFINE(hangAfterInitializingIndexBuild); const StringData kMaxNumActiveUserIndexBuildsServerParameterName = "maxNumActiveUserIndexBuilds"_sd; @@ -305,10 +304,6 @@ IndexBuildsCoordinatorMongod::startIndexBuild(OperationContext* opCtx, // Signal that the index build started successfully. startPromise.setWith([] {}); - while (MONGO_unlikely(hangAfterInitializingIndexBuild.shouldFail())) { - sleepmillis(100); - } - // Runs the remainder of the index build. Sets the promise result and cleans up the index // build. _runIndexBuild(opCtx.get(), buildUUID, indexBuildOptions); @@ -316,10 +311,14 @@ IndexBuildsCoordinatorMongod::startIndexBuild(OperationContext* opCtx, // Do not exit with an incomplete future. invariant(replState->sharedPromise.getFuture().isReady()); - // Logs the index build statistics if it took longer than the server parameter `slowMs` to - // complete. - CurOp::get(opCtx.get()) - ->completeAndLogOperation(opCtx.get(), MONGO_LOGV2_DEFAULT_COMPONENT); + try { + // Logs the index build statistics if it took longer than the server parameter `slowMs` + // to complete. + CurOp::get(opCtx.get()) + ->completeAndLogOperation(opCtx.get(), MONGO_LOGV2_DEFAULT_COMPONENT); + } catch (const DBException& e) { + LOGV2(4656002, "unable to log operation", "reason"_attr = e); + } }); // Waits until the index build has either been started or failed to start. @@ -499,14 +498,13 @@ void IndexBuildsCoordinatorMongod::_signalPrimaryForCommitReadiness( return; } - // Yield locks and storage engine resources before blocking. - opCtx->recoveryUnit()->abandonSnapshot(); - Lock::TempRelease release(opCtx->lockState()); - invariant(!opCtx->lockState()->isRSTLLocked()); + // No locks should be held. + invariant(!opCtx->lockState()->isLocked()); Backoff exponentialBackoff(Seconds(1), Seconds(2)); - auto onRemoteCmdScheduled = [&](executor::TaskExecutor::CallbackHandle handle) { + auto onRemoteCmdScheduled = [replState, + replCoord](executor::TaskExecutor::CallbackHandle handle) { stdx::unique_lock<Latch> lk(replState->mutex); // We have already received commit or abort signal, So skip voting. if (replState->waitForNextAction->getFuture().isReady()) { @@ -517,12 +515,12 @@ void IndexBuildsCoordinatorMongod::_signalPrimaryForCommitReadiness( } }; - auto onRemoteCmdComplete = [&](executor::TaskExecutor::CallbackHandle) { + auto onRemoteCmdComplete = [replState](executor::TaskExecutor::CallbackHandle) { stdx::unique_lock<Latch> lk(replState->mutex); replState->voteCmdCbkHandle = executor::TaskExecutor::CallbackHandle(); }; - auto needToVote = [&]() -> bool { + auto needToVote = [replState]() -> bool { stdx::unique_lock<Latch> lk(replState->mutex); return !replState->waitForNextAction->getFuture().isReady() ? true : false; }; @@ -597,10 +595,6 @@ IndexBuildAction IndexBuildsCoordinatorMongod::_drainSideWritesUntilNextActionIs // Waits until the promise is fulfilled or the deadline expires. IndexBuildAction nextAction; auto waitUntilNextActionIsReady = [&]() { - // Don't perform a blocking wait while holding locks or storage engine resources. - opCtx->recoveryUnit()->abandonSnapshot(); - Lock::TempRelease release(opCtx->lockState()); - auto deadline = Date_t::now() + Milliseconds(1000); auto timeoutError = opCtx->getTimeoutError(); @@ -634,9 +628,13 @@ Timestamp IndexBuildsCoordinatorMongod::_waitForNextIndexBuildAction( "buildUUID"_attr = replState->buildUUID); while (true) { - // Future wait can be interrupted. This function will yield locks while waiting for the - // future to be fulfilled. + // Future wait should hold no locks. + invariant(!opCtx->lockState()->isLocked(), + str::stream() << "holding locks while waiting for commit or abort: " + << replState->buildUUID); + // Future wait can be interrupted. const auto nextAction = _drainSideWritesUntilNextActionIsAvailable(opCtx, replState); + LOGV2(3856204, "Index build received signal for build uuid: {buildUUID} , action: {action}", "buildUUID"_attr = replState->buildUUID, @@ -644,9 +642,7 @@ Timestamp IndexBuildsCoordinatorMongod::_waitForNextIndexBuildAction( bool needsToRetryWait = false; - // Ensure RSTL is acquired before checking replication state. This is only necessary for - // single-phase builds on secondaries. Everywhere else, the RSTL is already held and this is - // should never block. + // Reacquire RSTL lock to check replication state. repl::ReplicationStateTransitionLockGuard rstl(opCtx, MODE_IX); const NamespaceStringOrUUID dbAndUUID(replState->dbName, replState->collectionUUID); @@ -682,13 +678,15 @@ Timestamp IndexBuildsCoordinatorMongod::_waitForNextIndexBuildAction( // Sanity check // This signal can be received during primary (drain phase), secondary, // startup( startup recovery) and startup2 (initial sync). - invariant(!isMaster && replState->indexBuildState.isAbortPrepared(), + invariant(!isMaster, str::stream() << "Index build: " << replState->buildUUID); + invariant(replState->indexBuildState.isAborted(), str::stream() << "Index build: " << replState->buildUUID << ", index build state: " << replState->indexBuildState.toString()); invariant(replState->indexBuildState.getTimestamp() && replState->indexBuildState.getAbortReason(), replState->buildUUID.toString()); + // The calling thread will interrupt our OperationContext and we will exit. LOGV2(3856206, "Aborting index build", "buildUUID"_attr = replState->buildUUID, @@ -699,33 +697,20 @@ Timestamp IndexBuildsCoordinatorMongod::_waitForNextIndexBuildAction( case IndexBuildAction::kRollbackAbort: invariant(replState->protocol == IndexBuildProtocol::kTwoPhase); invariant(replCoord->getMemberState().rollback()); - - uassertStatusOK(Status( - ErrorCodes::IndexBuildAborted, - str::stream() << "Aborting index build, index build uuid:" - << replState->buildUUID << " , abort reason:" - << replState->indexBuildState.getAbortReason().get_value_or(""))); + // The calling thread will interrupt our OperationContext and we will exit. break; case IndexBuildAction::kPrimaryAbort: - // There are chances when the index build got aborted, it only existed in the - // coordinator, So, we missed marking the index build aborted on manager. So, it's - // important, we exit from here if we are still primary. Otherwise, the index build - // gets committed, though our index build was marked aborted. - - // Single-phase builds do not replicate abort oplog entries. We do not need to be - // primary to abort the index build, and we must continue aborting even in the event - // of a state transition because this build will not receive another signal. - if (isMaster || IndexBuildProtocol::kSinglePhase == replState->protocol) { - uassertStatusOK(Status( - ErrorCodes::IndexBuildAborted, - str::stream() - << "Index build aborted for index build: " << replState->buildUUID - << " , abort reason:" - << replState->indexBuildState.getAbortReason().get_value_or(""))); - } - // Intentionally continue to next case. If we are no longer primary while processing - // kPrimaryAbort, fall back to the kCommitQuorumSatisfied case and reset our - // 'waitForNextAction'. + // The thread aborting a two-phase index build must hold the RSTL so that the + // replication state does not change. They will interrupt our OperationContext and + // we will exit. Single-phase builds do not replicate abort oplog entries. We do + // not need to be primary to abort the index build, and we must continue aborting + // even in the event of a state transition because this build will not receive + // another signal. + invariant(isMaster || IndexBuildProtocol::kSinglePhase == replState->protocol, + str::stream() + << "isMaster: " << isMaster << ", singlePhase: " + << (IndexBuildProtocol::kSinglePhase == replState->protocol)); + break; case IndexBuildAction::kCommitQuorumSatisfied: if (!isMaster) { // Reset the promise as the node has stepped down, diff --git a/src/mongo/db/index_builds_coordinator_mongod_test.cpp b/src/mongo/db/index_builds_coordinator_mongod_test.cpp index 9535b0887ea..c2da0c3f263 100644 --- a/src/mongo/db/index_builds_coordinator_mongod_test.cpp +++ b/src/mongo/db/index_builds_coordinator_mongod_test.cpp @@ -89,7 +89,6 @@ void IndexBuildsCoordinatorMongodTest::setUp() { } void IndexBuildsCoordinatorMongodTest::tearDown() { - _indexBuildsCoord->verifyNoIndexBuilds_forTestOnly(); _indexBuildsCoord->shutdown(operationContext()); _indexBuildsCoord.reset(); // All databases are dropped during tear down. diff --git a/src/mongo/db/repair_database_and_check_version.cpp b/src/mongo/db/repair_database_and_check_version.cpp index 20e9ec50172..d66a8699481 100644 --- a/src/mongo/db/repair_database_and_check_version.cpp +++ b/src/mongo/db/repair_database_and_check_version.cpp @@ -162,8 +162,8 @@ bool checkIdIndexExists(OperationContext* opCtx, RecordId catalogId) { Status buildMissingIdIndex(OperationContext* opCtx, Collection* collection) { MultiIndexBlock indexer; - ON_BLOCK_EXIT( - [&] { indexer.cleanUpAfterBuild(opCtx, collection, MultiIndexBlock::kNoopOnCleanUpFn); }); + auto abortOnExit = makeGuard( + [&] { indexer.abortIndexBuild(opCtx, collection, MultiIndexBlock::kNoopOnCleanUpFn); }); const auto indexCatalog = collection->getIndexCatalog(); const auto idIndexSpec = indexCatalog->getDefaultIdIndexSpec(); @@ -187,6 +187,7 @@ Status buildMissingIdIndex(OperationContext* opCtx, Collection* collection) { status = indexer.commit( opCtx, collection, MultiIndexBlock::kNoopOnCreateEachFn, MultiIndexBlock::kNoopOnCommitFn); wuow.commit(); + abortOnExit.dismiss(); return status; } diff --git a/src/mongo/db/repl/collection_bulk_loader_impl.cpp b/src/mongo/db/repl/collection_bulk_loader_impl.cpp index 3a537c96b0e..fdc32b1b32d 100644 --- a/src/mongo/db/repl/collection_bulk_loader_impl.cpp +++ b/src/mongo/db/repl/collection_bulk_loader_impl.cpp @@ -320,7 +320,11 @@ Status CollectionBulkLoaderImpl::commit() { "namespace"_attr = _nss.ns(), "stats"_attr = _stats.toString()); - _releaseResources(); + // Clean up here so we do not try to abort the index builds when cleaning up in + // _releaseResources. + _idIndexBlock.reset(); + _secondaryIndexesBlock.reset(); + _autoColl.reset(); return Status::OK(); }); } @@ -328,13 +332,13 @@ Status CollectionBulkLoaderImpl::commit() { void CollectionBulkLoaderImpl::_releaseResources() { invariant(&cc() == _opCtx->getClient()); if (_secondaryIndexesBlock) { - _secondaryIndexesBlock->cleanUpAfterBuild( + _secondaryIndexesBlock->abortIndexBuild( _opCtx.get(), _collection, MultiIndexBlock::kNoopOnCleanUpFn); _secondaryIndexesBlock.reset(); } if (_idIndexBlock) { - _idIndexBlock->cleanUpAfterBuild( + _idIndexBlock->abortIndexBuild( _opCtx.get(), _collection, MultiIndexBlock::kNoopOnCleanUpFn); _idIndexBlock.reset(); } diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index b2d74114922..5b4cb82da23 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -2342,10 +2342,21 @@ BSONObj ReplicationCoordinatorImpl::runCmdOnPrimaryAndAwaitResponse( uassertStatusOK(scheduleResult.getStatus()); CallbackHandle cbkHandle = scheduleResult.getValue(); - onRemoteCmdScheduled(cbkHandle); - - // Wait for the response in an interruptible mode. - _replExecutor->wait(cbkHandle, opCtx); + try { + onRemoteCmdScheduled(cbkHandle); + + // Wait for the response in an interruptible mode. + _replExecutor->wait(cbkHandle, opCtx); + } catch (const DBException&) { + // If waiting for the response is interrupted, then we still have a callback out and + // registered with the TaskExecutor to run when the response finally does come back. Since + // the callback references local state, cbkResponse, it would be invalid for the callback to + // run after leaving the this function. Therefore, we cancel the callback and wait + // uninterruptably for the callback to be run. + _replExecutor->cancel(cbkHandle); + _replExecutor->wait(cbkHandle); + throw; + } onRemoteCmdComplete(cbkHandle); uassertStatusOK(cbkResponse.status); diff --git a/src/mongo/db/repl/rs_rollback_test.cpp b/src/mongo/db/repl/rs_rollback_test.cpp index 2b3fdc3e828..8c9cc3d89e4 100644 --- a/src/mongo/db/repl/rs_rollback_test.cpp +++ b/src/mongo/db/repl/rs_rollback_test.cpp @@ -1052,8 +1052,10 @@ TEST_F(RSRollbackTest, RollbackCommitIndexBuild) { ASSERT_EQUALS(1, numIndexesInProgress(_opCtx.get(), nss, coll)); // Kill the index build we just restarted so the fixture can shut down. - IndexBuildsCoordinator::get(_opCtx.get()) - ->abortIndexBuildByBuildUUID(_opCtx.get(), buildUUID, IndexBuildAction::kPrimaryAbort); + ASSERT_OK(_coordinator->setFollowerMode(MemberState::RS_ROLLBACK)); + ASSERT(IndexBuildsCoordinator::get(_opCtx.get()) + ->abortIndexBuildByBuildUUID( + _opCtx.get(), buildUUID, IndexBuildAction::kRollbackAbort, "")); } TEST_F(RSRollbackTest, RollbackAbortIndexBuild) { @@ -1102,8 +1104,10 @@ TEST_F(RSRollbackTest, RollbackAbortIndexBuild) { ASSERT_EQUALS(1, numIndexesInProgress(_opCtx.get(), nss, coll)); // Kill the index build we just restarted so the fixture can shut down. - IndexBuildsCoordinator::get(_opCtx.get()) - ->abortIndexBuildByBuildUUID(_opCtx.get(), buildUUID, IndexBuildAction::kPrimaryAbort); + ASSERT_OK(_coordinator->setFollowerMode(MemberState::RS_ROLLBACK)); + ASSERT(IndexBuildsCoordinator::get(_opCtx.get()) + ->abortIndexBuildByBuildUUID( + _opCtx.get(), buildUUID, IndexBuildAction::kRollbackAbort, "")); } TEST_F(RSRollbackTest, AbortedIndexBuildsAreRestarted) { @@ -1157,8 +1161,10 @@ TEST_F(RSRollbackTest, AbortedIndexBuildsAreRestarted) { ASSERT_EQUALS(1, numIndexesInProgress(_opCtx.get(), nss, coll)); // Kill the index build we just restarted so the fixture can shut down. - IndexBuildsCoordinator::get(_opCtx.get()) - ->abortIndexBuildByBuildUUID(_opCtx.get(), buildUUID, IndexBuildAction::kPrimaryAbort); + ASSERT_OK(_coordinator->setFollowerMode(MemberState::RS_ROLLBACK)); + ASSERT(IndexBuildsCoordinator::get(_opCtx.get()) + ->abortIndexBuildByBuildUUID( + _opCtx.get(), buildUUID, IndexBuildAction::kRollbackAbort, "")); } TEST_F(RSRollbackTest, AbortedIndexBuildsAreNotRestartedWhenStartIsRolledBack) { diff --git a/src/mongo/db/repl_index_build_state.h b/src/mongo/db/repl_index_build_state.h index ef83160f64d..7b1fd07bf7f 100644 --- a/src/mongo/db/repl_index_build_state.h +++ b/src/mongo/db/repl_index_build_state.h @@ -98,32 +98,41 @@ enum class IndexBuildAction { * Represents the index build state. * Valid State transition for primary: * =================================== - * kNone ---> kAborted. - * kNone ---> kPrepareAbort ---> kAborted. + * kSetup -> kInProgress + * kInProgress -> kAborted // An index build failed due to an indexing error or was aborted + * externally. + * kInProgress -> kCommitted // An index build was committed + * * Valid State transition for secondaries: * ======================================= - * kNone ---> kPrepareCommit ---> kCommitted. - * kNone ---> kPrepareAbort ---> kAborted. + * kSetup -> kInProgress + * kInProgress -> kAborted // An index build received an abort oplog entry + * kInProgress -> kPrepareCommit -> kCommitted // An index build received a commit oplog entry */ class IndexBuildState { public: enum StateFlag { - kNone = 1 << 0, + kSetup = 1 << 0, + /** + * Once an index build is in-progress it is eligible for being aborted by an external + * thread. The kSetup state prevents other threads from observing an inconsistent state of + * a build until it transitions to kInProgress. + */ + kInProgress = 1 << 1, /** - * Below state indicates that indexBuildscoordinator thread was externally asked either to - * commit or abort. Oplog applier, rollback, createIndexes command and drop - * databases/collections/indexes cmds can change this state to kPrepareCommit or - * kPrepareAbort. + * Below state indicates that IndexBuildsCoordinator thread was externally asked to commit. + * For kPrepareCommit, this can come from an oplog entry. */ - kPrepareCommit = 1 << 1, - kPrepareAbort = 1 << 2, + kPrepareCommit = 1 << 2, /** - * Below state indicates that index build was successfully able to commit or abort. And, - * it's yet to generate the commitIndexBuild or abortIndexBuild oplog entry respectively. + * Below state indicates that index build was successfully able to commit or abort. For + * kCommitted, the state is set immediately before it commits the index build. For + * kAborted, this state is set after the build is cleaned up and the abort oplog entry is + * replicated. */ kCommitted = 1 << 3, - kAborted = 1 << 4 + kAborted = 1 << 4, }; using StateSet = int; @@ -132,8 +141,9 @@ public: } bool checkIfValidTransition(StateFlag newState) { - if (_state == kNone || (_state == kPrepareCommit && newState == kCommitted) || - (_state == kPrepareAbort && newState == kAborted)) { + if ((_state == kSetup && newState == kInProgress) || + (_state == kInProgress && newState != kSetup) || + (_state == kPrepareCommit && newState == kCommitted)) { return true; } return false; @@ -143,8 +153,6 @@ public: bool skipCheck, boost::optional<Timestamp> timestamp = boost::none, boost::optional<std::string> abortReason = boost::none) { - // TODO SERVER-46560: Should remove the hard-coded value skipCheck 'true'. - skipCheck = true; if (!skipCheck) { invariant(checkIfValidTransition(state), str::stream() << "current state :" << toString(_state) @@ -154,7 +162,7 @@ public: if (timestamp) _timestamp = timestamp; if (abortReason) { - invariant(_state == kPrepareAbort); + invariant(_state == kAborted); _abortReason = abortReason; } } @@ -167,14 +175,14 @@ public: return _state == kCommitted; } - bool isAbortPrepared() const { - return _state == kPrepareAbort; - } - bool isAborted() const { return _state == kAborted; } + bool isSettingUp() const { + return _state == kSetup; + } + boost::optional<Timestamp> getTimestamp() const { return _timestamp; } @@ -189,14 +197,14 @@ public: static std::string toString(StateFlag state) { switch (state) { - case kNone: - return "in-progress"; + case kSetup: + return "Setting up"; + case kInProgress: + return "In progress"; case kPrepareCommit: return "Prepare commit"; case kCommitted: return "Committed"; - case kPrepareAbort: - return "Prepare abort"; case kAborted: return "Aborted"; } @@ -205,7 +213,7 @@ public: private: // Represents the index build state. - StateFlag _state = kNone; + StateFlag _state = kSetup; // Timestamp will be populated only if the node is secondary. // It represents the commit or abort timestamp communicated via // commitIndexBuild and abortIndexBuild oplog entry. @@ -263,6 +271,10 @@ struct ReplIndexBuildState { // Protects the state below. mutable Mutex mutex = MONGO_MAKE_LATCH("ReplIndexBuildState::mutex"); + // The OperationId of the index build. This allows external callers to interrupt the index build + // thread. + OperationId opId; + /* * Readers who read the commit quorum value from "config.system.indexBuilds" collection * to decide if the commit quorum got satisfied for an index build, should take this lock in diff --git a/src/mongo/dbtests/dbtests.cpp b/src/mongo/dbtests/dbtests.cpp index 56612d5c9d5..09d93729855 100644 --- a/src/mongo/dbtests/dbtests.cpp +++ b/src/mongo/dbtests/dbtests.cpp @@ -108,8 +108,8 @@ Status createIndexFromSpec(OperationContext* opCtx, StringData ns, const BSONObj wunit.commit(); } MultiIndexBlock indexer; - ON_BLOCK_EXIT( - [&] { indexer.cleanUpAfterBuild(opCtx, coll, MultiIndexBlock::kNoopOnCleanUpFn); }); + auto abortOnExit = + makeGuard([&] { indexer.abortIndexBuild(opCtx, coll, MultiIndexBlock::kNoopOnCleanUpFn); }); Status status = indexer .init(opCtx, coll, @@ -140,6 +140,7 @@ Status createIndexFromSpec(OperationContext* opCtx, StringData ns, const BSONObj opCtx, coll, MultiIndexBlock::kNoopOnCreateEachFn, MultiIndexBlock::kNoopOnCommitFn)); ASSERT_OK(opCtx->recoveryUnit()->setTimestamp(Timestamp(1, 1))); wunit.commit(); + abortOnExit.dismiss(); return Status::OK(); } diff --git a/src/mongo/dbtests/indexupdatetests.cpp b/src/mongo/dbtests/indexupdatetests.cpp index f474310f530..dc6f23cc99f 100644 --- a/src/mongo/dbtests/indexupdatetests.cpp +++ b/src/mongo/dbtests/indexupdatetests.cpp @@ -91,8 +91,8 @@ protected: try { MultiIndexBlock indexer; - ON_BLOCK_EXIT([&] { - indexer.cleanUpAfterBuild(_opCtx, collection(), MultiIndexBlock::kNoopOnCleanUpFn); + auto abortOnExit = makeGuard([&] { + indexer.abortIndexBuild(_opCtx, collection(), MultiIndexBlock::kNoopOnCleanUpFn); }); uassertStatusOK( @@ -104,6 +104,7 @@ protected: MultiIndexBlock::kNoopOnCreateEachFn, MultiIndexBlock::kNoopOnCommitFn)); wunit.commit(); + abortOnExit.dismiss(); } catch (const DBException& e) { if (ErrorCodes::isInterruption(e.code())) return true; @@ -156,8 +157,9 @@ public: << static_cast<int>(kIndexVersion) << "unique" << true << "background" << background); - ON_BLOCK_EXIT( - [&] { indexer.cleanUpAfterBuild(_opCtx, coll, MultiIndexBlock::kNoopOnCleanUpFn); }); + auto abortOnExit = makeGuard([&] { + indexer.abortIndexBuild(_opCtx, collection(), MultiIndexBlock::kNoopOnCleanUpFn); + }); ASSERT_OK(indexer.init(_opCtx, coll, spec, MultiIndexBlock::kNoopOnInitFn).getStatus()); ASSERT_OK(indexer.insertAllDocumentsInCollection(_opCtx, coll)); @@ -167,6 +169,7 @@ public: ASSERT_OK(indexer.commit( _opCtx, coll, MultiIndexBlock::kNoopOnCreateEachFn, MultiIndexBlock::kNoopOnCommitFn)); wunit.commit(); + abortOnExit.dismiss(); } }; @@ -205,8 +208,9 @@ public: << "background" << background); collLk.emplace(_opCtx, _nss, LockMode::MODE_X); - ON_BLOCK_EXIT( - [&] { indexer.cleanUpAfterBuild(_opCtx, coll, MultiIndexBlock::kNoopOnCleanUpFn); }); + auto abortOnExit = makeGuard([&] { + indexer.abortIndexBuild(_opCtx, collection(), MultiIndexBlock::kNoopOnCleanUpFn); + }); ASSERT_OK(indexer.init(_opCtx, coll, spec, MultiIndexBlock::kNoopOnInitFn).getStatus()); @@ -320,9 +324,8 @@ Status IndexBuildBase::createIndex(const BSONObj& indexSpec) { Lock::CollectionLock collLk(_opCtx, _nss, MODE_X); MultiIndexBlock indexer; - ON_BLOCK_EXIT([&] { - indexer.cleanUpAfterBuild(_opCtx, collection(), MultiIndexBlock::kNoopOnCleanUpFn); - }); + auto abortOnExit = makeGuard( + [&] { indexer.abortIndexBuild(_opCtx, collection(), MultiIndexBlock::kNoopOnCleanUpFn); }); Status status = indexer.init(_opCtx, collection(), indexSpec, MultiIndexBlock::kNoopOnInitFn).getStatus(); if (status == ErrorCodes::IndexAlreadyExists) { @@ -345,6 +348,7 @@ Status IndexBuildBase::createIndex(const BSONObj& indexSpec) { MultiIndexBlock::kNoopOnCreateEachFn, MultiIndexBlock::kNoopOnCommitFn)); wunit.commit(); + abortOnExit.dismiss(); return Status::OK(); } diff --git a/src/mongo/dbtests/querytests.cpp b/src/mongo/dbtests/querytests.cpp index 6f7d96ef33f..d017dcd2311 100644 --- a/src/mongo/dbtests/querytests.cpp +++ b/src/mongo/dbtests/querytests.cpp @@ -104,8 +104,8 @@ protected: auto specObj = builder.obj(); MultiIndexBlock indexer; - ON_BLOCK_EXIT([&] { - indexer.cleanUpAfterBuild(&_opCtx, _collection, MultiIndexBlock::kNoopOnCleanUpFn); + auto abortOnExit = makeGuard([&] { + indexer.abortIndexBuild(&_opCtx, _collection, MultiIndexBlock::kNoopOnCleanUpFn); }); { WriteUnitOfWork wunit(&_opCtx); @@ -127,6 +127,7 @@ protected: MultiIndexBlock::kNoopOnCommitFn)); wunit.commit(); } + abortOnExit.dismiss(); } void insert(const char* s) { diff --git a/src/mongo/dbtests/storage_timestamp_tests.cpp b/src/mongo/dbtests/storage_timestamp_tests.cpp index 67be7eb3eb8..58925890921 100644 --- a/src/mongo/dbtests/storage_timestamp_tests.cpp +++ b/src/mongo/dbtests/storage_timestamp_tests.cpp @@ -265,8 +265,8 @@ public: // Build an index. MultiIndexBlock indexer; - ON_BLOCK_EXIT( - [&] { indexer.cleanUpAfterBuild(_opCtx, coll, MultiIndexBlock::kNoopOnCleanUpFn); }); + auto abortOnExit = makeGuard( + [&] { indexer.abortIndexBuild(_opCtx, coll, MultiIndexBlock::kNoopOnCleanUpFn); }); BSONObj indexInfoObj; { @@ -297,6 +297,7 @@ public: // MultiIndexBlock. wuow.commit(); } + abortOnExit.dismiss(); } std::int32_t itCount(Collection* coll) { @@ -1872,8 +1873,8 @@ public: // Build an index on `{a: 1}`. This index will be multikey. MultiIndexBlock indexer; - ON_BLOCK_EXIT([&] { - indexer.cleanUpAfterBuild( + auto abortOnExit = makeGuard([&] { + indexer.abortIndexBuild( _opCtx, autoColl.getCollection(), MultiIndexBlock::kNoopOnCleanUpFn); }); const LogicalTime beforeIndexBuild = _clock->reserveTicks(2); @@ -1930,6 +1931,7 @@ public: MultiIndexBlock::kNoopOnCommitFn)); wuow.commit(); } + abortOnExit.dismiss(); const Timestamp afterIndexBuild = _clock->reserveTicks(1).asTimestamp(); @@ -1984,8 +1986,8 @@ public: // Build an index on `{a: 1}`. MultiIndexBlock indexer; - ON_BLOCK_EXIT([&] { - indexer.cleanUpAfterBuild( + auto abortOnExit = makeGuard([&] { + indexer.abortIndexBuild( _opCtx, autoColl.getCollection(), MultiIndexBlock::kNoopOnCleanUpFn); }); const LogicalTime beforeIndexBuild = _clock->reserveTicks(2); @@ -2116,6 +2118,7 @@ public: MultiIndexBlock::kNoopOnCommitFn)); wuow.commit(); } + abortOnExit.dismiss(); } }; @@ -2653,8 +2656,8 @@ public: const IndexCatalogEntry* buildingIndex = nullptr; MultiIndexBlock indexer; - ON_BLOCK_EXIT([&] { - indexer.cleanUpAfterBuild(_opCtx, collection, MultiIndexBlock::kNoopOnCleanUpFn); + auto abortOnExit = makeGuard([&] { + indexer.abortIndexBuild(_opCtx, collection, MultiIndexBlock::kNoopOnCleanUpFn); }); // Provide a build UUID, indicating that this is a two-phase index build. @@ -2767,6 +2770,7 @@ public: MultiIndexBlock::kNoopOnCommitFn)); wuow.commit(); } + abortOnExit.dismiss(); } }; diff --git a/src/mongo/dbtests/wildcard_multikey_persistence_test.cpp b/src/mongo/dbtests/wildcard_multikey_persistence_test.cpp index a0b079255d8..679a39ed254 100644 --- a/src/mongo/dbtests/wildcard_multikey_persistence_test.cpp +++ b/src/mongo/dbtests/wildcard_multikey_persistence_test.cpp @@ -200,8 +200,8 @@ protected: auto coll = autoColl.getCollection(); MultiIndexBlock indexer; - ON_BLOCK_EXIT( - [&] { indexer.cleanUpAfterBuild(opCtx(), coll, MultiIndexBlock::kNoopOnCleanUpFn); }); + auto abortOnExit = makeGuard( + [&] { indexer.abortIndexBuild(opCtx(), coll, MultiIndexBlock::kNoopOnCleanUpFn); }); // Initialize the index builder and add all documents currently in the collection. ASSERT_OK( @@ -212,6 +212,7 @@ protected: WriteUnitOfWork wunit(opCtx()); ASSERT_OK(indexer.commit( opCtx(), coll, MultiIndexBlock::kNoopOnCreateEachFn, MultiIndexBlock::kNoopOnCommitFn)); + abortOnExit.dismiss(); wunit.commit(); } |