diff options
23 files changed, 396 insertions, 206 deletions
diff --git a/etc/evergreen.yml b/etc/evergreen.yml index 8ecf15dd47f..b61822bf1d0 100644 --- a/etc/evergreen.yml +++ b/etc/evergreen.yml @@ -11226,6 +11226,7 @@ buildvariants: - name: replica_sets distros: - rhel62-large + - name: rollback_fuzzer_gen - name: sharding_gen - name: enterprise-rhel-62-64-bit-coverage diff --git a/jstests/noPassthrough/backup_restore_fsync_lock.js b/jstests/noPassthrough/backup_restore_fsync_lock.js index f6f720377aa..57b12a6ab24 100644 --- a/jstests/noPassthrough/backup_restore_fsync_lock.js +++ b/jstests/noPassthrough/backup_restore_fsync_lock.js @@ -11,9 +11,12 @@ * * Some methods for backup used in this test checkpoint the files in the dbpath. This technique will * not work for ephemeral storage engines, as they do not store any data in the dbpath. + * TODO: (SERVER-44467) Remove two_phase_index_builds_unsupported tag when startup recovery works + * for two-phase index builds. * @tags: [ * requires_persistence, * requires_replication, + * two_phase_index_builds_unsupported, * ] */ diff --git a/jstests/noPassthrough/backup_restore_rolling.js b/jstests/noPassthrough/backup_restore_rolling.js index 63f6b1a7cd2..358f33b0194 100644 --- a/jstests/noPassthrough/backup_restore_rolling.js +++ b/jstests/noPassthrough/backup_restore_rolling.js @@ -11,9 +11,12 @@ * * Some methods for backup used in this test checkpoint the files in the dbpath. This technique will * not work for ephemeral storage engines, as they do not store any data in the dbpath. + * TODO: (SERVER-44467) Remove two_phase_index_builds_unsupported tag when startup recovery works + * for two-phase index builds. * @tags: [ * requires_persistence, * requires_wiredtiger, + * two_phase_index_builds_unsupported, * ] */ diff --git a/jstests/noPassthrough/backup_restore_stop_start.js b/jstests/noPassthrough/backup_restore_stop_start.js index 2598b8670ab..71ad88c7dd5 100644 --- a/jstests/noPassthrough/backup_restore_stop_start.js +++ b/jstests/noPassthrough/backup_restore_stop_start.js @@ -11,9 +11,12 @@ * * Some methods for backup used in this test checkpoint the files in the dbpath. This technique will * not work for ephemeral storage engines, as they do not store any data in the dbpath. + * TODO: (SERVER-44467) Remove two_phase_index_builds_unsupported tag when startup recovery works + * for two-phase index builds. * @tags: [ * requires_persistence, * requires_replication, + * two_phase_index_builds_unsupported, * ] */ diff --git a/jstests/noPassthrough/characterize_index_builds_on_restart.js b/jstests/noPassthrough/characterize_index_builds_on_restart.js index b5520883a4f..64c2ba5fe91 100644 --- a/jstests/noPassthrough/characterize_index_builds_on_restart.js +++ b/jstests/noPassthrough/characterize_index_builds_on_restart.js @@ -2,10 +2,13 @@ * Characterizes the actions (rebuilds or drops the index) taken upon unfinished indexes when * restarting mongod from (standalone -> standalone) and (replica set member -> standalone). * + * TODO(SERVER-44468): Remove two_phase_index_builds_unsupported tag when standalone startup works + * with two-phase builds * @tags: [ * requires_majority_read_concern, * requires_persistence, * requires_replication, + * two_phase_index_builds_unsupported, * ] */ (function() { diff --git a/jstests/noPassthrough/indexbg_shutdown.js b/jstests/noPassthrough/indexbg_shutdown.js index ae7d1f3c3e6..4e9bc9cf016 100644 --- a/jstests/noPassthrough/indexbg_shutdown.js +++ b/jstests/noPassthrough/indexbg_shutdown.js @@ -3,7 +3,13 @@ * shuts down a secondary while it's building that index, and confirms that the secondary * shuts down cleanly, without an fassert. * Also confirms that killOp has no effect on the background index build on the secondary. - * @tags: [requires_replication] + * + * TODO(SERVER-44467): Remove two_phase_index_builds_unsupported tag when startup recovery works + * for two-phase index builds. + * @tags: [ + * requires_replication, + * two_phase_index_builds_unsupported, + * ] */ (function() { diff --git a/jstests/noPassthroughWithMongod/indexbg_restart_secondary.js b/jstests/noPassthroughWithMongod/indexbg_restart_secondary.js index 2e5d2e85bf6..e12973b7ff8 100644 --- a/jstests/noPassthroughWithMongod/indexbg_restart_secondary.js +++ b/jstests/noPassthroughWithMongod/indexbg_restart_secondary.js @@ -2,9 +2,18 @@ * Starts a replica set with arbiter, builds an index in background. * Kills the secondary once the index build starts with a failpoint. * The index build should resume when the secondary is restarted. + * + * TODO: (SERVER-44467) Remove two_phase_index_builds_unsupported tag when startup recovery works + * for two-phase index builds. + * + * @tags: [ + * requires_persistence, + * requires_journaling, + * requires_replication, + * two_phase_index_builds_unsupported, + * ] */ -// @tags: [requires_persistence, requires_journaling, requires_replication] (function() { 'use strict'; diff --git a/jstests/replsets/rollback_waits_for_bgindex_completion.js b/jstests/replsets/rollback_waits_for_bgindex_completion.js index 892ddcd34e2..7bfb36ba9ad 100644 --- a/jstests/replsets/rollback_waits_for_bgindex_completion.js +++ b/jstests/replsets/rollback_waits_for_bgindex_completion.js @@ -2,12 +2,10 @@ * Test to ensure that rollback waits for in-progress background index builds to finish before * starting the rollback process. Only applies to Recoverable Rollback via WiredTiger checkpoints. * - * TODO(SERVER-39451): Remove two_phase_index_builds_unsupported tag. * @tags: [ * requires_wiredtiger, * requires_journaling, * requires_majority_read_concern, - * two_phase_index_builds_unsupported, * ] */ (function() { diff --git a/src/mongo/db/catalog/catalog_control.cpp b/src/mongo/db/catalog/catalog_control.cpp index 916deb9f2c7..bbcea9f741c 100644 --- a/src/mongo/db/catalog/catalog_control.cpp +++ b/src/mongo/db/catalog/catalog_control.cpp @@ -110,13 +110,12 @@ void openCatalog(OperationContext* opCtx, const MinVisibleTimestampMap& minVisib storageEngine->loadCatalog(opCtx); log() << "openCatalog: reconciling catalog and idents"; - auto indexesToRebuild = storageEngine->reconcileCatalogAndIdents(opCtx); - fassert(40688, indexesToRebuild.getStatus()); + auto reconcileResult = fassert(40688, storageEngine->reconcileCatalogAndIdents(opCtx)); // Determine which indexes need to be rebuilt. rebuildIndexesOnCollection() requires that all // indexes on that collection are done at once, so we use a map to group them together. StringMap<IndexNameObjs> nsToIndexNameObjMap; - for (StorageEngine::IndexIdentifier indexIdentifier : indexesToRebuild.getValue()) { + for (StorageEngine::IndexIdentifier indexIdentifier : reconcileResult.indexesToRebuild) { auto indexName = indexIdentifier.indexName; auto indexSpecs = getIndexNameObjs(opCtx, @@ -158,6 +157,13 @@ void openCatalog(OperationContext* opCtx, const MinVisibleTimestampMap& minVisib fassert(40690, rebuildIndexesOnCollection(opCtx, collection, indexSpecs)); } + // Once all unfinished index builds have been dropped and the catalog has been reloaded, restart + // any unfinished index builds. This will not restart any index builds to completion, but rather + // start the background thread, build the index, and wait for a replicated commit or abort oplog + // entry. + IndexBuildsCoordinator::get(opCtx)->restartIndexBuildsForRecovery( + opCtx, reconcileResult.indexBuildsToRestart); + // Open all databases and repopulate the CollectionCatalog. log() << "openCatalog: reopening all databases"; auto databaseHolder = DatabaseHolder::get(opCtx); @@ -187,6 +193,7 @@ void openCatalog(OperationContext* opCtx, const MinVisibleTimestampMap& minVisib } } } + // Opening CollectionCatalog: The collection catalog is now in sync with the storage engine // catalog. Clear the pre-closing state. CollectionCatalog::get(opCtx).onOpenCatalog(opCtx); diff --git a/src/mongo/db/catalog/catalog_control_test.cpp b/src/mongo/db/catalog/catalog_control_test.cpp index 21740aa1b12..d2527e7efb9 100644 --- a/src/mongo/db/catalog/catalog_control_test.cpp +++ b/src/mongo/db/catalog/catalog_control_test.cpp @@ -148,9 +148,9 @@ public: } void setCachePressureForTest(int pressure) final {} void triggerJournalFlush() const final {} - StatusWith<std::vector<IndexIdentifier>> reconcileCatalogAndIdents( + StatusWith<StorageEngine::ReconcileResult> reconcileCatalogAndIdents( OperationContext* opCtx) final { - return std::vector<IndexIdentifier>(); + return ReconcileResult{}; } Timestamp getAllDurableTimestamp() const final { return {}; diff --git a/src/mongo/db/catalog/index_builds_manager.cpp b/src/mongo/db/catalog/index_builds_manager.cpp index 952fb1c05b9..0c85b725664 100644 --- a/src/mongo/db/catalog/index_builds_manager.cpp +++ b/src/mongo/db/catalog/index_builds_manager.cpp @@ -109,13 +109,8 @@ Status IndexBuildsManager::setUpIndexBuild(OperationContext* opCtx, return ex.toStatus(); } - if (options.forRecovery) { - log() << "Index build initialized: " << buildUUID << ": " << nss - << ": indexes: " << indexes.size(); - } else { - log() << "Index build initialized: " << buildUUID << ": " << nss << " (" - << collection->uuid() << " ): indexes: " << indexes.size(); - } + log() << "Index build initialized: " << buildUUID << ": " << nss << " (" << collection->uuid() + << " ): indexes: " << indexes.size(); return Status::OK(); } diff --git a/src/mongo/db/catalog/index_builds_manager.h b/src/mongo/db/catalog/index_builds_manager.h index 1843177d5ae..c16b5b57af4 100644 --- a/src/mongo/db/catalog/index_builds_manager.h +++ b/src/mongo/db/catalog/index_builds_manager.h @@ -69,7 +69,6 @@ public: SetupOptions(); IndexConstraints indexConstraints = IndexConstraints::kEnforce; IndexBuildProtocol protocol = IndexBuildProtocol::kSinglePhase; - bool forRecovery = false; }; IndexBuildsManager() = default; diff --git a/src/mongo/db/index_builds_coordinator.cpp b/src/mongo/db/index_builds_coordinator.cpp index ed3e8f7e176..00e4628cb46 100644 --- a/src/mongo/db/index_builds_coordinator.cpp +++ b/src/mongo/db/index_builds_coordinator.cpp @@ -314,12 +314,30 @@ bool IndexBuildsCoordinator::supportsTwoPhaseIndexBuild() const { return true; } -StatusWith<std::pair<long long, long long>> IndexBuildsCoordinator::startIndexRebuildForRecovery( +StatusWith<std::pair<long long, long long>> IndexBuildsCoordinator::rebuildIndexesForRecovery( OperationContext* opCtx, const NamespaceString& nss, const std::vector<BSONObj>& specs, const UUID& buildUUID) { - // Index builds in recovery mode have the global write lock. + + const auto protocol = IndexBuildProtocol::kSinglePhase; + auto status = _startIndexBuildForRecovery(opCtx, nss, specs, buildUUID, protocol); + if (!status.isOK()) { + return status; + } + + auto& collectionCatalog = CollectionCatalog::get(getGlobalServiceContext()); + Collection* collection = collectionCatalog.lookupCollectionByNamespace(nss); + + // Complete the index build. + return _runIndexRebuildForRecovery(opCtx, collection, buildUUID); +} + +Status IndexBuildsCoordinator::_startIndexBuildForRecovery(OperationContext* opCtx, + const NamespaceString& nss, + const std::vector<BSONObj>& specs, + const UUID& buildUUID, + IndexBuildProtocol protocol) { invariant(opCtx->lockState()->isW()); std::vector<std::string> indexNames; @@ -334,53 +352,57 @@ StatusWith<std::pair<long long, long long>> IndexBuildsCoordinator::startIndexRe indexNames.push_back(name); } - ReplIndexBuildState::IndexCatalogStats indexCatalogStats; - auto& collectionCatalog = CollectionCatalog::get(getGlobalServiceContext()); Collection* collection = collectionCatalog.lookupCollectionByNamespace(nss); auto indexCatalog = collection->getIndexCatalog(); { // These steps are combined into a single WUOW to ensure there are no commits without // the indexes. - // 1) Drop all indexes. - // 2) Re-create the Collection. - // 3) Start the index build process. + // 1) Drop all unfinished indexes. + // 2) Start, but do not complete the index build process. WriteUnitOfWork wuow(opCtx); - // 1 - { - for (size_t i = 0; i < indexNames.size(); i++) { - auto descriptor = indexCatalog->findIndexByName(opCtx, indexNames[i], false); - if (!descriptor) { - // If it's unfinished index, drop it directly via removeIndex. - Status status = DurableCatalog::get(opCtx)->removeIndex( - opCtx, collection->getCatalogId(), indexNames[i]); - continue; - } - Status s = indexCatalog->dropIndex(opCtx, descriptor); - if (!s.isOK()) { - return s; + for (size_t i = 0; i < indexNames.size(); i++) { + const bool includeUnfinished = false; + auto descriptor = + indexCatalog->findIndexByName(opCtx, indexNames[i], includeUnfinished); + if (!descriptor) { + const auto durableBuildUUID = DurableCatalog::get(opCtx)->getIndexBuildUUID( + opCtx, collection->getCatalogId(), indexNames[i]); + + // A build UUID is present if and only if we are rebuilding a two-phase build. + invariant((protocol == IndexBuildProtocol::kTwoPhase) == + durableBuildUUID.is_initialized()); + // When a buildUUID is present, it must match the build UUID parameter to this + // function. + invariant(!durableBuildUUID || *durableBuildUUID == buildUUID, + str::stream() << "durable build UUID: " << durableBuildUUID + << "buildUUID: " << buildUUID); + + // If it's unfinished index, drop it directly via removeIndex. + Status status = DurableCatalog::get(opCtx)->removeIndex( + opCtx, collection->getCatalogId(), indexNames[i]); + if (!status.isOK()) { + return status; } + continue; + } + Status s = indexCatalog->dropIndex(opCtx, descriptor); + if (!s.isOK()) { + return s; } } - // We need to initialize the collection to drop and rebuild the indexes. + // We need to initialize the collection to rebuild the indexes. collection->init(opCtx); - // Register the index build. During recovery, collections may not have UUIDs present yet to - // due upgrading. We don't require collection UUIDs during recovery except to create a - // ReplIndexBuildState object. - auto collectionUUID = UUID::gen(); auto dbName = nss.db().toString(); - - // We run the index build using the single phase protocol as we already hold the global - // write lock. auto replIndexBuildState = std::make_shared<ReplIndexBuildState>(buildUUID, - collectionUUID, + collection->uuid(), dbName, specs, - IndexBuildProtocol::kSinglePhase, + protocol, /*commitQuorum=*/boost::none); Status status = [&]() { @@ -391,12 +413,7 @@ StatusWith<std::pair<long long, long long>> IndexBuildsCoordinator::startIndexRe return status; } - // Setup the index build. - indexCatalogStats.numIndexesBefore = - _getNumIndexesTotal(opCtx, collection) + indexNames.size(); - IndexBuildsManager::SetupOptions options; - options.forRecovery = true; status = _indexBuildsManager.setUpIndexBuild( opCtx, collection, specs, buildUUID, MultiIndexBlock::kNoopOnInitFn, options); if (!status.isOK()) { @@ -408,7 +425,7 @@ StatusWith<std::pair<long long, long long>> IndexBuildsCoordinator::startIndexRe wuow.commit(); } - return _runIndexRebuildForRecovery(opCtx, collection, indexCatalogStats, buildUUID); + return Status::OK(); } void IndexBuildsCoordinator::joinIndexBuild(OperationContext* opCtx, const UUID& buildUUID) { @@ -526,7 +543,10 @@ void IndexBuildsCoordinator::onStepUp(OperationContext* opCtx) { void IndexBuildsCoordinator::onRollback(OperationContext* opCtx) { log() << "IndexBuildsCoordinator::onRollback - this node is entering the rollback state"; auto indexBuilds = _getIndexBuilds(); - auto onIndexBuild = [](std::shared_ptr<ReplIndexBuildState> replState) { + auto onIndexBuild = [this](std::shared_ptr<ReplIndexBuildState> replState) { + const std::string reason = "rollback"; + _indexBuildsManager.abortIndexBuild(replState->buildUUID, reason); + stdx::unique_lock<Latch> lk(replState->mutex); if (!replState->aborted) { // Leave abort timestamp as null. This will unblock the index build and allow it to @@ -535,15 +555,42 @@ void IndexBuildsCoordinator::onRollback(OperationContext* opCtx) { invariant(replState->abortTimestamp.isNull(), replState->buildUUID.toString()); invariant(!replState->aborted, replState->buildUUID.toString()); replState->aborted = true; - replState->abortReason = "rollback"; + replState->abortReason = reason; replState->condVar.notify_all(); } }; forEachIndexBuild(indexBuilds, "IndexBuildsCoordinator::onRollback - "_sd, onIndexBuild); } -void IndexBuildsCoordinator::recoverIndexBuilds() { - // TODO: not yet implemented. +void IndexBuildsCoordinator::restartIndexBuildsForRecovery( + OperationContext* opCtx, + const std::map<UUID, StorageEngine::IndexBuildToRestart>& buildsToRestart) { + for (auto& [buildUUID, build] : buildsToRestart) { + boost::optional<NamespaceString> nss = + CollectionCatalog::get(opCtx).lookupNSSByUUID(build.collUUID); + invariant(nss); + + log() << "Restarting index build for collection: " << *nss + << ", collection UUID: " << build.collUUID << ", index build UUID: " << buildUUID; + + IndexBuildsCoordinator::IndexBuildOptions indexBuildOptions; + // Start the index build as if in secondary oplog application. + indexBuildOptions.replSetAndNotPrimaryAtStart = true; + // Indicate that the intialization should not generate oplog entries or timestamps for the + // first catalog write, and that the original durable catalog entries should be dropped and + // replaced. + indexBuildOptions.twoPhaseRecovery = true; + // This spawns a new thread and returns immediately. These index builds will start and wait + // for a commit or abort to be replicated. + MONGO_COMPILER_VARIABLE_UNUSED auto fut = + uassertStatusOK(startIndexBuild(opCtx, + nss->db(), + build.collUUID, + build.indexSpecs, + buildUUID, + IndexBuildProtocol::kTwoPhase, + indexBuildOptions)); + } } int IndexBuildsCoordinator::numInProgForDb(StringData db) const { @@ -873,6 +920,25 @@ void IndexBuildsCoordinator::_unregisterIndexBuild( invariant(_allIndexBuilds.erase(replIndexBuildState->buildUUID)); } +Status IndexBuildsCoordinator::_registerAndSetUpIndexBuildForTwoPhaseRecovery( + OperationContext* opCtx, + StringData dbName, + CollectionUUID collectionUUID, + const std::vector<BSONObj>& specs, + const UUID& buildUUID) { + NamespaceStringOrUUID nssOrUuid{dbName.toString(), collectionUUID}; + + // Don't use the AutoGet helpers because they require an open database, which may not be the + // case when an index builds is restarted during recovery. + Lock::DBLock dbLock(opCtx, dbName, MODE_IX); + Lock::CollectionLock collLock(opCtx, nssOrUuid, MODE_X); + auto collection = CollectionCatalog::get(opCtx).lookupCollectionByUUID(collectionUUID); + invariant(collection); + const auto& nss = collection->ns(); + const auto protocol = IndexBuildProtocol::kTwoPhase; + return _startIndexBuildForRecovery(opCtx, nss, specs, buildUUID, protocol); +} + StatusWith<boost::optional<SharedSemiFuture<ReplIndexBuildState::IndexCatalogStats>>> IndexBuildsCoordinator::_registerAndSetUpIndexBuild( OperationContext* opCtx, @@ -931,35 +997,13 @@ IndexBuildsCoordinator::_registerAndSetUpIndexBuild( } MultiIndexBlock::OnInitFn onInitFn; - // Two-phase index builds write a different oplog entry than the default behavior which - // writes a no-op just to generate an optime. if (serverGlobalParams.featureCompatibility.isVersionInitialized() && serverGlobalParams.featureCompatibility.getVersion() == ServerGlobalParams::FeatureCompatibility::Version::kFullyUpgradedTo44) { - onInitFn = [&](std::vector<BSONObj>& specs) { - // TODO (SERVER-40807): disabling the following code for the v4.2 release so it does not - // have downstream impact. - /* - // Only the primary node writes an index build entry to the collection as the - // secondaries will replicate it. - if (replCoord->canAcceptWritesFor(opCtx, nss)) { - invariant(replIndexBuildState->commitQuorum); - std::vector<std::string> indexNames; - for (const auto& spec : specs) { - indexNames.push_back(spec.getStringField(IndexDescriptor::kIndexNameFieldName)); - } - - IndexBuildEntry entry(replIndexBuildState->buildUUID, - *collection->uuid(), - *replIndexBuildState->commitQuorum, - indexNames); - Status status = addIndexBuildEntry(opCtx, entry); - if (!status.isOK()) { - return status; - } - } - */ + // Two-phase index builds write a different oplog entry than the default behavior which + // writes a no-op just to generate an optime. + onInitFn = [&](std::vector<BSONObj>& specs) { opCtx->getServiceContext()->getOpObserver()->onStartIndexBuild( opCtx, nss, @@ -980,6 +1024,7 @@ IndexBuildsCoordinator::_registerAndSetUpIndexBuild( ? IndexBuildsManager::IndexConstraints::kRelax : IndexBuildsManager::IndexConstraints::kEnforce; options.protocol = protocol; + status = _indexBuildsManager.setUpIndexBuild( opCtx, collection, filteredSpecs, replIndexBuildState->buildUUID, onInitFn, options); @@ -1536,10 +1581,7 @@ void IndexBuildsCoordinator::_insertKeysFromSideTablesAndCommit( } StatusWith<std::pair<long long, long long>> IndexBuildsCoordinator::_runIndexRebuildForRecovery( - OperationContext* opCtx, - Collection* collection, - ReplIndexBuildState::IndexCatalogStats& indexCatalogStats, - const UUID& buildUUID) noexcept { + OperationContext* opCtx, Collection* collection, const UUID& buildUUID) noexcept { // Index builds in recovery mode have the global write lock. invariant(opCtx->lockState()->isW()); @@ -1555,6 +1597,9 @@ StatusWith<std::pair<long long, long long>> IndexBuildsCoordinator::_runIndexReb long long numRecords = 0; long long dataSize = 0; + ReplIndexBuildState::IndexCatalogStats indexCatalogStats; + indexCatalogStats.numIndexesBefore = _getNumIndexesTotal(opCtx, collection); + try { log() << "Index builds manager starting: " << buildUUID << ": " << nss; diff --git a/src/mongo/db/index_builds_coordinator.h b/src/mongo/db/index_builds_coordinator.h index 76ed8f74945..9183a27795f 100644 --- a/src/mongo/db/index_builds_coordinator.h +++ b/src/mongo/db/index_builds_coordinator.h @@ -75,6 +75,7 @@ public: struct IndexBuildOptions { boost::optional<CommitQuorumOptions> commitQuorum; bool replSetAndNotPrimaryAtStart = false; + bool twoPhaseRecovery = false; }; /** @@ -125,14 +126,22 @@ public: IndexBuildOptions indexBuildOptions) = 0; /** - * Sets up the in-memory and persisted state of the index build. - * - * This function should only be called when in recovery mode, because we create new Collection - * objects and replace old ones after dropping existing indexes. + * Given a vector of two-phase index builds, start, but do not complete each one in a background + * thread. Each index build will wait for a replicate commit or abort, as in steady-state + * replication. + */ + void restartIndexBuildsForRecovery( + OperationContext* opCtx, + const std::map<UUID, StorageEngine::IndexBuildToRestart>& buildsToRestart); + + /** + * Runs the full index rebuild for recovery. This will only rebuild single-phase index builds. + * Rebuilding an index in recovery mode verifies each document to ensure that it is a valid + * BSON object. It will remove any documents with invalid BSON. * * Returns the number of records and the size of the data iterated over, if successful. */ - StatusWith<std::pair<long long, long long>> startIndexRebuildForRecovery( + StatusWith<std::pair<long long, long long>> rebuildIndexesForRecovery( OperationContext* opCtx, const NamespaceString& nss, const std::vector<BSONObj>& specs, @@ -370,6 +379,18 @@ private: */ Status _registerIndexBuild(WithLock, std::shared_ptr<ReplIndexBuildState> replIndexBuildState); + /** + * Sets up the in-memory and persisted state of the index build. + * + * This function should only be called when in recovery mode, because we drop and replace + * existing indexes in a single WriteUnitOfWork. + */ + Status _startIndexBuildForRecovery(OperationContext* opCtx, + const NamespaceString& nss, + const std::vector<BSONObj>& specs, + const UUID& buildUUID, + IndexBuildProtocol protocol); + protected: /** * Unregisters the index build. @@ -395,6 +416,16 @@ protected: boost::optional<CommitQuorumOptions> commitQuorum); /** + * Sets up the in-memory and persisted state of the index build for two-phase recovery. + * + * Helper function for startIndexBuild during the two-phase index build recovery process. + */ + Status _registerAndSetUpIndexBuildForTwoPhaseRecovery(OperationContext* opCtx, + StringData dbName, + CollectionUUID collectionUUID, + const std::vector<BSONObj>& specs, + const UUID& buildUUID); + /** * Runs the index build on the caller thread. Handles unregistering the index build and setting * the index build's Promise with the outcome of the index build. * 'IndexBuildOptios::replSetAndNotPrimary' is determined at the start of the index build. @@ -526,10 +557,7 @@ protected: * Returns the number of records and the size of the data iterated over, if successful. */ StatusWith<std::pair<long long, long long>> _runIndexRebuildForRecovery( - OperationContext* opCtx, - Collection* collection, - ReplIndexBuildState::IndexCatalogStats& indexCatalogStats, - const UUID& buildUUID) noexcept; + OperationContext* opCtx, Collection* collection, const UUID& buildUUID) noexcept; /** * Looks up active index build by UUID. diff --git a/src/mongo/db/index_builds_coordinator_mongod.cpp b/src/mongo/db/index_builds_coordinator_mongod.cpp index 2d205a452c4..bdcb58dcb41 100644 --- a/src/mongo/db/index_builds_coordinator_mongod.cpp +++ b/src/mongo/db/index_builds_coordinator_mongod.cpp @@ -106,19 +106,35 @@ IndexBuildsCoordinatorMongod::startIndexBuild(OperationContext* opCtx, // thread. stdx::unique_lock<Latch> lk(_startBuildMutex); - auto statusWithOptionalResult = _registerAndSetUpIndexBuild( - opCtx, dbName, collectionUUID, specs, buildUUID, protocol, indexBuildOptions.commitQuorum); - if (!statusWithOptionalResult.isOK()) { - return statusWithOptionalResult.getStatus(); - } + if (indexBuildOptions.twoPhaseRecovery) { + // Two phase index build recovery goes though a different set-up procedure because the + // original index will be dropped first. + invariant(protocol == IndexBuildProtocol::kTwoPhase); + auto status = _registerAndSetUpIndexBuildForTwoPhaseRecovery( + opCtx, dbName, collectionUUID, specs, buildUUID); + if (!status.isOK()) { + return status; + } + } else { + auto statusWithOptionalResult = _registerAndSetUpIndexBuild(opCtx, + dbName, + collectionUUID, + specs, + buildUUID, + protocol, + indexBuildOptions.commitQuorum); + if (!statusWithOptionalResult.isOK()) { + return statusWithOptionalResult.getStatus(); + } - if (statusWithOptionalResult.getValue()) { - // TODO (SERVER-37644): when joining is implemented, the returned Future will no longer - // always be set. - invariant(statusWithOptionalResult.getValue()->isReady()); - // The requested index (specs) are already built or are being built. Return success early - // (this is v4.0 behavior compatible). - return statusWithOptionalResult.getValue().get(); + if (statusWithOptionalResult.getValue()) { + // TODO (SERVER-37644): when joining is implemented, the returned Future will no longer + // always be set. + invariant(statusWithOptionalResult.getValue()->isReady()); + // The requested index (specs) are already built or are being built. Return success + // early (this is v4.0 behavior compatible). + return statusWithOptionalResult.getValue().get(); + } } auto replState = invariant(_getIndexBuild(buildUUID)); diff --git a/src/mongo/db/repair_database.cpp b/src/mongo/db/repair_database.cpp index 19e5a8fe114..ced21031a10 100644 --- a/src/mongo/db/repair_database.cpp +++ b/src/mongo/db/repair_database.cpp @@ -117,8 +117,8 @@ Status rebuildIndexesOnCollection(OperationContext* opCtx, // Rebuild the indexes provided by 'indexSpecs'. IndexBuildsCoordinator* indexBuildsCoord = IndexBuildsCoordinator::get(opCtx); UUID buildUUID = UUID::gen(); - auto swRebuild = indexBuildsCoord->startIndexRebuildForRecovery( - opCtx, collection->ns(), indexSpecs, buildUUID); + auto swRebuild = + indexBuildsCoord->rebuildIndexesForRecovery(opCtx, collection->ns(), indexSpecs, buildUUID); if (!swRebuild.isOK()) { return swRebuild.getStatus(); } diff --git a/src/mongo/db/repair_database_and_check_version.cpp b/src/mongo/db/repair_database_and_check_version.cpp index 1995e67e60a..e5a0f51b588 100644 --- a/src/mongo/db/repair_database_and_check_version.cpp +++ b/src/mongo/db/repair_database_and_check_version.cpp @@ -264,10 +264,11 @@ void checkForCappedOplog(OperationContext* opCtx, Database* db) { } void rebuildIndexes(OperationContext* opCtx, StorageEngine* storageEngine) { - std::vector<StorageEngine::IndexIdentifier> indexesToRebuild = - fassert(40593, storageEngine->reconcileCatalogAndIdents(opCtx)); + auto reconcileResult = fassert(40593, storageEngine->reconcileCatalogAndIdents(opCtx)); + // TODO (SERVER-44467): Restart two-phase index builds during startup + invariant(reconcileResult.indexBuildsToRestart.empty()); - if (!indexesToRebuild.empty() && serverGlobalParams.indexBuildRetry) { + if (!reconcileResult.indexesToRebuild.empty() && serverGlobalParams.indexBuildRetry) { log() << "note: restart the server with --noIndexBuildRetry " << "to skip index rebuilds"; } @@ -280,7 +281,7 @@ void rebuildIndexes(OperationContext* opCtx, StorageEngine* storageEngine) { // Determine which indexes need to be rebuilt. rebuildIndexesOnCollection() requires that all // indexes on that collection are done at once, so we use a map to group them together. StringMap<IndexNameObjs> nsToIndexNameObjMap; - for (auto&& idxIdentifier : indexesToRebuild) { + for (auto&& idxIdentifier : reconcileResult.indexesToRebuild) { NamespaceString collNss = idxIdentifier.nss; const std::string& indexName = idxIdentifier.indexName; auto swIndexSpecs = diff --git a/src/mongo/db/repl/rollback_impl.cpp b/src/mongo/db/repl/rollback_impl.cpp index ca25a04134c..24c6e1e8b63 100644 --- a/src/mongo/db/repl/rollback_impl.cpp +++ b/src/mongo/db/repl/rollback_impl.cpp @@ -424,6 +424,9 @@ StatusWith<std::set<NamespaceString>> RollbackImpl::_namespacesForOp(const Oplog case OplogEntry::CommandType::kDrop: case OplogEntry::CommandType::kCreateIndexes: case OplogEntry::CommandType::kDropIndexes: + case OplogEntry::CommandType::kStartIndexBuild: + case OplogEntry::CommandType::kAbortIndexBuild: + case OplogEntry::CommandType::kCommitIndexBuild: case OplogEntry::CommandType::kCollMod: { // For all other command types, we should be able to parse the collection name from // the first command argument. @@ -435,12 +438,6 @@ StatusWith<std::set<NamespaceString>> RollbackImpl::_namespacesForOp(const Oplog } break; } - // TODO(SERVER-39451): Ignore no-op startIndexBuild, abortIndexBuild, and - // commitIndexBuild commands. - // Revisit when we are ready to implement rollback logic. - case OplogEntry::CommandType::kStartIndexBuild: - case OplogEntry::CommandType::kAbortIndexBuild: - case OplogEntry::CommandType::kCommitIndexBuild: case OplogEntry::CommandType::kCommitTransaction: case OplogEntry::CommandType::kAbortTransaction: { break; diff --git a/src/mongo/db/storage/kv/storage_engine_test.cpp b/src/mongo/db/storage/kv/storage_engine_test.cpp index 93423116d78..5fb349a9a6b 100644 --- a/src/mongo/db/storage/kv/storage_engine_test.cpp +++ b/src/mongo/db/storage/kv/storage_engine_test.cpp @@ -67,7 +67,11 @@ TEST_F(StorageEngineTest, ReconcileIdentsTest) { // Create a table in the KVEngine not reflected in the DurableCatalog. This should be dropped // when reconciling. ASSERT_OK(createCollTable(opCtx.get(), NamespaceString("db.coll2"))); - ASSERT_OK(reconcile(opCtx.get()).getStatus()); + + auto reconcileResult = unittest::assertGet(reconcile(opCtx.get())); + ASSERT_EQUALS(0UL, reconcileResult.indexesToRebuild.size()); + ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToRestart.size()); + auto identsVec = getAllKVEngineIdents(opCtx.get()); auto idents = std::set<std::string>(identsVec.begin(), identsVec.end()); // There are two idents. `_mdb_catalog` and the ident for `db.coll1`. @@ -80,10 +84,11 @@ TEST_F(StorageEngineTest, ReconcileIdentsTest) { opCtx.get(), NamespaceString("db.coll1"), "_id", false /* isBackgroundSecondaryBuild */)); ASSERT_OK(dropIndexTable(opCtx.get(), NamespaceString("db.coll1"), "_id")); // The reconcile response should include this index as needing to be rebuilt. - auto reconcileStatus = reconcile(opCtx.get()); - ASSERT_OK(reconcileStatus.getStatus()); - ASSERT_EQUALS(static_cast<const unsigned long>(1), reconcileStatus.getValue().size()); - StorageEngine::IndexIdentifier& toRebuild = reconcileStatus.getValue()[0]; + reconcileResult = unittest::assertGet(reconcile(opCtx.get())); + ASSERT_EQUALS(1UL, reconcileResult.indexesToRebuild.size()); + ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToRestart.size()); + + StorageEngine::IndexIdentifier& toRebuild = reconcileResult.indexesToRebuild[0]; ASSERT_EQUALS("db.coll1", toRebuild.nss.ns()); ASSERT_EQUALS("_id", toRebuild.indexName); @@ -92,7 +97,7 @@ TEST_F(StorageEngineTest, ReconcileIdentsTest) { ASSERT_EQUALS(static_cast<const unsigned long>(1), getAllKVEngineIdents(opCtx.get()).size()); // Reconciling this should result in an error. - reconcileStatus = reconcile(opCtx.get()); + auto reconcileStatus = reconcile(opCtx.get()); ASSERT_NOT_OK(reconcileStatus.getStatus()); ASSERT_EQUALS(ErrorCodes::UnrecoverableRollbackError, reconcileStatus.getStatus()); } @@ -131,7 +136,9 @@ TEST_F(StorageEngineTest, ReconcileDropsTemporary) { ASSERT(identExists(opCtx.get(), ident)); - ASSERT_OK(reconcile(opCtx.get()).getStatus()); + auto reconcileResult = unittest::assertGet(reconcile(opCtx.get())); + ASSERT_EQUALS(0UL, reconcileResult.indexesToRebuild.size()); + ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToRestart.size()); // The storage engine is responsible for dropping its temporary idents. ASSERT(!identExists(opCtx.get(), ident)); @@ -159,7 +166,7 @@ TEST_F(StorageEngineTest, TemporaryDropsItself) { ASSERT(!identExists(opCtx.get(), ident)); } -TEST_F(StorageEngineTest, ReconcileDoesNotDropIndexBuildTempTables) { +TEST_F(StorageEngineTest, ReconcileUnfinishedIndex) { auto opCtx = cc().makeOperationContext(); Lock::GlobalLock lk(&*opCtx, MODE_IS); @@ -170,38 +177,29 @@ TEST_F(StorageEngineTest, ReconcileDoesNotDropIndexBuildTempTables) { auto swCollInfo = createCollection(opCtx.get(), ns); ASSERT_OK(swCollInfo.getStatus()); - const bool isBackgroundSecondaryBuild = false; - ASSERT_OK(startIndexBuild(opCtx.get(), ns, indexName, isBackgroundSecondaryBuild)); - auto sideWrites = makeTemporary(opCtx.get()); - auto constraintViolations = makeTemporary(opCtx.get()); + // Start an non-backgroundSecondary single-phase (i.e. no build UUID) index. + const bool isBackgroundSecondaryBuild = false; + const boost::optional<UUID> buildUUID = boost::none; + ASSERT_OK(startIndexBuild(opCtx.get(), ns, indexName, isBackgroundSecondaryBuild, buildUUID)); const auto indexIdent = _storageEngine->getCatalog()->getIndexIdent( opCtx.get(), swCollInfo.getValue().catalogId, indexName); - indexBuildScan(opCtx.get(), - ns, - indexName, - sideWrites->rs()->getIdent(), - constraintViolations->rs()->getIdent()); - indexBuildDrain(opCtx.get(), ns, indexName); + auto reconcileResult = unittest::assertGet(reconcile(opCtx.get())); - auto reconcileStatus = reconcile(opCtx.get()); - ASSERT_OK(reconcileStatus.getStatus()); + // Reconcile should have to dropped the ident to allow the index to be rebuilt. ASSERT(!identExists(opCtx.get(), indexIdent)); - // Because this non-backgroundSecondary index is unfinished, reconcile will drop the index. - ASSERT_EQUALS(0UL, reconcileStatus.getValue().size()); - - // The owning index was dropped, and so should its temporary tables. - ASSERT(!identExists(opCtx.get(), sideWrites->rs()->getIdent())); - ASSERT(!identExists(opCtx.get(), constraintViolations->rs()->getIdent())); + // Because this non-backgroundSecondary index is unfinished, reconcile will drop the index and + // not require it to be rebuilt. + ASSERT_EQUALS(0UL, reconcileResult.indexesToRebuild.size()); - sideWrites->deleteTemporaryTable(opCtx.get()); - constraintViolations->deleteTemporaryTable(opCtx.get()); + // There are no two-phase builds to restart. + ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToRestart.size()); } -TEST_F(StorageEngineTest, ReconcileDoesNotDropIndexBuildTempTablesBackgroundSecondary) { +TEST_F(StorageEngineTest, ReconcileUnfinishedBackgroundSecondaryIndex) { auto opCtx = cc().makeOperationContext(); Lock::GlobalLock lk(&*opCtx, MODE_IS); @@ -212,40 +210,79 @@ TEST_F(StorageEngineTest, ReconcileDoesNotDropIndexBuildTempTablesBackgroundSeco auto swCollInfo = createCollection(opCtx.get(), ns); ASSERT_OK(swCollInfo.getStatus()); + // Start a backgroundSecondary single-phase (i.e. no build UUID) index. const bool isBackgroundSecondaryBuild = true; - ASSERT_OK(startIndexBuild(opCtx.get(), ns, indexName, isBackgroundSecondaryBuild)); - - auto sideWrites = makeTemporary(opCtx.get()); - auto constraintViolations = makeTemporary(opCtx.get()); + const boost::optional<UUID> buildUUID = boost::none; + ASSERT_OK(startIndexBuild(opCtx.get(), ns, indexName, isBackgroundSecondaryBuild, buildUUID)); const auto indexIdent = _storageEngine->getCatalog()->getIndexIdent( opCtx.get(), swCollInfo.getValue().catalogId, indexName); - indexBuildScan(opCtx.get(), - ns, - indexName, - sideWrites->rs()->getIdent(), - constraintViolations->rs()->getIdent()); - indexBuildDrain(opCtx.get(), ns, indexName); + auto reconcileResult = unittest::assertGet(reconcile(opCtx.get())); - auto reconcileStatus = reconcile(opCtx.get()); - ASSERT_OK(reconcileStatus.getStatus()); + // Reconcile should not have dropped the ident because it expects the caller will drop and + // rebuild the index. ASSERT(identExists(opCtx.get(), indexIdent)); - // Because this backgroundSecondary index is unfinished, reconcile will identify that it should - // be rebuilt. - ASSERT_EQUALS(1UL, reconcileStatus.getValue().size()); - StorageEngine::IndexIdentifier& toRebuild = reconcileStatus.getValue()[0]; - ASSERT_EQUALS(ns, toRebuild.nss); + // Because this backgroundSecondary index is unfinished, reconcile will drop the index and + // require it to be rebuilt. + ASSERT_EQUALS(1UL, reconcileResult.indexesToRebuild.size()); + StorageEngine::IndexIdentifier& toRebuild = reconcileResult.indexesToRebuild[0]; + ASSERT_EQUALS(ns.ns(), toRebuild.nss.ns()); ASSERT_EQUALS(indexName, toRebuild.indexName); - // Because these temporary idents were associated with an in-progress index build, they are not - // dropped. - ASSERT(identExists(opCtx.get(), sideWrites->rs()->getIdent())); - ASSERT(identExists(opCtx.get(), constraintViolations->rs()->getIdent())); + // There are no two-phase builds to restart. + ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToRestart.size()); +} + +TEST_F(StorageEngineTest, ReconcileTwoPhaseIndexBuilds) { + auto opCtx = cc().makeOperationContext(); + + Lock::GlobalLock lk(&*opCtx, MODE_IS); + + const NamespaceString ns("db.coll1"); + const std::string indexA("a_1"); + const std::string indexB("b_1"); + + auto swCollInfo = createCollection(opCtx.get(), ns); + ASSERT_OK(swCollInfo.getStatus()); + + // Using a build UUID implies that this index build is two-phase, so the isBackgroundSecondary + // field will be ignored. There is no special behavior on primaries or secondaries. + auto buildUUID = UUID::gen(); + const bool isBackgroundSecondaryBuild = false; - sideWrites->deleteTemporaryTable(opCtx.get()); - constraintViolations->deleteTemporaryTable(opCtx.get()); + // Start two indexes with the same buildUUID to simulate building multiple indexes within the + // same build. + ASSERT_OK(startIndexBuild(opCtx.get(), ns, indexA, isBackgroundSecondaryBuild, buildUUID)); + ASSERT_OK(startIndexBuild(opCtx.get(), ns, indexB, isBackgroundSecondaryBuild, buildUUID)); + + const auto indexIdentA = _storageEngine->getCatalog()->getIndexIdent( + opCtx.get(), swCollInfo.getValue().catalogId, indexA); + const auto indexIdentB = _storageEngine->getCatalog()->getIndexIdent( + opCtx.get(), swCollInfo.getValue().catalogId, indexB); + + auto reconcileResult = unittest::assertGet(reconcile(opCtx.get())); + + // Reconcile should not have dropped the ident to allow the restarted index build to do so + // transactionally with the start. + ASSERT(identExists(opCtx.get(), indexIdentA)); + ASSERT(identExists(opCtx.get(), indexIdentB)); + + // Because this is an unfinished two-phase index build, reconcile will not require this index to + // be rebuilt to completion, rather restarted. + ASSERT_EQUALS(0UL, reconcileResult.indexesToRebuild.size()); + + // Only one index build should be indicated as needing to be restarted. + ASSERT_EQUALS(1UL, reconcileResult.indexBuildsToRestart.size()); + auto& [toRestartBuildUUID, toRestart] = *reconcileResult.indexBuildsToRestart.begin(); + ASSERT_EQ(buildUUID, toRestartBuildUUID); + + // Both specs should be listed within the same build. + auto& specs = toRestart.indexSpecs; + ASSERT_EQ(2UL, specs.size()); + ASSERT_EQ(indexA, specs[0]["name"].str()); + ASSERT_EQ(indexB, specs[1]["name"].str()); } TEST_F(StorageEngineRepairTest, LoadCatalogRecoversOrphans) { @@ -284,7 +321,9 @@ TEST_F(StorageEngineRepairTest, ReconcileSucceeds) { // Reconcile would normally return an error if a collection existed with a missing ident in the // storage engine. When in a repair context, that should not be the case. - ASSERT_OK(reconcile(opCtx.get()).getStatus()); + auto reconcileResult = unittest::assertGet(reconcile(opCtx.get())); + ASSERT_EQUALS(0UL, reconcileResult.indexesToRebuild.size()); + ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToRestart.size()); ASSERT(!identExists(opCtx.get(), swCollInfo.getValue().ident)); ASSERT(collectionExists(opCtx.get(), collNs)); @@ -339,7 +378,9 @@ TEST_F(StorageEngineTest, LoadCatalogDropsOrphans) { // orphaned idents. _storageEngine->loadCatalog(opCtx.get()); // reconcileCatalogAndIdents() drops orphaned idents. - ASSERT_OK(reconcile(opCtx.get()).getStatus()); + auto reconcileResult = unittest::assertGet(reconcile(opCtx.get())); + ASSERT_EQUALS(0UL, reconcileResult.indexesToRebuild.size()); + ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToRestart.size()); ASSERT(!identExists(opCtx.get(), swCollInfo.getValue().ident)); auto identNs = swCollInfo.getValue().ident; diff --git a/src/mongo/db/storage/storage_engine.h b/src/mongo/db/storage/storage_engine.h index 6b156ce3de6..cdf21fd3837 100644 --- a/src/mongo/db/storage/storage_engine.h +++ b/src/mongo/db/storage/storage_engine.h @@ -497,11 +497,37 @@ public: }; /** - * Drop abandoned idents. In the successful case, returns a list of collection, index name - * pairs to rebuild. + * Describes an index build on a collection that should be restarted. */ - virtual StatusWith<std::vector<IndexIdentifier>> reconcileCatalogAndIdents( - OperationContext* opCtx) = 0; + struct IndexBuildToRestart { + IndexBuildToRestart(UUID collUUID) : collUUID(collUUID) {} + + // Collection UUID. + const UUID collUUID; + + // Index specs for the build. + std::vector<BSONObj> indexSpecs; + }; + + /* + * ReconcileResult is the result of reconciling abandoned storage engine idents and unfinished + * index builds. + */ + struct ReconcileResult { + // A list of IndexIdentifiers that must be rebuilt to completion. + std::vector<IndexIdentifier> indexesToRebuild; + + // A map of unfinished two-phase indexes that must be restarted in the background, but + // not to completion; they will wait for replicated commit or abort operations. This is a + // mapping from index build UUID to index build. + std::map<UUID, IndexBuildToRestart> indexBuildsToRestart; + }; + + /** + * Drop abandoned idents. If successful, returns a ReconcileResult with indexes that need to be + * rebuilt or builds that need to be restarted. + * */ + virtual StatusWith<ReconcileResult> reconcileCatalogAndIdents(OperationContext* opCtx) = 0; /** * Returns the all_durable timestamp. All transactions with timestamps earlier than the diff --git a/src/mongo/db/storage/storage_engine_impl.cpp b/src/mongo/db/storage/storage_engine_impl.cpp index f7b203741a6..213c3c50937 100644 --- a/src/mongo/db/storage/storage_engine_impl.cpp +++ b/src/mongo/db/storage/storage_engine_impl.cpp @@ -40,6 +40,7 @@ #include "mongo/db/catalog/collection_catalog_helper.h" #include "mongo/db/client.h" #include "mongo/db/concurrency/d_concurrency.h" +#include "mongo/db/index_builds_coordinator.h" #include "mongo/db/logical_clock.h" #include "mongo/db/operation_context_noop.h" #include "mongo/db/server_options.h" @@ -317,8 +318,8 @@ Status StorageEngineImpl::_recoverOrphanedCollection(OperationContext* opCtx, * Third, a DurableCatalog may have an index ident that the KVEngine does not. This method will * rebuild the index. */ -StatusWith<std::vector<StorageEngine::IndexIdentifier>> -StorageEngineImpl::reconcileCatalogAndIdents(OperationContext* opCtx) { +StatusWith<StorageEngine::ReconcileResult> StorageEngineImpl::reconcileCatalogAndIdents( + OperationContext* opCtx) { // Gather all tables known to the storage engine and drop those that aren't cross-referenced // in the _mdb_catalog. This can happen for two reasons. // @@ -405,7 +406,7 @@ StorageEngineImpl::reconcileCatalogAndIdents(OperationContext* opCtx) { // // Also, remove unfinished builds except those that were background index builds started on a // secondary. - std::vector<StorageEngine::IndexIdentifier> ret; + StorageEngine::ReconcileResult ret; for (DurableCatalog::Entry entry : catalogEntries) { BSONCollectionCatalogEntry::MetaData metaData = _catalog->getMetaData(opCtx, entry.catalogId); @@ -440,7 +441,32 @@ StorageEngineImpl::reconcileCatalogAndIdents(OperationContext* opCtx) { if (indexMetaData.ready && !foundIdent) { log() << "Expected index data is missing, rebuilding. Collection: " << coll << " Index: " << indexName; - ret.push_back({entry.catalogId, coll, indexName}); + ret.indexesToRebuild.push_back({entry.catalogId, coll, indexName}); + continue; + } + + // Any index build with a UUID is an unfinished two-phase build and must be restarted. + // There are no special cases to handle on primaries or secondaries. An index build may + // be associated with multiple indexes. + if (indexMetaData.buildUUID) { + invariant(!indexMetaData.ready); + invariant(indexMetaData.runTwoPhaseBuild); + + auto collUUID = metaData.options.uuid; + invariant(collUUID); + auto buildUUID = *indexMetaData.buildUUID; + + log() << "Found index from unfinished build. Collection: " << coll << "(" + << *collUUID << "), index: " << indexName << ", build UUID: " << buildUUID; + + // Insert in the map if a build has not already been registered. + auto existingIt = ret.indexBuildsToRestart.find(buildUUID); + if (existingIt == ret.indexBuildsToRestart.end()) { + ret.indexBuildsToRestart.insert({buildUUID, IndexBuildToRestart(*collUUID)}); + existingIt = ret.indexBuildsToRestart.find(buildUUID); + } + + existingIt->second.indexSpecs.emplace_back(indexMetaData.spec); continue; } @@ -486,7 +512,7 @@ StorageEngineImpl::reconcileCatalogAndIdents(OperationContext* opCtx) { log() << "Expected background index build did not complete, rebuilding. " "Collection: " << coll << " Index: " << indexName; - ret.push_back({entry.catalogId, coll, indexName}); + ret.indexesToRebuild.push_back({entry.catalogId, coll, indexName}); continue; } diff --git a/src/mongo/db/storage/storage_engine_impl.h b/src/mongo/db/storage/storage_engine_impl.h index 3dda7d220f6..96bbde222be 100644 --- a/src/mongo/db/storage/storage_engine_impl.h +++ b/src/mongo/db/storage/storage_engine_impl.h @@ -344,11 +344,7 @@ public: return _engine->isInIndividuallyCheckpointedIndexesList(ident); } - /** - * Drop abandoned idents. Returns a list of indexes to rebuild. - */ - StatusWith<std::vector<StorageEngine::IndexIdentifier>> reconcileCatalogAndIdents( - OperationContext* opCtx) override; + StatusWith<ReconcileResult> reconcileCatalogAndIdents(OperationContext* opCtx) override; std::string getFilesystemPathForDb(const std::string& dbName) const override; diff --git a/src/mongo/db/storage/storage_engine_test_fixture.h b/src/mongo/db/storage/storage_engine_test_fixture.h index 22945fc184e..ffda8356902 100644 --- a/src/mongo/db/storage/storage_engine_test_fixture.h +++ b/src/mongo/db/storage/storage_engine_test_fixture.h @@ -88,7 +88,7 @@ public: return _storageEngine->getEngine()->dropIdent(opCtx, ident); } - StatusWith<std::vector<StorageEngine::IndexIdentifier>> reconcile(OperationContext* opCtx) { + StatusWith<StorageEngine::ReconcileResult> reconcile(OperationContext* opCtx) { return _storageEngine->reconcileCatalogAndIdents(opCtx); } @@ -116,7 +116,8 @@ public: NamespaceString collNs, std::string key, bool isBackgroundSecondaryBuild) { - auto ret = startIndexBuild(opCtx, collNs, key, isBackgroundSecondaryBuild); + auto buildUUID = UUID::gen(); + auto ret = startIndexBuild(opCtx, collNs, key, isBackgroundSecondaryBuild, buildUUID); if (!ret.isOK()) { return ret; } @@ -128,7 +129,8 @@ public: Status startIndexBuild(OperationContext* opCtx, NamespaceString collNs, std::string key, - bool isBackgroundSecondaryBuild) { + bool isBackgroundSecondaryBuild, + boost::optional<UUID> buildUUID) { BSONObjBuilder builder; { BSONObjBuilder keyObj; @@ -143,26 +145,11 @@ public: auto ret = DurableCatalog::get(opCtx)->prepareForIndexBuild(opCtx, collection->getCatalogId(), descriptor.get(), - UUID::gen(), + buildUUID, isBackgroundSecondaryBuild); return ret; } - void indexBuildScan(OperationContext* opCtx, - NamespaceString collNs, - std::string key, - std::string sideWritesIdent, - std::string constraintViolationsIdent) { - Collection* collection = CollectionCatalog::get(opCtx).lookupCollectionByNamespace(collNs); - DurableCatalog::get(opCtx)->setIndexBuildScanning( - opCtx, collection->getCatalogId(), key, sideWritesIdent, constraintViolationsIdent); - } - - void indexBuildDrain(OperationContext* opCtx, NamespaceString collNs, std::string key) { - Collection* collection = CollectionCatalog::get(opCtx).lookupCollectionByNamespace(collNs); - DurableCatalog::get(opCtx)->setIndexBuildDraining(opCtx, collection->getCatalogId(), key); - } - void indexBuildSuccess(OperationContext* opCtx, NamespaceString collNs, std::string key) { Collection* collection = CollectionCatalog::get(opCtx).lookupCollectionByNamespace(collNs); DurableCatalog::get(opCtx)->indexBuildSuccess(opCtx, collection->getCatalogId(), key); |