diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/mongo/db/SConscript | 12 | ||||
-rw-r--r-- | src/mongo/db/catalog/SConscript | 1 | ||||
-rw-r--r-- | src/mongo/db/catalog/catalog_control.cpp | 10 | ||||
-rw-r--r-- | src/mongo/db/catalog/multi_index_block.cpp | 52 | ||||
-rw-r--r-- | src/mongo/db/catalog/multi_index_block.h | 6 | ||||
-rw-r--r-- | src/mongo/db/index_builds_coordinator.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/index_builds_coordinator.h | 11 | ||||
-rw-r--r-- | src/mongo/db/repl/rs_rollback.cpp | 2 | ||||
-rw-r--r-- | src/mongo/db/resumable_index_builds.idl | 123 | ||||
-rw-r--r-- | src/mongo/db/startup_recovery.cpp | 2 | ||||
-rw-r--r-- | src/mongo/db/storage/SConscript | 1 | ||||
-rw-r--r-- | src/mongo/db/storage/kv/storage_engine_test.cpp | 23 | ||||
-rw-r--r-- | src/mongo/db/storage/storage_engine.h | 5 | ||||
-rw-r--r-- | src/mongo/db/storage/storage_engine_impl.cpp | 83 | ||||
-rw-r--r-- | src/mongo/db/storage/storage_engine_impl.h | 9 | ||||
-rw-r--r-- | src/mongo/util/uuid.h | 1 |
16 files changed, 284 insertions, 63 deletions
diff --git a/src/mongo/db/SConscript b/src/mongo/db/SConscript index 75538f4e0e5..61e3abba115 100644 --- a/src/mongo/db/SConscript +++ b/src/mongo/db/SConscript @@ -845,6 +845,17 @@ env.Library( ) env.Library( + target='resumable_index_builds_idl', + source=[ + env.Idlc('resumable_index_builds.idl')[0], + ], + LIBDEPS=[ + "$BUILD_DIR/mongo/base", + '$BUILD_DIR/mongo/idl/idl_parser', + ] +) + +env.Library( target="index_builds_coordinator_mongod", source=[ "index_builds_coordinator_mongod.cpp", @@ -857,6 +868,7 @@ env.Library( 'curop', 'db_raii', 'index_build_entry_helpers', + 'resumable_index_builds_idl', '$BUILD_DIR/mongo/db/catalog/collection_catalog', '$BUILD_DIR/mongo/db/catalog/index_build_entry_idl', "$BUILD_DIR/mongo/executor/task_executor_interface", diff --git a/src/mongo/db/catalog/SConscript b/src/mongo/db/catalog/SConscript index 05eb9f9c3dc..5ae71b7908d 100644 --- a/src/mongo/db/catalog/SConscript +++ b/src/mongo/db/catalog/SConscript @@ -232,6 +232,7 @@ env.Library( '$BUILD_DIR/mongo/db/concurrency/write_conflict_exception', '$BUILD_DIR/mongo/db/curop', '$BUILD_DIR/mongo/db/repl/repl_coordinator_interface', + '$BUILD_DIR/mongo/db/resumable_index_builds_idl', '$BUILD_DIR/mongo/db/service_context', '$BUILD_DIR/mongo/db/storage/write_unit_of_work', '$BUILD_DIR/mongo/util/fail_point', diff --git a/src/mongo/db/catalog/catalog_control.cpp b/src/mongo/db/catalog/catalog_control.cpp index 02393d4929b..a3344f5c1b2 100644 --- a/src/mongo/db/catalog/catalog_control.cpp +++ b/src/mongo/db/catalog/catalog_control.cpp @@ -163,12 +163,12 @@ void openCatalog(OperationContext* opCtx, const MinVisibleTimestampMap& minVisib fassert(40690, rebuildIndexesOnCollection(opCtx, collection, indexSpecs, RepairData::kNo)); } - // Once all unfinished index builds have been dropped and the catalog has been reloaded, restart - // any unfinished index builds. This will not restart any index builds to completion, but rather - // start the background thread, build the index, and wait for a replicated commit or abort oplog - // entry. + // Once all unfinished index builds have been dropped and the catalog has been reloaded, resume + // or restart any unfinished index builds. This will not resume/restart any index builds to + // completion, but rather start the background thread, build the index, and wait for a + // replicated commit or abort oplog entry. IndexBuildsCoordinator::get(opCtx)->restartIndexBuildsForRecovery( - opCtx, reconcileResult.indexBuildsToRestart); + opCtx, reconcileResult.indexBuildsToRestart, reconcileResult.indexBuildsToResume); // Open all databases and repopulate the CollectionCatalog. LOGV2(20276, "openCatalog: reopening all databases"); diff --git a/src/mongo/db/catalog/multi_index_block.cpp b/src/mongo/db/catalog/multi_index_block.cpp index 6586dec2740..6842dc97659 100644 --- a/src/mongo/db/catalog/multi_index_block.cpp +++ b/src/mongo/db/catalog/multi_index_block.cpp @@ -397,8 +397,9 @@ Status MultiIndexBlock::insertAllDocumentsInCollection(OperationContext* opCtx, opCtx->recoveryUnit()->setReadOnce(readOnce); try { - invariant(_phase == Phase::kInitialized, _phaseToString(_phase)); - _phase = Phase::kCollectionScan; + invariant(_phase == IndexBuildPhaseEnum::kInitialized, + IndexBuildPhase_serializer(_phase).toString()); + _phase = IndexBuildPhaseEnum::kCollectionScan; BSONObj objToIndex; RecordId loc; @@ -433,7 +434,7 @@ Status MultiIndexBlock::insertAllDocumentsInCollection(OperationContext* opCtx, n++; } } catch (...) { - _phase = Phase::kInitialized; + _phase = IndexBuildPhaseEnum::kInitialized; return exceptionToStatus(); } @@ -522,9 +523,10 @@ Status MultiIndexBlock::dumpInsertsFromBulk(OperationContext* opCtx, // insertDocumentsInCollection() to scan and insert the contents of the collection. // Therefore, it is possible for the phase of this MultiIndexBlock to be kInitialized // rather than kCollection when this function is called. - invariant(_phase == Phase::kCollectionScan || _phase == Phase::kInitialized, - _phaseToString(_phase)); - _phase = Phase::kBulkLoad; + invariant(_phase == IndexBuildPhaseEnum::kCollectionScan || + _phase == IndexBuildPhaseEnum::kInitialized, + IndexBuildPhase_serializer(_phase).toString()); + _phase = IndexBuildPhaseEnum::kBulkLoad; for (size_t i = 0; i < _indexes.size(); i++) { // When dupRecords is passed, 'dupsAllowed' should be passed to reflect whether or not the @@ -577,8 +579,10 @@ Status MultiIndexBlock::drainBackgroundWrites( // Background writes are drained three times (once without blocking writes and twice blocking // writes), so we may either be coming from the bulk load phase or be already in the drain // writes phase. - invariant(_phase == Phase::kBulkLoad || _phase == Phase::kDrainWrites, _phaseToString(_phase)); - _phase = Phase::kDrainWrites; + invariant(_phase == IndexBuildPhaseEnum::kBulkLoad || + _phase == IndexBuildPhaseEnum::kDrainWrites, + IndexBuildPhase_serializer(_phase).toString()); + _phase = IndexBuildPhaseEnum::kDrainWrites; const Collection* coll = CollectionCatalog::get(opCtx).lookupCollectionByUUID(opCtx, _collectionUUID.get()); @@ -779,18 +783,23 @@ void MultiIndexBlock::_writeStateToDisk(OperationContext* opCtx) const { BSONObj MultiIndexBlock::_constructStateObject() const { BSONObjBuilder builder; _buildUUID->appendToBuilder(&builder, "_id"); - builder.append("phase", _phaseToString(_phase)); + builder.append("phase", IndexBuildPhase_serializer(_phase)); + + if (_collectionUUID) { + _collectionUUID->appendToBuilder(&builder, "collectionUUID"); + } // We can be interrupted by shutdown before inserting the first document from the collection // scan, in which case there is no _lastRecordIdInserted. - if (_phase == Phase::kCollectionScan && _lastRecordIdInserted) + if (_phase == IndexBuildPhaseEnum::kCollectionScan && _lastRecordIdInserted) builder.append("collectionScanPosition", _lastRecordIdInserted->repr()); BSONArrayBuilder indexesArray(builder.subarrayStart("indexes")); for (const auto& index : _indexes) { BSONObjBuilder indexInfo(indexesArray.subobjStart()); - if (_phase == Phase::kCollectionScan || _phase == Phase::kBulkLoad) { + if (_phase == IndexBuildPhaseEnum::kCollectionScan || + _phase == IndexBuildPhaseEnum::kBulkLoad) { auto state = index.bulk->getSorterState(); indexInfo.append("tempDir", state.tempDir); @@ -814,12 +823,16 @@ BSONObj MultiIndexBlock::_constructStateObject() const { if (auto duplicateKeyTrackerTableIdent = indexBuildInterceptor->getDuplicateKeyTrackerTableIdent()) - indexInfo.append("dupKeyTempTable", *duplicateKeyTrackerTableIdent); + indexInfo.append("duplicateKeyTrackerTable", *duplicateKeyTrackerTableIdent); if (auto skippedRecordTrackerTableIdent = indexBuildInterceptor->getSkippedRecordTracker()->getTableIdent()) indexInfo.append("skippedRecordTrackerTable", *skippedRecordTrackerTableIdent); + // TODO SERVER-49450: Consider only writing out the index name or key and getting the rest + // of the spec from the durable catalog on startup. + indexInfo.append("spec", index.block->getSpec()); + indexInfo.done(); // Ensure the data we are referencing has been persisted to disk. @@ -829,19 +842,4 @@ BSONObj MultiIndexBlock::_constructStateObject() const { return builder.obj(); } - -std::string MultiIndexBlock::_phaseToString(Phase phase) const { - switch (phase) { - case Phase::kInitialized: - return "initialized"; - case Phase::kCollectionScan: - return "collection scan"; - case Phase::kBulkLoad: - return "bulk load"; - case Phase::kDrainWrites: - return "drain writes"; - } - MONGO_UNREACHABLE; -} - } // namespace mongo diff --git a/src/mongo/db/catalog/multi_index_block.h b/src/mongo/db/catalog/multi_index_block.h index 314800a1743..56fb44bffa3 100644 --- a/src/mongo/db/catalog/multi_index_block.h +++ b/src/mongo/db/catalog/multi_index_block.h @@ -46,6 +46,7 @@ #include "mongo/db/index/index_access_method.h" #include "mongo/db/index/index_build_interceptor.h" #include "mongo/db/record_id.h" +#include "mongo/db/resumable_index_builds_gen.h" #include "mongo/platform/mutex.h" #include "mongo/util/fail_point.h" @@ -297,8 +298,6 @@ private: InsertDeleteOptions options; }; - enum class Phase { kInitialized, kCollectionScan, kBulkLoad, kDrainWrites }; - void _abortWithoutCleanup(OperationContext* opCtx, bool shutdown); bool _shouldWriteStateToDisk(OperationContext* opCtx, bool shutdown) const; @@ -307,7 +306,6 @@ private: BSONObj _constructStateObject() const; - std::string _phaseToString(Phase phase) const; // Is set during init() and ensures subsequent function calls act on the same Collection. boost::optional<UUID> _collectionUUID; @@ -331,6 +329,6 @@ private: boost::optional<RecordId> _lastRecordIdInserted; // The current phase of the index build. - Phase _phase = Phase::kInitialized; + IndexBuildPhaseEnum _phase = IndexBuildPhaseEnum::kInitialized; }; } // namespace mongo diff --git a/src/mongo/db/index_builds_coordinator.cpp b/src/mongo/db/index_builds_coordinator.cpp index 3835961496c..a5d29ee04fb 100644 --- a/src/mongo/db/index_builds_coordinator.cpp +++ b/src/mongo/db/index_builds_coordinator.cpp @@ -1321,8 +1321,10 @@ IndexBuilds IndexBuildsCoordinator::stopIndexBuildsForRollback(OperationContext* return buildsStopped; } -void IndexBuildsCoordinator::restartIndexBuildsForRecovery(OperationContext* opCtx, - const IndexBuilds& buildsToRestart) { +void IndexBuildsCoordinator::restartIndexBuildsForRecovery( + OperationContext* opCtx, + const IndexBuilds& buildsToRestart, + const std::vector<ResumeIndexInfo>& buildsToResume) { for (auto& [buildUUID, build] : buildsToRestart) { boost::optional<NamespaceString> nss = CollectionCatalog::get(opCtx).lookupNSSByUUID(opCtx, build.collUUID); diff --git a/src/mongo/db/index_builds_coordinator.h b/src/mongo/db/index_builds_coordinator.h index 56df844a0e5..c78134998db 100644 --- a/src/mongo/db/index_builds_coordinator.h +++ b/src/mongo/db/index_builds_coordinator.h @@ -46,6 +46,7 @@ #include "mongo/db/rebuild_indexes.h" #include "mongo/db/repl/oplog_entry.h" #include "mongo/db/repl_index_build_state.h" +#include "mongo/db/resumable_index_builds_gen.h" #include "mongo/db/storage/durable_catalog.h" #include "mongo/executor/task_executor.h" #include "mongo/executor/thread_pool_task_executor.h" @@ -146,11 +147,13 @@ public: IndexBuildOptions indexBuildOptions) = 0; /** - * Given a set of two-phase index builds, start, but do not complete each one in a background - * thread. Each index build will wait for a replicated commit or abort, as in steady-state - * replication. + * Resumes and restarts index builds for recovery. Anything that fails to resume will be + * started in a background thread. Each index build will wait for a replicated commit or abort, + * as in steady-state. */ - void restartIndexBuildsForRecovery(OperationContext* opCtx, const IndexBuilds& buildsToRestart); + void restartIndexBuildsForRecovery(OperationContext* opCtx, + const IndexBuilds& buildsToRestart, + const std::vector<ResumeIndexInfo>& buildsToResume); /** * Runs the full index rebuild for recovery. This will only rebuild single-phase index builds. diff --git a/src/mongo/db/repl/rs_rollback.cpp b/src/mongo/db/repl/rs_rollback.cpp index 12b86993a2a..e74f850acfb 100644 --- a/src/mongo/db/repl/rs_rollback.cpp +++ b/src/mongo/db/repl/rs_rollback.cpp @@ -1669,7 +1669,7 @@ void rollback_internal::syncFixUp(OperationContext* opCtx, LOGV2(21707, "Restarting rolled-back committed or aborted index builds"); IndexBuildsCoordinator::get(opCtx)->restartIndexBuildsForRecovery( - opCtx, fixUpInfo.indexBuildsToRestart); + opCtx, fixUpInfo.indexBuildsToRestart, {}); LOGV2(21708, "Deleting and updating documents to roll back insert, update and remove " diff --git a/src/mongo/db/resumable_index_builds.idl b/src/mongo/db/resumable_index_builds.idl new file mode 100644 index 00000000000..9f8fc4a6818 --- /dev/null +++ b/src/mongo/db/resumable_index_builds.idl @@ -0,0 +1,123 @@ +# Copyright (C) 2020-present MongoDB, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the Server Side Public License, version 1, +# as published by MongoDB, Inc. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Server Side Public License for more details. +# +# You should have received a copy of the Server Side Public License +# along with this program. If not, see +# <http://www.mongodb.com/licensing/server-side-public-license>. +# +# As a special exception, the copyright holders give permission to link the +# code of portions of this program with the OpenSSL library under certain +# conditions as described in each individual source file and distribute +# linked combinations including the program with the OpenSSL library. You +# must comply with the Server Side Public License in all respects for +# all of the code used other than as permitted herein. If you modify file(s) +# with this exception, you may extend this exception to your version of the +# file(s), but you are not obligated to do so. If you do not wish to do so, +# delete this exception statement from your version. If you delete this +# exception statement from all source files in the program, then also delete +# it in the license file. + +# This IDL file describes the BSON format for ElectionMetrics, +# ElectionCandidateMetrics and ElectionParticipantMetrics, and +# handles the serialization to and deserialization from their BSON +# representations for those classes. + +global: + cpp_namespace: "mongo" + cpp_includes: + - "mongo/util/uuid.h" + +imports: + - "mongo/idl/basic_types.idl" + +enums: + IndexBuildPhase: + description: "Phase of a hybrid index build" + type: string + values: + kInitialized: "initialized" + kCollectionScan: "collection scan" + kBulkLoad: "bulk load" + kDrainWrites: "drain writes" + +structs: + SorterRange: + description: "The range of data that was sorted and spilled to disk" + strict: true + fields: + startOffset: + description: "Tracks where in the file we started writing this data range" + type: long + endOffset: + description: "Tracks where in the file we finished writing this data range" + type: long + checksum: + description: "Keeps track of the hash of all data objects spilled to disk" + type: long + validator: { gte: 0 } + + IndexSorterInfo: + description: "The information to resume the sorter for an index build" + strict: true + fields: + sideWritesTable: + description: "The name of the ident associated with the side writes table for this + index build" + type: string + duplicateKeyTrackerTable: + description: "The name of the ident associated with the duplicate key tracker table + for this index build" + type: string + optional: true + skippedRecordTrackerTable: + description: "The name of the ident associated with the skipped record tracker table + for this index build" + type: string + optional: true + tempDir: + description: "The directory into which we place a file when spilling data to disk" + type: string + optional: true + fileName: + description: "The name of the file that sorted data is written to" + type: string + optional: true + ranges: + description: "All ranges of data that were already sorted and spilled to disk" + type: array<SorterRange> + optional: true + spec: + description: "The index specification" + type: object + + ResumeIndexInfo: + description: "Information needed to resume index builds" + strict: true + fields: + _id: + description: "A UUID that uniquely identifies the index build across replica set + members." + cpp_name: buildUUID + type: uuid + phase: + description: "The phase the index build was in when the node shut down" + type: IndexBuildPhase + collectionUUID: + description: "A UUID that uniquely identifies which collection the index is being + built on" + type: uuid + collectionScanPosition: + description: "The last record id inserted into the sorter before shutdown" + type: long + optional: true + indexes: + description: "The information needed to resume each specific index in this build" + type: array<IndexSorterInfo> diff --git a/src/mongo/db/startup_recovery.cpp b/src/mongo/db/startup_recovery.cpp index cbc10c7be7a..9e0e5bed487 100644 --- a/src/mongo/db/startup_recovery.cpp +++ b/src/mongo/db/startup_recovery.cpp @@ -390,7 +390,7 @@ void reconcileCatalogAndRebuildUnfinishedIndexes(OperationContext* opCtx, // not build any indexes to completion, but rather start the background thread to build the // index, and wait for a replicated commit or abort oplog entry. IndexBuildsCoordinator::get(opCtx)->restartIndexBuildsForRecovery( - opCtx, reconcileResult.indexBuildsToRestart); + opCtx, reconcileResult.indexBuildsToRestart, reconcileResult.indexBuildsToResume); } /** diff --git a/src/mongo/db/storage/SConscript b/src/mongo/db/storage/SConscript index 5384d1852c3..fc8a18d2c20 100644 --- a/src/mongo/db/storage/SConscript +++ b/src/mongo/db/storage/SConscript @@ -524,6 +524,7 @@ env.Library( LIBDEPS_PRIVATE=[ '$BUILD_DIR/mongo/db/storage/storage_repair_observer', '$BUILD_DIR/mongo/db/catalog/collection_catalog_helper', + '$BUILD_DIR/mongo/db/resumable_index_builds_idl', '$BUILD_DIR/mongo/db/vector_clock', 'two_phase_index_build_knobs_idl', ], diff --git a/src/mongo/db/storage/kv/storage_engine_test.cpp b/src/mongo/db/storage/kv/storage_engine_test.cpp index b34c3acd2ce..37bca4130f0 100644 --- a/src/mongo/db/storage/kv/storage_engine_test.cpp +++ b/src/mongo/db/storage/kv/storage_engine_test.cpp @@ -93,6 +93,7 @@ TEST_F(StorageEngineTest, ReconcileIdentsTest) { reconcileResult = unittest::assertGet(reconcile(opCtx.get())); ASSERT_EQUALS(1UL, reconcileResult.indexesToRebuild.size()); ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToRestart.size()); + ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToResume.size()); StorageEngine::IndexIdentifier& toRebuild = reconcileResult.indexesToRebuild[0]; ASSERT_EQUALS("db.coll1", toRebuild.nss.ns()); @@ -148,6 +149,7 @@ TEST_F(StorageEngineTest, ReconcileDropsTemporary) { auto reconcileResult = unittest::assertGet(reconcile(opCtx.get())); ASSERT_EQUALS(0UL, reconcileResult.indexesToRebuild.size()); ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToRestart.size()); + ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToResume.size()); // The storage engine is responsible for dropping its temporary idents. ASSERT(!identExists(opCtx.get(), ident)); @@ -170,9 +172,14 @@ TEST_F(StorageEngineTest, ReconcileKeepsTemporary) { ASSERT_EQUALS(0UL, reconcileResult.indexesToRebuild.size()); ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToRestart.size()); - // The storage engine does not drop its temporary idents outside of starting up after an - // unclean shutdown. - ASSERT(identExists(opCtx.get(), ident)); + // TODO SERVER-49847: Clean up when the feature is turned on by default. + if (_storageEngine->supportsResumableIndexBuilds()) { + // The storage engine does not drop its temporary idents outside of starting up after an + // unclean shutdown. + ASSERT(identExists(opCtx.get(), ident)); + } else { + ASSERT_FALSE(identExists(opCtx.get(), ident)); + } rs->finalizeTemporaryTable(opCtx.get(), TemporaryRecordStore::FinalizationAction::kDelete); } @@ -231,8 +238,9 @@ TEST_F(StorageEngineTest, ReconcileUnfinishedIndex) { // not require it to be rebuilt. ASSERT_EQUALS(0UL, reconcileResult.indexesToRebuild.size()); - // There are no two-phase builds to restart. + // There are no two-phase builds to resume or restart. ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToRestart.size()); + ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToResume.size()); } TEST_F(StorageEngineTest, ReconcileUnfinishedBackgroundSecondaryIndex) { @@ -272,8 +280,9 @@ TEST_F(StorageEngineTest, ReconcileUnfinishedBackgroundSecondaryIndex) { ASSERT_EQUALS(ns.ns(), toRebuild.nss.ns()); ASSERT_EQUALS(indexName, toRebuild.indexName); - // There are no two-phase builds to restart. + // There are no two-phase builds to restart or resume. ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToRestart.size()); + ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToResume.size()); } TEST_F(StorageEngineTest, ReconcileTwoPhaseIndexBuilds) { @@ -332,6 +341,9 @@ TEST_F(StorageEngineTest, ReconcileTwoPhaseIndexBuilds) { ASSERT_EQ(2UL, specs.size()); ASSERT_EQ(indexA, specs[0]["name"].str()); ASSERT_EQ(indexB, specs[1]["name"].str()); + + // There should be no index builds to resume. + ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToResume.size()); } TEST_F(StorageEngineRepairTest, LoadCatalogRecoversOrphans) { @@ -373,6 +385,7 @@ TEST_F(StorageEngineRepairTest, ReconcileSucceeds) { auto reconcileResult = unittest::assertGet(reconcile(opCtx.get())); ASSERT_EQUALS(0UL, reconcileResult.indexesToRebuild.size()); ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToRestart.size()); + ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToResume.size()); ASSERT(!identExists(opCtx.get(), swCollInfo.getValue().ident)); ASSERT(collectionExists(opCtx.get(), collNs)); diff --git a/src/mongo/db/storage/storage_engine.h b/src/mongo/db/storage/storage_engine.h index 44854b5e7dc..1832b0c4d23 100644 --- a/src/mongo/db/storage/storage_engine.h +++ b/src/mongo/db/storage/storage_engine.h @@ -37,6 +37,7 @@ #include "mongo/bson/bsonobj.h" #include "mongo/bson/timestamp.h" #include "mongo/db/catalog/index_builds.h" +#include "mongo/db/resumable_index_builds_gen.h" #include "mongo/db/storage/temporary_record_store.h" #include "mongo/util/functional.h" #include "mongo/util/str.h" @@ -538,6 +539,10 @@ public: // not to completion; they will wait for replicated commit or abort operations. This is a // mapping from index build UUID to index build. IndexBuilds indexBuildsToRestart; + + // List of index builds to be resumed. Each ResumeIndexInfo may contain multiple indexes to + // resume as part of the same build. + std::vector<ResumeIndexInfo> indexBuildsToResume; }; /** diff --git a/src/mongo/db/storage/storage_engine_impl.cpp b/src/mongo/db/storage/storage_engine_impl.cpp index 33d6cc6caac..1803a412bf5 100644 --- a/src/mongo/db/storage/storage_engine_impl.cpp +++ b/src/mongo/db/storage/storage_engine_impl.cpp @@ -320,6 +320,63 @@ Status StorageEngineImpl::_recoverOrphanedCollection(OperationContext* opCtx, return Status::OK(); } +bool StorageEngineImpl::_handleInternalIdents(OperationContext* opCtx, + const std::string& ident, + ReconcileResult* reconcileResult, + std::set<std::string>* internalIdentsToDrop) { + if (!_catalog->isInternalIdent(ident)) { + return false; + } + // When starting up after an unclean shutdown, we do not attempt to recover any state from the + // internal idents. Thus, we drop them in this case. + if (startingAfterUncleanShutdown(opCtx->getServiceContext()) || + !supportsResumableIndexBuilds()) { + internalIdentsToDrop->insert(ident); + return true; + } + + // When starting up after a clean shutdown and resumable index builds are supported, find the + // internal idents that contain the relevant information to resume each index build and recover + // the state. + auto rs = _engine->getRecordStore(opCtx, "", ident, CollectionOptions()); + + // Look at the contents to determine whether this ident will contain information for + // resuming an index build. + // TODO SERVER-49125: differentiate the internal idents without looking at the contents. + auto cursor = rs->getCursor(opCtx); + auto record = cursor->next(); + if (record) { + auto doc = record.get().data.toBson(); + + // Parse the documents here so that we can restart the build if the document doesn't + // contain all the necessary information to be able to resume building the index. + if (doc.hasField("phase")) { + ResumeIndexInfo resumeInfo; + try { + resumeInfo = ResumeIndexInfo::parse(IDLParserErrorContext("ResumeIndexInfo"), doc); + } catch (const DBException& e) { + LOGV2(4916300, "Failed to parse resumable index info", "error"_attr = e.toStatus()); + + // Ignore the error so that we can restart the index build instead of resume it. We + // should drop the internal ident if we failed to parse. + internalIdentsToDrop->insert(ident); + return true; + } + + reconcileResult->indexBuildsToResume.push_back(resumeInfo); + + LOGV2(4916301, + "Found unfinished index build to resume", + "buildUUID"_attr = resumeInfo.getBuildUUID(), + "collectionUUID"_attr = resumeInfo.getCollectionUUID(), + "phase"_attr = IndexBuildPhase_serializer(resumeInfo.getPhase())); + return true; + } + } + + return false; +} + /** * This method reconciles differences between idents the KVEngine is aware of and the * DurableCatalog. There are three differences to consider: @@ -366,16 +423,13 @@ StatusWith<StorageEngine::ReconcileResult> StorageEngineImpl::reconcileCatalogAn // Drop all idents in the storage engine that are not known to the catalog. This can happen in // the case of a collection or index creation being rolled back. + StorageEngine::ReconcileResult reconcileResult; for (const auto& it : engineIdents) { if (catalogIdents.find(it) != catalogIdents.end()) { continue; } - // When starting up after an unclean shutdown, we do not attempt to recover any state from - // the internal idents. Thus, we drop them in this case. - if (startingAfterUncleanShutdown(opCtx->getServiceContext()) && - _catalog->isInternalIdent(it)) { - internalIdentsToDrop.insert(it); + if (_handleInternalIdents(opCtx, it, &reconcileResult, &internalIdentsToDrop)) { continue; } @@ -425,7 +479,6 @@ StatusWith<StorageEngine::ReconcileResult> StorageEngineImpl::reconcileCatalogAn // // Also, remove unfinished builds except those that were background index builds started on a // secondary. - StorageEngine::ReconcileResult ret; for (DurableCatalog::Entry entry : catalogEntries) { BSONCollectionCatalogEntry::MetaData metaData = _catalog->getMetaData(opCtx, entry.catalogId); @@ -472,13 +525,14 @@ StatusWith<StorageEngine::ReconcileResult> StorageEngineImpl::reconcileCatalogAn "Expected index data is missing, rebuilding", "index"_attr = indexName, "namespace"_attr = coll); - ret.indexesToRebuild.push_back({entry.catalogId, coll, indexName}); + reconcileResult.indexesToRebuild.push_back({entry.catalogId, coll, indexName}); continue; } // Any index build with a UUID is an unfinished two-phase build and must be restarted. // There are no special cases to handle on primaries or secondaries. An index build may - // be associated with multiple indexes. + // be associated with multiple indexes. We should only restart an index build if we + // aren't going to resume it. if (indexMetaData.buildUUID) { invariant(!indexMetaData.ready); @@ -496,10 +550,11 @@ StatusWith<StorageEngine::ReconcileResult> StorageEngineImpl::reconcileCatalogAn "buildUUID"_attr = buildUUID); // Insert in the map if a build has not already been registered. - auto existingIt = ret.indexBuildsToRestart.find(buildUUID); - if (existingIt == ret.indexBuildsToRestart.end()) { - ret.indexBuildsToRestart.insert({buildUUID, IndexBuildDetails(*collUUID)}); - existingIt = ret.indexBuildsToRestart.find(buildUUID); + auto existingIt = reconcileResult.indexBuildsToRestart.find(buildUUID); + if (existingIt == reconcileResult.indexBuildsToRestart.end()) { + reconcileResult.indexBuildsToRestart.insert( + {buildUUID, IndexBuildDetails(*collUUID)}); + existingIt = reconcileResult.indexBuildsToRestart.find(buildUUID); } existingIt->second.indexSpecs.emplace_back(indexMetaData.spec); @@ -516,7 +571,7 @@ StatusWith<StorageEngine::ReconcileResult> StorageEngineImpl::reconcileCatalogAn "- see SERVER-43097", "namespace"_attr = coll, "index"_attr = indexName); - ret.indexesToRebuild.push_back({entry.catalogId, coll, indexName}); + reconcileResult.indexesToRebuild.push_back({entry.catalogId, coll, indexName}); continue; } @@ -557,7 +612,7 @@ StatusWith<StorageEngine::ReconcileResult> StorageEngineImpl::reconcileCatalogAn wuow.commit(); } - return ret; + return reconcileResult; } std::string StorageEngineImpl::getFilesystemPathForDb(const std::string& dbName) const { diff --git a/src/mongo/db/storage/storage_engine_impl.h b/src/mongo/db/storage/storage_engine_impl.h index 6ef1a605992..897d04f4b94 100644 --- a/src/mongo/db/storage/storage_engine_impl.h +++ b/src/mongo/db/storage/storage_engine_impl.h @@ -383,6 +383,15 @@ private: */ void _onMinOfCheckpointAndOldestTimestampChanged(const Timestamp& timestamp); + /** + * Returns whether the given ident is an internal ident and if it should be dropped or used to + * resume an index build. + */ + bool _handleInternalIdents(OperationContext* opCtx, + const std::string& ident, + ReconcileResult* reconcileResult, + std::set<std::string>* internalIdentsToDrop); + class RemoveDBChange; // This must be the first member so it is destroyed last. diff --git a/src/mongo/util/uuid.h b/src/mongo/util/uuid.h index 200d02e50bc..33741ff3476 100644 --- a/src/mongo/util/uuid.h +++ b/src/mongo/util/uuid.h @@ -85,6 +85,7 @@ class UUID { friend class repl::OplogEntryBase; friend class repl::DurableReplOperation; friend class repl::InitialSyncIdDocument; + friend class ResumeIndexInfo; friend class ResumeTokenInternal; friend class ShardCollectionTypeBase; friend class TenantMigrationDonorDocument; |