summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/mongo/db/SConscript12
-rw-r--r--src/mongo/db/catalog/SConscript1
-rw-r--r--src/mongo/db/catalog/catalog_control.cpp10
-rw-r--r--src/mongo/db/catalog/multi_index_block.cpp52
-rw-r--r--src/mongo/db/catalog/multi_index_block.h6
-rw-r--r--src/mongo/db/index_builds_coordinator.cpp6
-rw-r--r--src/mongo/db/index_builds_coordinator.h11
-rw-r--r--src/mongo/db/repl/rs_rollback.cpp2
-rw-r--r--src/mongo/db/resumable_index_builds.idl123
-rw-r--r--src/mongo/db/startup_recovery.cpp2
-rw-r--r--src/mongo/db/storage/SConscript1
-rw-r--r--src/mongo/db/storage/kv/storage_engine_test.cpp23
-rw-r--r--src/mongo/db/storage/storage_engine.h5
-rw-r--r--src/mongo/db/storage/storage_engine_impl.cpp83
-rw-r--r--src/mongo/db/storage/storage_engine_impl.h9
-rw-r--r--src/mongo/util/uuid.h1
16 files changed, 284 insertions, 63 deletions
diff --git a/src/mongo/db/SConscript b/src/mongo/db/SConscript
index 75538f4e0e5..61e3abba115 100644
--- a/src/mongo/db/SConscript
+++ b/src/mongo/db/SConscript
@@ -845,6 +845,17 @@ env.Library(
)
env.Library(
+ target='resumable_index_builds_idl',
+ source=[
+ env.Idlc('resumable_index_builds.idl')[0],
+ ],
+ LIBDEPS=[
+ "$BUILD_DIR/mongo/base",
+ '$BUILD_DIR/mongo/idl/idl_parser',
+ ]
+)
+
+env.Library(
target="index_builds_coordinator_mongod",
source=[
"index_builds_coordinator_mongod.cpp",
@@ -857,6 +868,7 @@ env.Library(
'curop',
'db_raii',
'index_build_entry_helpers',
+ 'resumable_index_builds_idl',
'$BUILD_DIR/mongo/db/catalog/collection_catalog',
'$BUILD_DIR/mongo/db/catalog/index_build_entry_idl',
"$BUILD_DIR/mongo/executor/task_executor_interface",
diff --git a/src/mongo/db/catalog/SConscript b/src/mongo/db/catalog/SConscript
index 05eb9f9c3dc..5ae71b7908d 100644
--- a/src/mongo/db/catalog/SConscript
+++ b/src/mongo/db/catalog/SConscript
@@ -232,6 +232,7 @@ env.Library(
'$BUILD_DIR/mongo/db/concurrency/write_conflict_exception',
'$BUILD_DIR/mongo/db/curop',
'$BUILD_DIR/mongo/db/repl/repl_coordinator_interface',
+ '$BUILD_DIR/mongo/db/resumable_index_builds_idl',
'$BUILD_DIR/mongo/db/service_context',
'$BUILD_DIR/mongo/db/storage/write_unit_of_work',
'$BUILD_DIR/mongo/util/fail_point',
diff --git a/src/mongo/db/catalog/catalog_control.cpp b/src/mongo/db/catalog/catalog_control.cpp
index 02393d4929b..a3344f5c1b2 100644
--- a/src/mongo/db/catalog/catalog_control.cpp
+++ b/src/mongo/db/catalog/catalog_control.cpp
@@ -163,12 +163,12 @@ void openCatalog(OperationContext* opCtx, const MinVisibleTimestampMap& minVisib
fassert(40690, rebuildIndexesOnCollection(opCtx, collection, indexSpecs, RepairData::kNo));
}
- // Once all unfinished index builds have been dropped and the catalog has been reloaded, restart
- // any unfinished index builds. This will not restart any index builds to completion, but rather
- // start the background thread, build the index, and wait for a replicated commit or abort oplog
- // entry.
+ // Once all unfinished index builds have been dropped and the catalog has been reloaded, resume
+ // or restart any unfinished index builds. This will not resume/restart any index builds to
+ // completion, but rather start the background thread, build the index, and wait for a
+ // replicated commit or abort oplog entry.
IndexBuildsCoordinator::get(opCtx)->restartIndexBuildsForRecovery(
- opCtx, reconcileResult.indexBuildsToRestart);
+ opCtx, reconcileResult.indexBuildsToRestart, reconcileResult.indexBuildsToResume);
// Open all databases and repopulate the CollectionCatalog.
LOGV2(20276, "openCatalog: reopening all databases");
diff --git a/src/mongo/db/catalog/multi_index_block.cpp b/src/mongo/db/catalog/multi_index_block.cpp
index 6586dec2740..6842dc97659 100644
--- a/src/mongo/db/catalog/multi_index_block.cpp
+++ b/src/mongo/db/catalog/multi_index_block.cpp
@@ -397,8 +397,9 @@ Status MultiIndexBlock::insertAllDocumentsInCollection(OperationContext* opCtx,
opCtx->recoveryUnit()->setReadOnce(readOnce);
try {
- invariant(_phase == Phase::kInitialized, _phaseToString(_phase));
- _phase = Phase::kCollectionScan;
+ invariant(_phase == IndexBuildPhaseEnum::kInitialized,
+ IndexBuildPhase_serializer(_phase).toString());
+ _phase = IndexBuildPhaseEnum::kCollectionScan;
BSONObj objToIndex;
RecordId loc;
@@ -433,7 +434,7 @@ Status MultiIndexBlock::insertAllDocumentsInCollection(OperationContext* opCtx,
n++;
}
} catch (...) {
- _phase = Phase::kInitialized;
+ _phase = IndexBuildPhaseEnum::kInitialized;
return exceptionToStatus();
}
@@ -522,9 +523,10 @@ Status MultiIndexBlock::dumpInsertsFromBulk(OperationContext* opCtx,
// insertDocumentsInCollection() to scan and insert the contents of the collection.
// Therefore, it is possible for the phase of this MultiIndexBlock to be kInitialized
// rather than kCollection when this function is called.
- invariant(_phase == Phase::kCollectionScan || _phase == Phase::kInitialized,
- _phaseToString(_phase));
- _phase = Phase::kBulkLoad;
+ invariant(_phase == IndexBuildPhaseEnum::kCollectionScan ||
+ _phase == IndexBuildPhaseEnum::kInitialized,
+ IndexBuildPhase_serializer(_phase).toString());
+ _phase = IndexBuildPhaseEnum::kBulkLoad;
for (size_t i = 0; i < _indexes.size(); i++) {
// When dupRecords is passed, 'dupsAllowed' should be passed to reflect whether or not the
@@ -577,8 +579,10 @@ Status MultiIndexBlock::drainBackgroundWrites(
// Background writes are drained three times (once without blocking writes and twice blocking
// writes), so we may either be coming from the bulk load phase or be already in the drain
// writes phase.
- invariant(_phase == Phase::kBulkLoad || _phase == Phase::kDrainWrites, _phaseToString(_phase));
- _phase = Phase::kDrainWrites;
+ invariant(_phase == IndexBuildPhaseEnum::kBulkLoad ||
+ _phase == IndexBuildPhaseEnum::kDrainWrites,
+ IndexBuildPhase_serializer(_phase).toString());
+ _phase = IndexBuildPhaseEnum::kDrainWrites;
const Collection* coll =
CollectionCatalog::get(opCtx).lookupCollectionByUUID(opCtx, _collectionUUID.get());
@@ -779,18 +783,23 @@ void MultiIndexBlock::_writeStateToDisk(OperationContext* opCtx) const {
BSONObj MultiIndexBlock::_constructStateObject() const {
BSONObjBuilder builder;
_buildUUID->appendToBuilder(&builder, "_id");
- builder.append("phase", _phaseToString(_phase));
+ builder.append("phase", IndexBuildPhase_serializer(_phase));
+
+ if (_collectionUUID) {
+ _collectionUUID->appendToBuilder(&builder, "collectionUUID");
+ }
// We can be interrupted by shutdown before inserting the first document from the collection
// scan, in which case there is no _lastRecordIdInserted.
- if (_phase == Phase::kCollectionScan && _lastRecordIdInserted)
+ if (_phase == IndexBuildPhaseEnum::kCollectionScan && _lastRecordIdInserted)
builder.append("collectionScanPosition", _lastRecordIdInserted->repr());
BSONArrayBuilder indexesArray(builder.subarrayStart("indexes"));
for (const auto& index : _indexes) {
BSONObjBuilder indexInfo(indexesArray.subobjStart());
- if (_phase == Phase::kCollectionScan || _phase == Phase::kBulkLoad) {
+ if (_phase == IndexBuildPhaseEnum::kCollectionScan ||
+ _phase == IndexBuildPhaseEnum::kBulkLoad) {
auto state = index.bulk->getSorterState();
indexInfo.append("tempDir", state.tempDir);
@@ -814,12 +823,16 @@ BSONObj MultiIndexBlock::_constructStateObject() const {
if (auto duplicateKeyTrackerTableIdent =
indexBuildInterceptor->getDuplicateKeyTrackerTableIdent())
- indexInfo.append("dupKeyTempTable", *duplicateKeyTrackerTableIdent);
+ indexInfo.append("duplicateKeyTrackerTable", *duplicateKeyTrackerTableIdent);
if (auto skippedRecordTrackerTableIdent =
indexBuildInterceptor->getSkippedRecordTracker()->getTableIdent())
indexInfo.append("skippedRecordTrackerTable", *skippedRecordTrackerTableIdent);
+ // TODO SERVER-49450: Consider only writing out the index name or key and getting the rest
+ // of the spec from the durable catalog on startup.
+ indexInfo.append("spec", index.block->getSpec());
+
indexInfo.done();
// Ensure the data we are referencing has been persisted to disk.
@@ -829,19 +842,4 @@ BSONObj MultiIndexBlock::_constructStateObject() const {
return builder.obj();
}
-
-std::string MultiIndexBlock::_phaseToString(Phase phase) const {
- switch (phase) {
- case Phase::kInitialized:
- return "initialized";
- case Phase::kCollectionScan:
- return "collection scan";
- case Phase::kBulkLoad:
- return "bulk load";
- case Phase::kDrainWrites:
- return "drain writes";
- }
- MONGO_UNREACHABLE;
-}
-
} // namespace mongo
diff --git a/src/mongo/db/catalog/multi_index_block.h b/src/mongo/db/catalog/multi_index_block.h
index 314800a1743..56fb44bffa3 100644
--- a/src/mongo/db/catalog/multi_index_block.h
+++ b/src/mongo/db/catalog/multi_index_block.h
@@ -46,6 +46,7 @@
#include "mongo/db/index/index_access_method.h"
#include "mongo/db/index/index_build_interceptor.h"
#include "mongo/db/record_id.h"
+#include "mongo/db/resumable_index_builds_gen.h"
#include "mongo/platform/mutex.h"
#include "mongo/util/fail_point.h"
@@ -297,8 +298,6 @@ private:
InsertDeleteOptions options;
};
- enum class Phase { kInitialized, kCollectionScan, kBulkLoad, kDrainWrites };
-
void _abortWithoutCleanup(OperationContext* opCtx, bool shutdown);
bool _shouldWriteStateToDisk(OperationContext* opCtx, bool shutdown) const;
@@ -307,7 +306,6 @@ private:
BSONObj _constructStateObject() const;
- std::string _phaseToString(Phase phase) const;
// Is set during init() and ensures subsequent function calls act on the same Collection.
boost::optional<UUID> _collectionUUID;
@@ -331,6 +329,6 @@ private:
boost::optional<RecordId> _lastRecordIdInserted;
// The current phase of the index build.
- Phase _phase = Phase::kInitialized;
+ IndexBuildPhaseEnum _phase = IndexBuildPhaseEnum::kInitialized;
};
} // namespace mongo
diff --git a/src/mongo/db/index_builds_coordinator.cpp b/src/mongo/db/index_builds_coordinator.cpp
index 3835961496c..a5d29ee04fb 100644
--- a/src/mongo/db/index_builds_coordinator.cpp
+++ b/src/mongo/db/index_builds_coordinator.cpp
@@ -1321,8 +1321,10 @@ IndexBuilds IndexBuildsCoordinator::stopIndexBuildsForRollback(OperationContext*
return buildsStopped;
}
-void IndexBuildsCoordinator::restartIndexBuildsForRecovery(OperationContext* opCtx,
- const IndexBuilds& buildsToRestart) {
+void IndexBuildsCoordinator::restartIndexBuildsForRecovery(
+ OperationContext* opCtx,
+ const IndexBuilds& buildsToRestart,
+ const std::vector<ResumeIndexInfo>& buildsToResume) {
for (auto& [buildUUID, build] : buildsToRestart) {
boost::optional<NamespaceString> nss =
CollectionCatalog::get(opCtx).lookupNSSByUUID(opCtx, build.collUUID);
diff --git a/src/mongo/db/index_builds_coordinator.h b/src/mongo/db/index_builds_coordinator.h
index 56df844a0e5..c78134998db 100644
--- a/src/mongo/db/index_builds_coordinator.h
+++ b/src/mongo/db/index_builds_coordinator.h
@@ -46,6 +46,7 @@
#include "mongo/db/rebuild_indexes.h"
#include "mongo/db/repl/oplog_entry.h"
#include "mongo/db/repl_index_build_state.h"
+#include "mongo/db/resumable_index_builds_gen.h"
#include "mongo/db/storage/durable_catalog.h"
#include "mongo/executor/task_executor.h"
#include "mongo/executor/thread_pool_task_executor.h"
@@ -146,11 +147,13 @@ public:
IndexBuildOptions indexBuildOptions) = 0;
/**
- * Given a set of two-phase index builds, start, but do not complete each one in a background
- * thread. Each index build will wait for a replicated commit or abort, as in steady-state
- * replication.
+ * Resumes and restarts index builds for recovery. Anything that fails to resume will be
+ * started in a background thread. Each index build will wait for a replicated commit or abort,
+ * as in steady-state.
*/
- void restartIndexBuildsForRecovery(OperationContext* opCtx, const IndexBuilds& buildsToRestart);
+ void restartIndexBuildsForRecovery(OperationContext* opCtx,
+ const IndexBuilds& buildsToRestart,
+ const std::vector<ResumeIndexInfo>& buildsToResume);
/**
* Runs the full index rebuild for recovery. This will only rebuild single-phase index builds.
diff --git a/src/mongo/db/repl/rs_rollback.cpp b/src/mongo/db/repl/rs_rollback.cpp
index 12b86993a2a..e74f850acfb 100644
--- a/src/mongo/db/repl/rs_rollback.cpp
+++ b/src/mongo/db/repl/rs_rollback.cpp
@@ -1669,7 +1669,7 @@ void rollback_internal::syncFixUp(OperationContext* opCtx,
LOGV2(21707, "Restarting rolled-back committed or aborted index builds");
IndexBuildsCoordinator::get(opCtx)->restartIndexBuildsForRecovery(
- opCtx, fixUpInfo.indexBuildsToRestart);
+ opCtx, fixUpInfo.indexBuildsToRestart, {});
LOGV2(21708,
"Deleting and updating documents to roll back insert, update and remove "
diff --git a/src/mongo/db/resumable_index_builds.idl b/src/mongo/db/resumable_index_builds.idl
new file mode 100644
index 00000000000..9f8fc4a6818
--- /dev/null
+++ b/src/mongo/db/resumable_index_builds.idl
@@ -0,0 +1,123 @@
+# Copyright (C) 2020-present MongoDB, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the Server Side Public License, version 1,
+# as published by MongoDB, Inc.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# Server Side Public License for more details.
+#
+# You should have received a copy of the Server Side Public License
+# along with this program. If not, see
+# <http://www.mongodb.com/licensing/server-side-public-license>.
+#
+# As a special exception, the copyright holders give permission to link the
+# code of portions of this program with the OpenSSL library under certain
+# conditions as described in each individual source file and distribute
+# linked combinations including the program with the OpenSSL library. You
+# must comply with the Server Side Public License in all respects for
+# all of the code used other than as permitted herein. If you modify file(s)
+# with this exception, you may extend this exception to your version of the
+# file(s), but you are not obligated to do so. If you do not wish to do so,
+# delete this exception statement from your version. If you delete this
+# exception statement from all source files in the program, then also delete
+# it in the license file.
+
+# This IDL file describes the BSON format for ElectionMetrics,
+# ElectionCandidateMetrics and ElectionParticipantMetrics, and
+# handles the serialization to and deserialization from their BSON
+# representations for those classes.
+
+global:
+ cpp_namespace: "mongo"
+ cpp_includes:
+ - "mongo/util/uuid.h"
+
+imports:
+ - "mongo/idl/basic_types.idl"
+
+enums:
+ IndexBuildPhase:
+ description: "Phase of a hybrid index build"
+ type: string
+ values:
+ kInitialized: "initialized"
+ kCollectionScan: "collection scan"
+ kBulkLoad: "bulk load"
+ kDrainWrites: "drain writes"
+
+structs:
+ SorterRange:
+ description: "The range of data that was sorted and spilled to disk"
+ strict: true
+ fields:
+ startOffset:
+ description: "Tracks where in the file we started writing this data range"
+ type: long
+ endOffset:
+ description: "Tracks where in the file we finished writing this data range"
+ type: long
+ checksum:
+ description: "Keeps track of the hash of all data objects spilled to disk"
+ type: long
+ validator: { gte: 0 }
+
+ IndexSorterInfo:
+ description: "The information to resume the sorter for an index build"
+ strict: true
+ fields:
+ sideWritesTable:
+ description: "The name of the ident associated with the side writes table for this
+ index build"
+ type: string
+ duplicateKeyTrackerTable:
+ description: "The name of the ident associated with the duplicate key tracker table
+ for this index build"
+ type: string
+ optional: true
+ skippedRecordTrackerTable:
+ description: "The name of the ident associated with the skipped record tracker table
+ for this index build"
+ type: string
+ optional: true
+ tempDir:
+ description: "The directory into which we place a file when spilling data to disk"
+ type: string
+ optional: true
+ fileName:
+ description: "The name of the file that sorted data is written to"
+ type: string
+ optional: true
+ ranges:
+ description: "All ranges of data that were already sorted and spilled to disk"
+ type: array<SorterRange>
+ optional: true
+ spec:
+ description: "The index specification"
+ type: object
+
+ ResumeIndexInfo:
+ description: "Information needed to resume index builds"
+ strict: true
+ fields:
+ _id:
+ description: "A UUID that uniquely identifies the index build across replica set
+ members."
+ cpp_name: buildUUID
+ type: uuid
+ phase:
+ description: "The phase the index build was in when the node shut down"
+ type: IndexBuildPhase
+ collectionUUID:
+ description: "A UUID that uniquely identifies which collection the index is being
+ built on"
+ type: uuid
+ collectionScanPosition:
+ description: "The last record id inserted into the sorter before shutdown"
+ type: long
+ optional: true
+ indexes:
+ description: "The information needed to resume each specific index in this build"
+ type: array<IndexSorterInfo>
diff --git a/src/mongo/db/startup_recovery.cpp b/src/mongo/db/startup_recovery.cpp
index cbc10c7be7a..9e0e5bed487 100644
--- a/src/mongo/db/startup_recovery.cpp
+++ b/src/mongo/db/startup_recovery.cpp
@@ -390,7 +390,7 @@ void reconcileCatalogAndRebuildUnfinishedIndexes(OperationContext* opCtx,
// not build any indexes to completion, but rather start the background thread to build the
// index, and wait for a replicated commit or abort oplog entry.
IndexBuildsCoordinator::get(opCtx)->restartIndexBuildsForRecovery(
- opCtx, reconcileResult.indexBuildsToRestart);
+ opCtx, reconcileResult.indexBuildsToRestart, reconcileResult.indexBuildsToResume);
}
/**
diff --git a/src/mongo/db/storage/SConscript b/src/mongo/db/storage/SConscript
index 5384d1852c3..fc8a18d2c20 100644
--- a/src/mongo/db/storage/SConscript
+++ b/src/mongo/db/storage/SConscript
@@ -524,6 +524,7 @@ env.Library(
LIBDEPS_PRIVATE=[
'$BUILD_DIR/mongo/db/storage/storage_repair_observer',
'$BUILD_DIR/mongo/db/catalog/collection_catalog_helper',
+ '$BUILD_DIR/mongo/db/resumable_index_builds_idl',
'$BUILD_DIR/mongo/db/vector_clock',
'two_phase_index_build_knobs_idl',
],
diff --git a/src/mongo/db/storage/kv/storage_engine_test.cpp b/src/mongo/db/storage/kv/storage_engine_test.cpp
index b34c3acd2ce..37bca4130f0 100644
--- a/src/mongo/db/storage/kv/storage_engine_test.cpp
+++ b/src/mongo/db/storage/kv/storage_engine_test.cpp
@@ -93,6 +93,7 @@ TEST_F(StorageEngineTest, ReconcileIdentsTest) {
reconcileResult = unittest::assertGet(reconcile(opCtx.get()));
ASSERT_EQUALS(1UL, reconcileResult.indexesToRebuild.size());
ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToRestart.size());
+ ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToResume.size());
StorageEngine::IndexIdentifier& toRebuild = reconcileResult.indexesToRebuild[0];
ASSERT_EQUALS("db.coll1", toRebuild.nss.ns());
@@ -148,6 +149,7 @@ TEST_F(StorageEngineTest, ReconcileDropsTemporary) {
auto reconcileResult = unittest::assertGet(reconcile(opCtx.get()));
ASSERT_EQUALS(0UL, reconcileResult.indexesToRebuild.size());
ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToRestart.size());
+ ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToResume.size());
// The storage engine is responsible for dropping its temporary idents.
ASSERT(!identExists(opCtx.get(), ident));
@@ -170,9 +172,14 @@ TEST_F(StorageEngineTest, ReconcileKeepsTemporary) {
ASSERT_EQUALS(0UL, reconcileResult.indexesToRebuild.size());
ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToRestart.size());
- // The storage engine does not drop its temporary idents outside of starting up after an
- // unclean shutdown.
- ASSERT(identExists(opCtx.get(), ident));
+ // TODO SERVER-49847: Clean up when the feature is turned on by default.
+ if (_storageEngine->supportsResumableIndexBuilds()) {
+ // The storage engine does not drop its temporary idents outside of starting up after an
+ // unclean shutdown.
+ ASSERT(identExists(opCtx.get(), ident));
+ } else {
+ ASSERT_FALSE(identExists(opCtx.get(), ident));
+ }
rs->finalizeTemporaryTable(opCtx.get(), TemporaryRecordStore::FinalizationAction::kDelete);
}
@@ -231,8 +238,9 @@ TEST_F(StorageEngineTest, ReconcileUnfinishedIndex) {
// not require it to be rebuilt.
ASSERT_EQUALS(0UL, reconcileResult.indexesToRebuild.size());
- // There are no two-phase builds to restart.
+ // There are no two-phase builds to resume or restart.
ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToRestart.size());
+ ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToResume.size());
}
TEST_F(StorageEngineTest, ReconcileUnfinishedBackgroundSecondaryIndex) {
@@ -272,8 +280,9 @@ TEST_F(StorageEngineTest, ReconcileUnfinishedBackgroundSecondaryIndex) {
ASSERT_EQUALS(ns.ns(), toRebuild.nss.ns());
ASSERT_EQUALS(indexName, toRebuild.indexName);
- // There are no two-phase builds to restart.
+ // There are no two-phase builds to restart or resume.
ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToRestart.size());
+ ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToResume.size());
}
TEST_F(StorageEngineTest, ReconcileTwoPhaseIndexBuilds) {
@@ -332,6 +341,9 @@ TEST_F(StorageEngineTest, ReconcileTwoPhaseIndexBuilds) {
ASSERT_EQ(2UL, specs.size());
ASSERT_EQ(indexA, specs[0]["name"].str());
ASSERT_EQ(indexB, specs[1]["name"].str());
+
+ // There should be no index builds to resume.
+ ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToResume.size());
}
TEST_F(StorageEngineRepairTest, LoadCatalogRecoversOrphans) {
@@ -373,6 +385,7 @@ TEST_F(StorageEngineRepairTest, ReconcileSucceeds) {
auto reconcileResult = unittest::assertGet(reconcile(opCtx.get()));
ASSERT_EQUALS(0UL, reconcileResult.indexesToRebuild.size());
ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToRestart.size());
+ ASSERT_EQUALS(0UL, reconcileResult.indexBuildsToResume.size());
ASSERT(!identExists(opCtx.get(), swCollInfo.getValue().ident));
ASSERT(collectionExists(opCtx.get(), collNs));
diff --git a/src/mongo/db/storage/storage_engine.h b/src/mongo/db/storage/storage_engine.h
index 44854b5e7dc..1832b0c4d23 100644
--- a/src/mongo/db/storage/storage_engine.h
+++ b/src/mongo/db/storage/storage_engine.h
@@ -37,6 +37,7 @@
#include "mongo/bson/bsonobj.h"
#include "mongo/bson/timestamp.h"
#include "mongo/db/catalog/index_builds.h"
+#include "mongo/db/resumable_index_builds_gen.h"
#include "mongo/db/storage/temporary_record_store.h"
#include "mongo/util/functional.h"
#include "mongo/util/str.h"
@@ -538,6 +539,10 @@ public:
// not to completion; they will wait for replicated commit or abort operations. This is a
// mapping from index build UUID to index build.
IndexBuilds indexBuildsToRestart;
+
+ // List of index builds to be resumed. Each ResumeIndexInfo may contain multiple indexes to
+ // resume as part of the same build.
+ std::vector<ResumeIndexInfo> indexBuildsToResume;
};
/**
diff --git a/src/mongo/db/storage/storage_engine_impl.cpp b/src/mongo/db/storage/storage_engine_impl.cpp
index 33d6cc6caac..1803a412bf5 100644
--- a/src/mongo/db/storage/storage_engine_impl.cpp
+++ b/src/mongo/db/storage/storage_engine_impl.cpp
@@ -320,6 +320,63 @@ Status StorageEngineImpl::_recoverOrphanedCollection(OperationContext* opCtx,
return Status::OK();
}
+bool StorageEngineImpl::_handleInternalIdents(OperationContext* opCtx,
+ const std::string& ident,
+ ReconcileResult* reconcileResult,
+ std::set<std::string>* internalIdentsToDrop) {
+ if (!_catalog->isInternalIdent(ident)) {
+ return false;
+ }
+ // When starting up after an unclean shutdown, we do not attempt to recover any state from the
+ // internal idents. Thus, we drop them in this case.
+ if (startingAfterUncleanShutdown(opCtx->getServiceContext()) ||
+ !supportsResumableIndexBuilds()) {
+ internalIdentsToDrop->insert(ident);
+ return true;
+ }
+
+ // When starting up after a clean shutdown and resumable index builds are supported, find the
+ // internal idents that contain the relevant information to resume each index build and recover
+ // the state.
+ auto rs = _engine->getRecordStore(opCtx, "", ident, CollectionOptions());
+
+ // Look at the contents to determine whether this ident will contain information for
+ // resuming an index build.
+ // TODO SERVER-49125: differentiate the internal idents without looking at the contents.
+ auto cursor = rs->getCursor(opCtx);
+ auto record = cursor->next();
+ if (record) {
+ auto doc = record.get().data.toBson();
+
+ // Parse the documents here so that we can restart the build if the document doesn't
+ // contain all the necessary information to be able to resume building the index.
+ if (doc.hasField("phase")) {
+ ResumeIndexInfo resumeInfo;
+ try {
+ resumeInfo = ResumeIndexInfo::parse(IDLParserErrorContext("ResumeIndexInfo"), doc);
+ } catch (const DBException& e) {
+ LOGV2(4916300, "Failed to parse resumable index info", "error"_attr = e.toStatus());
+
+ // Ignore the error so that we can restart the index build instead of resume it. We
+ // should drop the internal ident if we failed to parse.
+ internalIdentsToDrop->insert(ident);
+ return true;
+ }
+
+ reconcileResult->indexBuildsToResume.push_back(resumeInfo);
+
+ LOGV2(4916301,
+ "Found unfinished index build to resume",
+ "buildUUID"_attr = resumeInfo.getBuildUUID(),
+ "collectionUUID"_attr = resumeInfo.getCollectionUUID(),
+ "phase"_attr = IndexBuildPhase_serializer(resumeInfo.getPhase()));
+ return true;
+ }
+ }
+
+ return false;
+}
+
/**
* This method reconciles differences between idents the KVEngine is aware of and the
* DurableCatalog. There are three differences to consider:
@@ -366,16 +423,13 @@ StatusWith<StorageEngine::ReconcileResult> StorageEngineImpl::reconcileCatalogAn
// Drop all idents in the storage engine that are not known to the catalog. This can happen in
// the case of a collection or index creation being rolled back.
+ StorageEngine::ReconcileResult reconcileResult;
for (const auto& it : engineIdents) {
if (catalogIdents.find(it) != catalogIdents.end()) {
continue;
}
- // When starting up after an unclean shutdown, we do not attempt to recover any state from
- // the internal idents. Thus, we drop them in this case.
- if (startingAfterUncleanShutdown(opCtx->getServiceContext()) &&
- _catalog->isInternalIdent(it)) {
- internalIdentsToDrop.insert(it);
+ if (_handleInternalIdents(opCtx, it, &reconcileResult, &internalIdentsToDrop)) {
continue;
}
@@ -425,7 +479,6 @@ StatusWith<StorageEngine::ReconcileResult> StorageEngineImpl::reconcileCatalogAn
//
// Also, remove unfinished builds except those that were background index builds started on a
// secondary.
- StorageEngine::ReconcileResult ret;
for (DurableCatalog::Entry entry : catalogEntries) {
BSONCollectionCatalogEntry::MetaData metaData =
_catalog->getMetaData(opCtx, entry.catalogId);
@@ -472,13 +525,14 @@ StatusWith<StorageEngine::ReconcileResult> StorageEngineImpl::reconcileCatalogAn
"Expected index data is missing, rebuilding",
"index"_attr = indexName,
"namespace"_attr = coll);
- ret.indexesToRebuild.push_back({entry.catalogId, coll, indexName});
+ reconcileResult.indexesToRebuild.push_back({entry.catalogId, coll, indexName});
continue;
}
// Any index build with a UUID is an unfinished two-phase build and must be restarted.
// There are no special cases to handle on primaries or secondaries. An index build may
- // be associated with multiple indexes.
+ // be associated with multiple indexes. We should only restart an index build if we
+ // aren't going to resume it.
if (indexMetaData.buildUUID) {
invariant(!indexMetaData.ready);
@@ -496,10 +550,11 @@ StatusWith<StorageEngine::ReconcileResult> StorageEngineImpl::reconcileCatalogAn
"buildUUID"_attr = buildUUID);
// Insert in the map if a build has not already been registered.
- auto existingIt = ret.indexBuildsToRestart.find(buildUUID);
- if (existingIt == ret.indexBuildsToRestart.end()) {
- ret.indexBuildsToRestart.insert({buildUUID, IndexBuildDetails(*collUUID)});
- existingIt = ret.indexBuildsToRestart.find(buildUUID);
+ auto existingIt = reconcileResult.indexBuildsToRestart.find(buildUUID);
+ if (existingIt == reconcileResult.indexBuildsToRestart.end()) {
+ reconcileResult.indexBuildsToRestart.insert(
+ {buildUUID, IndexBuildDetails(*collUUID)});
+ existingIt = reconcileResult.indexBuildsToRestart.find(buildUUID);
}
existingIt->second.indexSpecs.emplace_back(indexMetaData.spec);
@@ -516,7 +571,7 @@ StatusWith<StorageEngine::ReconcileResult> StorageEngineImpl::reconcileCatalogAn
"- see SERVER-43097",
"namespace"_attr = coll,
"index"_attr = indexName);
- ret.indexesToRebuild.push_back({entry.catalogId, coll, indexName});
+ reconcileResult.indexesToRebuild.push_back({entry.catalogId, coll, indexName});
continue;
}
@@ -557,7 +612,7 @@ StatusWith<StorageEngine::ReconcileResult> StorageEngineImpl::reconcileCatalogAn
wuow.commit();
}
- return ret;
+ return reconcileResult;
}
std::string StorageEngineImpl::getFilesystemPathForDb(const std::string& dbName) const {
diff --git a/src/mongo/db/storage/storage_engine_impl.h b/src/mongo/db/storage/storage_engine_impl.h
index 6ef1a605992..897d04f4b94 100644
--- a/src/mongo/db/storage/storage_engine_impl.h
+++ b/src/mongo/db/storage/storage_engine_impl.h
@@ -383,6 +383,15 @@ private:
*/
void _onMinOfCheckpointAndOldestTimestampChanged(const Timestamp& timestamp);
+ /**
+ * Returns whether the given ident is an internal ident and if it should be dropped or used to
+ * resume an index build.
+ */
+ bool _handleInternalIdents(OperationContext* opCtx,
+ const std::string& ident,
+ ReconcileResult* reconcileResult,
+ std::set<std::string>* internalIdentsToDrop);
+
class RemoveDBChange;
// This must be the first member so it is destroyed last.
diff --git a/src/mongo/util/uuid.h b/src/mongo/util/uuid.h
index 200d02e50bc..33741ff3476 100644
--- a/src/mongo/util/uuid.h
+++ b/src/mongo/util/uuid.h
@@ -85,6 +85,7 @@ class UUID {
friend class repl::OplogEntryBase;
friend class repl::DurableReplOperation;
friend class repl::InitialSyncIdDocument;
+ friend class ResumeIndexInfo;
friend class ResumeTokenInternal;
friend class ShardCollectionTypeBase;
friend class TenantMigrationDonorDocument;