From 300458bb1ed89940649560e92ff84c81a4a77678 Mon Sep 17 00:00:00 2001 From: Louis Williams Date: Thu, 21 Nov 2019 22:36:46 +0000 Subject: SERVER-44467 Implement startup recovery for two-phase index builds --- etc/evergreen.yml | 6 +- jstests/noPassthrough/backup_restore_fsync_lock.js | 3 - jstests/noPassthrough/backup_restore_rolling.js | 3 - jstests/noPassthrough/backup_restore_stop_start.js | 3 - .../characterize_index_builds_on_restart.js | 2 +- jstests/noPassthrough/indexbg_shutdown.js | 3 - .../indexbg_restart_secondary.js | 42 +-- .../replsets/libs/rollback_index_builds_test.js | 6 + .../replsets/rollback_index_build_start_abort.js | 20 ++ src/mongo/db/catalog/multi_index_block.cpp | 2 +- src/mongo/db/index_builds_coordinator.cpp | 298 ++++++++++++--------- src/mongo/db/index_builds_coordinator.h | 32 ++- src/mongo/db/repair_database_and_check_version.cpp | 9 +- src/mongo/db/repl/oplog.cpp | 6 +- src/mongo/db/repl/oplog_applier_impl.cpp | 14 +- ...replication_coordinator_external_state_impl.cpp | 5 + src/mongo/db/storage/storage_engine_impl.cpp | 2 +- 17 files changed, 278 insertions(+), 178 deletions(-) create mode 100644 jstests/replsets/rollback_index_build_start_abort.js diff --git a/etc/evergreen.yml b/etc/evergreen.yml index 5303a7080a2..57c58aae543 100644 --- a/etc/evergreen.yml +++ b/etc/evergreen.yml @@ -10174,14 +10174,14 @@ buildvariants: - name: compile_TG distros: - rhel62-large - - name: .concurrency !.large !.ubsan !.no_txns !.stepdowns !.kill_terminate !.debug_only + - name: .concurrency !.large !.ubsan !.no_txns !.debug_only - name: .jscore .common - name: noPassthrough_gen - name: noPassthroughWithMongod_gen - - name: replica_sets + - name: .replica_sets !.encrypt !.auth distros: - rhel62-large - - name: rollback_fuzzer_gen + - name: .rollbackfuzzer - name: sharding_gen - name: enterprise-rhel-62-64-bit-coverage diff --git a/jstests/noPassthrough/backup_restore_fsync_lock.js b/jstests/noPassthrough/backup_restore_fsync_lock.js index 57b12a6ab24..f6f720377aa 100644 --- a/jstests/noPassthrough/backup_restore_fsync_lock.js +++ b/jstests/noPassthrough/backup_restore_fsync_lock.js @@ -11,12 +11,9 @@ * * Some methods for backup used in this test checkpoint the files in the dbpath. This technique will * not work for ephemeral storage engines, as they do not store any data in the dbpath. - * TODO: (SERVER-44467) Remove two_phase_index_builds_unsupported tag when startup recovery works - * for two-phase index builds. * @tags: [ * requires_persistence, * requires_replication, - * two_phase_index_builds_unsupported, * ] */ diff --git a/jstests/noPassthrough/backup_restore_rolling.js b/jstests/noPassthrough/backup_restore_rolling.js index 358f33b0194..63f6b1a7cd2 100644 --- a/jstests/noPassthrough/backup_restore_rolling.js +++ b/jstests/noPassthrough/backup_restore_rolling.js @@ -11,12 +11,9 @@ * * Some methods for backup used in this test checkpoint the files in the dbpath. This technique will * not work for ephemeral storage engines, as they do not store any data in the dbpath. - * TODO: (SERVER-44467) Remove two_phase_index_builds_unsupported tag when startup recovery works - * for two-phase index builds. * @tags: [ * requires_persistence, * requires_wiredtiger, - * two_phase_index_builds_unsupported, * ] */ diff --git a/jstests/noPassthrough/backup_restore_stop_start.js b/jstests/noPassthrough/backup_restore_stop_start.js index 71ad88c7dd5..2598b8670ab 100644 --- a/jstests/noPassthrough/backup_restore_stop_start.js +++ b/jstests/noPassthrough/backup_restore_stop_start.js @@ -11,12 +11,9 @@ * * Some methods for backup used in this test checkpoint the files in the dbpath. This technique will * not work for ephemeral storage engines, as they do not store any data in the dbpath. - * TODO: (SERVER-44467) Remove two_phase_index_builds_unsupported tag when startup recovery works - * for two-phase index builds. * @tags: [ * requires_persistence, * requires_replication, - * two_phase_index_builds_unsupported, * ] */ diff --git a/jstests/noPassthrough/characterize_index_builds_on_restart.js b/jstests/noPassthrough/characterize_index_builds_on_restart.js index 64c2ba5fe91..2f500684201 100644 --- a/jstests/noPassthrough/characterize_index_builds_on_restart.js +++ b/jstests/noPassthrough/characterize_index_builds_on_restart.js @@ -99,7 +99,7 @@ function startIndexBuildOnSecondaryAndLeaveUnfinished(primaryDB, writeConcern, s checkLog.containsWithCount( secondaryDB, "Index build interrupted due to \'leaveIndexBuildUnfinishedForShutdown\' " + - "failpoint. Mimicing shutdown error code.", + "failpoint. Mimicking shutdown error code.", expectedFailPointMessageCount); // Wait until the secondary has a recovery timestamp beyond the index oplog entry. On diff --git a/jstests/noPassthrough/indexbg_shutdown.js b/jstests/noPassthrough/indexbg_shutdown.js index 4e9bc9cf016..a8a5781b3ed 100644 --- a/jstests/noPassthrough/indexbg_shutdown.js +++ b/jstests/noPassthrough/indexbg_shutdown.js @@ -4,11 +4,8 @@ * shuts down cleanly, without an fassert. * Also confirms that killOp has no effect on the background index build on the secondary. * - * TODO(SERVER-44467): Remove two_phase_index_builds_unsupported tag when startup recovery works - * for two-phase index builds. * @tags: [ * requires_replication, - * two_phase_index_builds_unsupported, * ] */ diff --git a/jstests/noPassthroughWithMongod/indexbg_restart_secondary.js b/jstests/noPassthroughWithMongod/indexbg_restart_secondary.js index e12973b7ff8..462236d8c63 100644 --- a/jstests/noPassthroughWithMongod/indexbg_restart_secondary.js +++ b/jstests/noPassthroughWithMongod/indexbg_restart_secondary.js @@ -3,20 +3,18 @@ * Kills the secondary once the index build starts with a failpoint. * The index build should resume when the secondary is restarted. * - * TODO: (SERVER-44467) Remove two_phase_index_builds_unsupported tag when startup recovery works - * for two-phase index builds. - * * @tags: [ * requires_persistence, * requires_journaling, * requires_replication, - * two_phase_index_builds_unsupported, * ] */ (function() { 'use strict'; +load('jstests/noPassthrough/libs/index_build.js'); + // Set up replica set var replTest = new ReplSetTest({name: 'bgIndex', nodes: 3}); var nodes = replTest.nodeList(); @@ -57,19 +55,31 @@ replTest.awaitReplication(); assert.commandWorked(secondDB.adminCommand( {configureFailPoint: 'leaveIndexBuildUnfinishedForShutdown', mode: 'alwaysOn'})); + try { - coll.createIndex({i: 1}, {background: true}); - masterDB.getLastError(2); - assert.eq(2, coll.getIndexes().length); - - // Make sure all writes are durable on the secondary so that we can restart it knowing that - // the index build will be found on startup. - // Waiting for durable is important for both (A) the record that we started the index build - // so it is rebuild on restart, and (B) the update to minvalid to show that we've already - // applied the oplog entry so it isn't replayed. If (A) is present without (B), then there - // are two ways that the index can be rebuilt on startup and this test is only for the one - // triggered by (A). - secondDB.adminCommand({fsync: 1}); + if (IndexBuildTest.supportsTwoPhaseIndexBuild(master)) { + // Pause the index build on the secondary to wait for it to start. + IndexBuildTest.pauseIndexBuilds(secondDB); + IndexBuildTest.startIndexBuild(master, coll.getFullName(), {i: 1}); + + // Wait for build to start on the secondary. + jsTestLog("waiting for index build to start on secondary"); + IndexBuildTest.waitForIndexBuildToStart(secondDB); + IndexBuildTest.resumeIndexBuilds(secondDB); + } else { + coll.createIndex({i: 1}, {background: true}); + masterDB.getLastError(2); + assert.eq(2, coll.getIndexes().length); + + // Make sure all writes are durable on the secondary so that we can restart it knowing that + // the index build will be found on startup. + // Waiting for durable is important for both (A) the record that we started the index build + // so it is rebuild on restart, and (B) the update to minvalid to show that we've already + // applied the oplog entry so it isn't replayed. If (A) is present without (B), then there + // are two ways that the index can be rebuilt on startup and this test is only for the one + // triggered by (A). + secondDB.adminCommand({fsync: 1}); + } } finally { assert.commandWorked(secondDB.adminCommand( {configureFailPoint: 'leaveIndexBuildUnfinishedForShutdown', mode: 'off'})); diff --git a/jstests/replsets/libs/rollback_index_builds_test.js b/jstests/replsets/libs/rollback_index_builds_test.js index f751d5c92a4..2d563454a9b 100644 --- a/jstests/replsets/libs/rollback_index_builds_test.js +++ b/jstests/replsets/libs/rollback_index_builds_test.js @@ -80,6 +80,12 @@ class RollbackIndexBuildsTest { IndexBuildTest.resumeIndexBuilds(primary); IndexBuildTest.waitForIndexBuildToStop(primaryDB, collName, "a_1"); break; + case "abort": + const opId = IndexBuildTest.getIndexBuildOpId(primaryDB, collName, "a_1"); + assert.commandWorked(primaryDB.killOp(opId)); + IndexBuildTest.resumeIndexBuilds(primary); + IndexBuildTest.waitForIndexBuildToStop(primaryDB, collName, "a_1"); + break; case "drop": collection.dropIndexes(indexSpec); break; diff --git a/jstests/replsets/rollback_index_build_start_abort.js b/jstests/replsets/rollback_index_build_start_abort.js new file mode 100644 index 00000000000..eb65327f838 --- /dev/null +++ b/jstests/replsets/rollback_index_build_start_abort.js @@ -0,0 +1,20 @@ +/** + * Tests different permutations of rolling-back index build start and abort oplog entries. + */ +(function() { +"use strict"; + +// for RollbackIndexBuildTest +load('jstests/replsets/libs/rollback_index_builds_test.js'); + +const rollbackIndexTest = new RollbackIndexBuildsTest(); + +// Build a schedule of operations interleaving rollback and an index build. +const rollbackOps = ["holdStableTimestamp", "transitionToRollback"]; +const indexBuildOps = ["start", "abort"]; + +// This generates 4 choose 2, or 6 schedules. +const schedules = RollbackIndexBuildsTest.makeSchedules(rollbackOps, indexBuildOps); +rollbackIndexTest.runSchedules(schedules); +rollbackIndexTest.stop(); +})(); diff --git a/src/mongo/db/catalog/multi_index_block.cpp b/src/mongo/db/catalog/multi_index_block.cpp index 4b8b50a2ff9..91f4d9e383d 100644 --- a/src/mongo/db/catalog/multi_index_block.cpp +++ b/src/mongo/db/catalog/multi_index_block.cpp @@ -542,7 +542,7 @@ Status MultiIndexBlock::insertAllDocumentsInCollection(OperationContext* opCtx, if (MONGO_unlikely(leaveIndexBuildUnfinishedForShutdown.shouldFail())) { log() << "Index build interrupted due to 'leaveIndexBuildUnfinishedForShutdown' failpoint. " - "Mimicing shutdown error code."; + "Mimicking shutdown error code."; return Status( ErrorCodes::InterruptedAtShutdown, "background index build interrupted due to failpoint. returning a shutdown error."); diff --git a/src/mongo/db/index_builds_coordinator.cpp b/src/mongo/db/index_builds_coordinator.cpp index 074ce083d53..d2f10674a28 100644 --- a/src/mongo/db/index_builds_coordinator.cpp +++ b/src/mongo/db/index_builds_coordinator.cpp @@ -428,16 +428,6 @@ Status IndexBuildsCoordinator::_startIndexBuildForRecovery(OperationContext* opC return Status::OK(); } -void IndexBuildsCoordinator::joinIndexBuild(OperationContext* opCtx, const UUID& buildUUID) { - auto replStateResult = _getIndexBuild(buildUUID); - if (!replStateResult.isOK()) { - return; - } - auto replState = replStateResult.getValue(); - auto fut = replState->sharedPromise.getFuture(); - log() << "Index build joined: " << buildUUID << ": " << fut.waitNoThrow(opCtx); -} - void IndexBuildsCoordinator::waitForAllIndexBuildsToStopForShutdown() { stdx::unique_lock lk(_mutex); @@ -492,16 +482,39 @@ void IndexBuildsCoordinator::abortDatabaseIndexBuilds(StringData db, const std:: dbIndexBuilds->waitUntilNoIndexBuildsRemain(lk); } -void IndexBuildsCoordinator::commitIndexBuild(OperationContext* opCtx, - const std::vector& specs, - const UUID& buildUUID) { +void IndexBuildsCoordinator::signalCommitAndWait(OperationContext* opCtx, const UUID& buildUUID) { auto replState = uassertStatusOK(_getIndexBuild(buildUUID)); - stdx::unique_lock lk(replState->mutex); - replState->isCommitReady = true; - replState->commitTimestamp = opCtx->recoveryUnit()->getCommitTimestamp(); - invariant(!replState->commitTimestamp.isNull(), buildUUID.toString()); - replState->condVar.notify_all(); + { + stdx::unique_lock lk(replState->mutex); + replState->isCommitReady = true; + replState->commitTimestamp = opCtx->recoveryUnit()->getCommitTimestamp(); + invariant(!replState->commitTimestamp.isNull(), buildUUID.toString()); + replState->condVar.notify_all(); + } + auto fut = replState->sharedPromise.getFuture(); + log() << "Index build joined after commit: " << buildUUID << ": " << fut.waitNoThrow(opCtx); + + // Throws if there was an error building the index. + fut.get(); +} + +void IndexBuildsCoordinator::signalAbortAndWait(OperationContext* opCtx, + const UUID& buildUUID, + const std::string& reason) noexcept { + abortIndexBuildByBuildUUID(opCtx, buildUUID, reason); + + // Because we replicate abort oplog entries for single-phase builds, it is possible to receive + // an abort for a non-existent index build. Abort should always succeed, so suppress the error. + auto replStateResult = _getIndexBuild(buildUUID); + if (!replStateResult.isOK()) { + log() << "ignoring error while aborting index build " << buildUUID << ": " + << replStateResult.getStatus(); + return; + } + auto replState = replStateResult.getValue(); + auto fut = replState->sharedPromise.getFuture(); + log() << "Index build joined after abort: " << buildUUID << ": " << fut.waitNoThrow(opCtx); } void IndexBuildsCoordinator::abortIndexBuildByBuildUUID(OperationContext* opCtx, @@ -1115,6 +1128,115 @@ void IndexBuildsCoordinator::_runIndexBuild(OperationContext* opCtx, } } +void IndexBuildsCoordinator::_cleanUpSinglePhaseAfterFailure( + OperationContext* opCtx, + Collection* collection, + std::shared_ptr replState, + const IndexBuildOptions& indexBuildOptions, + const Status& status) { + if (status == ErrorCodes::InterruptedAtShutdown) { + // Leave it as-if kill -9 happened. Startup recovery will rebuild the index. + _indexBuildsManager.interruptIndexBuild(opCtx, replState->buildUUID, "shutting down"); + _indexBuildsManager.tearDownIndexBuild( + opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn); + return; + } + + // If the index build was not completed successfully, we'll need to acquire some locks to + // clean it up. + UninterruptibleLockGuard noInterrupt(opCtx->lockState()); + + NamespaceString nss = collection->ns(); + Lock::DBLock dbLock(opCtx, nss.db(), MODE_IX); + + if (indexBuildOptions.replSetAndNotPrimaryAtStart) { + // This build started and failed as a secondary. Single-phase index builds started on + // secondaries may not fail. Do not clean up the index build. It must remain unfinished + // until it is successfully rebuilt on startup. + fassert(31354, + status.withContext(str::stream() << "Index build: " << replState->buildUUID + << "; Database: " << replState->dbName)); + } + + Lock::CollectionLock collLock(opCtx, nss, MODE_X); + + auto replCoord = repl::ReplicationCoordinator::get(opCtx); + if (replCoord->getSettings().usingReplSets() && replCoord->canAcceptWritesFor(opCtx, nss)) { + // We are currently a primary node. + // TODO(SERVER-44723): Stop replicating abortIndexBuild for single-phase index builds. This + // is unnecessary for single-phase builds. + auto onCleanUpFn = [&] { onAbortIndexBuild(opCtx, nss, *replState, status); }; + _indexBuildsManager.tearDownIndexBuild( + opCtx, collection, replState->buildUUID, onCleanUpFn); + return; + } + + // If we started the build as a primary and are now unable to accept writes, this build was + // aborted due to a stepdown. + _indexBuildsManager.tearDownIndexBuild( + opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn); +} + +void IndexBuildsCoordinator::_cleanUpTwoPhaseAfterFailure( + OperationContext* opCtx, + Collection* collection, + std::shared_ptr replState, + const IndexBuildOptions& indexBuildOptions, + const Status& status) { + + if (status == ErrorCodes::InterruptedAtShutdown) { + // Leave it as-if kill -9 happened. Startup recovery will restart the index build. + _indexBuildsManager.interruptIndexBuild(opCtx, replState->buildUUID, "shutting down"); + _indexBuildsManager.tearDownIndexBuild( + opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn); + return; + } + + // If the index build was not completed successfully, we'll need to acquire some locks to + // clean it up. + UninterruptibleLockGuard noInterrupt(opCtx->lockState()); + + NamespaceString nss = collection->ns(); + Lock::DBLock dbLock(opCtx, nss.db(), MODE_IX); + + auto replCoord = repl::ReplicationCoordinator::get(opCtx); + if (replCoord->getSettings().usingReplSets() && !replCoord->canAcceptWritesFor(opCtx, nss)) { + // We failed this index build as a secondary node. + + // Failed index builds should fatally assert on the secondary, except when the index build + // was stopped due to an explicit abort oplog entry or rollback. + if (status == ErrorCodes::IndexBuildAborted) { + // On a secondary, we should be able to obtain the timestamp for cleaning up the index + // build from the oplog entry unless the index build did not fail due to processing an + // abortIndexBuild oplog entry. This is the case if we were aborted due to rollback. + stdx::unique_lock lk(replState->mutex); + invariant(replState->aborted, replState->buildUUID.toString()); + Timestamp abortIndexBuildTimestamp = replState->abortTimestamp; + + // Unlock the RSTL to avoid deadlocks with state transitions. See SERVER-42824. + unlockRSTLForIndexCleanup(opCtx); + Lock::CollectionLock collLock(opCtx, nss, MODE_X); + + // TimestampBlock is a no-op if the abort timestamp is unset. + TimestampBlock tsBlock(opCtx, abortIndexBuildTimestamp); + _indexBuildsManager.tearDownIndexBuild( + opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn); + return; + } + + fassert(51101, + status.withContext(str::stream() << "Index build: " << replState->buildUUID + << "; Database: " << replState->dbName)); + } + + // We are currently a primary node. Notify downstream nodes to abort their index builds with the + // same build UUID. + Lock::CollectionLock collLock(opCtx, nss, MODE_X); + auto onCleanUpFn = [&] { onAbortIndexBuild(opCtx, nss, *replState, status); }; + _indexBuildsManager.tearDownIndexBuild(opCtx, collection, replState->buildUUID, onCleanUpFn); + return; +} + void IndexBuildsCoordinator::_runIndexBuildInner(OperationContext* opCtx, std::shared_ptr replState, const IndexBuildOptions& indexBuildOptions) { @@ -1168,124 +1290,42 @@ void IndexBuildsCoordinator::_runIndexBuildInner(OperationContext* opCtx, status = ex.toStatus(); } - if (status == ErrorCodes::InterruptedAtShutdown) { - // Leave it as-if kill -9 happened. This will be handled on restart. - _indexBuildsManager.interruptIndexBuild(opCtx, replState->buildUUID, "shutting down"); - - // On secondaries, a shutdown interruption status is part of normal operation and - // should be suppressed, unlike other errors which should be raised to the administrator's - // attention via a server crash. The server will attempt to recover the index build during - // the next startup. - // On primary and standalone nodes, the failed index build will not be replicated so it is - // okay to propagate the shutdown error to the client. - if (indexBuildOptions.replSetAndNotPrimaryAtStart) { - replState->stats.numIndexesAfter = replState->stats.numIndexesBefore; - status = Status::OK(); - } - } else if (IndexBuildProtocol::kTwoPhase == replState->protocol) { - // TODO (SERVER-40807): disabling the following code for the v4.2 release so it does not - // have downstream impact. - /* - // Only the primary node removes the index build entry, as the secondaries will - // replicate. - if (!replSetAndNotPrimary) { - auto removeStatus = removeIndexBuildEntry(opCtx, replState->buildUUID); - if (!removeStatus.isOK()) { - logFailure(removeStatus, nss, replState); - uassertStatusOK(removeStatus); - MONGO_UNREACHABLE; - } - } - */ - } - - NamespaceString nss; - { - // We do not hold a collection lock here, but we are protected against the collection being - // dropped while the index build is still registered for the collection -- until - // tearDownIndexBuild is called. The collection can be renamed, but it is OK for the name to - // be stale just for logging purposes. - auto collection = - CollectionCatalog::get(opCtx).lookupCollectionByUUID(replState->collectionUUID); - invariant(collection, - str::stream() << "Collection with UUID " << replState->collectionUUID - << " should exist because an index build is in progress: " - << replState->buildUUID); - nss = collection->ns(); - - // If the index build was not completely successfully, we'll need to acquire some locks to - // clean it up. - if (!status.isOK()) { - UninterruptibleLockGuard noInterrupt(opCtx->lockState()); - - Lock::DBLock dbLock(opCtx, nss.db(), MODE_IX); - - if (!indexBuildOptions.replSetAndNotPrimaryAtStart) { - auto replCoord = repl::ReplicationCoordinator::get(opCtx); - if (replCoord->getSettings().usingReplSets() && - replCoord->canAcceptWritesFor(opCtx, nss)) { - // We are currently a primary node. Notify downstream nodes to abort their index - // builds with the same build UUID. - Lock::CollectionLock collLock(opCtx, nss, MODE_X); - auto onCleanUpFn = [&] { onAbortIndexBuild(opCtx, nss, *replState, status); }; - _indexBuildsManager.tearDownIndexBuild( - opCtx, collection, replState->buildUUID, onCleanUpFn); - } else { - // This index build was aborted because we are stepping down from primary. - unlockRSTLForIndexCleanup(opCtx); - Lock::CollectionLock collLock(opCtx, nss, MODE_X); - _indexBuildsManager.tearDownIndexBuild( - opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn); - } - } else { - // We started this index build during oplog application as a secondary node. - Timestamp abortIndexBuildTimestamp; - if (status == ErrorCodes::IndexBuildAborted) { - // We are on a secondary. We should be able to obtain the timestamp for cleaning - // up the index build from the oplog entry unless the index build did not fail - // due to processing an abortIndexBuild oplog entry. For example, a unique index - // key violation would result in the index build failing on the secondary. - stdx::unique_lock lk(replState->mutex); - invariant(replState->aborted, replState->buildUUID.toString()); - abortIndexBuildTimestamp = replState->abortTimestamp; - } + // We do not hold a collection lock here, but we are protected against the collection being + // dropped while the index build is still registered for the collection -- until + // tearDownIndexBuild is called. The collection can be renamed, but it is OK for the name to + // be stale just for logging purposes. + auto collection = + CollectionCatalog::get(opCtx).lookupCollectionByUUID(replState->collectionUUID); + invariant(collection, + str::stream() << "Collection with UUID " << replState->collectionUUID + << " should exist because an index build is in progress: " + << replState->buildUUID); + NamespaceString nss = collection->ns(); - unlockRSTLForIndexCleanup(opCtx); - Lock::CollectionLock collLock(opCtx, nss, MODE_X); + if (status.isOK()) { + _indexBuildsManager.tearDownIndexBuild( + opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn); - // TimestampBlock is a no-op if the abort timestamp is unset. - TimestampBlock tsBlock(opCtx, abortIndexBuildTimestamp); - _indexBuildsManager.tearDownIndexBuild( - opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn); - } - } else { - _indexBuildsManager.tearDownIndexBuild( - opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn); - } + log() << "Index build completed successfully: " << replState->buildUUID << ": " << nss + << " ( " << replState->collectionUUID + << " ). Index specs built: " << replState->indexSpecs.size() + << ". Indexes in catalog before build: " << replState->stats.numIndexesBefore + << ". Indexes in catalog after build: " << replState->stats.numIndexesAfter; + return; } - if (!status.isOK()) { - logFailure(status, nss, replState); + logFailure(status, nss, replState); - // Failed index builds should abort secondary oplog application, except when the index build - // was stopped due to processing an abortIndexBuild oplog entry. - if (indexBuildOptions.replSetAndNotPrimaryAtStart) { - if (status == ErrorCodes::IndexBuildAborted) { - return; - } - fassert(51101, - status.withContext(str::stream() << "Index build: " << replState->buildUUID - << "; Database: " << replState->dbName)); - } - - uassertStatusOK(status); - MONGO_UNREACHABLE; + if (IndexBuildProtocol::kSinglePhase == replState->protocol) { + _cleanUpSinglePhaseAfterFailure(opCtx, collection, replState, indexBuildOptions, status); + } else { + invariant(IndexBuildProtocol::kTwoPhase == replState->protocol, + str::stream() << replState->buildUUID); + _cleanUpTwoPhaseAfterFailure(opCtx, collection, replState, indexBuildOptions, status); } - log() << "Index build completed successfully: " << replState->buildUUID << ": " << nss << " ( " - << replState->collectionUUID << " ). Index specs built: " << replState->indexSpecs.size() - << ". Indexes in catalog before build: " << replState->stats.numIndexesBefore - << ". Indexes in catalog after build: " << replState->stats.numIndexesAfter; + // Any error that escapes at this point is not fatal and can be handled by the caller. + uassertStatusOK(status); } void IndexBuildsCoordinator::_buildIndex( diff --git a/src/mongo/db/index_builds_coordinator.h b/src/mongo/db/index_builds_coordinator.h index 1f2b811d8a9..ab4fae20711 100644 --- a/src/mongo/db/index_builds_coordinator.h +++ b/src/mongo/db/index_builds_coordinator.h @@ -155,16 +155,18 @@ public: const UUID& buildUUID); /** - * Waits for the index build identified by 'buildUUID' to complete. + * Signals the index build identified by 'buildUUID' to commit, and waits for its thread to + * complete. Throws if there were any errors building the index. */ - void joinIndexBuild(OperationContext* opCtx, const UUID& buildUUID); + void signalCommitAndWait(OperationContext* opCtx, const UUID& buildUUID); /** - * Commits the index build identified by 'buildUUID'. + * Signals the index build identified by 'buildUUID' to abort, and waits for its thread to + * complete. */ - void commitIndexBuild(OperationContext* opCtx, - const std::vector& specs, - const UUID& buildUUID); + void signalAbortAndWait(OperationContext* opCtx, + const UUID& buildUUID, + const std::string& reason) noexcept; /** * Waits for all index builds to stop after they have been interrupted during shutdown. @@ -443,6 +445,24 @@ protected: std::shared_ptr replState, const IndexBuildOptions& indexBuildOptions); + /** + * Cleans up a single-phase index build after a failure. + */ + void _cleanUpSinglePhaseAfterFailure(OperationContext* opCtx, + Collection* collection, + std::shared_ptr replState, + const IndexBuildOptions& indexBuildOptions, + const Status& status); + + /** + * Cleans up a two-phase index build after a failure. + */ + void _cleanUpTwoPhaseAfterFailure(OperationContext* opCtx, + Collection* collection, + std::shared_ptr replState, + const IndexBuildOptions& indexBuildOptions, + const Status& status); + /** * Modularizes the _indexBuildsManager calls part of _runIndexBuildInner. Throws on error. */ diff --git a/src/mongo/db/repair_database_and_check_version.cpp b/src/mongo/db/repair_database_and_check_version.cpp index e5a0f51b588..bf8d07ed074 100644 --- a/src/mongo/db/repair_database_and_check_version.cpp +++ b/src/mongo/db/repair_database_and_check_version.cpp @@ -46,6 +46,7 @@ #include "mongo/db/concurrency/write_conflict_exception.h" #include "mongo/db/db_raii.h" #include "mongo/db/dbhelpers.h" +#include "mongo/db/index_builds_coordinator.h" #include "mongo/db/namespace_string.h" #include "mongo/db/operation_context.h" #include "mongo/db/repair_database.h" @@ -265,8 +266,6 @@ void checkForCappedOplog(OperationContext* opCtx, Database* db) { void rebuildIndexes(OperationContext* opCtx, StorageEngine* storageEngine) { auto reconcileResult = fassert(40593, storageEngine->reconcileCatalogAndIdents(opCtx)); - // TODO (SERVER-44467): Restart two-phase index builds during startup - invariant(reconcileResult.indexBuildsToRestart.empty()); if (!reconcileResult.indexesToRebuild.empty() && serverGlobalParams.indexBuildRetry) { log() << "note: restart the server with --noIndexBuildRetry " @@ -315,6 +314,12 @@ void rebuildIndexes(OperationContext* opCtx, StorageEngine* storageEngine) { std::vector indexSpecs = entry.second.second; fassert(40592, rebuildIndexesOnCollection(opCtx, collection, indexSpecs)); } + + // Once all unfinished indexes have been rebuilt, restart any unfinished index builds. This will + // not build any indexes to completion, but rather start the background thread to build the + // index, and wait for a replicated commit or abort oplog entry. + IndexBuildsCoordinator::get(opCtx)->restartIndexBuildsForRecovery( + opCtx, reconcileResult.indexBuildsToRestart); } /** diff --git a/src/mongo/db/repl/oplog.cpp b/src/mongo/db/repl/oplog.cpp index 3b83398df30..8f345e4e023 100644 --- a/src/mongo/db/repl/oplog.cpp +++ b/src/mongo/db/repl/oplog.cpp @@ -213,8 +213,7 @@ Status commitIndexBuild(OperationContext* opCtx, return statusWithIndexes.getStatus(); } auto indexBuildsCoord = IndexBuildsCoordinator::get(opCtx); - indexBuildsCoord->commitIndexBuild(opCtx, statusWithIndexes.getValue(), indexBuildUUID); - indexBuildsCoord->joinIndexBuild(opCtx, indexBuildUUID); + indexBuildsCoord->signalCommitAndWait(opCtx, indexBuildUUID); return Status::OK(); } @@ -223,11 +222,10 @@ Status abortIndexBuild(OperationContext* opCtx, const Status& cause, OplogApplication::Mode mode) { // Wait until the index build finishes aborting. - IndexBuildsCoordinator::get(opCtx)->abortIndexBuildByBuildUUID( + IndexBuildsCoordinator::get(opCtx)->signalAbortAndWait( opCtx, indexBuildUUID, str::stream() << "abortIndexBuild oplog entry encountered: " << cause); - IndexBuildsCoordinator::get(opCtx)->joinIndexBuild(opCtx, indexBuildUUID); return Status::OK(); } diff --git a/src/mongo/db/repl/oplog_applier_impl.cpp b/src/mongo/db/repl/oplog_applier_impl.cpp index 43039c8f48e..f1c12d1e68a 100644 --- a/src/mongo/db/repl/oplog_applier_impl.cpp +++ b/src/mongo/db/repl/oplog_applier_impl.cpp @@ -449,9 +449,17 @@ void OplogApplierImpl::_run(OplogBuffer* oplogBuffer) { // Apply the operations in this batch. '_applyOplogBatch' returns the optime of the // last op that was applied, which should be the last optime in the batch. - auto lastOpTimeAppliedInBatch = - fassertNoTrace(34437, _applyOplogBatch(&opCtx, ops.releaseBatch())); - invariant(lastOpTimeAppliedInBatch == lastOpTimeInBatch); + auto swLastOpTimeAppliedInBatch = _applyOplogBatch(&opCtx, ops.releaseBatch()); + if (swLastOpTimeAppliedInBatch.getStatus().code() == ErrorCodes::InterruptedAtShutdown) { + // If an operation was interrupted at shutdown, fail the batch without advancing + // appliedThrough as if this were an unclean shutdown. This ensures the stable timestamp + // does not advance, and a checkpoint cannot be taken at a timestamp that includes this + // batch. On startup, we will recover from an earlier stable checkpoint and apply the + // operations from this batch again. + return; + } + fassertNoTrace(34437, swLastOpTimeAppliedInBatch); + invariant(swLastOpTimeAppliedInBatch.getValue() == lastOpTimeInBatch); // Update various things that care about our last applied optime. Tests rely on 1 happening // before 2 even though it isn't strictly necessary. diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp index f9797af98d2..51c5e4f96fe 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp @@ -366,6 +366,11 @@ void ReplicationCoordinatorExternalStateImpl::shutdown(OperationContext* opCtx) // _taskExecutor pointer never changes. _taskExecutor->join(); + // Ensure that all writes are visible before reading. If we failed mid-batch, it would be + // possible to read from a kLastApplied ReadSource where not all writes to the minValid document + // are visible, generating a writeConflict that would not resolve. + opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp); + auto loadLastOpTimeAndWallTimeResult = loadLastOpTimeAndWallTime(opCtx); if (_replicationProcess->getConsistencyMarkers()->getOplogTruncateAfterPoint(opCtx).isNull() && loadLastOpTimeAndWallTimeResult.isOK() && diff --git a/src/mongo/db/storage/storage_engine_impl.cpp b/src/mongo/db/storage/storage_engine_impl.cpp index 213c3c50937..4763ddbdf13 100644 --- a/src/mongo/db/storage/storage_engine_impl.cpp +++ b/src/mongo/db/storage/storage_engine_impl.cpp @@ -456,7 +456,7 @@ StatusWith StorageEngineImpl::reconcileCatalogAn invariant(collUUID); auto buildUUID = *indexMetaData.buildUUID; - log() << "Found index from unfinished build. Collection: " << coll << "(" + log() << "Found index from unfinished build. Collection: " << coll << " (" << *collUUID << "), index: " << indexName << ", build UUID: " << buildUUID; // Insert in the map if a build has not already been registered. -- cgit v1.2.1