summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLouis Williams <louis.williams@mongodb.com>2019-11-21 22:36:46 +0000
committerevergreen <evergreen@mongodb.com>2019-11-21 22:36:46 +0000
commit300458bb1ed89940649560e92ff84c81a4a77678 (patch)
tree88f1e90d56f9d67b3335d7162a123d40b3d05996
parent92d0ebf34f39f82f430869273ae751c1d97ec960 (diff)
downloadmongo-300458bb1ed89940649560e92ff84c81a4a77678.tar.gz
SERVER-44467 Implement startup recovery for two-phase index builds
-rw-r--r--etc/evergreen.yml6
-rw-r--r--jstests/noPassthrough/backup_restore_fsync_lock.js3
-rw-r--r--jstests/noPassthrough/backup_restore_rolling.js3
-rw-r--r--jstests/noPassthrough/backup_restore_stop_start.js3
-rw-r--r--jstests/noPassthrough/characterize_index_builds_on_restart.js2
-rw-r--r--jstests/noPassthrough/indexbg_shutdown.js3
-rw-r--r--jstests/noPassthroughWithMongod/indexbg_restart_secondary.js42
-rw-r--r--jstests/replsets/libs/rollback_index_builds_test.js6
-rw-r--r--jstests/replsets/rollback_index_build_start_abort.js20
-rw-r--r--src/mongo/db/catalog/multi_index_block.cpp2
-rw-r--r--src/mongo/db/index_builds_coordinator.cpp298
-rw-r--r--src/mongo/db/index_builds_coordinator.h32
-rw-r--r--src/mongo/db/repair_database_and_check_version.cpp9
-rw-r--r--src/mongo/db/repl/oplog.cpp6
-rw-r--r--src/mongo/db/repl/oplog_applier_impl.cpp14
-rw-r--r--src/mongo/db/repl/replication_coordinator_external_state_impl.cpp5
-rw-r--r--src/mongo/db/storage/storage_engine_impl.cpp2
17 files changed, 278 insertions, 178 deletions
diff --git a/etc/evergreen.yml b/etc/evergreen.yml
index 5303a7080a2..57c58aae543 100644
--- a/etc/evergreen.yml
+++ b/etc/evergreen.yml
@@ -10174,14 +10174,14 @@ buildvariants:
- name: compile_TG
distros:
- rhel62-large
- - name: .concurrency !.large !.ubsan !.no_txns !.stepdowns !.kill_terminate !.debug_only
+ - name: .concurrency !.large !.ubsan !.no_txns !.debug_only
- name: .jscore .common
- name: noPassthrough_gen
- name: noPassthroughWithMongod_gen
- - name: replica_sets
+ - name: .replica_sets !.encrypt !.auth
distros:
- rhel62-large
- - name: rollback_fuzzer_gen
+ - name: .rollbackfuzzer
- name: sharding_gen
- name: enterprise-rhel-62-64-bit-coverage
diff --git a/jstests/noPassthrough/backup_restore_fsync_lock.js b/jstests/noPassthrough/backup_restore_fsync_lock.js
index 57b12a6ab24..f6f720377aa 100644
--- a/jstests/noPassthrough/backup_restore_fsync_lock.js
+++ b/jstests/noPassthrough/backup_restore_fsync_lock.js
@@ -11,12 +11,9 @@
*
* Some methods for backup used in this test checkpoint the files in the dbpath. This technique will
* not work for ephemeral storage engines, as they do not store any data in the dbpath.
- * TODO: (SERVER-44467) Remove two_phase_index_builds_unsupported tag when startup recovery works
- * for two-phase index builds.
* @tags: [
* requires_persistence,
* requires_replication,
- * two_phase_index_builds_unsupported,
* ]
*/
diff --git a/jstests/noPassthrough/backup_restore_rolling.js b/jstests/noPassthrough/backup_restore_rolling.js
index 358f33b0194..63f6b1a7cd2 100644
--- a/jstests/noPassthrough/backup_restore_rolling.js
+++ b/jstests/noPassthrough/backup_restore_rolling.js
@@ -11,12 +11,9 @@
*
* Some methods for backup used in this test checkpoint the files in the dbpath. This technique will
* not work for ephemeral storage engines, as they do not store any data in the dbpath.
- * TODO: (SERVER-44467) Remove two_phase_index_builds_unsupported tag when startup recovery works
- * for two-phase index builds.
* @tags: [
* requires_persistence,
* requires_wiredtiger,
- * two_phase_index_builds_unsupported,
* ]
*/
diff --git a/jstests/noPassthrough/backup_restore_stop_start.js b/jstests/noPassthrough/backup_restore_stop_start.js
index 71ad88c7dd5..2598b8670ab 100644
--- a/jstests/noPassthrough/backup_restore_stop_start.js
+++ b/jstests/noPassthrough/backup_restore_stop_start.js
@@ -11,12 +11,9 @@
*
* Some methods for backup used in this test checkpoint the files in the dbpath. This technique will
* not work for ephemeral storage engines, as they do not store any data in the dbpath.
- * TODO: (SERVER-44467) Remove two_phase_index_builds_unsupported tag when startup recovery works
- * for two-phase index builds.
* @tags: [
* requires_persistence,
* requires_replication,
- * two_phase_index_builds_unsupported,
* ]
*/
diff --git a/jstests/noPassthrough/characterize_index_builds_on_restart.js b/jstests/noPassthrough/characterize_index_builds_on_restart.js
index 64c2ba5fe91..2f500684201 100644
--- a/jstests/noPassthrough/characterize_index_builds_on_restart.js
+++ b/jstests/noPassthrough/characterize_index_builds_on_restart.js
@@ -99,7 +99,7 @@ function startIndexBuildOnSecondaryAndLeaveUnfinished(primaryDB, writeConcern, s
checkLog.containsWithCount(
secondaryDB,
"Index build interrupted due to \'leaveIndexBuildUnfinishedForShutdown\' " +
- "failpoint. Mimicing shutdown error code.",
+ "failpoint. Mimicking shutdown error code.",
expectedFailPointMessageCount);
// Wait until the secondary has a recovery timestamp beyond the index oplog entry. On
diff --git a/jstests/noPassthrough/indexbg_shutdown.js b/jstests/noPassthrough/indexbg_shutdown.js
index 4e9bc9cf016..a8a5781b3ed 100644
--- a/jstests/noPassthrough/indexbg_shutdown.js
+++ b/jstests/noPassthrough/indexbg_shutdown.js
@@ -4,11 +4,8 @@
* shuts down cleanly, without an fassert.
* Also confirms that killOp has no effect on the background index build on the secondary.
*
- * TODO(SERVER-44467): Remove two_phase_index_builds_unsupported tag when startup recovery works
- * for two-phase index builds.
* @tags: [
* requires_replication,
- * two_phase_index_builds_unsupported,
* ]
*/
diff --git a/jstests/noPassthroughWithMongod/indexbg_restart_secondary.js b/jstests/noPassthroughWithMongod/indexbg_restart_secondary.js
index e12973b7ff8..462236d8c63 100644
--- a/jstests/noPassthroughWithMongod/indexbg_restart_secondary.js
+++ b/jstests/noPassthroughWithMongod/indexbg_restart_secondary.js
@@ -3,20 +3,18 @@
* Kills the secondary once the index build starts with a failpoint.
* The index build should resume when the secondary is restarted.
*
- * TODO: (SERVER-44467) Remove two_phase_index_builds_unsupported tag when startup recovery works
- * for two-phase index builds.
- *
* @tags: [
* requires_persistence,
* requires_journaling,
* requires_replication,
- * two_phase_index_builds_unsupported,
* ]
*/
(function() {
'use strict';
+load('jstests/noPassthrough/libs/index_build.js');
+
// Set up replica set
var replTest = new ReplSetTest({name: 'bgIndex', nodes: 3});
var nodes = replTest.nodeList();
@@ -57,19 +55,31 @@ replTest.awaitReplication();
assert.commandWorked(secondDB.adminCommand(
{configureFailPoint: 'leaveIndexBuildUnfinishedForShutdown', mode: 'alwaysOn'}));
+
try {
- coll.createIndex({i: 1}, {background: true});
- masterDB.getLastError(2);
- assert.eq(2, coll.getIndexes().length);
-
- // Make sure all writes are durable on the secondary so that we can restart it knowing that
- // the index build will be found on startup.
- // Waiting for durable is important for both (A) the record that we started the index build
- // so it is rebuild on restart, and (B) the update to minvalid to show that we've already
- // applied the oplog entry so it isn't replayed. If (A) is present without (B), then there
- // are two ways that the index can be rebuilt on startup and this test is only for the one
- // triggered by (A).
- secondDB.adminCommand({fsync: 1});
+ if (IndexBuildTest.supportsTwoPhaseIndexBuild(master)) {
+ // Pause the index build on the secondary to wait for it to start.
+ IndexBuildTest.pauseIndexBuilds(secondDB);
+ IndexBuildTest.startIndexBuild(master, coll.getFullName(), {i: 1});
+
+ // Wait for build to start on the secondary.
+ jsTestLog("waiting for index build to start on secondary");
+ IndexBuildTest.waitForIndexBuildToStart(secondDB);
+ IndexBuildTest.resumeIndexBuilds(secondDB);
+ } else {
+ coll.createIndex({i: 1}, {background: true});
+ masterDB.getLastError(2);
+ assert.eq(2, coll.getIndexes().length);
+
+ // Make sure all writes are durable on the secondary so that we can restart it knowing that
+ // the index build will be found on startup.
+ // Waiting for durable is important for both (A) the record that we started the index build
+ // so it is rebuild on restart, and (B) the update to minvalid to show that we've already
+ // applied the oplog entry so it isn't replayed. If (A) is present without (B), then there
+ // are two ways that the index can be rebuilt on startup and this test is only for the one
+ // triggered by (A).
+ secondDB.adminCommand({fsync: 1});
+ }
} finally {
assert.commandWorked(secondDB.adminCommand(
{configureFailPoint: 'leaveIndexBuildUnfinishedForShutdown', mode: 'off'}));
diff --git a/jstests/replsets/libs/rollback_index_builds_test.js b/jstests/replsets/libs/rollback_index_builds_test.js
index f751d5c92a4..2d563454a9b 100644
--- a/jstests/replsets/libs/rollback_index_builds_test.js
+++ b/jstests/replsets/libs/rollback_index_builds_test.js
@@ -80,6 +80,12 @@ class RollbackIndexBuildsTest {
IndexBuildTest.resumeIndexBuilds(primary);
IndexBuildTest.waitForIndexBuildToStop(primaryDB, collName, "a_1");
break;
+ case "abort":
+ const opId = IndexBuildTest.getIndexBuildOpId(primaryDB, collName, "a_1");
+ assert.commandWorked(primaryDB.killOp(opId));
+ IndexBuildTest.resumeIndexBuilds(primary);
+ IndexBuildTest.waitForIndexBuildToStop(primaryDB, collName, "a_1");
+ break;
case "drop":
collection.dropIndexes(indexSpec);
break;
diff --git a/jstests/replsets/rollback_index_build_start_abort.js b/jstests/replsets/rollback_index_build_start_abort.js
new file mode 100644
index 00000000000..eb65327f838
--- /dev/null
+++ b/jstests/replsets/rollback_index_build_start_abort.js
@@ -0,0 +1,20 @@
+/**
+ * Tests different permutations of rolling-back index build start and abort oplog entries.
+ */
+(function() {
+"use strict";
+
+// for RollbackIndexBuildTest
+load('jstests/replsets/libs/rollback_index_builds_test.js');
+
+const rollbackIndexTest = new RollbackIndexBuildsTest();
+
+// Build a schedule of operations interleaving rollback and an index build.
+const rollbackOps = ["holdStableTimestamp", "transitionToRollback"];
+const indexBuildOps = ["start", "abort"];
+
+// This generates 4 choose 2, or 6 schedules.
+const schedules = RollbackIndexBuildsTest.makeSchedules(rollbackOps, indexBuildOps);
+rollbackIndexTest.runSchedules(schedules);
+rollbackIndexTest.stop();
+})();
diff --git a/src/mongo/db/catalog/multi_index_block.cpp b/src/mongo/db/catalog/multi_index_block.cpp
index 4b8b50a2ff9..91f4d9e383d 100644
--- a/src/mongo/db/catalog/multi_index_block.cpp
+++ b/src/mongo/db/catalog/multi_index_block.cpp
@@ -542,7 +542,7 @@ Status MultiIndexBlock::insertAllDocumentsInCollection(OperationContext* opCtx,
if (MONGO_unlikely(leaveIndexBuildUnfinishedForShutdown.shouldFail())) {
log() << "Index build interrupted due to 'leaveIndexBuildUnfinishedForShutdown' failpoint. "
- "Mimicing shutdown error code.";
+ "Mimicking shutdown error code.";
return Status(
ErrorCodes::InterruptedAtShutdown,
"background index build interrupted due to failpoint. returning a shutdown error.");
diff --git a/src/mongo/db/index_builds_coordinator.cpp b/src/mongo/db/index_builds_coordinator.cpp
index 074ce083d53..d2f10674a28 100644
--- a/src/mongo/db/index_builds_coordinator.cpp
+++ b/src/mongo/db/index_builds_coordinator.cpp
@@ -428,16 +428,6 @@ Status IndexBuildsCoordinator::_startIndexBuildForRecovery(OperationContext* opC
return Status::OK();
}
-void IndexBuildsCoordinator::joinIndexBuild(OperationContext* opCtx, const UUID& buildUUID) {
- auto replStateResult = _getIndexBuild(buildUUID);
- if (!replStateResult.isOK()) {
- return;
- }
- auto replState = replStateResult.getValue();
- auto fut = replState->sharedPromise.getFuture();
- log() << "Index build joined: " << buildUUID << ": " << fut.waitNoThrow(opCtx);
-}
-
void IndexBuildsCoordinator::waitForAllIndexBuildsToStopForShutdown() {
stdx::unique_lock<Latch> lk(_mutex);
@@ -492,16 +482,39 @@ void IndexBuildsCoordinator::abortDatabaseIndexBuilds(StringData db, const std::
dbIndexBuilds->waitUntilNoIndexBuildsRemain(lk);
}
-void IndexBuildsCoordinator::commitIndexBuild(OperationContext* opCtx,
- const std::vector<BSONObj>& specs,
- const UUID& buildUUID) {
+void IndexBuildsCoordinator::signalCommitAndWait(OperationContext* opCtx, const UUID& buildUUID) {
auto replState = uassertStatusOK(_getIndexBuild(buildUUID));
- stdx::unique_lock<Latch> lk(replState->mutex);
- replState->isCommitReady = true;
- replState->commitTimestamp = opCtx->recoveryUnit()->getCommitTimestamp();
- invariant(!replState->commitTimestamp.isNull(), buildUUID.toString());
- replState->condVar.notify_all();
+ {
+ stdx::unique_lock<Latch> lk(replState->mutex);
+ replState->isCommitReady = true;
+ replState->commitTimestamp = opCtx->recoveryUnit()->getCommitTimestamp();
+ invariant(!replState->commitTimestamp.isNull(), buildUUID.toString());
+ replState->condVar.notify_all();
+ }
+ auto fut = replState->sharedPromise.getFuture();
+ log() << "Index build joined after commit: " << buildUUID << ": " << fut.waitNoThrow(opCtx);
+
+ // Throws if there was an error building the index.
+ fut.get();
+}
+
+void IndexBuildsCoordinator::signalAbortAndWait(OperationContext* opCtx,
+ const UUID& buildUUID,
+ const std::string& reason) noexcept {
+ abortIndexBuildByBuildUUID(opCtx, buildUUID, reason);
+
+ // Because we replicate abort oplog entries for single-phase builds, it is possible to receive
+ // an abort for a non-existent index build. Abort should always succeed, so suppress the error.
+ auto replStateResult = _getIndexBuild(buildUUID);
+ if (!replStateResult.isOK()) {
+ log() << "ignoring error while aborting index build " << buildUUID << ": "
+ << replStateResult.getStatus();
+ return;
+ }
+ auto replState = replStateResult.getValue();
+ auto fut = replState->sharedPromise.getFuture();
+ log() << "Index build joined after abort: " << buildUUID << ": " << fut.waitNoThrow(opCtx);
}
void IndexBuildsCoordinator::abortIndexBuildByBuildUUID(OperationContext* opCtx,
@@ -1115,6 +1128,115 @@ void IndexBuildsCoordinator::_runIndexBuild(OperationContext* opCtx,
}
}
+void IndexBuildsCoordinator::_cleanUpSinglePhaseAfterFailure(
+ OperationContext* opCtx,
+ Collection* collection,
+ std::shared_ptr<ReplIndexBuildState> replState,
+ const IndexBuildOptions& indexBuildOptions,
+ const Status& status) {
+ if (status == ErrorCodes::InterruptedAtShutdown) {
+ // Leave it as-if kill -9 happened. Startup recovery will rebuild the index.
+ _indexBuildsManager.interruptIndexBuild(opCtx, replState->buildUUID, "shutting down");
+ _indexBuildsManager.tearDownIndexBuild(
+ opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn);
+ return;
+ }
+
+ // If the index build was not completed successfully, we'll need to acquire some locks to
+ // clean it up.
+ UninterruptibleLockGuard noInterrupt(opCtx->lockState());
+
+ NamespaceString nss = collection->ns();
+ Lock::DBLock dbLock(opCtx, nss.db(), MODE_IX);
+
+ if (indexBuildOptions.replSetAndNotPrimaryAtStart) {
+ // This build started and failed as a secondary. Single-phase index builds started on
+ // secondaries may not fail. Do not clean up the index build. It must remain unfinished
+ // until it is successfully rebuilt on startup.
+ fassert(31354,
+ status.withContext(str::stream() << "Index build: " << replState->buildUUID
+ << "; Database: " << replState->dbName));
+ }
+
+ Lock::CollectionLock collLock(opCtx, nss, MODE_X);
+
+ auto replCoord = repl::ReplicationCoordinator::get(opCtx);
+ if (replCoord->getSettings().usingReplSets() && replCoord->canAcceptWritesFor(opCtx, nss)) {
+ // We are currently a primary node.
+ // TODO(SERVER-44723): Stop replicating abortIndexBuild for single-phase index builds. This
+ // is unnecessary for single-phase builds.
+ auto onCleanUpFn = [&] { onAbortIndexBuild(opCtx, nss, *replState, status); };
+ _indexBuildsManager.tearDownIndexBuild(
+ opCtx, collection, replState->buildUUID, onCleanUpFn);
+ return;
+ }
+
+ // If we started the build as a primary and are now unable to accept writes, this build was
+ // aborted due to a stepdown.
+ _indexBuildsManager.tearDownIndexBuild(
+ opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn);
+}
+
+void IndexBuildsCoordinator::_cleanUpTwoPhaseAfterFailure(
+ OperationContext* opCtx,
+ Collection* collection,
+ std::shared_ptr<ReplIndexBuildState> replState,
+ const IndexBuildOptions& indexBuildOptions,
+ const Status& status) {
+
+ if (status == ErrorCodes::InterruptedAtShutdown) {
+ // Leave it as-if kill -9 happened. Startup recovery will restart the index build.
+ _indexBuildsManager.interruptIndexBuild(opCtx, replState->buildUUID, "shutting down");
+ _indexBuildsManager.tearDownIndexBuild(
+ opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn);
+ return;
+ }
+
+ // If the index build was not completed successfully, we'll need to acquire some locks to
+ // clean it up.
+ UninterruptibleLockGuard noInterrupt(opCtx->lockState());
+
+ NamespaceString nss = collection->ns();
+ Lock::DBLock dbLock(opCtx, nss.db(), MODE_IX);
+
+ auto replCoord = repl::ReplicationCoordinator::get(opCtx);
+ if (replCoord->getSettings().usingReplSets() && !replCoord->canAcceptWritesFor(opCtx, nss)) {
+ // We failed this index build as a secondary node.
+
+ // Failed index builds should fatally assert on the secondary, except when the index build
+ // was stopped due to an explicit abort oplog entry or rollback.
+ if (status == ErrorCodes::IndexBuildAborted) {
+ // On a secondary, we should be able to obtain the timestamp for cleaning up the index
+ // build from the oplog entry unless the index build did not fail due to processing an
+ // abortIndexBuild oplog entry. This is the case if we were aborted due to rollback.
+ stdx::unique_lock<Latch> lk(replState->mutex);
+ invariant(replState->aborted, replState->buildUUID.toString());
+ Timestamp abortIndexBuildTimestamp = replState->abortTimestamp;
+
+ // Unlock the RSTL to avoid deadlocks with state transitions. See SERVER-42824.
+ unlockRSTLForIndexCleanup(opCtx);
+ Lock::CollectionLock collLock(opCtx, nss, MODE_X);
+
+ // TimestampBlock is a no-op if the abort timestamp is unset.
+ TimestampBlock tsBlock(opCtx, abortIndexBuildTimestamp);
+ _indexBuildsManager.tearDownIndexBuild(
+ opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn);
+ return;
+ }
+
+ fassert(51101,
+ status.withContext(str::stream() << "Index build: " << replState->buildUUID
+ << "; Database: " << replState->dbName));
+ }
+
+ // We are currently a primary node. Notify downstream nodes to abort their index builds with the
+ // same build UUID.
+ Lock::CollectionLock collLock(opCtx, nss, MODE_X);
+ auto onCleanUpFn = [&] { onAbortIndexBuild(opCtx, nss, *replState, status); };
+ _indexBuildsManager.tearDownIndexBuild(opCtx, collection, replState->buildUUID, onCleanUpFn);
+ return;
+}
+
void IndexBuildsCoordinator::_runIndexBuildInner(OperationContext* opCtx,
std::shared_ptr<ReplIndexBuildState> replState,
const IndexBuildOptions& indexBuildOptions) {
@@ -1168,124 +1290,42 @@ void IndexBuildsCoordinator::_runIndexBuildInner(OperationContext* opCtx,
status = ex.toStatus();
}
- if (status == ErrorCodes::InterruptedAtShutdown) {
- // Leave it as-if kill -9 happened. This will be handled on restart.
- _indexBuildsManager.interruptIndexBuild(opCtx, replState->buildUUID, "shutting down");
-
- // On secondaries, a shutdown interruption status is part of normal operation and
- // should be suppressed, unlike other errors which should be raised to the administrator's
- // attention via a server crash. The server will attempt to recover the index build during
- // the next startup.
- // On primary and standalone nodes, the failed index build will not be replicated so it is
- // okay to propagate the shutdown error to the client.
- if (indexBuildOptions.replSetAndNotPrimaryAtStart) {
- replState->stats.numIndexesAfter = replState->stats.numIndexesBefore;
- status = Status::OK();
- }
- } else if (IndexBuildProtocol::kTwoPhase == replState->protocol) {
- // TODO (SERVER-40807): disabling the following code for the v4.2 release so it does not
- // have downstream impact.
- /*
- // Only the primary node removes the index build entry, as the secondaries will
- // replicate.
- if (!replSetAndNotPrimary) {
- auto removeStatus = removeIndexBuildEntry(opCtx, replState->buildUUID);
- if (!removeStatus.isOK()) {
- logFailure(removeStatus, nss, replState);
- uassertStatusOK(removeStatus);
- MONGO_UNREACHABLE;
- }
- }
- */
- }
-
- NamespaceString nss;
- {
- // We do not hold a collection lock here, but we are protected against the collection being
- // dropped while the index build is still registered for the collection -- until
- // tearDownIndexBuild is called. The collection can be renamed, but it is OK for the name to
- // be stale just for logging purposes.
- auto collection =
- CollectionCatalog::get(opCtx).lookupCollectionByUUID(replState->collectionUUID);
- invariant(collection,
- str::stream() << "Collection with UUID " << replState->collectionUUID
- << " should exist because an index build is in progress: "
- << replState->buildUUID);
- nss = collection->ns();
-
- // If the index build was not completely successfully, we'll need to acquire some locks to
- // clean it up.
- if (!status.isOK()) {
- UninterruptibleLockGuard noInterrupt(opCtx->lockState());
-
- Lock::DBLock dbLock(opCtx, nss.db(), MODE_IX);
-
- if (!indexBuildOptions.replSetAndNotPrimaryAtStart) {
- auto replCoord = repl::ReplicationCoordinator::get(opCtx);
- if (replCoord->getSettings().usingReplSets() &&
- replCoord->canAcceptWritesFor(opCtx, nss)) {
- // We are currently a primary node. Notify downstream nodes to abort their index
- // builds with the same build UUID.
- Lock::CollectionLock collLock(opCtx, nss, MODE_X);
- auto onCleanUpFn = [&] { onAbortIndexBuild(opCtx, nss, *replState, status); };
- _indexBuildsManager.tearDownIndexBuild(
- opCtx, collection, replState->buildUUID, onCleanUpFn);
- } else {
- // This index build was aborted because we are stepping down from primary.
- unlockRSTLForIndexCleanup(opCtx);
- Lock::CollectionLock collLock(opCtx, nss, MODE_X);
- _indexBuildsManager.tearDownIndexBuild(
- opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn);
- }
- } else {
- // We started this index build during oplog application as a secondary node.
- Timestamp abortIndexBuildTimestamp;
- if (status == ErrorCodes::IndexBuildAborted) {
- // We are on a secondary. We should be able to obtain the timestamp for cleaning
- // up the index build from the oplog entry unless the index build did not fail
- // due to processing an abortIndexBuild oplog entry. For example, a unique index
- // key violation would result in the index build failing on the secondary.
- stdx::unique_lock<Latch> lk(replState->mutex);
- invariant(replState->aborted, replState->buildUUID.toString());
- abortIndexBuildTimestamp = replState->abortTimestamp;
- }
+ // We do not hold a collection lock here, but we are protected against the collection being
+ // dropped while the index build is still registered for the collection -- until
+ // tearDownIndexBuild is called. The collection can be renamed, but it is OK for the name to
+ // be stale just for logging purposes.
+ auto collection =
+ CollectionCatalog::get(opCtx).lookupCollectionByUUID(replState->collectionUUID);
+ invariant(collection,
+ str::stream() << "Collection with UUID " << replState->collectionUUID
+ << " should exist because an index build is in progress: "
+ << replState->buildUUID);
+ NamespaceString nss = collection->ns();
- unlockRSTLForIndexCleanup(opCtx);
- Lock::CollectionLock collLock(opCtx, nss, MODE_X);
+ if (status.isOK()) {
+ _indexBuildsManager.tearDownIndexBuild(
+ opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn);
- // TimestampBlock is a no-op if the abort timestamp is unset.
- TimestampBlock tsBlock(opCtx, abortIndexBuildTimestamp);
- _indexBuildsManager.tearDownIndexBuild(
- opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn);
- }
- } else {
- _indexBuildsManager.tearDownIndexBuild(
- opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn);
- }
+ log() << "Index build completed successfully: " << replState->buildUUID << ": " << nss
+ << " ( " << replState->collectionUUID
+ << " ). Index specs built: " << replState->indexSpecs.size()
+ << ". Indexes in catalog before build: " << replState->stats.numIndexesBefore
+ << ". Indexes in catalog after build: " << replState->stats.numIndexesAfter;
+ return;
}
- if (!status.isOK()) {
- logFailure(status, nss, replState);
+ logFailure(status, nss, replState);
- // Failed index builds should abort secondary oplog application, except when the index build
- // was stopped due to processing an abortIndexBuild oplog entry.
- if (indexBuildOptions.replSetAndNotPrimaryAtStart) {
- if (status == ErrorCodes::IndexBuildAborted) {
- return;
- }
- fassert(51101,
- status.withContext(str::stream() << "Index build: " << replState->buildUUID
- << "; Database: " << replState->dbName));
- }
-
- uassertStatusOK(status);
- MONGO_UNREACHABLE;
+ if (IndexBuildProtocol::kSinglePhase == replState->protocol) {
+ _cleanUpSinglePhaseAfterFailure(opCtx, collection, replState, indexBuildOptions, status);
+ } else {
+ invariant(IndexBuildProtocol::kTwoPhase == replState->protocol,
+ str::stream() << replState->buildUUID);
+ _cleanUpTwoPhaseAfterFailure(opCtx, collection, replState, indexBuildOptions, status);
}
- log() << "Index build completed successfully: " << replState->buildUUID << ": " << nss << " ( "
- << replState->collectionUUID << " ). Index specs built: " << replState->indexSpecs.size()
- << ". Indexes in catalog before build: " << replState->stats.numIndexesBefore
- << ". Indexes in catalog after build: " << replState->stats.numIndexesAfter;
+ // Any error that escapes at this point is not fatal and can be handled by the caller.
+ uassertStatusOK(status);
}
void IndexBuildsCoordinator::_buildIndex(
diff --git a/src/mongo/db/index_builds_coordinator.h b/src/mongo/db/index_builds_coordinator.h
index 1f2b811d8a9..ab4fae20711 100644
--- a/src/mongo/db/index_builds_coordinator.h
+++ b/src/mongo/db/index_builds_coordinator.h
@@ -155,16 +155,18 @@ public:
const UUID& buildUUID);
/**
- * Waits for the index build identified by 'buildUUID' to complete.
+ * Signals the index build identified by 'buildUUID' to commit, and waits for its thread to
+ * complete. Throws if there were any errors building the index.
*/
- void joinIndexBuild(OperationContext* opCtx, const UUID& buildUUID);
+ void signalCommitAndWait(OperationContext* opCtx, const UUID& buildUUID);
/**
- * Commits the index build identified by 'buildUUID'.
+ * Signals the index build identified by 'buildUUID' to abort, and waits for its thread to
+ * complete.
*/
- void commitIndexBuild(OperationContext* opCtx,
- const std::vector<BSONObj>& specs,
- const UUID& buildUUID);
+ void signalAbortAndWait(OperationContext* opCtx,
+ const UUID& buildUUID,
+ const std::string& reason) noexcept;
/**
* Waits for all index builds to stop after they have been interrupted during shutdown.
@@ -444,6 +446,24 @@ protected:
const IndexBuildOptions& indexBuildOptions);
/**
+ * Cleans up a single-phase index build after a failure.
+ */
+ void _cleanUpSinglePhaseAfterFailure(OperationContext* opCtx,
+ Collection* collection,
+ std::shared_ptr<ReplIndexBuildState> replState,
+ const IndexBuildOptions& indexBuildOptions,
+ const Status& status);
+
+ /**
+ * Cleans up a two-phase index build after a failure.
+ */
+ void _cleanUpTwoPhaseAfterFailure(OperationContext* opCtx,
+ Collection* collection,
+ std::shared_ptr<ReplIndexBuildState> replState,
+ const IndexBuildOptions& indexBuildOptions,
+ const Status& status);
+
+ /**
* Modularizes the _indexBuildsManager calls part of _runIndexBuildInner. Throws on error.
*/
void _buildIndex(OperationContext* opCtx,
diff --git a/src/mongo/db/repair_database_and_check_version.cpp b/src/mongo/db/repair_database_and_check_version.cpp
index e5a0f51b588..bf8d07ed074 100644
--- a/src/mongo/db/repair_database_and_check_version.cpp
+++ b/src/mongo/db/repair_database_and_check_version.cpp
@@ -46,6 +46,7 @@
#include "mongo/db/concurrency/write_conflict_exception.h"
#include "mongo/db/db_raii.h"
#include "mongo/db/dbhelpers.h"
+#include "mongo/db/index_builds_coordinator.h"
#include "mongo/db/namespace_string.h"
#include "mongo/db/operation_context.h"
#include "mongo/db/repair_database.h"
@@ -265,8 +266,6 @@ void checkForCappedOplog(OperationContext* opCtx, Database* db) {
void rebuildIndexes(OperationContext* opCtx, StorageEngine* storageEngine) {
auto reconcileResult = fassert(40593, storageEngine->reconcileCatalogAndIdents(opCtx));
- // TODO (SERVER-44467): Restart two-phase index builds during startup
- invariant(reconcileResult.indexBuildsToRestart.empty());
if (!reconcileResult.indexesToRebuild.empty() && serverGlobalParams.indexBuildRetry) {
log() << "note: restart the server with --noIndexBuildRetry "
@@ -315,6 +314,12 @@ void rebuildIndexes(OperationContext* opCtx, StorageEngine* storageEngine) {
std::vector<BSONObj> indexSpecs = entry.second.second;
fassert(40592, rebuildIndexesOnCollection(opCtx, collection, indexSpecs));
}
+
+ // Once all unfinished indexes have been rebuilt, restart any unfinished index builds. This will
+ // not build any indexes to completion, but rather start the background thread to build the
+ // index, and wait for a replicated commit or abort oplog entry.
+ IndexBuildsCoordinator::get(opCtx)->restartIndexBuildsForRecovery(
+ opCtx, reconcileResult.indexBuildsToRestart);
}
/**
diff --git a/src/mongo/db/repl/oplog.cpp b/src/mongo/db/repl/oplog.cpp
index 3b83398df30..8f345e4e023 100644
--- a/src/mongo/db/repl/oplog.cpp
+++ b/src/mongo/db/repl/oplog.cpp
@@ -213,8 +213,7 @@ Status commitIndexBuild(OperationContext* opCtx,
return statusWithIndexes.getStatus();
}
auto indexBuildsCoord = IndexBuildsCoordinator::get(opCtx);
- indexBuildsCoord->commitIndexBuild(opCtx, statusWithIndexes.getValue(), indexBuildUUID);
- indexBuildsCoord->joinIndexBuild(opCtx, indexBuildUUID);
+ indexBuildsCoord->signalCommitAndWait(opCtx, indexBuildUUID);
return Status::OK();
}
@@ -223,11 +222,10 @@ Status abortIndexBuild(OperationContext* opCtx,
const Status& cause,
OplogApplication::Mode mode) {
// Wait until the index build finishes aborting.
- IndexBuildsCoordinator::get(opCtx)->abortIndexBuildByBuildUUID(
+ IndexBuildsCoordinator::get(opCtx)->signalAbortAndWait(
opCtx,
indexBuildUUID,
str::stream() << "abortIndexBuild oplog entry encountered: " << cause);
- IndexBuildsCoordinator::get(opCtx)->joinIndexBuild(opCtx, indexBuildUUID);
return Status::OK();
}
diff --git a/src/mongo/db/repl/oplog_applier_impl.cpp b/src/mongo/db/repl/oplog_applier_impl.cpp
index 43039c8f48e..f1c12d1e68a 100644
--- a/src/mongo/db/repl/oplog_applier_impl.cpp
+++ b/src/mongo/db/repl/oplog_applier_impl.cpp
@@ -449,9 +449,17 @@ void OplogApplierImpl::_run(OplogBuffer* oplogBuffer) {
// Apply the operations in this batch. '_applyOplogBatch' returns the optime of the
// last op that was applied, which should be the last optime in the batch.
- auto lastOpTimeAppliedInBatch =
- fassertNoTrace(34437, _applyOplogBatch(&opCtx, ops.releaseBatch()));
- invariant(lastOpTimeAppliedInBatch == lastOpTimeInBatch);
+ auto swLastOpTimeAppliedInBatch = _applyOplogBatch(&opCtx, ops.releaseBatch());
+ if (swLastOpTimeAppliedInBatch.getStatus().code() == ErrorCodes::InterruptedAtShutdown) {
+ // If an operation was interrupted at shutdown, fail the batch without advancing
+ // appliedThrough as if this were an unclean shutdown. This ensures the stable timestamp
+ // does not advance, and a checkpoint cannot be taken at a timestamp that includes this
+ // batch. On startup, we will recover from an earlier stable checkpoint and apply the
+ // operations from this batch again.
+ return;
+ }
+ fassertNoTrace(34437, swLastOpTimeAppliedInBatch);
+ invariant(swLastOpTimeAppliedInBatch.getValue() == lastOpTimeInBatch);
// Update various things that care about our last applied optime. Tests rely on 1 happening
// before 2 even though it isn't strictly necessary.
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
index f9797af98d2..51c5e4f96fe 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
@@ -366,6 +366,11 @@ void ReplicationCoordinatorExternalStateImpl::shutdown(OperationContext* opCtx)
// _taskExecutor pointer never changes.
_taskExecutor->join();
+ // Ensure that all writes are visible before reading. If we failed mid-batch, it would be
+ // possible to read from a kLastApplied ReadSource where not all writes to the minValid document
+ // are visible, generating a writeConflict that would not resolve.
+ opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
+
auto loadLastOpTimeAndWallTimeResult = loadLastOpTimeAndWallTime(opCtx);
if (_replicationProcess->getConsistencyMarkers()->getOplogTruncateAfterPoint(opCtx).isNull() &&
loadLastOpTimeAndWallTimeResult.isOK() &&
diff --git a/src/mongo/db/storage/storage_engine_impl.cpp b/src/mongo/db/storage/storage_engine_impl.cpp
index 213c3c50937..4763ddbdf13 100644
--- a/src/mongo/db/storage/storage_engine_impl.cpp
+++ b/src/mongo/db/storage/storage_engine_impl.cpp
@@ -456,7 +456,7 @@ StatusWith<StorageEngine::ReconcileResult> StorageEngineImpl::reconcileCatalogAn
invariant(collUUID);
auto buildUUID = *indexMetaData.buildUUID;
- log() << "Found index from unfinished build. Collection: " << coll << "("
+ log() << "Found index from unfinished build. Collection: " << coll << " ("
<< *collUUID << "), index: " << indexName << ", build UUID: " << buildUUID;
// Insert in the map if a build has not already been registered.