From 300458bb1ed89940649560e92ff84c81a4a77678 Mon Sep 17 00:00:00 2001
From: Louis Williams <louis.williams@mongodb.com>
Date: Thu, 21 Nov 2019 22:36:46 +0000
Subject: SERVER-44467 Implement startup recovery for two-phase index builds

---
 etc/evergreen.yml                                  |   6 +-
 jstests/noPassthrough/backup_restore_fsync_lock.js |   3 -
 jstests/noPassthrough/backup_restore_rolling.js    |   3 -
 jstests/noPassthrough/backup_restore_stop_start.js |   3 -
 .../characterize_index_builds_on_restart.js        |   2 +-
 jstests/noPassthrough/indexbg_shutdown.js          |   3 -
 .../indexbg_restart_secondary.js                   |  42 +--
 .../replsets/libs/rollback_index_builds_test.js    |   6 +
 .../replsets/rollback_index_build_start_abort.js   |  20 ++
 src/mongo/db/catalog/multi_index_block.cpp         |   2 +-
 src/mongo/db/index_builds_coordinator.cpp          | 298 ++++++++++++---------
 src/mongo/db/index_builds_coordinator.h            |  32 ++-
 src/mongo/db/repair_database_and_check_version.cpp |   9 +-
 src/mongo/db/repl/oplog.cpp                        |   6 +-
 src/mongo/db/repl/oplog_applier_impl.cpp           |  14 +-
 ...replication_coordinator_external_state_impl.cpp |   5 +
 src/mongo/db/storage/storage_engine_impl.cpp       |   2 +-
 17 files changed, 278 insertions(+), 178 deletions(-)
 create mode 100644 jstests/replsets/rollback_index_build_start_abort.js

diff --git a/etc/evergreen.yml b/etc/evergreen.yml
index 5303a7080a2..57c58aae543 100644
--- a/etc/evergreen.yml
+++ b/etc/evergreen.yml
@@ -10174,14 +10174,14 @@ buildvariants:
   - name: compile_TG
     distros:
     - rhel62-large
-  - name: .concurrency !.large !.ubsan !.no_txns !.stepdowns !.kill_terminate !.debug_only
+  - name: .concurrency !.large !.ubsan !.no_txns !.debug_only
   - name: .jscore .common
   - name: noPassthrough_gen
   - name: noPassthroughWithMongod_gen
-  - name: replica_sets
+  - name: .replica_sets !.encrypt !.auth
     distros:
     - rhel62-large
-  - name: rollback_fuzzer_gen
+  - name: .rollbackfuzzer
   - name: sharding_gen
 
 - name: enterprise-rhel-62-64-bit-coverage
diff --git a/jstests/noPassthrough/backup_restore_fsync_lock.js b/jstests/noPassthrough/backup_restore_fsync_lock.js
index 57b12a6ab24..f6f720377aa 100644
--- a/jstests/noPassthrough/backup_restore_fsync_lock.js
+++ b/jstests/noPassthrough/backup_restore_fsync_lock.js
@@ -11,12 +11,9 @@
  *
  * Some methods for backup used in this test checkpoint the files in the dbpath. This technique will
  * not work for ephemeral storage engines, as they do not store any data in the dbpath.
- * TODO: (SERVER-44467) Remove two_phase_index_builds_unsupported tag when startup recovery works
- * for two-phase index builds.
  * @tags: [
  *     requires_persistence,
  *     requires_replication,
- *     two_phase_index_builds_unsupported,
  * ]
  */
 
diff --git a/jstests/noPassthrough/backup_restore_rolling.js b/jstests/noPassthrough/backup_restore_rolling.js
index 358f33b0194..63f6b1a7cd2 100644
--- a/jstests/noPassthrough/backup_restore_rolling.js
+++ b/jstests/noPassthrough/backup_restore_rolling.js
@@ -11,12 +11,9 @@
  *
  * Some methods for backup used in this test checkpoint the files in the dbpath. This technique will
  * not work for ephemeral storage engines, as they do not store any data in the dbpath.
- * TODO: (SERVER-44467) Remove two_phase_index_builds_unsupported tag when startup recovery works
- * for two-phase index builds.
  * @tags: [
  *     requires_persistence,
  *     requires_wiredtiger,
- *     two_phase_index_builds_unsupported,
  * ]
  */
 
diff --git a/jstests/noPassthrough/backup_restore_stop_start.js b/jstests/noPassthrough/backup_restore_stop_start.js
index 71ad88c7dd5..2598b8670ab 100644
--- a/jstests/noPassthrough/backup_restore_stop_start.js
+++ b/jstests/noPassthrough/backup_restore_stop_start.js
@@ -11,12 +11,9 @@
  *
  * Some methods for backup used in this test checkpoint the files in the dbpath. This technique will
  * not work for ephemeral storage engines, as they do not store any data in the dbpath.
- * TODO: (SERVER-44467) Remove two_phase_index_builds_unsupported tag when startup recovery works
- * for two-phase index builds.
  * @tags: [
  *     requires_persistence,
  *     requires_replication,
- *     two_phase_index_builds_unsupported,
  * ]
  */
 
diff --git a/jstests/noPassthrough/characterize_index_builds_on_restart.js b/jstests/noPassthrough/characterize_index_builds_on_restart.js
index 64c2ba5fe91..2f500684201 100644
--- a/jstests/noPassthrough/characterize_index_builds_on_restart.js
+++ b/jstests/noPassthrough/characterize_index_builds_on_restart.js
@@ -99,7 +99,7 @@ function startIndexBuildOnSecondaryAndLeaveUnfinished(primaryDB, writeConcern, s
         checkLog.containsWithCount(
             secondaryDB,
             "Index build interrupted due to \'leaveIndexBuildUnfinishedForShutdown\' " +
-                "failpoint. Mimicing shutdown error code.",
+                "failpoint. Mimicking shutdown error code.",
             expectedFailPointMessageCount);
 
         // Wait until the secondary has a recovery timestamp beyond the index oplog entry. On
diff --git a/jstests/noPassthrough/indexbg_shutdown.js b/jstests/noPassthrough/indexbg_shutdown.js
index 4e9bc9cf016..a8a5781b3ed 100644
--- a/jstests/noPassthrough/indexbg_shutdown.js
+++ b/jstests/noPassthrough/indexbg_shutdown.js
@@ -4,11 +4,8 @@
  * shuts down cleanly, without an fassert.
  * Also confirms that killOp has no effect on the background index build on the secondary.
  *
- * TODO(SERVER-44467): Remove two_phase_index_builds_unsupported tag when startup recovery works
- * for two-phase index builds.
  * @tags: [
  *   requires_replication,
- *   two_phase_index_builds_unsupported,
  * ]
  */
 
diff --git a/jstests/noPassthroughWithMongod/indexbg_restart_secondary.js b/jstests/noPassthroughWithMongod/indexbg_restart_secondary.js
index e12973b7ff8..462236d8c63 100644
--- a/jstests/noPassthroughWithMongod/indexbg_restart_secondary.js
+++ b/jstests/noPassthroughWithMongod/indexbg_restart_secondary.js
@@ -3,20 +3,18 @@
  * Kills the secondary once the index build starts with a failpoint.
  * The index build should resume when the secondary is restarted.
  *
- * TODO: (SERVER-44467) Remove two_phase_index_builds_unsupported tag when startup recovery works
- * for two-phase index builds.
- *
  * @tags: [
  *   requires_persistence,
  *   requires_journaling,
  *   requires_replication,
- *   two_phase_index_builds_unsupported,
  * ]
  */
 
 (function() {
 'use strict';
 
+load('jstests/noPassthrough/libs/index_build.js');
+
 // Set up replica set
 var replTest = new ReplSetTest({name: 'bgIndex', nodes: 3});
 var nodes = replTest.nodeList();
@@ -57,19 +55,31 @@ replTest.awaitReplication();
 
 assert.commandWorked(secondDB.adminCommand(
     {configureFailPoint: 'leaveIndexBuildUnfinishedForShutdown', mode: 'alwaysOn'}));
+
 try {
-    coll.createIndex({i: 1}, {background: true});
-    masterDB.getLastError(2);
-    assert.eq(2, coll.getIndexes().length);
-
-    // Make sure all writes are durable on the secondary so that we can restart it knowing that
-    // the index build will be found on startup.
-    // Waiting for durable is important for both (A) the record that we started the index build
-    // so it is rebuild on restart, and (B) the update to minvalid to show that we've already
-    // applied the oplog entry so it isn't replayed. If (A) is present without (B), then there
-    // are two ways that the index can be rebuilt on startup and this test is only for the one
-    // triggered by (A).
-    secondDB.adminCommand({fsync: 1});
+    if (IndexBuildTest.supportsTwoPhaseIndexBuild(master)) {
+        // Pause the index build on the secondary to wait for it to start.
+        IndexBuildTest.pauseIndexBuilds(secondDB);
+        IndexBuildTest.startIndexBuild(master, coll.getFullName(), {i: 1});
+
+        // Wait for build to start on the secondary.
+        jsTestLog("waiting for index build to start on secondary");
+        IndexBuildTest.waitForIndexBuildToStart(secondDB);
+        IndexBuildTest.resumeIndexBuilds(secondDB);
+    } else {
+        coll.createIndex({i: 1}, {background: true});
+        masterDB.getLastError(2);
+        assert.eq(2, coll.getIndexes().length);
+
+        // Make sure all writes are durable on the secondary so that we can restart it knowing that
+        // the index build will be found on startup.
+        // Waiting for durable is important for both (A) the record that we started the index build
+        // so it is rebuild on restart, and (B) the update to minvalid to show that we've already
+        // applied the oplog entry so it isn't replayed. If (A) is present without (B), then there
+        // are two ways that the index can be rebuilt on startup and this test is only for the one
+        // triggered by (A).
+        secondDB.adminCommand({fsync: 1});
+    }
 } finally {
     assert.commandWorked(secondDB.adminCommand(
         {configureFailPoint: 'leaveIndexBuildUnfinishedForShutdown', mode: 'off'}));
diff --git a/jstests/replsets/libs/rollback_index_builds_test.js b/jstests/replsets/libs/rollback_index_builds_test.js
index f751d5c92a4..2d563454a9b 100644
--- a/jstests/replsets/libs/rollback_index_builds_test.js
+++ b/jstests/replsets/libs/rollback_index_builds_test.js
@@ -80,6 +80,12 @@ class RollbackIndexBuildsTest {
                         IndexBuildTest.resumeIndexBuilds(primary);
                         IndexBuildTest.waitForIndexBuildToStop(primaryDB, collName, "a_1");
                         break;
+                    case "abort":
+                        const opId = IndexBuildTest.getIndexBuildOpId(primaryDB, collName, "a_1");
+                        assert.commandWorked(primaryDB.killOp(opId));
+                        IndexBuildTest.resumeIndexBuilds(primary);
+                        IndexBuildTest.waitForIndexBuildToStop(primaryDB, collName, "a_1");
+                        break;
                     case "drop":
                         collection.dropIndexes(indexSpec);
                         break;
diff --git a/jstests/replsets/rollback_index_build_start_abort.js b/jstests/replsets/rollback_index_build_start_abort.js
new file mode 100644
index 00000000000..eb65327f838
--- /dev/null
+++ b/jstests/replsets/rollback_index_build_start_abort.js
@@ -0,0 +1,20 @@
+/**
+ * Tests different permutations of rolling-back index build start and abort oplog entries.
+ */
+(function() {
+"use strict";
+
+// for RollbackIndexBuildTest
+load('jstests/replsets/libs/rollback_index_builds_test.js');
+
+const rollbackIndexTest = new RollbackIndexBuildsTest();
+
+// Build a schedule of operations interleaving rollback and an index build.
+const rollbackOps = ["holdStableTimestamp", "transitionToRollback"];
+const indexBuildOps = ["start", "abort"];
+
+// This generates 4 choose 2, or 6 schedules.
+const schedules = RollbackIndexBuildsTest.makeSchedules(rollbackOps, indexBuildOps);
+rollbackIndexTest.runSchedules(schedules);
+rollbackIndexTest.stop();
+})();
diff --git a/src/mongo/db/catalog/multi_index_block.cpp b/src/mongo/db/catalog/multi_index_block.cpp
index 4b8b50a2ff9..91f4d9e383d 100644
--- a/src/mongo/db/catalog/multi_index_block.cpp
+++ b/src/mongo/db/catalog/multi_index_block.cpp
@@ -542,7 +542,7 @@ Status MultiIndexBlock::insertAllDocumentsInCollection(OperationContext* opCtx,
 
     if (MONGO_unlikely(leaveIndexBuildUnfinishedForShutdown.shouldFail())) {
         log() << "Index build interrupted due to 'leaveIndexBuildUnfinishedForShutdown' failpoint. "
-                 "Mimicing shutdown error code.";
+                 "Mimicking shutdown error code.";
         return Status(
             ErrorCodes::InterruptedAtShutdown,
             "background index build interrupted due to failpoint. returning a shutdown error.");
diff --git a/src/mongo/db/index_builds_coordinator.cpp b/src/mongo/db/index_builds_coordinator.cpp
index 074ce083d53..d2f10674a28 100644
--- a/src/mongo/db/index_builds_coordinator.cpp
+++ b/src/mongo/db/index_builds_coordinator.cpp
@@ -428,16 +428,6 @@ Status IndexBuildsCoordinator::_startIndexBuildForRecovery(OperationContext* opC
     return Status::OK();
 }
 
-void IndexBuildsCoordinator::joinIndexBuild(OperationContext* opCtx, const UUID& buildUUID) {
-    auto replStateResult = _getIndexBuild(buildUUID);
-    if (!replStateResult.isOK()) {
-        return;
-    }
-    auto replState = replStateResult.getValue();
-    auto fut = replState->sharedPromise.getFuture();
-    log() << "Index build joined: " << buildUUID << ": " << fut.waitNoThrow(opCtx);
-}
-
 void IndexBuildsCoordinator::waitForAllIndexBuildsToStopForShutdown() {
     stdx::unique_lock<Latch> lk(_mutex);
 
@@ -492,16 +482,39 @@ void IndexBuildsCoordinator::abortDatabaseIndexBuilds(StringData db, const std::
     dbIndexBuilds->waitUntilNoIndexBuildsRemain(lk);
 }
 
-void IndexBuildsCoordinator::commitIndexBuild(OperationContext* opCtx,
-                                              const std::vector<BSONObj>& specs,
-                                              const UUID& buildUUID) {
+void IndexBuildsCoordinator::signalCommitAndWait(OperationContext* opCtx, const UUID& buildUUID) {
     auto replState = uassertStatusOK(_getIndexBuild(buildUUID));
 
-    stdx::unique_lock<Latch> lk(replState->mutex);
-    replState->isCommitReady = true;
-    replState->commitTimestamp = opCtx->recoveryUnit()->getCommitTimestamp();
-    invariant(!replState->commitTimestamp.isNull(), buildUUID.toString());
-    replState->condVar.notify_all();
+    {
+        stdx::unique_lock<Latch> lk(replState->mutex);
+        replState->isCommitReady = true;
+        replState->commitTimestamp = opCtx->recoveryUnit()->getCommitTimestamp();
+        invariant(!replState->commitTimestamp.isNull(), buildUUID.toString());
+        replState->condVar.notify_all();
+    }
+    auto fut = replState->sharedPromise.getFuture();
+    log() << "Index build joined after commit: " << buildUUID << ": " << fut.waitNoThrow(opCtx);
+
+    // Throws if there was an error building the index.
+    fut.get();
+}
+
+void IndexBuildsCoordinator::signalAbortAndWait(OperationContext* opCtx,
+                                                const UUID& buildUUID,
+                                                const std::string& reason) noexcept {
+    abortIndexBuildByBuildUUID(opCtx, buildUUID, reason);
+
+    // Because we replicate abort oplog entries for single-phase builds, it is possible to receive
+    // an abort for a non-existent index build. Abort should always succeed, so suppress the error.
+    auto replStateResult = _getIndexBuild(buildUUID);
+    if (!replStateResult.isOK()) {
+        log() << "ignoring error while aborting index build " << buildUUID << ": "
+              << replStateResult.getStatus();
+        return;
+    }
+    auto replState = replStateResult.getValue();
+    auto fut = replState->sharedPromise.getFuture();
+    log() << "Index build joined after abort: " << buildUUID << ": " << fut.waitNoThrow(opCtx);
 }
 
 void IndexBuildsCoordinator::abortIndexBuildByBuildUUID(OperationContext* opCtx,
@@ -1115,6 +1128,115 @@ void IndexBuildsCoordinator::_runIndexBuild(OperationContext* opCtx,
     }
 }
 
+void IndexBuildsCoordinator::_cleanUpSinglePhaseAfterFailure(
+    OperationContext* opCtx,
+    Collection* collection,
+    std::shared_ptr<ReplIndexBuildState> replState,
+    const IndexBuildOptions& indexBuildOptions,
+    const Status& status) {
+    if (status == ErrorCodes::InterruptedAtShutdown) {
+        // Leave it as-if kill -9 happened. Startup recovery will rebuild the index.
+        _indexBuildsManager.interruptIndexBuild(opCtx, replState->buildUUID, "shutting down");
+        _indexBuildsManager.tearDownIndexBuild(
+            opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn);
+        return;
+    }
+
+    // If the index build was not completed successfully, we'll need to acquire some locks to
+    // clean it up.
+    UninterruptibleLockGuard noInterrupt(opCtx->lockState());
+
+    NamespaceString nss = collection->ns();
+    Lock::DBLock dbLock(opCtx, nss.db(), MODE_IX);
+
+    if (indexBuildOptions.replSetAndNotPrimaryAtStart) {
+        // This build started and failed as a secondary. Single-phase index builds started on
+        // secondaries may not fail. Do not clean up the index build. It must remain unfinished
+        // until it is successfully rebuilt on startup.
+        fassert(31354,
+                status.withContext(str::stream() << "Index build: " << replState->buildUUID
+                                                 << "; Database: " << replState->dbName));
+    }
+
+    Lock::CollectionLock collLock(opCtx, nss, MODE_X);
+
+    auto replCoord = repl::ReplicationCoordinator::get(opCtx);
+    if (replCoord->getSettings().usingReplSets() && replCoord->canAcceptWritesFor(opCtx, nss)) {
+        // We are currently a primary node.
+        // TODO(SERVER-44723): Stop replicating abortIndexBuild for single-phase index builds. This
+        // is unnecessary for single-phase builds.
+        auto onCleanUpFn = [&] { onAbortIndexBuild(opCtx, nss, *replState, status); };
+        _indexBuildsManager.tearDownIndexBuild(
+            opCtx, collection, replState->buildUUID, onCleanUpFn);
+        return;
+    }
+
+    // If we started the build as a primary and are now unable to accept writes, this build was
+    // aborted due to a stepdown.
+    _indexBuildsManager.tearDownIndexBuild(
+        opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn);
+}
+
+void IndexBuildsCoordinator::_cleanUpTwoPhaseAfterFailure(
+    OperationContext* opCtx,
+    Collection* collection,
+    std::shared_ptr<ReplIndexBuildState> replState,
+    const IndexBuildOptions& indexBuildOptions,
+    const Status& status) {
+
+    if (status == ErrorCodes::InterruptedAtShutdown) {
+        // Leave it as-if kill -9 happened. Startup recovery will restart the index build.
+        _indexBuildsManager.interruptIndexBuild(opCtx, replState->buildUUID, "shutting down");
+        _indexBuildsManager.tearDownIndexBuild(
+            opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn);
+        return;
+    }
+
+    // If the index build was not completed successfully, we'll need to acquire some locks to
+    // clean it up.
+    UninterruptibleLockGuard noInterrupt(opCtx->lockState());
+
+    NamespaceString nss = collection->ns();
+    Lock::DBLock dbLock(opCtx, nss.db(), MODE_IX);
+
+    auto replCoord = repl::ReplicationCoordinator::get(opCtx);
+    if (replCoord->getSettings().usingReplSets() && !replCoord->canAcceptWritesFor(opCtx, nss)) {
+        // We failed this index build as a secondary node.
+
+        // Failed index builds should fatally assert on the secondary, except when the index build
+        // was stopped due to an explicit abort oplog entry or rollback.
+        if (status == ErrorCodes::IndexBuildAborted) {
+            // On a secondary, we should be able to obtain the timestamp for cleaning up the index
+            // build from the oplog entry unless the index build did not fail due to processing an
+            // abortIndexBuild oplog entry. This is the case if we were aborted due to rollback.
+            stdx::unique_lock<Latch> lk(replState->mutex);
+            invariant(replState->aborted, replState->buildUUID.toString());
+            Timestamp abortIndexBuildTimestamp = replState->abortTimestamp;
+
+            // Unlock the RSTL to avoid deadlocks with state transitions. See SERVER-42824.
+            unlockRSTLForIndexCleanup(opCtx);
+            Lock::CollectionLock collLock(opCtx, nss, MODE_X);
+
+            // TimestampBlock is a no-op if the abort timestamp is unset.
+            TimestampBlock tsBlock(opCtx, abortIndexBuildTimestamp);
+            _indexBuildsManager.tearDownIndexBuild(
+                opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn);
+            return;
+        }
+
+        fassert(51101,
+                status.withContext(str::stream() << "Index build: " << replState->buildUUID
+                                                 << "; Database: " << replState->dbName));
+    }
+
+    // We are currently a primary node. Notify downstream nodes to abort their index builds with the
+    // same build UUID.
+    Lock::CollectionLock collLock(opCtx, nss, MODE_X);
+    auto onCleanUpFn = [&] { onAbortIndexBuild(opCtx, nss, *replState, status); };
+    _indexBuildsManager.tearDownIndexBuild(opCtx, collection, replState->buildUUID, onCleanUpFn);
+    return;
+}
+
 void IndexBuildsCoordinator::_runIndexBuildInner(OperationContext* opCtx,
                                                  std::shared_ptr<ReplIndexBuildState> replState,
                                                  const IndexBuildOptions& indexBuildOptions) {
@@ -1168,124 +1290,42 @@ void IndexBuildsCoordinator::_runIndexBuildInner(OperationContext* opCtx,
         status = ex.toStatus();
     }
 
-    if (status == ErrorCodes::InterruptedAtShutdown) {
-        // Leave it as-if kill -9 happened. This will be handled on restart.
-        _indexBuildsManager.interruptIndexBuild(opCtx, replState->buildUUID, "shutting down");
-
-        // On secondaries, a shutdown interruption status is part of normal operation and
-        // should be suppressed, unlike other errors which should be raised to the administrator's
-        // attention via a server crash. The server will attempt to recover the index build during
-        // the next startup.
-        // On primary and standalone nodes, the failed index build will not be replicated so it is
-        // okay to propagate the shutdown error to the client.
-        if (indexBuildOptions.replSetAndNotPrimaryAtStart) {
-            replState->stats.numIndexesAfter = replState->stats.numIndexesBefore;
-            status = Status::OK();
-        }
-    } else if (IndexBuildProtocol::kTwoPhase == replState->protocol) {
-        // TODO (SERVER-40807): disabling the following code for the v4.2 release so it does not
-        // have downstream impact.
-        /*
-        // Only the primary node removes the index build entry, as the secondaries will
-        // replicate.
-        if (!replSetAndNotPrimary) {
-            auto removeStatus = removeIndexBuildEntry(opCtx, replState->buildUUID);
-            if (!removeStatus.isOK()) {
-                logFailure(removeStatus, nss, replState);
-                uassertStatusOK(removeStatus);
-                MONGO_UNREACHABLE;
-            }
-        }
-        */
-    }
-
-    NamespaceString nss;
-    {
-        // We do not hold a collection lock here, but we are protected against the collection being
-        // dropped while the index build is still registered for the collection -- until
-        // tearDownIndexBuild is called. The collection can be renamed, but it is OK for the name to
-        // be stale just for logging purposes.
-        auto collection =
-            CollectionCatalog::get(opCtx).lookupCollectionByUUID(replState->collectionUUID);
-        invariant(collection,
-                  str::stream() << "Collection with UUID " << replState->collectionUUID
-                                << " should exist because an index build is in progress: "
-                                << replState->buildUUID);
-        nss = collection->ns();
-
-        // If the index build was not completely successfully, we'll need to acquire some locks to
-        // clean it up.
-        if (!status.isOK()) {
-            UninterruptibleLockGuard noInterrupt(opCtx->lockState());
-
-            Lock::DBLock dbLock(opCtx, nss.db(), MODE_IX);
-
-            if (!indexBuildOptions.replSetAndNotPrimaryAtStart) {
-                auto replCoord = repl::ReplicationCoordinator::get(opCtx);
-                if (replCoord->getSettings().usingReplSets() &&
-                    replCoord->canAcceptWritesFor(opCtx, nss)) {
-                    // We are currently a primary node. Notify downstream nodes to abort their index
-                    // builds with the same build UUID.
-                    Lock::CollectionLock collLock(opCtx, nss, MODE_X);
-                    auto onCleanUpFn = [&] { onAbortIndexBuild(opCtx, nss, *replState, status); };
-                    _indexBuildsManager.tearDownIndexBuild(
-                        opCtx, collection, replState->buildUUID, onCleanUpFn);
-                } else {
-                    // This index build was aborted because we are stepping down from primary.
-                    unlockRSTLForIndexCleanup(opCtx);
-                    Lock::CollectionLock collLock(opCtx, nss, MODE_X);
-                    _indexBuildsManager.tearDownIndexBuild(
-                        opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn);
-                }
-            } else {
-                // We started this index build during oplog application as a secondary node.
-                Timestamp abortIndexBuildTimestamp;
-                if (status == ErrorCodes::IndexBuildAborted) {
-                    // We are on a secondary. We should be able to obtain the timestamp for cleaning
-                    // up the index build from the oplog entry unless the index build did not fail
-                    // due to processing an abortIndexBuild oplog entry. For example, a unique index
-                    // key violation would result in the index build failing on the secondary.
-                    stdx::unique_lock<Latch> lk(replState->mutex);
-                    invariant(replState->aborted, replState->buildUUID.toString());
-                    abortIndexBuildTimestamp = replState->abortTimestamp;
-                }
+    // We do not hold a collection lock here, but we are protected against the collection being
+    // dropped while the index build is still registered for the collection -- until
+    // tearDownIndexBuild is called. The collection can be renamed, but it is OK for the name to
+    // be stale just for logging purposes.
+    auto collection =
+        CollectionCatalog::get(opCtx).lookupCollectionByUUID(replState->collectionUUID);
+    invariant(collection,
+              str::stream() << "Collection with UUID " << replState->collectionUUID
+                            << " should exist because an index build is in progress: "
+                            << replState->buildUUID);
+    NamespaceString nss = collection->ns();
 
-                unlockRSTLForIndexCleanup(opCtx);
-                Lock::CollectionLock collLock(opCtx, nss, MODE_X);
+    if (status.isOK()) {
+        _indexBuildsManager.tearDownIndexBuild(
+            opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn);
 
-                // TimestampBlock is a no-op if the abort timestamp is unset.
-                TimestampBlock tsBlock(opCtx, abortIndexBuildTimestamp);
-                _indexBuildsManager.tearDownIndexBuild(
-                    opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn);
-            }
-        } else {
-            _indexBuildsManager.tearDownIndexBuild(
-                opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn);
-        }
+        log() << "Index build completed successfully: " << replState->buildUUID << ": " << nss
+              << " ( " << replState->collectionUUID
+              << " ). Index specs built: " << replState->indexSpecs.size()
+              << ". Indexes in catalog before build: " << replState->stats.numIndexesBefore
+              << ". Indexes in catalog after build: " << replState->stats.numIndexesAfter;
+        return;
     }
 
-    if (!status.isOK()) {
-        logFailure(status, nss, replState);
+    logFailure(status, nss, replState);
 
-        // Failed index builds should abort secondary oplog application, except when the index build
-        // was stopped due to processing an abortIndexBuild oplog entry.
-        if (indexBuildOptions.replSetAndNotPrimaryAtStart) {
-            if (status == ErrorCodes::IndexBuildAborted) {
-                return;
-            }
-            fassert(51101,
-                    status.withContext(str::stream() << "Index build: " << replState->buildUUID
-                                                     << "; Database: " << replState->dbName));
-        }
-
-        uassertStatusOK(status);
-        MONGO_UNREACHABLE;
+    if (IndexBuildProtocol::kSinglePhase == replState->protocol) {
+        _cleanUpSinglePhaseAfterFailure(opCtx, collection, replState, indexBuildOptions, status);
+    } else {
+        invariant(IndexBuildProtocol::kTwoPhase == replState->protocol,
+                  str::stream() << replState->buildUUID);
+        _cleanUpTwoPhaseAfterFailure(opCtx, collection, replState, indexBuildOptions, status);
     }
 
-    log() << "Index build completed successfully: " << replState->buildUUID << ": " << nss << " ( "
-          << replState->collectionUUID << " ). Index specs built: " << replState->indexSpecs.size()
-          << ". Indexes in catalog before build: " << replState->stats.numIndexesBefore
-          << ". Indexes in catalog after build: " << replState->stats.numIndexesAfter;
+    // Any error that escapes at this point is not fatal and can be handled by the caller.
+    uassertStatusOK(status);
 }
 
 void IndexBuildsCoordinator::_buildIndex(
diff --git a/src/mongo/db/index_builds_coordinator.h b/src/mongo/db/index_builds_coordinator.h
index 1f2b811d8a9..ab4fae20711 100644
--- a/src/mongo/db/index_builds_coordinator.h
+++ b/src/mongo/db/index_builds_coordinator.h
@@ -155,16 +155,18 @@ public:
         const UUID& buildUUID);
 
     /**
-     * Waits for the index build identified by 'buildUUID' to complete.
+     * Signals the index build identified by 'buildUUID' to commit, and waits for its thread to
+     * complete. Throws if there were any errors building the index.
      */
-    void joinIndexBuild(OperationContext* opCtx, const UUID& buildUUID);
+    void signalCommitAndWait(OperationContext* opCtx, const UUID& buildUUID);
 
     /**
-     * Commits the index build identified by 'buildUUID'.
+     * Signals the index build identified by 'buildUUID' to abort, and waits for its thread to
+     * complete.
      */
-    void commitIndexBuild(OperationContext* opCtx,
-                          const std::vector<BSONObj>& specs,
-                          const UUID& buildUUID);
+    void signalAbortAndWait(OperationContext* opCtx,
+                            const UUID& buildUUID,
+                            const std::string& reason) noexcept;
 
     /**
      * Waits for all index builds to stop after they have been interrupted during shutdown.
@@ -443,6 +445,24 @@ protected:
                              std::shared_ptr<ReplIndexBuildState> replState,
                              const IndexBuildOptions& indexBuildOptions);
 
+    /**
+     * Cleans up a single-phase index build after a failure.
+     */
+    void _cleanUpSinglePhaseAfterFailure(OperationContext* opCtx,
+                                         Collection* collection,
+                                         std::shared_ptr<ReplIndexBuildState> replState,
+                                         const IndexBuildOptions& indexBuildOptions,
+                                         const Status& status);
+
+    /**
+     * Cleans up a two-phase index build after a failure.
+     */
+    void _cleanUpTwoPhaseAfterFailure(OperationContext* opCtx,
+                                      Collection* collection,
+                                      std::shared_ptr<ReplIndexBuildState> replState,
+                                      const IndexBuildOptions& indexBuildOptions,
+                                      const Status& status);
+
     /**
      * Modularizes the _indexBuildsManager calls part of _runIndexBuildInner. Throws on error.
      */
diff --git a/src/mongo/db/repair_database_and_check_version.cpp b/src/mongo/db/repair_database_and_check_version.cpp
index e5a0f51b588..bf8d07ed074 100644
--- a/src/mongo/db/repair_database_and_check_version.cpp
+++ b/src/mongo/db/repair_database_and_check_version.cpp
@@ -46,6 +46,7 @@
 #include "mongo/db/concurrency/write_conflict_exception.h"
 #include "mongo/db/db_raii.h"
 #include "mongo/db/dbhelpers.h"
+#include "mongo/db/index_builds_coordinator.h"
 #include "mongo/db/namespace_string.h"
 #include "mongo/db/operation_context.h"
 #include "mongo/db/repair_database.h"
@@ -265,8 +266,6 @@ void checkForCappedOplog(OperationContext* opCtx, Database* db) {
 
 void rebuildIndexes(OperationContext* opCtx, StorageEngine* storageEngine) {
     auto reconcileResult = fassert(40593, storageEngine->reconcileCatalogAndIdents(opCtx));
-    // TODO (SERVER-44467): Restart two-phase index builds during startup
-    invariant(reconcileResult.indexBuildsToRestart.empty());
 
     if (!reconcileResult.indexesToRebuild.empty() && serverGlobalParams.indexBuildRetry) {
         log() << "note: restart the server with --noIndexBuildRetry "
@@ -315,6 +314,12 @@ void rebuildIndexes(OperationContext* opCtx, StorageEngine* storageEngine) {
         std::vector<BSONObj> indexSpecs = entry.second.second;
         fassert(40592, rebuildIndexesOnCollection(opCtx, collection, indexSpecs));
     }
+
+    // Once all unfinished indexes have been rebuilt, restart any unfinished index builds. This will
+    // not build any indexes to completion, but rather start the background thread to build the
+    // index, and wait for a replicated commit or abort oplog entry.
+    IndexBuildsCoordinator::get(opCtx)->restartIndexBuildsForRecovery(
+        opCtx, reconcileResult.indexBuildsToRestart);
 }
 
 /**
diff --git a/src/mongo/db/repl/oplog.cpp b/src/mongo/db/repl/oplog.cpp
index 3b83398df30..8f345e4e023 100644
--- a/src/mongo/db/repl/oplog.cpp
+++ b/src/mongo/db/repl/oplog.cpp
@@ -213,8 +213,7 @@ Status commitIndexBuild(OperationContext* opCtx,
         return statusWithIndexes.getStatus();
     }
     auto indexBuildsCoord = IndexBuildsCoordinator::get(opCtx);
-    indexBuildsCoord->commitIndexBuild(opCtx, statusWithIndexes.getValue(), indexBuildUUID);
-    indexBuildsCoord->joinIndexBuild(opCtx, indexBuildUUID);
+    indexBuildsCoord->signalCommitAndWait(opCtx, indexBuildUUID);
     return Status::OK();
 }
 
@@ -223,11 +222,10 @@ Status abortIndexBuild(OperationContext* opCtx,
                        const Status& cause,
                        OplogApplication::Mode mode) {
     // Wait until the index build finishes aborting.
-    IndexBuildsCoordinator::get(opCtx)->abortIndexBuildByBuildUUID(
+    IndexBuildsCoordinator::get(opCtx)->signalAbortAndWait(
         opCtx,
         indexBuildUUID,
         str::stream() << "abortIndexBuild oplog entry encountered: " << cause);
-    IndexBuildsCoordinator::get(opCtx)->joinIndexBuild(opCtx, indexBuildUUID);
     return Status::OK();
 }
 
diff --git a/src/mongo/db/repl/oplog_applier_impl.cpp b/src/mongo/db/repl/oplog_applier_impl.cpp
index 43039c8f48e..f1c12d1e68a 100644
--- a/src/mongo/db/repl/oplog_applier_impl.cpp
+++ b/src/mongo/db/repl/oplog_applier_impl.cpp
@@ -449,9 +449,17 @@ void OplogApplierImpl::_run(OplogBuffer* oplogBuffer) {
 
         // Apply the operations in this batch. '_applyOplogBatch' returns the optime of the
         // last op that was applied, which should be the last optime in the batch.
-        auto lastOpTimeAppliedInBatch =
-            fassertNoTrace(34437, _applyOplogBatch(&opCtx, ops.releaseBatch()));
-        invariant(lastOpTimeAppliedInBatch == lastOpTimeInBatch);
+        auto swLastOpTimeAppliedInBatch = _applyOplogBatch(&opCtx, ops.releaseBatch());
+        if (swLastOpTimeAppliedInBatch.getStatus().code() == ErrorCodes::InterruptedAtShutdown) {
+            // If an operation was interrupted at shutdown, fail the batch without advancing
+            // appliedThrough as if this were an unclean shutdown. This ensures the stable timestamp
+            // does not advance, and a checkpoint cannot be taken at a timestamp that includes this
+            // batch. On startup, we will recover from an earlier stable checkpoint and apply the
+            // operations from this batch again.
+            return;
+        }
+        fassertNoTrace(34437, swLastOpTimeAppliedInBatch);
+        invariant(swLastOpTimeAppliedInBatch.getValue() == lastOpTimeInBatch);
 
         // Update various things that care about our last applied optime. Tests rely on 1 happening
         // before 2 even though it isn't strictly necessary.
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
index f9797af98d2..51c5e4f96fe 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
@@ -366,6 +366,11 @@ void ReplicationCoordinatorExternalStateImpl::shutdown(OperationContext* opCtx)
     // _taskExecutor pointer never changes.
     _taskExecutor->join();
 
+    // Ensure that all writes are visible before reading. If we failed mid-batch, it would be
+    // possible to read from a kLastApplied ReadSource where not all writes to the minValid document
+    // are visible, generating a writeConflict that would not resolve.
+    opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
+
     auto loadLastOpTimeAndWallTimeResult = loadLastOpTimeAndWallTime(opCtx);
     if (_replicationProcess->getConsistencyMarkers()->getOplogTruncateAfterPoint(opCtx).isNull() &&
         loadLastOpTimeAndWallTimeResult.isOK() &&
diff --git a/src/mongo/db/storage/storage_engine_impl.cpp b/src/mongo/db/storage/storage_engine_impl.cpp
index 213c3c50937..4763ddbdf13 100644
--- a/src/mongo/db/storage/storage_engine_impl.cpp
+++ b/src/mongo/db/storage/storage_engine_impl.cpp
@@ -456,7 +456,7 @@ StatusWith<StorageEngine::ReconcileResult> StorageEngineImpl::reconcileCatalogAn
                 invariant(collUUID);
                 auto buildUUID = *indexMetaData.buildUUID;
 
-                log() << "Found index from unfinished build. Collection: " << coll << "("
+                log() << "Found index from unfinished build. Collection: " << coll << " ("
                       << *collUUID << "), index: " << indexName << ", build UUID: " << buildUUID;
 
                 // Insert in the map if a build has not already been registered.
-- 
cgit v1.2.1