1 files changed, 116 insertions, 133 deletions
diff --git a/src/mongo/db/repl/rollback_impl.cpp b/src/mongo/db/repl/rollback_impl.cpp
index 1b6fd2482fb..647988123a1 100644
--- a/src/mongo/db/repl/rollback_impl.cpp
+++ b/src/mongo/db/repl/rollback_impl.cpp
@@ -226,111 +226,22 @@ Status RollbackImpl::runRollback(OperationContext* opCtx) {
         return status;
     }
     _rollbackStats.rollbackId = _replicationProcess->getRollbackID();
+    _listener->onRollbackIDIncremented();
 
-    // Before computing record store counts, abort all active transactions. This ensures that the
-    // count adjustments are based on correct values where no prepared transactions are active and
-    // all in-memory counts have been rolled-back.
-    // Before calling recoverToStableTimestamp, we must abort the storage transaction of any
-    // prepared transaction. This will require us to scan all sessions and call
-    // abortPreparedTransactionForRollback() on any txnParticipant with a prepared transaction.
-    killSessionsAbortAllPreparedTransactions(opCtx);
-
-    // Ask the record store for the pre-rollback counts of any collections whose counts will change
-    // and create a map with the adjusted counts for post-rollback. While finding the common
-    // point, we keep track of how much each collection's count will change during the rollback.
-    // Note: these numbers are relative to the common point, not the stable timestamp, and thus
-    // must be set after recovering from the oplog.
-    // TODO (SERVER-40614): This error should be fatal.
-    status = _findRecordStoreCounts(opCtx);
+    // Execute the critical section in rollback. It is illegal to exit rollback cleanly between
+    // aborting prepared transactions and reconstructing them. During this window, no interruptions
+    // are allowed and all errors should be made fatal.
+    status = _runRollbackCriticalSection(opCtx, commonPoint);
     if (!status.isOK()) {
-        return status;
+        fassertFailedWithStatus(31049, status.withContext("Error in rollback critical section"));
     }
+    _listener->onPreparedTransactionsReconstructed();
 
-    if (shouldCreateDataFiles()) {
-        // Write a rollback file for each namespace that has documents that would be deleted by
-        // rollback. We need to do this after aborting prepared transactions. Otherwise, we risk
-        // unecessary prepare conflicts when trying to read documents that were modified by those
-        // prepared transactions, which we know we will abort anyway.
-        // TODO (SERVER-40614): This error should be fatal.
-        status = _writeRollbackFiles(opCtx);
-        if (!status.isOK()) {
-            return status;
-        }
-    } else {
-        log() << "Not writing rollback files. 'createRollbackDataFiles' set to false.";
-    }
-
-    // If there were rolled back operations on any session, invalidate all sessions.
-    // We invalidate sessions before we recover so that we avoid invalidating sessions that had
-    // just recovered prepared transactions.
-    if (_observerInfo.rollbackSessionIds.size() > 0) {
-        MongoDSessionCatalog::invalidateSessions(opCtx, boost::none);
-    }
-
-    // Recover to the stable timestamp.
-    auto stableTimestampSW = _recoverToStableTimestamp(opCtx);
-    // TODO (SERVER-40614): This error should be fatal.
-    if (!stableTimestampSW.isOK()) {
-        return stableTimestampSW.getStatus();
-    }
-    _rollbackStats.stableTimestamp = stableTimestampSW.getValue();
-    _listener->onRecoverToStableTimestamp(stableTimestampSW.getValue());
-
-    // Log the total number of insert and update operations that have been rolled back as a result
-    // of recovering to the stable timestamp.
-    log() << "Rollback reverted " << _observerInfo.rollbackCommandCounts[kInsertCmdName]
-          << " insert operations, " << _observerInfo.rollbackCommandCounts[kUpdateCmdName]
-          << " update operations and " << _observerInfo.rollbackCommandCounts[kDeleteCmdName]
-          << " delete operations.";
-
-    // During replication recovery, we truncate all oplog entries with timestamps greater than or
-    // equal to the oplog truncate after point. As a result, we must find the oplog entry after
-    // the common point so we do not truncate the common point itself. If we entered rollback,
-    // we are guaranteed to have at least one oplog entry after the common point.
-    Timestamp truncatePoint = _findTruncateTimestamp(opCtx, commonPointSW.getValue());
-
-    // We cannot have an interrupt point between setting the oplog truncation point and fixing the
-    // record store counts or else a clean shutdown could produce incorrect counts. We explicitly
-    // check for shutdown here to safely maximize interruptibility.
-    // TODO (SERVER-40614): This interrupt point should be removed.
+    // We can now accept interruptions again.
     if (_isInShutdown()) {
         return Status(ErrorCodes::ShutdownInProgress, "rollback shutting down");
     }
 
-    // Persist the truncate point to the 'oplogTruncateAfterPoint' document. We save this value so
-    // that the replication recovery logic knows where to truncate the oplog. We save this value
-    // durably to match the behavior during startup recovery. This must occur after we successfully
-    // recover to a stable timestamp. If recovering to a stable timestamp fails and we still
-    // truncate the oplog then the oplog will not match the data files. If we crash at any earlier
-    // point, we will recover, find a new sync source, and restart roll back (if necessary on the
-    // new sync source). This is safe because a crash before this point would recover to a stable
-    // checkpoint anyways at or earlier than the stable timestamp.
-    //
-    // Note that storage engine timestamp recovery only restores the database *data* to a stable
-    // timestamp, but does not revert the oplog, which must be done as part of the rollback process.
-    _replicationProcess->getConsistencyMarkers()->setOplogTruncateAfterPoint(opCtx, truncatePoint);
-    _rollbackStats.truncateTimestamp = truncatePoint;
-    _listener->onSetOplogTruncateAfterPoint(truncatePoint);
-
-    // Align the drop pending reaper state with what's on disk. Oplog recovery depends on those
-    // being consistent.
-    _resetDropPendingState(opCtx);
-
-    // Run the recovery process.
-    _replicationProcess->getReplicationRecovery()->recoverFromOplog(opCtx,
-                                                                    stableTimestampSW.getValue());
-    _listener->onRecoverFromOplog();
-
-    // Sets the correct post-rollback counts on any collections whose counts changed during the
-    // rollback.
-    _correctRecordStoreCounts(opCtx);
-
-    // Reconstruct prepared transactions after counts have been adjusted. Since prepared
-    // transactions were aborted (i.e. the in-memory counts were rolled-back) before computing
-    // collection counts, reconstruct the prepared transactions now, adding on any additional counts
-    // to the now corrected record store.
-    reconstructPreparedTransactions(opCtx, OplogApplication::Mode::kRecovering);
-
     // At this point, the last applied and durable optimes on this node still point to ops on
     // the divergent branch of history. We therefore update the last optimes to the top of the
     // oplog, which should now be at the common point.
@@ -497,6 +408,110 @@ StatusWith<std::set<NamespaceString>> RollbackImpl::_namespacesForOp(const Oplog
     return namespaces;
 }
 
+Status RollbackImpl::_runRollbackCriticalSection(
+    OperationContext* opCtx,
+    RollBackLocalOperations::RollbackCommonPoint commonPoint) noexcept try {
+    // Before computing record store counts, abort all active transactions. This ensures that
+    // the count adjustments are based on correct values where no prepared transactions are
+    // active and all in-memory counts have been rolled-back.
+    // Before calling recoverToStableTimestamp, we must abort the storage transaction of any
+    // prepared transaction. This will require us to scan all sessions and call
+    // abortPreparedTransactionForRollback() on any txnParticipant with a prepared transaction.
+    killSessionsAbortAllPreparedTransactions(opCtx);
+
+    // Ask the record store for the pre-rollback counts of any collections whose counts will
+    // change and create a map with the adjusted counts for post-rollback. While finding the
+    // common point, we keep track of how much each collection's count will change during the
+    // rollback. Note: these numbers are relative to the common point, not the stable timestamp,
+    // and thus must be set after recovering from the oplog.
+    auto status = _findRecordStoreCounts(opCtx);
+    if (!status.isOK()) {
+        return status.withContext("Error while finding record store counts");
+    }
+
+    if (shouldCreateDataFiles()) {
+        // Write a rollback file for each namespace that has documents that would be deleted by
+        // rollback. We need to do this after aborting prepared transactions. Otherwise, we risk
+        // unecessary prepare conflicts when trying to read documents that were modified by
+        // those prepared transactions, which we know we will abort anyway.
+        status = _writeRollbackFiles(opCtx);
+        if (!status.isOK()) {
+            return status.withContext("Error while writing out rollback files");
+        }
+    } else {
+        log() << "Not writing rollback files. 'createRollbackDataFiles' set to false.";
+    }
+
+    // If there were rolled back operations on any session, invalidate all sessions.
+    // We invalidate sessions before we recover so that we avoid invalidating sessions that had
+    // just recovered prepared transactions.
+    if (_observerInfo.rollbackSessionIds.size() > 0) {
+        MongoDSessionCatalog::invalidateSessions(opCtx, boost::none);
+    }
+
+    // Recover to the stable timestamp.
+    auto stableTimestampSW = _recoverToStableTimestamp(opCtx);
+    if (!stableTimestampSW.isOK()) {
+        auto status = stableTimestampSW.getStatus();
+        return status.withContext("Error while recovering to stable timestamp");
+    }
+    _rollbackStats.stableTimestamp = stableTimestampSW.getValue();
+    _listener->onRecoverToStableTimestamp(stableTimestampSW.getValue());
+
+    // Log the total number of insert and update operations that have been rolled back as a
+    // result of recovering to the stable timestamp.
+    log() << "Rollback reverted " << _observerInfo.rollbackCommandCounts[kInsertCmdName]
+          << " insert operations, " << _observerInfo.rollbackCommandCounts[kUpdateCmdName]
+          << " update operations and " << _observerInfo.rollbackCommandCounts[kDeleteCmdName]
+          << " delete operations.";
+
+    // During replication recovery, we truncate all oplog entries with timestamps greater than
+    // or equal to the oplog truncate after point. As a result, we must find the oplog entry
+    // after the common point so we do not truncate the common point itself. If we entered
+    // rollback, we are guaranteed to have at least one oplog entry after the common point.
+    Timestamp truncatePoint = _findTruncateTimestamp(opCtx, commonPoint);
+
+    // Persist the truncate point to the 'oplogTruncateAfterPoint' document. We save this value so
+    // that the replication recovery logic knows where to truncate the oplog. We save this value
+    // durably to match the behavior during startup recovery. This must occur after we successfully
+    // recover to a stable timestamp. If recovering to a stable timestamp fails and we still
+    // truncate the oplog then the oplog will not match the data files. If we crash at any earlier
+    // point, we will recover, find a new sync source, and restart roll back (if necessary on the
+    // new sync source). This is safe because a crash before this point would recover to a stable
+    // checkpoint anyways at or earlier than the stable timestamp.
+    //
+    // Note that storage engine timestamp recovery only restores the database *data* to a stable
+    // timestamp, but does not revert the oplog, which must be done as part of the rollback process.
+    _replicationProcess->getConsistencyMarkers()->setOplogTruncateAfterPoint(opCtx, truncatePoint);
+    _rollbackStats.truncateTimestamp = truncatePoint;
+    _listener->onSetOplogTruncateAfterPoint(truncatePoint);
+
+    // Align the drop pending reaper state with what's on disk. Oplog recovery depends on those
+    // being consistent.
+    _resetDropPendingState(opCtx);
+
+    // Run the recovery process.
+    _replicationProcess->getReplicationRecovery()->recoverFromOplog(opCtx,
+                                                                    stableTimestampSW.getValue());
+    _listener->onRecoverFromOplog();
+
+    // Sets the correct post-rollback counts on any collections whose counts changed during the
+    // rollback.
+    _correctRecordStoreCounts(opCtx);
+
+    // Reconstruct prepared transactions after counts have been adjusted. Since prepared
+    // transactions were aborted (i.e. the in-memory counts were rolled-back) before computing
+    // collection counts, reconstruct the prepared transactions now, adding on any additional counts
+    // to the now corrected record store.
+    reconstructPreparedTransactions(opCtx, OplogApplication::Mode::kRecovering);
+
+    return Status::OK();
+} catch (...) {
+    // Any exceptions here should be made fatal.
+    severe() << "Caught exception during critical section in rollback: " << exceptionToStatus();
+    std::terminate();
+}
+
 void RollbackImpl::_correctRecordStoreCounts(OperationContext* opCtx) {
     // This function explicitly does not check for shutdown since a clean shutdown post oplog
     // truncation is not allowed to occur until the record store counts are corrected.
@@ -572,10 +587,6 @@ void RollbackImpl::_correctRecordStoreCounts(OperationContext* opCtx) {
 }
 
 Status RollbackImpl::_findRecordStoreCounts(OperationContext* opCtx) {
-    // TODO (SERVER-40614): This interrupt point should be removed.
-    if (_isInShutdown()) {
-        return Status(ErrorCodes::ShutdownInProgress, "rollback shutting down");
-    }
     const auto& catalog = CollectionCatalog::get(opCtx);
     auto storageEngine = opCtx->getServiceContext()->getStorageEngine();
 
@@ -1026,20 +1037,9 @@ Status RollbackImpl::_writeRollbackFiles(OperationContext* opCtx) {
                   str::stream() << "The collection with UUID " << uuid
                                 << " is unexpectedly missing in the CollectionCatalog");
 
-        if (_isInShutdown()) {
-            log() << "Rollback shutting down; not writing rollback file for namespace " << nss->ns()
-                  << " with uuid " << uuid;
-            continue;
-        }
-
         _writeRollbackFileForNamespace(opCtx, uuid, *nss, entry.second);
     }
 
-    // TODO (SERVER-40614): This interrupt point should be removed.
-    if (_isInShutdown()) {
-        return {ErrorCodes::ShutdownInProgress, "rollback shutting down"};
-    }
-
     return Status::OK();
 }
 
@@ -1088,27 +1088,10 @@ void RollbackImpl::_writeRollbackFileForNamespace(OperationContext* opCtx,
 }
 
 StatusWith<Timestamp> RollbackImpl::_recoverToStableTimestamp(OperationContext* opCtx) {
-    // TODO (SERVER-40614): This interrupt point should be removed.
-    if (_isInShutdown()) {
-        return Status(ErrorCodes::ShutdownInProgress, "rollback shutting down");
-    }
-    // Recover to the stable timestamp while holding the global exclusive lock.
-    {
-        Lock::GlobalWrite globalWrite(opCtx);
-        try {
-            auto stableTimestampSW = _storageInterface->recoverToStableTimestamp(opCtx);
-            if (!stableTimestampSW.isOK()) {
-                severe() << "RecoverToStableTimestamp failed. "
-                         << causedBy(stableTimestampSW.getStatus());
-                // TODO (SERVER-40614): fassert here instead of depending on the caller to do it
-                return {ErrorCodes::UnrecoverableRollbackError,
-                        "Recover to stable timestamp failed."};
-            }
-            return stableTimestampSW;
-        } catch (...) {
-            return exceptionToStatus();
-        }
-    }
+    // Recover to the stable timestamp while holding the global exclusive lock. This may throw,
+    // which the caller must handle.
+    Lock::GlobalWrite globalWrite(opCtx);
+    return _storageInterface->recoverToStableTimestamp(opCtx);
 }
 
 Status RollbackImpl::_triggerOpObserver(OperationContext* opCtx) {