SERVER-40614 Make rollback errors fatal between aborting and reconstructing prepared transactions

author: Vesselina Ratcheva <vesselina.ratcheva@10gen.com> 2019-04-29 18:06:07 -0400
committer: Vesselina Ratcheva <vesselina.ratcheva@10gen.com> 2019-05-09 12:42:37 -0400
commit: 29b29b2af6883b99f58c7a90a95f57221874214f (patch)
tree: 9447cc1cf2d2d6529b31bf74bfd0f2239f3da1fe
parent: bf47260ea0cbc58d3744d8964b2eb036b9a1a19e (diff)
download: mongo-29b29b2af6883b99f58c7a90a95f57221874214f.tar.gz
3 files changed, 198 insertions, 212 deletions
diff --git a/src/mongo/db/repl/rollback_impl.cpp b/src/mongo/db/repl/rollback_impl.cpp
index 1b6fd2482fb..647988123a1 100644
--- a/src/mongo/db/repl/rollback_impl.cpp
+++ b/src/mongo/db/repl/rollback_impl.cpp
@@ -226,111 +226,22 @@ Status RollbackImpl::runRollback(OperationContext* opCtx) {
         return status;
     }
     _rollbackStats.rollbackId = _replicationProcess->getRollbackID();
+    _listener->onRollbackIDIncremented();
 
-    // Before computing record store counts, abort all active transactions. This ensures that the
-    // count adjustments are based on correct values where no prepared transactions are active and
-    // all in-memory counts have been rolled-back.
-    // Before calling recoverToStableTimestamp, we must abort the storage transaction of any
-    // prepared transaction. This will require us to scan all sessions and call
-    // abortPreparedTransactionForRollback() on any txnParticipant with a prepared transaction.
-    killSessionsAbortAllPreparedTransactions(opCtx);
-
-    // Ask the record store for the pre-rollback counts of any collections whose counts will change
-    // and create a map with the adjusted counts for post-rollback. While finding the common
-    // point, we keep track of how much each collection's count will change during the rollback.
-    // Note: these numbers are relative to the common point, not the stable timestamp, and thus
-    // must be set after recovering from the oplog.
-    // TODO (SERVER-40614): This error should be fatal.
-    status = _findRecordStoreCounts(opCtx);
+    // Execute the critical section in rollback. It is illegal to exit rollback cleanly between
+    // aborting prepared transactions and reconstructing them. During this window, no interruptions
+    // are allowed and all errors should be made fatal.
+    status = _runRollbackCriticalSection(opCtx, commonPoint);
     if (!status.isOK()) {
-        return status;
+        fassertFailedWithStatus(31049, status.withContext("Error in rollback critical section"));
     }
+    _listener->onPreparedTransactionsReconstructed();
 
-    if (shouldCreateDataFiles()) {
-        // Write a rollback file for each namespace that has documents that would be deleted by
-        // rollback. We need to do this after aborting prepared transactions. Otherwise, we risk
-        // unecessary prepare conflicts when trying to read documents that were modified by those
-        // prepared transactions, which we know we will abort anyway.
-        // TODO (SERVER-40614): This error should be fatal.
-        status = _writeRollbackFiles(opCtx);
-        if (!status.isOK()) {
-            return status;
-        }
-    } else {
-        log() << "Not writing rollback files. 'createRollbackDataFiles' set to false.";
-    }
-
-    // If there were rolled back operations on any session, invalidate all sessions.
-    // We invalidate sessions before we recover so that we avoid invalidating sessions that had
-    // just recovered prepared transactions.
-    if (_observerInfo.rollbackSessionIds.size() > 0) {
-        MongoDSessionCatalog::invalidateSessions(opCtx, boost::none);
-    }
-
-    // Recover to the stable timestamp.
-    auto stableTimestampSW = _recoverToStableTimestamp(opCtx);
-    // TODO (SERVER-40614): This error should be fatal.
-    if (!stableTimestampSW.isOK()) {
-        return stableTimestampSW.getStatus();
-    }
-    _rollbackStats.stableTimestamp = stableTimestampSW.getValue();
-    _listener->onRecoverToStableTimestamp(stableTimestampSW.getValue());
-
-    // Log the total number of insert and update operations that have been rolled back as a result
-    // of recovering to the stable timestamp.
-    log() << "Rollback reverted " << _observerInfo.rollbackCommandCounts[kInsertCmdName]
-          << " insert operations, " << _observerInfo.rollbackCommandCounts[kUpdateCmdName]
-          << " update operations and " << _observerInfo.rollbackCommandCounts[kDeleteCmdName]
-          << " delete operations.";
-
-    // During replication recovery, we truncate all oplog entries with timestamps greater than or
-    // equal to the oplog truncate after point. As a result, we must find the oplog entry after
-    // the common point so we do not truncate the common point itself. If we entered rollback,
-    // we are guaranteed to have at least one oplog entry after the common point.
-    Timestamp truncatePoint = _findTruncateTimestamp(opCtx, commonPointSW.getValue());
-
-    // We cannot have an interrupt point between setting the oplog truncation point and fixing the
-    // record store counts or else a clean shutdown could produce incorrect counts. We explicitly
-    // check for shutdown here to safely maximize interruptibility.
-    // TODO (SERVER-40614): This interrupt point should be removed.
+    // We can now accept interruptions again.
     if (_isInShutdown()) {
         return Status(ErrorCodes::ShutdownInProgress, "rollback shutting down");
     }
 
-    // Persist the truncate point to the 'oplogTruncateAfterPoint' document. We save this value so
-    // that the replication recovery logic knows where to truncate the oplog. We save this value
-    // durably to match the behavior during startup recovery. This must occur after we successfully
-    // recover to a stable timestamp. If recovering to a stable timestamp fails and we still
-    // truncate the oplog then the oplog will not match the data files. If we crash at any earlier
-    // point, we will recover, find a new sync source, and restart roll back (if necessary on the
-    // new sync source). This is safe because a crash before this point would recover to a stable
-    // checkpoint anyways at or earlier than the stable timestamp.
-    //
-    // Note that storage engine timestamp recovery only restores the database *data* to a stable
-    // timestamp, but does not revert the oplog, which must be done as part of the rollback process.
-    _replicationProcess->getConsistencyMarkers()->setOplogTruncateAfterPoint(opCtx, truncatePoint);
-    _rollbackStats.truncateTimestamp = truncatePoint;
-    _listener->onSetOplogTruncateAfterPoint(truncatePoint);
-
-    // Align the drop pending reaper state with what's on disk. Oplog recovery depends on those
-    // being consistent.
-    _resetDropPendingState(opCtx);
-
-    // Run the recovery process.
-    _replicationProcess->getReplicationRecovery()->recoverFromOplog(opCtx,
-                                                                    stableTimestampSW.getValue());
-    _listener->onRecoverFromOplog();
-
-    // Sets the correct post-rollback counts on any collections whose counts changed during the
-    // rollback.
-    _correctRecordStoreCounts(opCtx);
-
-    // Reconstruct prepared transactions after counts have been adjusted. Since prepared
-    // transactions were aborted (i.e. the in-memory counts were rolled-back) before computing
-    // collection counts, reconstruct the prepared transactions now, adding on any additional counts
-    // to the now corrected record store.
-    reconstructPreparedTransactions(opCtx, OplogApplication::Mode::kRecovering);
-
     // At this point, the last applied and durable optimes on this node still point to ops on
     // the divergent branch of history. We therefore update the last optimes to the top of the
     // oplog, which should now be at the common point.
@@ -497,6 +408,110 @@ StatusWith<std::set<NamespaceString>> RollbackImpl::_namespacesForOp(const Oplog
     return namespaces;
 }
 
+Status RollbackImpl::_runRollbackCriticalSection(
+    OperationContext* opCtx,
+    RollBackLocalOperations::RollbackCommonPoint commonPoint) noexcept try {
+    // Before computing record store counts, abort all active transactions. This ensures that
+    // the count adjustments are based on correct values where no prepared transactions are
+    // active and all in-memory counts have been rolled-back.
+    // Before calling recoverToStableTimestamp, we must abort the storage transaction of any
+    // prepared transaction. This will require us to scan all sessions and call
+    // abortPreparedTransactionForRollback() on any txnParticipant with a prepared transaction.
+    killSessionsAbortAllPreparedTransactions(opCtx);
+
+    // Ask the record store for the pre-rollback counts of any collections whose counts will
+    // change and create a map with the adjusted counts for post-rollback. While finding the
+    // common point, we keep track of how much each collection's count will change during the
+    // rollback. Note: these numbers are relative to the common point, not the stable timestamp,
+    // and thus must be set after recovering from the oplog.
+    auto status = _findRecordStoreCounts(opCtx);
+    if (!status.isOK()) {
+        return status.withContext("Error while finding record store counts");
+    }
+
+    if (shouldCreateDataFiles()) {
+        // Write a rollback file for each namespace that has documents that would be deleted by
+        // rollback. We need to do this after aborting prepared transactions. Otherwise, we risk
+        // unecessary prepare conflicts when trying to read documents that were modified by
+        // those prepared transactions, which we know we will abort anyway.
+        status = _writeRollbackFiles(opCtx);
+        if (!status.isOK()) {
+            return status.withContext("Error while writing out rollback files");
+        }
+    } else {
+        log() << "Not writing rollback files. 'createRollbackDataFiles' set to false.";
+    }
+
+    // If there were rolled back operations on any session, invalidate all sessions.
+    // We invalidate sessions before we recover so that we avoid invalidating sessions that had
+    // just recovered prepared transactions.
+    if (_observerInfo.rollbackSessionIds.size() > 0) {
+        MongoDSessionCatalog::invalidateSessions(opCtx, boost::none);
+    }
+
+    // Recover to the stable timestamp.
+    auto stableTimestampSW = _recoverToStableTimestamp(opCtx);
+    if (!stableTimestampSW.isOK()) {
+        auto status = stableTimestampSW.getStatus();
+        return status.withContext("Error while recovering to stable timestamp");
+    }
+    _rollbackStats.stableTimestamp = stableTimestampSW.getValue();
+    _listener->onRecoverToStableTimestamp(stableTimestampSW.getValue());
+
+    // Log the total number of insert and update operations that have been rolled back as a
+    // result of recovering to the stable timestamp.
+    log() << "Rollback reverted " << _observerInfo.rollbackCommandCounts[kInsertCmdName]
+          << " insert operations, " << _observerInfo.rollbackCommandCounts[kUpdateCmdName]
+          << " update operations and " << _observerInfo.rollbackCommandCounts[kDeleteCmdName]
+          << " delete operations.";
+
+    // During replication recovery, we truncate all oplog entries with timestamps greater than
+    // or equal to the oplog truncate after point. As a result, we must find the oplog entry
+    // after the common point so we do not truncate the common point itself. If we entered
+    // rollback, we are guaranteed to have at least one oplog entry after the common point.
+    Timestamp truncatePoint = _findTruncateTimestamp(opCtx, commonPoint);
+
+    // Persist the truncate point to the 'oplogTruncateAfterPoint' document. We save this value so
+    // that the replication recovery logic knows where to truncate the oplog. We save this value
+    // durably to match the behavior during startup recovery. This must occur after we successfully
+    // recover to a stable timestamp. If recovering to a stable timestamp fails and we still
+    // truncate the oplog then the oplog will not match the data files. If we crash at any earlier
+    // point, we will recover, find a new sync source, and restart roll back (if necessary on the
+    // new sync source). This is safe because a crash before this point would recover to a stable
+    // checkpoint anyways at or earlier than the stable timestamp.
+    //
+    // Note that storage engine timestamp recovery only restores the database *data* to a stable
+    // timestamp, but does not revert the oplog, which must be done as part of the rollback process.
+    _replicationProcess->getConsistencyMarkers()->setOplogTruncateAfterPoint(opCtx, truncatePoint);
+    _rollbackStats.truncateTimestamp = truncatePoint;
+    _listener->onSetOplogTruncateAfterPoint(truncatePoint);
+
+    // Align the drop pending reaper state with what's on disk. Oplog recovery depends on those
+    // being consistent.
+    _resetDropPendingState(opCtx);
+
+    // Run the recovery process.
+    _replicationProcess->getReplicationRecovery()->recoverFromOplog(opCtx,
+                                                                    stableTimestampSW.getValue());
+    _listener->onRecoverFromOplog();
+
+    // Sets the correct post-rollback counts on any collections whose counts changed during the
+    // rollback.
+    _correctRecordStoreCounts(opCtx);
+
+    // Reconstruct prepared transactions after counts have been adjusted. Since prepared
+    // transactions were aborted (i.e. the in-memory counts were rolled-back) before computing
+    // collection counts, reconstruct the prepared transactions now, adding on any additional counts
+    // to the now corrected record store.
+    reconstructPreparedTransactions(opCtx, OplogApplication::Mode::kRecovering);
+
+    return Status::OK();
+} catch (...) {
+    // Any exceptions here should be made fatal.
+    severe() << "Caught exception during critical section in rollback: " << exceptionToStatus();
+    std::terminate();
+}
+
 void RollbackImpl::_correctRecordStoreCounts(OperationContext* opCtx) {
     // This function explicitly does not check for shutdown since a clean shutdown post oplog
     // truncation is not allowed to occur until the record store counts are corrected.
@@ -572,10 +587,6 @@ void RollbackImpl::_correctRecordStoreCounts(OperationContext* opCtx) {
 }
 
 Status RollbackImpl::_findRecordStoreCounts(OperationContext* opCtx) {
-    // TODO (SERVER-40614): This interrupt point should be removed.
-    if (_isInShutdown()) {
-        return Status(ErrorCodes::ShutdownInProgress, "rollback shutting down");
-    }
     const auto& catalog = CollectionCatalog::get(opCtx);
     auto storageEngine = opCtx->getServiceContext()->getStorageEngine();
 
@@ -1026,20 +1037,9 @@ Status RollbackImpl::_writeRollbackFiles(OperationContext* opCtx) {
                   str::stream() << "The collection with UUID " << uuid
                                 << " is unexpectedly missing in the CollectionCatalog");
 
-        if (_isInShutdown()) {
-            log() << "Rollback shutting down; not writing rollback file for namespace " << nss->ns()
-                  << " with uuid " << uuid;
-            continue;
-        }
-
         _writeRollbackFileForNamespace(opCtx, uuid, *nss, entry.second);
     }
 
-    // TODO (SERVER-40614): This interrupt point should be removed.
-    if (_isInShutdown()) {
-        return {ErrorCodes::ShutdownInProgress, "rollback shutting down"};
-    }
-
     return Status::OK();
 }
 
@@ -1088,27 +1088,10 @@ void RollbackImpl::_writeRollbackFileForNamespace(OperationContext* opCtx,
 }
 
 StatusWith<Timestamp> RollbackImpl::_recoverToStableTimestamp(OperationContext* opCtx) {
-    // TODO (SERVER-40614): This interrupt point should be removed.
-    if (_isInShutdown()) {
-        return Status(ErrorCodes::ShutdownInProgress, "rollback shutting down");
-    }
-    // Recover to the stable timestamp while holding the global exclusive lock.
-    {
-        Lock::GlobalWrite globalWrite(opCtx);
-        try {
-            auto stableTimestampSW = _storageInterface->recoverToStableTimestamp(opCtx);
-            if (!stableTimestampSW.isOK()) {
-                severe() << "RecoverToStableTimestamp failed. "
-                         << causedBy(stableTimestampSW.getStatus());
-                // TODO (SERVER-40614): fassert here instead of depending on the caller to do it
-                return {ErrorCodes::UnrecoverableRollbackError,
-                        "Recover to stable timestamp failed."};
-            }
-            return stableTimestampSW;
-        } catch (...) {
-            return exceptionToStatus();
-        }
-    }
+    // Recover to the stable timestamp while holding the global exclusive lock. This may throw,
+    // which the caller must handle.
+    Lock::GlobalWrite globalWrite(opCtx);
+    return _storageInterface->recoverToStableTimestamp(opCtx);
 }
 
 Status RollbackImpl::_triggerOpObserver(OperationContext* opCtx) {
diff --git a/src/mongo/db/repl/rollback_impl.h b/src/mongo/db/repl/rollback_impl.h
index e219d2b1fd1..ba9cbd95fa7 100644
--- a/src/mongo/db/repl/rollback_impl.h
+++ b/src/mongo/db/repl/rollback_impl.h
@@ -187,6 +187,11 @@ public:
         virtual void onCommonPointFound(Timestamp commonPoint) noexcept {}
 
         /**
+         * Function called after we have incremented the rollback ID.
+         */
+        virtual void onRollbackIDIncremented() noexcept {}
+
+        /**
          * Function called after a rollback file has been written for each namespace with inserts or
          * updates that are being rolled back.
          */
@@ -194,8 +199,9 @@ public:
 
         /**
          * Function called after we recover to the stable timestamp.
+         * NOTE: This may throw, for testing purposes.
          */
-        virtual void onRecoverToStableTimestamp(Timestamp stableTimestamp) noexcept {}
+        virtual void onRecoverToStableTimestamp(Timestamp stableTimestamp) {}
 
         /**
          * Function called after we set the oplog truncate after point.
@@ -208,6 +214,11 @@ public:
         virtual void onRecoverFromOplog() noexcept {}
 
         /**
+         * Function called after we reconstruct prepared transactions.
+         */
+        virtual void onPreparedTransactionsReconstructed() noexcept {}
+
+        /**
          * Function called after we have triggered the 'onRollback' OpObserver method.
          */
         virtual void onRollbackOpObserver(const OpObserver::RollbackObserverInfo& rbInfo) noexcept {
@@ -375,6 +386,13 @@ private:
     Status _findRecordStoreCounts(OperationContext* opCtx);
 
     /**
+     * Executes the critical section in rollback, defined as the window between aborting and
+     * reconstructing prepared transactions.
+     */
+    Status _runRollbackCriticalSection(
+        OperationContext* opCtx, RollBackLocalOperations::RollbackCommonPoint commonPoint) noexcept;
+
+    /**
      * Sets the record store counts to be the values stored in _newCounts.
      */
     void _correctRecordStoreCounts(OperationContext* opCtx);
diff --git a/src/mongo/db/repl/rollback_impl_test.cpp b/src/mongo/db/repl/rollback_impl_test.cpp
index 328b1f1572d..7df54ceea37 100644
--- a/src/mongo/db/repl/rollback_impl_test.cpp
+++ b/src/mongo/db/repl/rollback_impl_test.cpp
@@ -258,6 +258,14 @@ protected:
     bool _recoveredFromOplog = false;
     stdx::function<void()> _onRecoverFromOplogFn = [this]() { _recoveredFromOplog = true; };
 
+    bool _incrementedRollbackID = false;
+    stdx::function<void()> _onRollbackIDIncrementedFn = [this]() { _incrementedRollbackID = true; };
+
+    bool _reconstructedPreparedTransactions = false;
+    stdx::function<void()> _onPreparedTransactionsReconstructedFn = [this]() {
+        _reconstructedPreparedTransactions = true;
+    };
+
     Timestamp _commonPointFound;
     stdx::function<void(Timestamp commonPoint)> _onCommonPointFoundFn =
         [this](Timestamp commonPoint) { _commonPointFound = commonPoint; };
@@ -318,11 +326,15 @@ public:
         _test->_onCommonPointFoundFn(commonPoint);
     }
 
+    void onRollbackIDIncremented() noexcept override {
+        _test->_onRollbackIDIncrementedFn();
+    }
+
     void onRollbackFileWrittenForNamespace(UUID uuid, NamespaceString nss) noexcept final {
         _test->_onRollbackFileWrittenForNamespaceFn(std::move(uuid), std::move(nss));
     }
 
-    void onRecoverToStableTimestamp(Timestamp stableTimestamp) noexcept override {
+    void onRecoverToStableTimestamp(Timestamp stableTimestamp) override {
         _test->_onRecoverToStableTimestampFn(stableTimestamp);
     }
 
@@ -334,6 +346,10 @@ public:
         _test->_onRecoverFromOplogFn();
     }
 
+    void onPreparedTransactionsReconstructed() noexcept override {
+        _test->_onPreparedTransactionsReconstructedFn();
+    }
+
     void onRollbackOpObserver(const OpObserver::RollbackObserverInfo& rbInfo) noexcept override {
         _test->_onRollbackOpObserverFn(rbInfo);
     }
@@ -584,7 +600,9 @@ TEST_F(RollbackImplTest, RollbackCallsRecoverToStableTimestamp) {
     ASSERT_EQUALS(stableTimestamp, _stableTimestamp);
 }
 
-TEST_F(RollbackImplTest, RollbackReturnsBadStatusIfRecoverToStableTimestampFails) {
+DEATH_TEST_F(RollbackImplTest,
+             RollbackFassertsIfRecoverToStableTimestampFails,
+             "Fatal assertion 31049") {
     auto op = makeOpAndRecordId(1);
     _remoteOplog->setOperations({op});
     ASSERT_OK(_insertOplogEntry(op.first));
@@ -609,24 +627,8 @@ TEST_F(RollbackImplTest, RollbackReturnsBadStatusIfRecoverToStableTimestampFails
     ASSERT_EQUALS(currTimestamp, _storageInterface->getCurrentTimestamp());
     ASSERT_EQUALS(Timestamp(), _stableTimestamp);
 
-    // Run rollback.
-    auto rollbackStatus = _rollback->runRollback(_opCtx.get());
-
-    // Make sure rollback failed with an UnrecoverableRollbackError, and didn't execute the
-    // recover to timestamp logic.
-    ASSERT_EQUALS(ErrorCodes::UnrecoverableRollbackError, rollbackStatus.code());
-    ASSERT_EQUALS(currTimestamp, _storageInterface->getCurrentTimestamp());
-    ASSERT_EQUALS(Timestamp(), _stableTimestamp);
-
-    // Make sure we transitioned back to SECONDARY state.
-    ASSERT_EQUALS(_coordinator->getMemberState(), MemberState::RS_SECONDARY);
-
-    // Don't set the truncate after point if we fail early.
-    _assertDocsInOplog(_opCtx.get(), {1, 2});
-    truncateAfterPoint =
-        _replicationProcess->getConsistencyMarkers()->getOplogTruncateAfterPoint(_opCtx.get());
-    ASSERT_EQUALS(Timestamp(), truncateAfterPoint);
-    ASSERT_EQUALS(_truncatePoint, Timestamp());
+    // Run rollback. It should fassert.
+    _rollback->runRollback(_opCtx.get()).ignore();
 }
 
 TEST_F(RollbackImplTest, RollbackReturnsBadStatusIfIncrementRollbackIDFails) {
@@ -676,41 +678,61 @@ TEST_F(RollbackImplTest, RollbackCallsRecoverFromOplog) {
     ASSERT(_recoveredFromOplog);
 }
 
-TEST_F(RollbackImplTest, RollbackSkipsRecoverFromOplogWhenShutdownDuringRTT) {
+TEST_F(RollbackImplTest,
+       RollbackCannotBeShutDownBetweenAbortingAndReconstructingPreparedTransactions) {
     auto op = makeOpAndRecordId(1);
     _remoteOplog->setOperations({op});
     ASSERT_OK(_insertOplogEntry(op.first));
     ASSERT_OK(_insertOplogEntry(makeOp(2)));
 
     _assertDocsInOplog(_opCtx.get(), {1, 2});
-    auto truncateAfterPoint =
-        _replicationProcess->getConsistencyMarkers()->getOplogTruncateAfterPoint(_opCtx.get());
-    ASSERT_EQUALS(Timestamp(), truncateAfterPoint);
 
-    _onRecoverToStableTimestampFn = [this](Timestamp stableTimestamp) {
-        _recoveredToStableTimestamp = true;
-        _stableTimestamp = stableTimestamp;
+    _storageInterface->setStableTimestamp(nullptr, Timestamp(1, 1));
+
+    // Called before aborting prepared transactions. We request the shutdown here.
+    _onRollbackIDIncrementedFn = [this]() {
+        _incrementedRollbackID = true;
         _rollback->shutdown();
     };
 
-    // Run rollback.
-    auto status = _rollback->runRollback(_opCtx.get());
+    // Called after reconstructing prepared transactions.
+    _onPreparedTransactionsReconstructedFn = [this]() {
+        ASSERT(_incrementedRollbackID);
+        _reconstructedPreparedTransactions = true;
+    };
 
-    // Make sure shutdown occurred before oplog recovery.
-    ASSERT_EQUALS(ErrorCodes::ShutdownInProgress, _rollback->runRollback(_opCtx.get()));
-    ASSERT(_recoveredToStableTimestamp);
-    ASSERT_FALSE(_recoveredFromOplog);
-    ASSERT_FALSE(_coordinator->lastOpTimesWereReset());
+    // Shutting down is still allowed but it must occur after that window.
+    ASSERT_EQ(ErrorCodes::ShutdownInProgress, _rollback->runRollback(_opCtx.get()));
+    ASSERT(_incrementedRollbackID);
+    ASSERT(_reconstructedPreparedTransactions);
+}
 
-    // Make sure we transitioned back to SECONDARY state.
-    ASSERT_EQUALS(_coordinator->getMemberState(), MemberState::RS_SECONDARY);
-    ASSERT(_stableTimestamp.isNull());
+DEATH_TEST_F(RollbackImplTest,
+             RollbackUassertsAreFatalBetweenAbortingAndReconstructingPreparedTransactions,
+             "Caught exception during critical section in rollback") {
+    auto op = makeOpAndRecordId(1);
+    _remoteOplog->setOperations({op});
+    ASSERT_OK(_insertOplogEntry(op.first));
+    ASSERT_OK(_insertOplogEntry(makeOp(2)));
 
     _assertDocsInOplog(_opCtx.get(), {1, 2});
-    truncateAfterPoint =
-        _replicationProcess->getConsistencyMarkers()->getOplogTruncateAfterPoint(_opCtx.get());
-    ASSERT_EQUALS(Timestamp(), truncateAfterPoint);
-    ASSERT_EQUALS(_truncatePoint, Timestamp());
+
+    _storageInterface->setStableTimestamp(nullptr, Timestamp(1, 1));
+
+    // Called before aborting prepared transactions.
+    _onRollbackIDIncrementedFn = [this]() { _incrementedRollbackID = true; };
+
+    // Called during the critical section.
+    _onRecoverToStableTimestampFn = [this](Timestamp stableTimestamp) {
+        _recoveredToStableTimestamp = true;
+        uasserted(ErrorCodes::UnknownError, "error for test");
+    };
+
+    // Called after reconstructing prepared transactions. We should not be getting here.
+    _onPreparedTransactionsReconstructedFn = [this]() { ASSERT(false); };
+
+    // We expect to crash when we hit the exception.
+    _rollback->runRollback(_opCtx.get()).ignore();
 }
 
 TEST_F(RollbackImplTest,
@@ -1141,43 +1163,6 @@ TEST_F(RollbackImplTest, RollbackProperlySavesFilesWhenCreateCollAndInsertsAreRo
                                SimpleBSONObjComparator::kInstance.makeEqualTo()));
 }
 
-TEST_F(RollbackImplTest, RollbackStopsWritingRollbackFilesWhenShutdownIsInProgress) {
-    const auto commonOp = makeOpAndRecordId(1);
-    _remoteOplog->setOperations({commonOp});
-    ASSERT_OK(_insertOplogEntry(commonOp.first));
-    _storageInterface->setStableTimestamp(nullptr, Timestamp(1, 1));
-
-    const auto nss1 = NamespaceString("db.people");
-    const auto uuid1 = UUID::gen();
-    const auto coll1 = _initializeCollection(_opCtx.get(), uuid1, nss1);
-    const auto obj1 = BSON("_id" << 0 << "name"
-                                 << "kyle");
-    _insertDocAndGenerateOplogEntry(obj1, uuid1, nss1);
-
-    const auto nss2 = NamespaceString("db.persons");
-    const auto uuid2 = UUID::gen();
-    const auto coll2 = _initializeCollection(_opCtx.get(), uuid2, nss2);
-    const auto obj2 = BSON("_id" << 0 << "name"
-                                 << "jungsoo");
-    _insertDocAndGenerateOplogEntry(obj2, uuid2, nss2);
-
-    // Register a listener that sends rollback into shutdown.
-    std::vector<UUID> collsWithSuccessfullyWrittenDataFiles;
-    _onRollbackFileWrittenForNamespaceFn =
-        [this, &collsWithSuccessfullyWrittenDataFiles](UUID uuid, NamespaceString nss) {
-            collsWithSuccessfullyWrittenDataFiles.emplace_back(std::move(uuid));
-            _rollback->shutdown();
-        };
-
-    ASSERT_EQ(_rollback->runRollback(_opCtx.get()), ErrorCodes::ShutdownInProgress);
-
-    ASSERT_EQ(collsWithSuccessfullyWrittenDataFiles.size(), 1UL);
-    const auto& uuid = collsWithSuccessfullyWrittenDataFiles.front();
-    ASSERT(uuid == uuid1 || uuid == uuid2) << "wrote out a data file for unknown uuid " << uuid
-                                           << "; expected it to be either " << uuid1 << " or "
-                                           << uuid2;
-}
-
 DEATH_TEST_F(RollbackImplTest,
              InvariantFailureIfNamespaceIsMissingWhenWritingRollbackFiles,
              "unexpectedly missing in the CollectionCatalog") {
author	Vesselina Ratcheva <vesselina.ratcheva@10gen.com>	2019-04-29 18:06:07 -0400
committer	Vesselina Ratcheva <vesselina.ratcheva@10gen.com>	2019-05-09 12:42:37 -0400
commit	29b29b2af6883b99f58c7a90a95f57221874214f (patch)
tree	9447cc1cf2d2d6529b31bf74bfd0f2239f3da1fe
parent	bf47260ea0cbc58d3744d8964b2eb036b9a1a19e (diff)
download	mongo-29b29b2af6883b99f58c7a90a95f57221874214f.tar.gz