summaryrefslogtreecommitdiff
path: root/src/mongo/db/repl/rollback_impl.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/mongo/db/repl/rollback_impl.cpp')
-rw-r--r--src/mongo/db/repl/rollback_impl.cpp249
1 files changed, 116 insertions, 133 deletions
diff --git a/src/mongo/db/repl/rollback_impl.cpp b/src/mongo/db/repl/rollback_impl.cpp
index 1b6fd2482fb..647988123a1 100644
--- a/src/mongo/db/repl/rollback_impl.cpp
+++ b/src/mongo/db/repl/rollback_impl.cpp
@@ -226,111 +226,22 @@ Status RollbackImpl::runRollback(OperationContext* opCtx) {
return status;
}
_rollbackStats.rollbackId = _replicationProcess->getRollbackID();
+ _listener->onRollbackIDIncremented();
- // Before computing record store counts, abort all active transactions. This ensures that the
- // count adjustments are based on correct values where no prepared transactions are active and
- // all in-memory counts have been rolled-back.
- // Before calling recoverToStableTimestamp, we must abort the storage transaction of any
- // prepared transaction. This will require us to scan all sessions and call
- // abortPreparedTransactionForRollback() on any txnParticipant with a prepared transaction.
- killSessionsAbortAllPreparedTransactions(opCtx);
-
- // Ask the record store for the pre-rollback counts of any collections whose counts will change
- // and create a map with the adjusted counts for post-rollback. While finding the common
- // point, we keep track of how much each collection's count will change during the rollback.
- // Note: these numbers are relative to the common point, not the stable timestamp, and thus
- // must be set after recovering from the oplog.
- // TODO (SERVER-40614): This error should be fatal.
- status = _findRecordStoreCounts(opCtx);
+ // Execute the critical section in rollback. It is illegal to exit rollback cleanly between
+ // aborting prepared transactions and reconstructing them. During this window, no interruptions
+ // are allowed and all errors should be made fatal.
+ status = _runRollbackCriticalSection(opCtx, commonPoint);
if (!status.isOK()) {
- return status;
+ fassertFailedWithStatus(31049, status.withContext("Error in rollback critical section"));
}
+ _listener->onPreparedTransactionsReconstructed();
- if (shouldCreateDataFiles()) {
- // Write a rollback file for each namespace that has documents that would be deleted by
- // rollback. We need to do this after aborting prepared transactions. Otherwise, we risk
- // unecessary prepare conflicts when trying to read documents that were modified by those
- // prepared transactions, which we know we will abort anyway.
- // TODO (SERVER-40614): This error should be fatal.
- status = _writeRollbackFiles(opCtx);
- if (!status.isOK()) {
- return status;
- }
- } else {
- log() << "Not writing rollback files. 'createRollbackDataFiles' set to false.";
- }
-
- // If there were rolled back operations on any session, invalidate all sessions.
- // We invalidate sessions before we recover so that we avoid invalidating sessions that had
- // just recovered prepared transactions.
- if (_observerInfo.rollbackSessionIds.size() > 0) {
- MongoDSessionCatalog::invalidateSessions(opCtx, boost::none);
- }
-
- // Recover to the stable timestamp.
- auto stableTimestampSW = _recoverToStableTimestamp(opCtx);
- // TODO (SERVER-40614): This error should be fatal.
- if (!stableTimestampSW.isOK()) {
- return stableTimestampSW.getStatus();
- }
- _rollbackStats.stableTimestamp = stableTimestampSW.getValue();
- _listener->onRecoverToStableTimestamp(stableTimestampSW.getValue());
-
- // Log the total number of insert and update operations that have been rolled back as a result
- // of recovering to the stable timestamp.
- log() << "Rollback reverted " << _observerInfo.rollbackCommandCounts[kInsertCmdName]
- << " insert operations, " << _observerInfo.rollbackCommandCounts[kUpdateCmdName]
- << " update operations and " << _observerInfo.rollbackCommandCounts[kDeleteCmdName]
- << " delete operations.";
-
- // During replication recovery, we truncate all oplog entries with timestamps greater than or
- // equal to the oplog truncate after point. As a result, we must find the oplog entry after
- // the common point so we do not truncate the common point itself. If we entered rollback,
- // we are guaranteed to have at least one oplog entry after the common point.
- Timestamp truncatePoint = _findTruncateTimestamp(opCtx, commonPointSW.getValue());
-
- // We cannot have an interrupt point between setting the oplog truncation point and fixing the
- // record store counts or else a clean shutdown could produce incorrect counts. We explicitly
- // check for shutdown here to safely maximize interruptibility.
- // TODO (SERVER-40614): This interrupt point should be removed.
+ // We can now accept interruptions again.
if (_isInShutdown()) {
return Status(ErrorCodes::ShutdownInProgress, "rollback shutting down");
}
- // Persist the truncate point to the 'oplogTruncateAfterPoint' document. We save this value so
- // that the replication recovery logic knows where to truncate the oplog. We save this value
- // durably to match the behavior during startup recovery. This must occur after we successfully
- // recover to a stable timestamp. If recovering to a stable timestamp fails and we still
- // truncate the oplog then the oplog will not match the data files. If we crash at any earlier
- // point, we will recover, find a new sync source, and restart roll back (if necessary on the
- // new sync source). This is safe because a crash before this point would recover to a stable
- // checkpoint anyways at or earlier than the stable timestamp.
- //
- // Note that storage engine timestamp recovery only restores the database *data* to a stable
- // timestamp, but does not revert the oplog, which must be done as part of the rollback process.
- _replicationProcess->getConsistencyMarkers()->setOplogTruncateAfterPoint(opCtx, truncatePoint);
- _rollbackStats.truncateTimestamp = truncatePoint;
- _listener->onSetOplogTruncateAfterPoint(truncatePoint);
-
- // Align the drop pending reaper state with what's on disk. Oplog recovery depends on those
- // being consistent.
- _resetDropPendingState(opCtx);
-
- // Run the recovery process.
- _replicationProcess->getReplicationRecovery()->recoverFromOplog(opCtx,
- stableTimestampSW.getValue());
- _listener->onRecoverFromOplog();
-
- // Sets the correct post-rollback counts on any collections whose counts changed during the
- // rollback.
- _correctRecordStoreCounts(opCtx);
-
- // Reconstruct prepared transactions after counts have been adjusted. Since prepared
- // transactions were aborted (i.e. the in-memory counts were rolled-back) before computing
- // collection counts, reconstruct the prepared transactions now, adding on any additional counts
- // to the now corrected record store.
- reconstructPreparedTransactions(opCtx, OplogApplication::Mode::kRecovering);
-
// At this point, the last applied and durable optimes on this node still point to ops on
// the divergent branch of history. We therefore update the last optimes to the top of the
// oplog, which should now be at the common point.
@@ -497,6 +408,110 @@ StatusWith<std::set<NamespaceString>> RollbackImpl::_namespacesForOp(const Oplog
return namespaces;
}
+Status RollbackImpl::_runRollbackCriticalSection(
+ OperationContext* opCtx,
+ RollBackLocalOperations::RollbackCommonPoint commonPoint) noexcept try {
+ // Before computing record store counts, abort all active transactions. This ensures that
+ // the count adjustments are based on correct values where no prepared transactions are
+ // active and all in-memory counts have been rolled-back.
+ // Before calling recoverToStableTimestamp, we must abort the storage transaction of any
+ // prepared transaction. This will require us to scan all sessions and call
+ // abortPreparedTransactionForRollback() on any txnParticipant with a prepared transaction.
+ killSessionsAbortAllPreparedTransactions(opCtx);
+
+ // Ask the record store for the pre-rollback counts of any collections whose counts will
+ // change and create a map with the adjusted counts for post-rollback. While finding the
+ // common point, we keep track of how much each collection's count will change during the
+ // rollback. Note: these numbers are relative to the common point, not the stable timestamp,
+ // and thus must be set after recovering from the oplog.
+ auto status = _findRecordStoreCounts(opCtx);
+ if (!status.isOK()) {
+ return status.withContext("Error while finding record store counts");
+ }
+
+ if (shouldCreateDataFiles()) {
+ // Write a rollback file for each namespace that has documents that would be deleted by
+ // rollback. We need to do this after aborting prepared transactions. Otherwise, we risk
+ // unecessary prepare conflicts when trying to read documents that were modified by
+ // those prepared transactions, which we know we will abort anyway.
+ status = _writeRollbackFiles(opCtx);
+ if (!status.isOK()) {
+ return status.withContext("Error while writing out rollback files");
+ }
+ } else {
+ log() << "Not writing rollback files. 'createRollbackDataFiles' set to false.";
+ }
+
+ // If there were rolled back operations on any session, invalidate all sessions.
+ // We invalidate sessions before we recover so that we avoid invalidating sessions that had
+ // just recovered prepared transactions.
+ if (_observerInfo.rollbackSessionIds.size() > 0) {
+ MongoDSessionCatalog::invalidateSessions(opCtx, boost::none);
+ }
+
+ // Recover to the stable timestamp.
+ auto stableTimestampSW = _recoverToStableTimestamp(opCtx);
+ if (!stableTimestampSW.isOK()) {
+ auto status = stableTimestampSW.getStatus();
+ return status.withContext("Error while recovering to stable timestamp");
+ }
+ _rollbackStats.stableTimestamp = stableTimestampSW.getValue();
+ _listener->onRecoverToStableTimestamp(stableTimestampSW.getValue());
+
+ // Log the total number of insert and update operations that have been rolled back as a
+ // result of recovering to the stable timestamp.
+ log() << "Rollback reverted " << _observerInfo.rollbackCommandCounts[kInsertCmdName]
+ << " insert operations, " << _observerInfo.rollbackCommandCounts[kUpdateCmdName]
+ << " update operations and " << _observerInfo.rollbackCommandCounts[kDeleteCmdName]
+ << " delete operations.";
+
+ // During replication recovery, we truncate all oplog entries with timestamps greater than
+ // or equal to the oplog truncate after point. As a result, we must find the oplog entry
+ // after the common point so we do not truncate the common point itself. If we entered
+ // rollback, we are guaranteed to have at least one oplog entry after the common point.
+ Timestamp truncatePoint = _findTruncateTimestamp(opCtx, commonPoint);
+
+ // Persist the truncate point to the 'oplogTruncateAfterPoint' document. We save this value so
+ // that the replication recovery logic knows where to truncate the oplog. We save this value
+ // durably to match the behavior during startup recovery. This must occur after we successfully
+ // recover to a stable timestamp. If recovering to a stable timestamp fails and we still
+ // truncate the oplog then the oplog will not match the data files. If we crash at any earlier
+ // point, we will recover, find a new sync source, and restart roll back (if necessary on the
+ // new sync source). This is safe because a crash before this point would recover to a stable
+ // checkpoint anyways at or earlier than the stable timestamp.
+ //
+ // Note that storage engine timestamp recovery only restores the database *data* to a stable
+ // timestamp, but does not revert the oplog, which must be done as part of the rollback process.
+ _replicationProcess->getConsistencyMarkers()->setOplogTruncateAfterPoint(opCtx, truncatePoint);
+ _rollbackStats.truncateTimestamp = truncatePoint;
+ _listener->onSetOplogTruncateAfterPoint(truncatePoint);
+
+ // Align the drop pending reaper state with what's on disk. Oplog recovery depends on those
+ // being consistent.
+ _resetDropPendingState(opCtx);
+
+ // Run the recovery process.
+ _replicationProcess->getReplicationRecovery()->recoverFromOplog(opCtx,
+ stableTimestampSW.getValue());
+ _listener->onRecoverFromOplog();
+
+ // Sets the correct post-rollback counts on any collections whose counts changed during the
+ // rollback.
+ _correctRecordStoreCounts(opCtx);
+
+ // Reconstruct prepared transactions after counts have been adjusted. Since prepared
+ // transactions were aborted (i.e. the in-memory counts were rolled-back) before computing
+ // collection counts, reconstruct the prepared transactions now, adding on any additional counts
+ // to the now corrected record store.
+ reconstructPreparedTransactions(opCtx, OplogApplication::Mode::kRecovering);
+
+ return Status::OK();
+} catch (...) {
+ // Any exceptions here should be made fatal.
+ severe() << "Caught exception during critical section in rollback: " << exceptionToStatus();
+ std::terminate();
+}
+
void RollbackImpl::_correctRecordStoreCounts(OperationContext* opCtx) {
// This function explicitly does not check for shutdown since a clean shutdown post oplog
// truncation is not allowed to occur until the record store counts are corrected.
@@ -572,10 +587,6 @@ void RollbackImpl::_correctRecordStoreCounts(OperationContext* opCtx) {
}
Status RollbackImpl::_findRecordStoreCounts(OperationContext* opCtx) {
- // TODO (SERVER-40614): This interrupt point should be removed.
- if (_isInShutdown()) {
- return Status(ErrorCodes::ShutdownInProgress, "rollback shutting down");
- }
const auto& catalog = CollectionCatalog::get(opCtx);
auto storageEngine = opCtx->getServiceContext()->getStorageEngine();
@@ -1026,20 +1037,9 @@ Status RollbackImpl::_writeRollbackFiles(OperationContext* opCtx) {
str::stream() << "The collection with UUID " << uuid
<< " is unexpectedly missing in the CollectionCatalog");
- if (_isInShutdown()) {
- log() << "Rollback shutting down; not writing rollback file for namespace " << nss->ns()
- << " with uuid " << uuid;
- continue;
- }
-
_writeRollbackFileForNamespace(opCtx, uuid, *nss, entry.second);
}
- // TODO (SERVER-40614): This interrupt point should be removed.
- if (_isInShutdown()) {
- return {ErrorCodes::ShutdownInProgress, "rollback shutting down"};
- }
-
return Status::OK();
}
@@ -1088,27 +1088,10 @@ void RollbackImpl::_writeRollbackFileForNamespace(OperationContext* opCtx,
}
StatusWith<Timestamp> RollbackImpl::_recoverToStableTimestamp(OperationContext* opCtx) {
- // TODO (SERVER-40614): This interrupt point should be removed.
- if (_isInShutdown()) {
- return Status(ErrorCodes::ShutdownInProgress, "rollback shutting down");
- }
- // Recover to the stable timestamp while holding the global exclusive lock.
- {
- Lock::GlobalWrite globalWrite(opCtx);
- try {
- auto stableTimestampSW = _storageInterface->recoverToStableTimestamp(opCtx);
- if (!stableTimestampSW.isOK()) {
- severe() << "RecoverToStableTimestamp failed. "
- << causedBy(stableTimestampSW.getStatus());
- // TODO (SERVER-40614): fassert here instead of depending on the caller to do it
- return {ErrorCodes::UnrecoverableRollbackError,
- "Recover to stable timestamp failed."};
- }
- return stableTimestampSW;
- } catch (...) {
- return exceptionToStatus();
- }
- }
+ // Recover to the stable timestamp while holding the global exclusive lock. This may throw,
+ // which the caller must handle.
+ Lock::GlobalWrite globalWrite(opCtx);
+ return _storageInterface->recoverToStableTimestamp(opCtx);
}
Status RollbackImpl::_triggerOpObserver(OperationContext* opCtx) {