diff options
Diffstat (limited to 'src/mongo/db/repl/rollback_impl.cpp')
-rw-r--r-- | src/mongo/db/repl/rollback_impl.cpp | 249 |
1 files changed, 116 insertions, 133 deletions
diff --git a/src/mongo/db/repl/rollback_impl.cpp b/src/mongo/db/repl/rollback_impl.cpp index 1b6fd2482fb..647988123a1 100644 --- a/src/mongo/db/repl/rollback_impl.cpp +++ b/src/mongo/db/repl/rollback_impl.cpp @@ -226,111 +226,22 @@ Status RollbackImpl::runRollback(OperationContext* opCtx) { return status; } _rollbackStats.rollbackId = _replicationProcess->getRollbackID(); + _listener->onRollbackIDIncremented(); - // Before computing record store counts, abort all active transactions. This ensures that the - // count adjustments are based on correct values where no prepared transactions are active and - // all in-memory counts have been rolled-back. - // Before calling recoverToStableTimestamp, we must abort the storage transaction of any - // prepared transaction. This will require us to scan all sessions and call - // abortPreparedTransactionForRollback() on any txnParticipant with a prepared transaction. - killSessionsAbortAllPreparedTransactions(opCtx); - - // Ask the record store for the pre-rollback counts of any collections whose counts will change - // and create a map with the adjusted counts for post-rollback. While finding the common - // point, we keep track of how much each collection's count will change during the rollback. - // Note: these numbers are relative to the common point, not the stable timestamp, and thus - // must be set after recovering from the oplog. - // TODO (SERVER-40614): This error should be fatal. - status = _findRecordStoreCounts(opCtx); + // Execute the critical section in rollback. It is illegal to exit rollback cleanly between + // aborting prepared transactions and reconstructing them. During this window, no interruptions + // are allowed and all errors should be made fatal. + status = _runRollbackCriticalSection(opCtx, commonPoint); if (!status.isOK()) { - return status; + fassertFailedWithStatus(31049, status.withContext("Error in rollback critical section")); } + _listener->onPreparedTransactionsReconstructed(); - if (shouldCreateDataFiles()) { - // Write a rollback file for each namespace that has documents that would be deleted by - // rollback. We need to do this after aborting prepared transactions. Otherwise, we risk - // unecessary prepare conflicts when trying to read documents that were modified by those - // prepared transactions, which we know we will abort anyway. - // TODO (SERVER-40614): This error should be fatal. - status = _writeRollbackFiles(opCtx); - if (!status.isOK()) { - return status; - } - } else { - log() << "Not writing rollback files. 'createRollbackDataFiles' set to false."; - } - - // If there were rolled back operations on any session, invalidate all sessions. - // We invalidate sessions before we recover so that we avoid invalidating sessions that had - // just recovered prepared transactions. - if (_observerInfo.rollbackSessionIds.size() > 0) { - MongoDSessionCatalog::invalidateSessions(opCtx, boost::none); - } - - // Recover to the stable timestamp. - auto stableTimestampSW = _recoverToStableTimestamp(opCtx); - // TODO (SERVER-40614): This error should be fatal. - if (!stableTimestampSW.isOK()) { - return stableTimestampSW.getStatus(); - } - _rollbackStats.stableTimestamp = stableTimestampSW.getValue(); - _listener->onRecoverToStableTimestamp(stableTimestampSW.getValue()); - - // Log the total number of insert and update operations that have been rolled back as a result - // of recovering to the stable timestamp. - log() << "Rollback reverted " << _observerInfo.rollbackCommandCounts[kInsertCmdName] - << " insert operations, " << _observerInfo.rollbackCommandCounts[kUpdateCmdName] - << " update operations and " << _observerInfo.rollbackCommandCounts[kDeleteCmdName] - << " delete operations."; - - // During replication recovery, we truncate all oplog entries with timestamps greater than or - // equal to the oplog truncate after point. As a result, we must find the oplog entry after - // the common point so we do not truncate the common point itself. If we entered rollback, - // we are guaranteed to have at least one oplog entry after the common point. - Timestamp truncatePoint = _findTruncateTimestamp(opCtx, commonPointSW.getValue()); - - // We cannot have an interrupt point between setting the oplog truncation point and fixing the - // record store counts or else a clean shutdown could produce incorrect counts. We explicitly - // check for shutdown here to safely maximize interruptibility. - // TODO (SERVER-40614): This interrupt point should be removed. + // We can now accept interruptions again. if (_isInShutdown()) { return Status(ErrorCodes::ShutdownInProgress, "rollback shutting down"); } - // Persist the truncate point to the 'oplogTruncateAfterPoint' document. We save this value so - // that the replication recovery logic knows where to truncate the oplog. We save this value - // durably to match the behavior during startup recovery. This must occur after we successfully - // recover to a stable timestamp. If recovering to a stable timestamp fails and we still - // truncate the oplog then the oplog will not match the data files. If we crash at any earlier - // point, we will recover, find a new sync source, and restart roll back (if necessary on the - // new sync source). This is safe because a crash before this point would recover to a stable - // checkpoint anyways at or earlier than the stable timestamp. - // - // Note that storage engine timestamp recovery only restores the database *data* to a stable - // timestamp, but does not revert the oplog, which must be done as part of the rollback process. - _replicationProcess->getConsistencyMarkers()->setOplogTruncateAfterPoint(opCtx, truncatePoint); - _rollbackStats.truncateTimestamp = truncatePoint; - _listener->onSetOplogTruncateAfterPoint(truncatePoint); - - // Align the drop pending reaper state with what's on disk. Oplog recovery depends on those - // being consistent. - _resetDropPendingState(opCtx); - - // Run the recovery process. - _replicationProcess->getReplicationRecovery()->recoverFromOplog(opCtx, - stableTimestampSW.getValue()); - _listener->onRecoverFromOplog(); - - // Sets the correct post-rollback counts on any collections whose counts changed during the - // rollback. - _correctRecordStoreCounts(opCtx); - - // Reconstruct prepared transactions after counts have been adjusted. Since prepared - // transactions were aborted (i.e. the in-memory counts were rolled-back) before computing - // collection counts, reconstruct the prepared transactions now, adding on any additional counts - // to the now corrected record store. - reconstructPreparedTransactions(opCtx, OplogApplication::Mode::kRecovering); - // At this point, the last applied and durable optimes on this node still point to ops on // the divergent branch of history. We therefore update the last optimes to the top of the // oplog, which should now be at the common point. @@ -497,6 +408,110 @@ StatusWith<std::set<NamespaceString>> RollbackImpl::_namespacesForOp(const Oplog return namespaces; } +Status RollbackImpl::_runRollbackCriticalSection( + OperationContext* opCtx, + RollBackLocalOperations::RollbackCommonPoint commonPoint) noexcept try { + // Before computing record store counts, abort all active transactions. This ensures that + // the count adjustments are based on correct values where no prepared transactions are + // active and all in-memory counts have been rolled-back. + // Before calling recoverToStableTimestamp, we must abort the storage transaction of any + // prepared transaction. This will require us to scan all sessions and call + // abortPreparedTransactionForRollback() on any txnParticipant with a prepared transaction. + killSessionsAbortAllPreparedTransactions(opCtx); + + // Ask the record store for the pre-rollback counts of any collections whose counts will + // change and create a map with the adjusted counts for post-rollback. While finding the + // common point, we keep track of how much each collection's count will change during the + // rollback. Note: these numbers are relative to the common point, not the stable timestamp, + // and thus must be set after recovering from the oplog. + auto status = _findRecordStoreCounts(opCtx); + if (!status.isOK()) { + return status.withContext("Error while finding record store counts"); + } + + if (shouldCreateDataFiles()) { + // Write a rollback file for each namespace that has documents that would be deleted by + // rollback. We need to do this after aborting prepared transactions. Otherwise, we risk + // unecessary prepare conflicts when trying to read documents that were modified by + // those prepared transactions, which we know we will abort anyway. + status = _writeRollbackFiles(opCtx); + if (!status.isOK()) { + return status.withContext("Error while writing out rollback files"); + } + } else { + log() << "Not writing rollback files. 'createRollbackDataFiles' set to false."; + } + + // If there were rolled back operations on any session, invalidate all sessions. + // We invalidate sessions before we recover so that we avoid invalidating sessions that had + // just recovered prepared transactions. + if (_observerInfo.rollbackSessionIds.size() > 0) { + MongoDSessionCatalog::invalidateSessions(opCtx, boost::none); + } + + // Recover to the stable timestamp. + auto stableTimestampSW = _recoverToStableTimestamp(opCtx); + if (!stableTimestampSW.isOK()) { + auto status = stableTimestampSW.getStatus(); + return status.withContext("Error while recovering to stable timestamp"); + } + _rollbackStats.stableTimestamp = stableTimestampSW.getValue(); + _listener->onRecoverToStableTimestamp(stableTimestampSW.getValue()); + + // Log the total number of insert and update operations that have been rolled back as a + // result of recovering to the stable timestamp. + log() << "Rollback reverted " << _observerInfo.rollbackCommandCounts[kInsertCmdName] + << " insert operations, " << _observerInfo.rollbackCommandCounts[kUpdateCmdName] + << " update operations and " << _observerInfo.rollbackCommandCounts[kDeleteCmdName] + << " delete operations."; + + // During replication recovery, we truncate all oplog entries with timestamps greater than + // or equal to the oplog truncate after point. As a result, we must find the oplog entry + // after the common point so we do not truncate the common point itself. If we entered + // rollback, we are guaranteed to have at least one oplog entry after the common point. + Timestamp truncatePoint = _findTruncateTimestamp(opCtx, commonPoint); + + // Persist the truncate point to the 'oplogTruncateAfterPoint' document. We save this value so + // that the replication recovery logic knows where to truncate the oplog. We save this value + // durably to match the behavior during startup recovery. This must occur after we successfully + // recover to a stable timestamp. If recovering to a stable timestamp fails and we still + // truncate the oplog then the oplog will not match the data files. If we crash at any earlier + // point, we will recover, find a new sync source, and restart roll back (if necessary on the + // new sync source). This is safe because a crash before this point would recover to a stable + // checkpoint anyways at or earlier than the stable timestamp. + // + // Note that storage engine timestamp recovery only restores the database *data* to a stable + // timestamp, but does not revert the oplog, which must be done as part of the rollback process. + _replicationProcess->getConsistencyMarkers()->setOplogTruncateAfterPoint(opCtx, truncatePoint); + _rollbackStats.truncateTimestamp = truncatePoint; + _listener->onSetOplogTruncateAfterPoint(truncatePoint); + + // Align the drop pending reaper state with what's on disk. Oplog recovery depends on those + // being consistent. + _resetDropPendingState(opCtx); + + // Run the recovery process. + _replicationProcess->getReplicationRecovery()->recoverFromOplog(opCtx, + stableTimestampSW.getValue()); + _listener->onRecoverFromOplog(); + + // Sets the correct post-rollback counts on any collections whose counts changed during the + // rollback. + _correctRecordStoreCounts(opCtx); + + // Reconstruct prepared transactions after counts have been adjusted. Since prepared + // transactions were aborted (i.e. the in-memory counts were rolled-back) before computing + // collection counts, reconstruct the prepared transactions now, adding on any additional counts + // to the now corrected record store. + reconstructPreparedTransactions(opCtx, OplogApplication::Mode::kRecovering); + + return Status::OK(); +} catch (...) { + // Any exceptions here should be made fatal. + severe() << "Caught exception during critical section in rollback: " << exceptionToStatus(); + std::terminate(); +} + void RollbackImpl::_correctRecordStoreCounts(OperationContext* opCtx) { // This function explicitly does not check for shutdown since a clean shutdown post oplog // truncation is not allowed to occur until the record store counts are corrected. @@ -572,10 +587,6 @@ void RollbackImpl::_correctRecordStoreCounts(OperationContext* opCtx) { } Status RollbackImpl::_findRecordStoreCounts(OperationContext* opCtx) { - // TODO (SERVER-40614): This interrupt point should be removed. - if (_isInShutdown()) { - return Status(ErrorCodes::ShutdownInProgress, "rollback shutting down"); - } const auto& catalog = CollectionCatalog::get(opCtx); auto storageEngine = opCtx->getServiceContext()->getStorageEngine(); @@ -1026,20 +1037,9 @@ Status RollbackImpl::_writeRollbackFiles(OperationContext* opCtx) { str::stream() << "The collection with UUID " << uuid << " is unexpectedly missing in the CollectionCatalog"); - if (_isInShutdown()) { - log() << "Rollback shutting down; not writing rollback file for namespace " << nss->ns() - << " with uuid " << uuid; - continue; - } - _writeRollbackFileForNamespace(opCtx, uuid, *nss, entry.second); } - // TODO (SERVER-40614): This interrupt point should be removed. - if (_isInShutdown()) { - return {ErrorCodes::ShutdownInProgress, "rollback shutting down"}; - } - return Status::OK(); } @@ -1088,27 +1088,10 @@ void RollbackImpl::_writeRollbackFileForNamespace(OperationContext* opCtx, } StatusWith<Timestamp> RollbackImpl::_recoverToStableTimestamp(OperationContext* opCtx) { - // TODO (SERVER-40614): This interrupt point should be removed. - if (_isInShutdown()) { - return Status(ErrorCodes::ShutdownInProgress, "rollback shutting down"); - } - // Recover to the stable timestamp while holding the global exclusive lock. - { - Lock::GlobalWrite globalWrite(opCtx); - try { - auto stableTimestampSW = _storageInterface->recoverToStableTimestamp(opCtx); - if (!stableTimestampSW.isOK()) { - severe() << "RecoverToStableTimestamp failed. " - << causedBy(stableTimestampSW.getStatus()); - // TODO (SERVER-40614): fassert here instead of depending on the caller to do it - return {ErrorCodes::UnrecoverableRollbackError, - "Recover to stable timestamp failed."}; - } - return stableTimestampSW; - } catch (...) { - return exceptionToStatus(); - } - } + // Recover to the stable timestamp while holding the global exclusive lock. This may throw, + // which the caller must handle. + Lock::GlobalWrite globalWrite(opCtx); + return _storageInterface->recoverToStableTimestamp(opCtx); } Status RollbackImpl::_triggerOpObserver(OperationContext* opCtx) { |