SERVER-33292 Have storage dictate where replication recovery should begin playing oplog from

author: Judah Schvimer <judah@mongodb.com> 2018-03-12 14:47:40 -0400
committer: Judah Schvimer <judah@mongodb.com> 2018-03-12 14:47:40 -0400
commit: b1102c617e04ff751d702435f9d4521727e579e1 (patch)
tree: 0878af2c87cc855f833393c4323b623e6e1b3e31 /src/mongo/db/repl/replication_recovery.cpp
parent: ccaad4fb968b8a21a697c00362de5bb618bbb184 (diff)
download: mongo-b1102c617e04ff751d702435f9d4521727e579e1.tar.gz
1 files changed, 81 insertions, 95 deletions
diff --git a/src/mongo/db/repl/replication_recovery.cpp b/src/mongo/db/repl/replication_recovery.cpp
index 1c2cef4dadd..6e676942929 100644
--- a/src/mongo/db/repl/replication_recovery.cpp
+++ b/src/mongo/db/repl/replication_recovery.cpp
@@ -42,122 +42,101 @@
 namespace mongo {
 namespace repl {
 
-namespace {
-
-/**
- * Returns the timestamp at which we should start oplog application. Returns boost::none if
- * there are no oplog entries to apply.
- */
-boost::optional<Timestamp> _getOplogApplicationStartPoint(Timestamp checkpointTimestamp,
-                                                          OpTime appliedThrough) {
-    if (!checkpointTimestamp.isNull() && !appliedThrough.isNull()) {
-        // In versions that support "recover to stable timestamp" you should never see a
-        // non-null appliedThrough in a checkpoint, since we never take checkpoints in the middle
-        // of a secondary batch application, and a node that does not support "recover to stable
-        // timestamp" should never see a non-null checkpointTimestamp.
-        severe() << "checkpointTimestamp (" << checkpointTimestamp.toBSON()
-                 << ") and appliedThrough (" << appliedThrough << ") cannot both be non-null.";
-        fassertFailedNoTrace(40603);
-
-    } else if (!checkpointTimestamp.isNull()) {
-        // If appliedThrough is null and the checkpointTimestamp is not null, then we recovered
-        // to a checkpoint and should use that checkpoint timestamp as the oplog application
-        // start point.
-        log() << "Starting recovery oplog application at the checkpointTimestamp: "
-              << checkpointTimestamp.toBSON();
-        return checkpointTimestamp;
-
-    } else if (!appliedThrough.isNull()) {
-        // If the checkpointTimestamp is null and the appliedThrough is not null, then we did not
-        // recover to a checkpoint and we should use the appliedThrough as the oplog application
-        // start point.
-        log() << "Starting recovery oplog application at the appliedThrough: " << appliedThrough;
-        return appliedThrough.getTimestamp();
-
-    } else {
-        log() << "No oplog entries to apply for recovery. appliedThrough and "
-                 "checkpointTimestamp are both null.";
-        // No follow-up work to do.
-        return boost::none;
-    }
-    MONGO_UNREACHABLE;
-}
-
-}  // namespace
-
 ReplicationRecoveryImpl::ReplicationRecoveryImpl(StorageInterface* storageInterface,
                                                  ReplicationConsistencyMarkers* consistencyMarkers)
     : _storageInterface(storageInterface), _consistencyMarkers(consistencyMarkers) {}
 
-void ReplicationRecoveryImpl::recoverFromOplog(OperationContext* opCtx) try {
+void ReplicationRecoveryImpl::recoverFromOplog(OperationContext* opCtx,
+                                               boost::optional<Timestamp> stableTimestamp) try {
     if (_consistencyMarkers->getInitialSyncFlag(opCtx)) {
         log() << "No recovery needed. Initial sync flag set.";
         return;  // Initial Sync will take over so no cleanup is needed.
     }
 
     const auto truncateAfterPoint = _consistencyMarkers->getOplogTruncateAfterPoint(opCtx);
-    const auto appliedThrough = _consistencyMarkers->getAppliedThrough(opCtx);
-
     if (!truncateAfterPoint.isNull()) {
         log() << "Removing unapplied entries starting at: " << truncateAfterPoint.toBSON();
         _truncateOplogTo(opCtx, truncateAfterPoint);
-    }
 
-    // Clear the truncateAfterPoint so that we don't truncate the next batch of oplog entries
-    // erroneously.
-    _consistencyMarkers->setOplogTruncateAfterPoint(opCtx, {});
-
-    // TODO (SERVER-30556): Delete this line since the old oplog delete from point cannot exist.
-    _consistencyMarkers->removeOldOplogDeleteFromPointField(opCtx);
-
-    auto topOfOplogSW = _getLastAppliedOpTime(opCtx);
-    boost::optional<OpTime> topOfOplog = boost::none;
-    if (topOfOplogSW.getStatus() != ErrorCodes::CollectionIsEmpty &&
-        topOfOplogSW.getStatus() != ErrorCodes::NamespaceNotFound) {
-        fassert(40290, topOfOplogSW);
-        topOfOplog = topOfOplogSW.getValue();
+        // Clear the truncateAfterPoint so that we don't truncate the next batch of oplog entries
+        // erroneously.
+        _consistencyMarkers->setOplogTruncateAfterPoint(opCtx, {});
     }
 
-    // If we have a checkpoint timestamp, then we recovered to a timestamp and should set the
-    // initial data timestamp to that. Otherwise, we simply recovered the data on disk so we should
-    // set the initial data timestamp to the top OpTime in the oplog once the data is consistent
-    // there. If there is nothing in the oplog, then we do not set the initial data timestamp.
-    auto checkpointTimestamp = _consistencyMarkers->getCheckpointTimestamp(opCtx);
-    if (!checkpointTimestamp.isNull()) {
-
-        // If we have a checkpoint timestamp, we set the initial data timestamp now so that
-        // the operations we apply below can be given the proper timestamps.
-        _storageInterface->setInitialDataTimestamp(opCtx->getServiceContext(), checkpointTimestamp);
-    }
-
-    // Oplog is empty. There are no oplog entries to apply, so we exit recovery. If there was a
-    // checkpointTimestamp then we already set the initial data timestamp. Otherwise, there is
-    // nothing to set it to.
-    if (!topOfOplog) {
-        log() << "No oplog entries to apply for recovery. Oplog is empty.";
+    auto topOfOplogSW = _getTopOfOplog(opCtx);
+    if (topOfOplogSW.getStatus() == ErrorCodes::CollectionIsEmpty ||
+        topOfOplogSW.getStatus() == ErrorCodes::NamespaceNotFound) {
+        // Oplog is empty. There are no oplog entries to apply, so we exit recovery and go into
+        // initial sync.
+        log() << "No oplog entries to apply for recovery. Oplog is empty. Entering initial sync.";
         return;
     }
+    fassert(40290, topOfOplogSW);
+    const auto topOfOplog = topOfOplogSW.getValue();
 
-    if (auto startPoint = _getOplogApplicationStartPoint(checkpointTimestamp, appliedThrough)) {
-        _applyToEndOfOplog(opCtx, startPoint.get(), topOfOplog->getTimestamp());
+    const auto appliedThrough = _consistencyMarkers->getAppliedThrough(opCtx);
+    invariant(!stableTimestamp || appliedThrough.isNull() ||
+                  *stableTimestamp == appliedThrough.getTimestamp(),
+              str::stream() << "Stable timestamp " << stableTimestamp->toString()
+                            << " does not equal appliedThrough timestamp "
+                            << appliedThrough.toString());
+
+    // If we were passed in a stable timestamp, we are in rollback recovery and should recover from
+    // that stable timestamp. Otherwise, we're recovering at startup. If this storage engine
+    // supports recover to stable timestamp, we ask it for the recovery timestamp. If the storage
+    // engine returns a timestamp, we recover from that point. However, if the storage engine
+    // returns "none", the storage engine does not have a stable checkpoint and we must recover from
+    // an unstable checkpoint instead.
+    const bool supportsRecoverToStableTimestamp =
+        _storageInterface->supportsRecoverToStableTimestamp(opCtx->getServiceContext());
+    if (!stableTimestamp && supportsRecoverToStableTimestamp) {
+        stableTimestamp = _storageInterface->getRecoveryTimestamp(opCtx->getServiceContext());
     }
 
-    // If we don't have a checkpoint timestamp, then we are either not running a storage engine
-    // that supports "recover to stable timestamp" or we just upgraded from a version that didn't.
-    // In both cases, the data on disk is not consistent until we have applied all oplog entries to
-    // the end of the oplog, since we do not know which ones actually got applied before shutdown.
-    // As a result, we do not set the initial data timestamp until after we have applied to the end
-    // of the oplog.
-    if (checkpointTimestamp.isNull()) {
-        _storageInterface->setInitialDataTimestamp(opCtx->getServiceContext(),
-                                                   topOfOplog->getTimestamp());
+    if (stableTimestamp) {
+        invariant(supportsRecoverToStableTimestamp);
+        _recoverFromStableTimestamp(opCtx, *stableTimestamp, appliedThrough, topOfOplog);
+    } else {
+        _recoverFromUnstableCheckpoint(opCtx, appliedThrough, topOfOplog);
     }
-
 } catch (...) {
     severe() << "Caught exception during replication recovery: " << exceptionToStatus();
     std::terminate();
 }
 
+void ReplicationRecoveryImpl::_recoverFromStableTimestamp(OperationContext* opCtx,
+                                                          Timestamp stableTimestamp,
+                                                          OpTime appliedThrough,
+                                                          OpTime topOfOplog) {
+    invariant(!stableTimestamp.isNull());
+    invariant(!topOfOplog.isNull());
+    log() << "Recovering from stable timestamp: " << stableTimestamp
+          << " (top of oplog: " << topOfOplog << ", appliedThrough: " << appliedThrough << ")";
+
+    log() << "Starting recovery oplog application at the stable timestamp: " << stableTimestamp;
+    _applyToEndOfOplog(opCtx, stableTimestamp, topOfOplog.getTimestamp());
+}
+
+void ReplicationRecoveryImpl::_recoverFromUnstableCheckpoint(OperationContext* opCtx,
+                                                             OpTime appliedThrough,
+                                                             OpTime topOfOplog) {
+    invariant(!topOfOplog.isNull());
+    log() << "Recovering from an unstable checkpoint (top of oplog: " << topOfOplog
+          << ", appliedThrough: " << appliedThrough << ")";
+
+    if (appliedThrough.isNull()) {
+        // The appliedThrough would be null if we shut down cleanly or crashed as a primary. Either
+        // way we are consistent at the top of the oplog.
+        log() << "No oplog entries to apply for recovery. appliedThrough is null.";
+    } else {
+        // If the appliedThrough is not null, then we shut down uncleanly during secondary oplog
+        // application and must apply from the appliedThrough to the top of the oplog.
+        log() << "Starting recovery oplog application at the appliedThrough: " << appliedThrough
+              << ", through the top of the oplog: " << topOfOplog;
+        _applyToEndOfOplog(opCtx, appliedThrough.getTimestamp(), topOfOplog.getTimestamp());
+    }
+}
+
 void ReplicationRecoveryImpl::_applyToEndOfOplog(OperationContext* opCtx,
                                                  Timestamp oplogApplicationStartPoint,
                                                  Timestamp topOfOplog) {
@@ -167,8 +146,7 @@ void ReplicationRecoveryImpl::_applyToEndOfOplog(OperationContext* opCtx,
     // Check if we have any unapplied ops in our oplog. It is important that this is done after
     // deleting the ragged end of the oplog.
     if (oplogApplicationStartPoint == topOfOplog) {
-        log()
-            << "No oplog entries to apply for recovery. appliedThrough is at the top of the oplog.";
+        log() << "No oplog entries to apply for recovery. Start point is at the top of the oplog.";
         return;  // We've applied all the valid oplog we have.
     } else if (oplogApplicationStartPoint > topOfOplog) {
         severe() << "Applied op " << oplogApplicationStartPoint.toBSON()
@@ -209,15 +187,23 @@ void ReplicationRecoveryImpl::_applyToEndOfOplog(OperationContext* opCtx,
     // Apply remaining ops one at at time, but don't log them because they are already logged.
     UnreplicatedWritesBlock uwb(opCtx);
 
+    BSONObj entry;
     while (cursor->more()) {
-        auto entry = cursor->nextSafe();
+        entry = cursor->nextSafe();
         fassert(40294, SyncTail::syncApply(opCtx, entry, OplogApplication::Mode::kRecovering));
-        _consistencyMarkers->setAppliedThrough(opCtx,
-                                               fassert(40295, OpTime::parseFromOplogEntry(entry)));
     }
+
+    // We may crash before setting appliedThrough. If we have a stable checkpoint, we will recover
+    // to that checkpoint at a replication consistent point, and applying the oplog is safe.
+    // If we don't have a stable checkpoint, then we must be in startup recovery, and not rollback
+    // recovery, because we only roll back to a stable timestamp when we have a stable checkpoint.
+    // Startup recovery from an unstable checkpoint only ever applies a single batch and it is safe
+    // to replay the batch from any point.
+    _consistencyMarkers->setAppliedThrough(opCtx,
+                                           fassert(40295, OpTime::parseFromOplogEntry(entry)));
 }
 
-StatusWith<OpTime> ReplicationRecoveryImpl::_getLastAppliedOpTime(OperationContext* opCtx) const {
+StatusWith<OpTime> ReplicationRecoveryImpl::_getTopOfOplog(OperationContext* opCtx) const {
     const auto docsSW = _storageInterface->findDocuments(opCtx,
                                                          NamespaceString::kRsOplogNamespace,
                                                          boost::none,  // Collection scan
author	Judah Schvimer <judah@mongodb.com>	2018-03-12 14:47:40 -0400
committer	Judah Schvimer <judah@mongodb.com>	2018-03-12 14:47:40 -0400
commit	b1102c617e04ff751d702435f9d4521727e579e1 (patch)
tree	0878af2c87cc855f833393c4323b623e6e1b3e31 /src/mongo/db/repl/replication_recovery.cpp
parent	ccaad4fb968b8a21a697c00362de5bb618bbb184 (diff)
download	mongo-b1102c617e04ff751d702435f9d4521727e579e1.tar.gz