summaryrefslogtreecommitdiff
path: root/src/mongo/db/repl/replication_recovery.cpp
diff options
context:
space:
mode:
authorJudah Schvimer <judah@mongodb.com>2018-03-12 14:47:40 -0400
committerJudah Schvimer <judah@mongodb.com>2018-03-12 14:47:40 -0400
commitb1102c617e04ff751d702435f9d4521727e579e1 (patch)
tree0878af2c87cc855f833393c4323b623e6e1b3e31 /src/mongo/db/repl/replication_recovery.cpp
parentccaad4fb968b8a21a697c00362de5bb618bbb184 (diff)
downloadmongo-b1102c617e04ff751d702435f9d4521727e579e1.tar.gz
SERVER-33292 Have storage dictate where replication recovery should begin playing oplog from
Diffstat (limited to 'src/mongo/db/repl/replication_recovery.cpp')
-rw-r--r--src/mongo/db/repl/replication_recovery.cpp176
1 files changed, 81 insertions, 95 deletions
diff --git a/src/mongo/db/repl/replication_recovery.cpp b/src/mongo/db/repl/replication_recovery.cpp
index 1c2cef4dadd..6e676942929 100644
--- a/src/mongo/db/repl/replication_recovery.cpp
+++ b/src/mongo/db/repl/replication_recovery.cpp
@@ -42,122 +42,101 @@
namespace mongo {
namespace repl {
-namespace {
-
-/**
- * Returns the timestamp at which we should start oplog application. Returns boost::none if
- * there are no oplog entries to apply.
- */
-boost::optional<Timestamp> _getOplogApplicationStartPoint(Timestamp checkpointTimestamp,
- OpTime appliedThrough) {
- if (!checkpointTimestamp.isNull() && !appliedThrough.isNull()) {
- // In versions that support "recover to stable timestamp" you should never see a
- // non-null appliedThrough in a checkpoint, since we never take checkpoints in the middle
- // of a secondary batch application, and a node that does not support "recover to stable
- // timestamp" should never see a non-null checkpointTimestamp.
- severe() << "checkpointTimestamp (" << checkpointTimestamp.toBSON()
- << ") and appliedThrough (" << appliedThrough << ") cannot both be non-null.";
- fassertFailedNoTrace(40603);
-
- } else if (!checkpointTimestamp.isNull()) {
- // If appliedThrough is null and the checkpointTimestamp is not null, then we recovered
- // to a checkpoint and should use that checkpoint timestamp as the oplog application
- // start point.
- log() << "Starting recovery oplog application at the checkpointTimestamp: "
- << checkpointTimestamp.toBSON();
- return checkpointTimestamp;
-
- } else if (!appliedThrough.isNull()) {
- // If the checkpointTimestamp is null and the appliedThrough is not null, then we did not
- // recover to a checkpoint and we should use the appliedThrough as the oplog application
- // start point.
- log() << "Starting recovery oplog application at the appliedThrough: " << appliedThrough;
- return appliedThrough.getTimestamp();
-
- } else {
- log() << "No oplog entries to apply for recovery. appliedThrough and "
- "checkpointTimestamp are both null.";
- // No follow-up work to do.
- return boost::none;
- }
- MONGO_UNREACHABLE;
-}
-
-} // namespace
-
ReplicationRecoveryImpl::ReplicationRecoveryImpl(StorageInterface* storageInterface,
ReplicationConsistencyMarkers* consistencyMarkers)
: _storageInterface(storageInterface), _consistencyMarkers(consistencyMarkers) {}
-void ReplicationRecoveryImpl::recoverFromOplog(OperationContext* opCtx) try {
+void ReplicationRecoveryImpl::recoverFromOplog(OperationContext* opCtx,
+ boost::optional<Timestamp> stableTimestamp) try {
if (_consistencyMarkers->getInitialSyncFlag(opCtx)) {
log() << "No recovery needed. Initial sync flag set.";
return; // Initial Sync will take over so no cleanup is needed.
}
const auto truncateAfterPoint = _consistencyMarkers->getOplogTruncateAfterPoint(opCtx);
- const auto appliedThrough = _consistencyMarkers->getAppliedThrough(opCtx);
-
if (!truncateAfterPoint.isNull()) {
log() << "Removing unapplied entries starting at: " << truncateAfterPoint.toBSON();
_truncateOplogTo(opCtx, truncateAfterPoint);
- }
- // Clear the truncateAfterPoint so that we don't truncate the next batch of oplog entries
- // erroneously.
- _consistencyMarkers->setOplogTruncateAfterPoint(opCtx, {});
-
- // TODO (SERVER-30556): Delete this line since the old oplog delete from point cannot exist.
- _consistencyMarkers->removeOldOplogDeleteFromPointField(opCtx);
-
- auto topOfOplogSW = _getLastAppliedOpTime(opCtx);
- boost::optional<OpTime> topOfOplog = boost::none;
- if (topOfOplogSW.getStatus() != ErrorCodes::CollectionIsEmpty &&
- topOfOplogSW.getStatus() != ErrorCodes::NamespaceNotFound) {
- fassert(40290, topOfOplogSW);
- topOfOplog = topOfOplogSW.getValue();
+ // Clear the truncateAfterPoint so that we don't truncate the next batch of oplog entries
+ // erroneously.
+ _consistencyMarkers->setOplogTruncateAfterPoint(opCtx, {});
}
- // If we have a checkpoint timestamp, then we recovered to a timestamp and should set the
- // initial data timestamp to that. Otherwise, we simply recovered the data on disk so we should
- // set the initial data timestamp to the top OpTime in the oplog once the data is consistent
- // there. If there is nothing in the oplog, then we do not set the initial data timestamp.
- auto checkpointTimestamp = _consistencyMarkers->getCheckpointTimestamp(opCtx);
- if (!checkpointTimestamp.isNull()) {
-
- // If we have a checkpoint timestamp, we set the initial data timestamp now so that
- // the operations we apply below can be given the proper timestamps.
- _storageInterface->setInitialDataTimestamp(opCtx->getServiceContext(), checkpointTimestamp);
- }
-
- // Oplog is empty. There are no oplog entries to apply, so we exit recovery. If there was a
- // checkpointTimestamp then we already set the initial data timestamp. Otherwise, there is
- // nothing to set it to.
- if (!topOfOplog) {
- log() << "No oplog entries to apply for recovery. Oplog is empty.";
+ auto topOfOplogSW = _getTopOfOplog(opCtx);
+ if (topOfOplogSW.getStatus() == ErrorCodes::CollectionIsEmpty ||
+ topOfOplogSW.getStatus() == ErrorCodes::NamespaceNotFound) {
+ // Oplog is empty. There are no oplog entries to apply, so we exit recovery and go into
+ // initial sync.
+ log() << "No oplog entries to apply for recovery. Oplog is empty. Entering initial sync.";
return;
}
+ fassert(40290, topOfOplogSW);
+ const auto topOfOplog = topOfOplogSW.getValue();
- if (auto startPoint = _getOplogApplicationStartPoint(checkpointTimestamp, appliedThrough)) {
- _applyToEndOfOplog(opCtx, startPoint.get(), topOfOplog->getTimestamp());
+ const auto appliedThrough = _consistencyMarkers->getAppliedThrough(opCtx);
+ invariant(!stableTimestamp || appliedThrough.isNull() ||
+ *stableTimestamp == appliedThrough.getTimestamp(),
+ str::stream() << "Stable timestamp " << stableTimestamp->toString()
+ << " does not equal appliedThrough timestamp "
+ << appliedThrough.toString());
+
+ // If we were passed in a stable timestamp, we are in rollback recovery and should recover from
+ // that stable timestamp. Otherwise, we're recovering at startup. If this storage engine
+ // supports recover to stable timestamp, we ask it for the recovery timestamp. If the storage
+ // engine returns a timestamp, we recover from that point. However, if the storage engine
+ // returns "none", the storage engine does not have a stable checkpoint and we must recover from
+ // an unstable checkpoint instead.
+ const bool supportsRecoverToStableTimestamp =
+ _storageInterface->supportsRecoverToStableTimestamp(opCtx->getServiceContext());
+ if (!stableTimestamp && supportsRecoverToStableTimestamp) {
+ stableTimestamp = _storageInterface->getRecoveryTimestamp(opCtx->getServiceContext());
}
- // If we don't have a checkpoint timestamp, then we are either not running a storage engine
- // that supports "recover to stable timestamp" or we just upgraded from a version that didn't.
- // In both cases, the data on disk is not consistent until we have applied all oplog entries to
- // the end of the oplog, since we do not know which ones actually got applied before shutdown.
- // As a result, we do not set the initial data timestamp until after we have applied to the end
- // of the oplog.
- if (checkpointTimestamp.isNull()) {
- _storageInterface->setInitialDataTimestamp(opCtx->getServiceContext(),
- topOfOplog->getTimestamp());
+ if (stableTimestamp) {
+ invariant(supportsRecoverToStableTimestamp);
+ _recoverFromStableTimestamp(opCtx, *stableTimestamp, appliedThrough, topOfOplog);
+ } else {
+ _recoverFromUnstableCheckpoint(opCtx, appliedThrough, topOfOplog);
}
-
} catch (...) {
severe() << "Caught exception during replication recovery: " << exceptionToStatus();
std::terminate();
}
+void ReplicationRecoveryImpl::_recoverFromStableTimestamp(OperationContext* opCtx,
+ Timestamp stableTimestamp,
+ OpTime appliedThrough,
+ OpTime topOfOplog) {
+ invariant(!stableTimestamp.isNull());
+ invariant(!topOfOplog.isNull());
+ log() << "Recovering from stable timestamp: " << stableTimestamp
+ << " (top of oplog: " << topOfOplog << ", appliedThrough: " << appliedThrough << ")";
+
+ log() << "Starting recovery oplog application at the stable timestamp: " << stableTimestamp;
+ _applyToEndOfOplog(opCtx, stableTimestamp, topOfOplog.getTimestamp());
+}
+
+void ReplicationRecoveryImpl::_recoverFromUnstableCheckpoint(OperationContext* opCtx,
+ OpTime appliedThrough,
+ OpTime topOfOplog) {
+ invariant(!topOfOplog.isNull());
+ log() << "Recovering from an unstable checkpoint (top of oplog: " << topOfOplog
+ << ", appliedThrough: " << appliedThrough << ")";
+
+ if (appliedThrough.isNull()) {
+ // The appliedThrough would be null if we shut down cleanly or crashed as a primary. Either
+ // way we are consistent at the top of the oplog.
+ log() << "No oplog entries to apply for recovery. appliedThrough is null.";
+ } else {
+ // If the appliedThrough is not null, then we shut down uncleanly during secondary oplog
+ // application and must apply from the appliedThrough to the top of the oplog.
+ log() << "Starting recovery oplog application at the appliedThrough: " << appliedThrough
+ << ", through the top of the oplog: " << topOfOplog;
+ _applyToEndOfOplog(opCtx, appliedThrough.getTimestamp(), topOfOplog.getTimestamp());
+ }
+}
+
void ReplicationRecoveryImpl::_applyToEndOfOplog(OperationContext* opCtx,
Timestamp oplogApplicationStartPoint,
Timestamp topOfOplog) {
@@ -167,8 +146,7 @@ void ReplicationRecoveryImpl::_applyToEndOfOplog(OperationContext* opCtx,
// Check if we have any unapplied ops in our oplog. It is important that this is done after
// deleting the ragged end of the oplog.
if (oplogApplicationStartPoint == topOfOplog) {
- log()
- << "No oplog entries to apply for recovery. appliedThrough is at the top of the oplog.";
+ log() << "No oplog entries to apply for recovery. Start point is at the top of the oplog.";
return; // We've applied all the valid oplog we have.
} else if (oplogApplicationStartPoint > topOfOplog) {
severe() << "Applied op " << oplogApplicationStartPoint.toBSON()
@@ -209,15 +187,23 @@ void ReplicationRecoveryImpl::_applyToEndOfOplog(OperationContext* opCtx,
// Apply remaining ops one at at time, but don't log them because they are already logged.
UnreplicatedWritesBlock uwb(opCtx);
+ BSONObj entry;
while (cursor->more()) {
- auto entry = cursor->nextSafe();
+ entry = cursor->nextSafe();
fassert(40294, SyncTail::syncApply(opCtx, entry, OplogApplication::Mode::kRecovering));
- _consistencyMarkers->setAppliedThrough(opCtx,
- fassert(40295, OpTime::parseFromOplogEntry(entry)));
}
+
+ // We may crash before setting appliedThrough. If we have a stable checkpoint, we will recover
+ // to that checkpoint at a replication consistent point, and applying the oplog is safe.
+ // If we don't have a stable checkpoint, then we must be in startup recovery, and not rollback
+ // recovery, because we only roll back to a stable timestamp when we have a stable checkpoint.
+ // Startup recovery from an unstable checkpoint only ever applies a single batch and it is safe
+ // to replay the batch from any point.
+ _consistencyMarkers->setAppliedThrough(opCtx,
+ fassert(40295, OpTime::parseFromOplogEntry(entry)));
}
-StatusWith<OpTime> ReplicationRecoveryImpl::_getLastAppliedOpTime(OperationContext* opCtx) const {
+StatusWith<OpTime> ReplicationRecoveryImpl::_getTopOfOplog(OperationContext* opCtx) const {
const auto docsSW = _storageInterface->findDocuments(opCtx,
NamespaceString::kRsOplogNamespace,
boost::none, // Collection scan