diff options
author | Judah Schvimer <judah@mongodb.com> | 2018-03-12 14:47:40 -0400 |
---|---|---|
committer | Judah Schvimer <judah@mongodb.com> | 2018-03-12 14:47:40 -0400 |
commit | b1102c617e04ff751d702435f9d4521727e579e1 (patch) | |
tree | 0878af2c87cc855f833393c4323b623e6e1b3e31 /src/mongo/db/repl/replication_recovery.cpp | |
parent | ccaad4fb968b8a21a697c00362de5bb618bbb184 (diff) | |
download | mongo-b1102c617e04ff751d702435f9d4521727e579e1.tar.gz |
SERVER-33292 Have storage dictate where replication recovery should begin playing oplog from
Diffstat (limited to 'src/mongo/db/repl/replication_recovery.cpp')
-rw-r--r-- | src/mongo/db/repl/replication_recovery.cpp | 176 |
1 files changed, 81 insertions, 95 deletions
diff --git a/src/mongo/db/repl/replication_recovery.cpp b/src/mongo/db/repl/replication_recovery.cpp index 1c2cef4dadd..6e676942929 100644 --- a/src/mongo/db/repl/replication_recovery.cpp +++ b/src/mongo/db/repl/replication_recovery.cpp @@ -42,122 +42,101 @@ namespace mongo { namespace repl { -namespace { - -/** - * Returns the timestamp at which we should start oplog application. Returns boost::none if - * there are no oplog entries to apply. - */ -boost::optional<Timestamp> _getOplogApplicationStartPoint(Timestamp checkpointTimestamp, - OpTime appliedThrough) { - if (!checkpointTimestamp.isNull() && !appliedThrough.isNull()) { - // In versions that support "recover to stable timestamp" you should never see a - // non-null appliedThrough in a checkpoint, since we never take checkpoints in the middle - // of a secondary batch application, and a node that does not support "recover to stable - // timestamp" should never see a non-null checkpointTimestamp. - severe() << "checkpointTimestamp (" << checkpointTimestamp.toBSON() - << ") and appliedThrough (" << appliedThrough << ") cannot both be non-null."; - fassertFailedNoTrace(40603); - - } else if (!checkpointTimestamp.isNull()) { - // If appliedThrough is null and the checkpointTimestamp is not null, then we recovered - // to a checkpoint and should use that checkpoint timestamp as the oplog application - // start point. - log() << "Starting recovery oplog application at the checkpointTimestamp: " - << checkpointTimestamp.toBSON(); - return checkpointTimestamp; - - } else if (!appliedThrough.isNull()) { - // If the checkpointTimestamp is null and the appliedThrough is not null, then we did not - // recover to a checkpoint and we should use the appliedThrough as the oplog application - // start point. - log() << "Starting recovery oplog application at the appliedThrough: " << appliedThrough; - return appliedThrough.getTimestamp(); - - } else { - log() << "No oplog entries to apply for recovery. appliedThrough and " - "checkpointTimestamp are both null."; - // No follow-up work to do. - return boost::none; - } - MONGO_UNREACHABLE; -} - -} // namespace - ReplicationRecoveryImpl::ReplicationRecoveryImpl(StorageInterface* storageInterface, ReplicationConsistencyMarkers* consistencyMarkers) : _storageInterface(storageInterface), _consistencyMarkers(consistencyMarkers) {} -void ReplicationRecoveryImpl::recoverFromOplog(OperationContext* opCtx) try { +void ReplicationRecoveryImpl::recoverFromOplog(OperationContext* opCtx, + boost::optional<Timestamp> stableTimestamp) try { if (_consistencyMarkers->getInitialSyncFlag(opCtx)) { log() << "No recovery needed. Initial sync flag set."; return; // Initial Sync will take over so no cleanup is needed. } const auto truncateAfterPoint = _consistencyMarkers->getOplogTruncateAfterPoint(opCtx); - const auto appliedThrough = _consistencyMarkers->getAppliedThrough(opCtx); - if (!truncateAfterPoint.isNull()) { log() << "Removing unapplied entries starting at: " << truncateAfterPoint.toBSON(); _truncateOplogTo(opCtx, truncateAfterPoint); - } - // Clear the truncateAfterPoint so that we don't truncate the next batch of oplog entries - // erroneously. - _consistencyMarkers->setOplogTruncateAfterPoint(opCtx, {}); - - // TODO (SERVER-30556): Delete this line since the old oplog delete from point cannot exist. - _consistencyMarkers->removeOldOplogDeleteFromPointField(opCtx); - - auto topOfOplogSW = _getLastAppliedOpTime(opCtx); - boost::optional<OpTime> topOfOplog = boost::none; - if (topOfOplogSW.getStatus() != ErrorCodes::CollectionIsEmpty && - topOfOplogSW.getStatus() != ErrorCodes::NamespaceNotFound) { - fassert(40290, topOfOplogSW); - topOfOplog = topOfOplogSW.getValue(); + // Clear the truncateAfterPoint so that we don't truncate the next batch of oplog entries + // erroneously. + _consistencyMarkers->setOplogTruncateAfterPoint(opCtx, {}); } - // If we have a checkpoint timestamp, then we recovered to a timestamp and should set the - // initial data timestamp to that. Otherwise, we simply recovered the data on disk so we should - // set the initial data timestamp to the top OpTime in the oplog once the data is consistent - // there. If there is nothing in the oplog, then we do not set the initial data timestamp. - auto checkpointTimestamp = _consistencyMarkers->getCheckpointTimestamp(opCtx); - if (!checkpointTimestamp.isNull()) { - - // If we have a checkpoint timestamp, we set the initial data timestamp now so that - // the operations we apply below can be given the proper timestamps. - _storageInterface->setInitialDataTimestamp(opCtx->getServiceContext(), checkpointTimestamp); - } - - // Oplog is empty. There are no oplog entries to apply, so we exit recovery. If there was a - // checkpointTimestamp then we already set the initial data timestamp. Otherwise, there is - // nothing to set it to. - if (!topOfOplog) { - log() << "No oplog entries to apply for recovery. Oplog is empty."; + auto topOfOplogSW = _getTopOfOplog(opCtx); + if (topOfOplogSW.getStatus() == ErrorCodes::CollectionIsEmpty || + topOfOplogSW.getStatus() == ErrorCodes::NamespaceNotFound) { + // Oplog is empty. There are no oplog entries to apply, so we exit recovery and go into + // initial sync. + log() << "No oplog entries to apply for recovery. Oplog is empty. Entering initial sync."; return; } + fassert(40290, topOfOplogSW); + const auto topOfOplog = topOfOplogSW.getValue(); - if (auto startPoint = _getOplogApplicationStartPoint(checkpointTimestamp, appliedThrough)) { - _applyToEndOfOplog(opCtx, startPoint.get(), topOfOplog->getTimestamp()); + const auto appliedThrough = _consistencyMarkers->getAppliedThrough(opCtx); + invariant(!stableTimestamp || appliedThrough.isNull() || + *stableTimestamp == appliedThrough.getTimestamp(), + str::stream() << "Stable timestamp " << stableTimestamp->toString() + << " does not equal appliedThrough timestamp " + << appliedThrough.toString()); + + // If we were passed in a stable timestamp, we are in rollback recovery and should recover from + // that stable timestamp. Otherwise, we're recovering at startup. If this storage engine + // supports recover to stable timestamp, we ask it for the recovery timestamp. If the storage + // engine returns a timestamp, we recover from that point. However, if the storage engine + // returns "none", the storage engine does not have a stable checkpoint and we must recover from + // an unstable checkpoint instead. + const bool supportsRecoverToStableTimestamp = + _storageInterface->supportsRecoverToStableTimestamp(opCtx->getServiceContext()); + if (!stableTimestamp && supportsRecoverToStableTimestamp) { + stableTimestamp = _storageInterface->getRecoveryTimestamp(opCtx->getServiceContext()); } - // If we don't have a checkpoint timestamp, then we are either not running a storage engine - // that supports "recover to stable timestamp" or we just upgraded from a version that didn't. - // In both cases, the data on disk is not consistent until we have applied all oplog entries to - // the end of the oplog, since we do not know which ones actually got applied before shutdown. - // As a result, we do not set the initial data timestamp until after we have applied to the end - // of the oplog. - if (checkpointTimestamp.isNull()) { - _storageInterface->setInitialDataTimestamp(opCtx->getServiceContext(), - topOfOplog->getTimestamp()); + if (stableTimestamp) { + invariant(supportsRecoverToStableTimestamp); + _recoverFromStableTimestamp(opCtx, *stableTimestamp, appliedThrough, topOfOplog); + } else { + _recoverFromUnstableCheckpoint(opCtx, appliedThrough, topOfOplog); } - } catch (...) { severe() << "Caught exception during replication recovery: " << exceptionToStatus(); std::terminate(); } +void ReplicationRecoveryImpl::_recoverFromStableTimestamp(OperationContext* opCtx, + Timestamp stableTimestamp, + OpTime appliedThrough, + OpTime topOfOplog) { + invariant(!stableTimestamp.isNull()); + invariant(!topOfOplog.isNull()); + log() << "Recovering from stable timestamp: " << stableTimestamp + << " (top of oplog: " << topOfOplog << ", appliedThrough: " << appliedThrough << ")"; + + log() << "Starting recovery oplog application at the stable timestamp: " << stableTimestamp; + _applyToEndOfOplog(opCtx, stableTimestamp, topOfOplog.getTimestamp()); +} + +void ReplicationRecoveryImpl::_recoverFromUnstableCheckpoint(OperationContext* opCtx, + OpTime appliedThrough, + OpTime topOfOplog) { + invariant(!topOfOplog.isNull()); + log() << "Recovering from an unstable checkpoint (top of oplog: " << topOfOplog + << ", appliedThrough: " << appliedThrough << ")"; + + if (appliedThrough.isNull()) { + // The appliedThrough would be null if we shut down cleanly or crashed as a primary. Either + // way we are consistent at the top of the oplog. + log() << "No oplog entries to apply for recovery. appliedThrough is null."; + } else { + // If the appliedThrough is not null, then we shut down uncleanly during secondary oplog + // application and must apply from the appliedThrough to the top of the oplog. + log() << "Starting recovery oplog application at the appliedThrough: " << appliedThrough + << ", through the top of the oplog: " << topOfOplog; + _applyToEndOfOplog(opCtx, appliedThrough.getTimestamp(), topOfOplog.getTimestamp()); + } +} + void ReplicationRecoveryImpl::_applyToEndOfOplog(OperationContext* opCtx, Timestamp oplogApplicationStartPoint, Timestamp topOfOplog) { @@ -167,8 +146,7 @@ void ReplicationRecoveryImpl::_applyToEndOfOplog(OperationContext* opCtx, // Check if we have any unapplied ops in our oplog. It is important that this is done after // deleting the ragged end of the oplog. if (oplogApplicationStartPoint == topOfOplog) { - log() - << "No oplog entries to apply for recovery. appliedThrough is at the top of the oplog."; + log() << "No oplog entries to apply for recovery. Start point is at the top of the oplog."; return; // We've applied all the valid oplog we have. } else if (oplogApplicationStartPoint > topOfOplog) { severe() << "Applied op " << oplogApplicationStartPoint.toBSON() @@ -209,15 +187,23 @@ void ReplicationRecoveryImpl::_applyToEndOfOplog(OperationContext* opCtx, // Apply remaining ops one at at time, but don't log them because they are already logged. UnreplicatedWritesBlock uwb(opCtx); + BSONObj entry; while (cursor->more()) { - auto entry = cursor->nextSafe(); + entry = cursor->nextSafe(); fassert(40294, SyncTail::syncApply(opCtx, entry, OplogApplication::Mode::kRecovering)); - _consistencyMarkers->setAppliedThrough(opCtx, - fassert(40295, OpTime::parseFromOplogEntry(entry))); } + + // We may crash before setting appliedThrough. If we have a stable checkpoint, we will recover + // to that checkpoint at a replication consistent point, and applying the oplog is safe. + // If we don't have a stable checkpoint, then we must be in startup recovery, and not rollback + // recovery, because we only roll back to a stable timestamp when we have a stable checkpoint. + // Startup recovery from an unstable checkpoint only ever applies a single batch and it is safe + // to replay the batch from any point. + _consistencyMarkers->setAppliedThrough(opCtx, + fassert(40295, OpTime::parseFromOplogEntry(entry))); } -StatusWith<OpTime> ReplicationRecoveryImpl::_getLastAppliedOpTime(OperationContext* opCtx) const { +StatusWith<OpTime> ReplicationRecoveryImpl::_getTopOfOplog(OperationContext* opCtx) const { const auto docsSW = _storageInterface->findDocuments(opCtx, NamespaceString::kRsOplogNamespace, boost::none, // Collection scan |