summaryrefslogtreecommitdiff
path: root/src/mongo/db/repl/replication_recovery.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/mongo/db/repl/replication_recovery.cpp')
-rw-r--r--src/mongo/db/repl/replication_recovery.cpp192
1 files changed, 129 insertions, 63 deletions
diff --git a/src/mongo/db/repl/replication_recovery.cpp b/src/mongo/db/repl/replication_recovery.cpp
index 3c7b47d3285..f28c9aaa2d1 100644
--- a/src/mongo/db/repl/replication_recovery.cpp
+++ b/src/mongo/db/repl/replication_recovery.cpp
@@ -50,6 +50,7 @@
#include "mongo/db/storage/storage_parameters_gen.h"
#include "mongo/db/transaction_history_iterator.h"
#include "mongo/db/transaction_participant.h"
+#include "mongo/logv2/log.h"
#include "mongo/util/log.h"
#include "mongo/util/timer.h"
@@ -90,8 +91,12 @@ public:
void onBatchEnd(const StatusWith<OpTime>&, const std::vector<OplogEntry>&) final {}
void complete(const OpTime& applyThroughOpTime) const {
- log() << "Applied " << _numOpsApplied << " operations in " << _numBatches
- << " batches. Last operation applied with optime: " << applyThroughOpTime;
+ LOGV2(21536,
+ "Applied {numOpsApplied} operations in {numBatches} batches. Last operation applied "
+ "with optime: {applyThroughOpTime}",
+ "numOpsApplied"_attr = _numOpsApplied,
+ "numBatches"_attr = _numBatches,
+ "applyThroughOpTime"_attr = applyThroughOpTime);
}
private:
@@ -134,16 +139,21 @@ public:
ss << " and <= " << _oplogApplicationEndPoint->toBSON();
}
- severe() << "Couldn't find any entries in the oplog" << ss.str()
- << ", which should be impossible.";
+ LOGV2_FATAL(
+ 21559,
+ "Couldn't find any entries in the oplog{ss_str}, which should be impossible.",
+ "ss_str"_attr = ss.str());
fassertFailedNoTrace(40293);
}
auto firstTimestampFound =
fassert(40291, OpTime::parseFromOplogEntry(_cursor->nextSafe())).getTimestamp();
if (firstTimestampFound != _oplogApplicationStartPoint) {
- severe() << "Oplog entry at " << _oplogApplicationStartPoint.toBSON()
- << " is missing; actual entry found is " << firstTimestampFound.toBSON();
+ LOGV2_FATAL(21560,
+ "Oplog entry at {oplogApplicationStartPoint} is missing; actual entry "
+ "found is {firstTimestampFound}",
+ "oplogApplicationStartPoint"_attr = _oplogApplicationStartPoint.toBSON(),
+ "firstTimestampFound"_attr = firstTimestampFound.toBSON());
fassertFailedNoTrace(40292);
}
}
@@ -210,8 +220,9 @@ private:
boost::optional<Timestamp> recoverFromOplogPrecursor(OperationContext* opCtx,
StorageInterface* storageInterface) {
if (!storageInterface->supportsRecoveryTimestamp(opCtx->getServiceContext())) {
- severe() << "Cannot recover from the oplog with a storage engine that does not support "
- << "recover to stable timestamp.";
+ LOGV2_FATAL(21561,
+ "Cannot recover from the oplog with a storage engine that does not support "
+ "recover to stable timestamp.");
fassertFailedNoTrace(50805);
}
@@ -220,7 +231,8 @@ boost::optional<Timestamp> recoverFromOplogPrecursor(OperationContext* opCtx,
// happen.
auto recoveryTS = storageInterface->getRecoveryTimestamp(opCtx->getServiceContext());
if (recoveryTS && recoveryTS->isNull()) {
- severe() << "Cannot recover from the oplog with stable checkpoint at null timestamp.";
+ LOGV2_FATAL(21562,
+ "Cannot recover from the oplog with stable checkpoint at null timestamp.");
fassertFailedNoTrace(50806);
}
@@ -238,36 +250,44 @@ void ReplicationRecoveryImpl::_assertNoRecoveryNeededOnUnstableCheckpoint(Operat
invariant(!_storageInterface->getRecoveryTimestamp(opCtx->getServiceContext()));
if (_consistencyMarkers->getInitialSyncFlag(opCtx)) {
- severe() << "Unexpected recovery needed, initial sync flag set.";
+ LOGV2_FATAL(21563, "Unexpected recovery needed, initial sync flag set.");
fassertFailedNoTrace(31362);
}
const auto truncateAfterPoint = _consistencyMarkers->getOplogTruncateAfterPoint(opCtx);
if (!truncateAfterPoint.isNull()) {
- severe() << "Unexpected recovery needed, oplog requires truncation. Truncate after point: "
- << truncateAfterPoint.toString();
+ LOGV2_FATAL(21564,
+ "Unexpected recovery needed, oplog requires truncation. Truncate after point: "
+ "{truncateAfterPoint}",
+ "truncateAfterPoint"_attr = truncateAfterPoint.toString());
fassertFailedNoTrace(31363);
}
auto topOfOplogSW = _getTopOfOplog(opCtx);
if (!topOfOplogSW.isOK()) {
- severe() << "Recovery not possible, no oplog found: " << topOfOplogSW.getStatus();
+ LOGV2_FATAL(21565,
+ "Recovery not possible, no oplog found: {topOfOplogSW_getStatus}",
+ "topOfOplogSW_getStatus"_attr = topOfOplogSW.getStatus());
fassertFailedNoTrace(31364);
}
const auto topOfOplog = topOfOplogSW.getValue();
const auto appliedThrough = _consistencyMarkers->getAppliedThrough(opCtx);
if (!appliedThrough.isNull() && appliedThrough != topOfOplog) {
- severe() << "Unexpected recovery needed, appliedThrough is not at top of oplog, indicating "
- "oplog has not been fully applied. appliedThrough: "
- << appliedThrough.toString();
+ LOGV2_FATAL(21566,
+ "Unexpected recovery needed, appliedThrough is not at top of oplog, indicating "
+ "oplog has not been fully applied. appliedThrough: {appliedThrough}",
+ "appliedThrough"_attr = appliedThrough.toString());
fassertFailedNoTrace(31365);
}
const auto minValid = _consistencyMarkers->getMinValid(opCtx);
if (minValid > topOfOplog) {
- severe() << "Unexpected recovery needed, top of oplog is not consistent. topOfOplog: "
- << topOfOplog << ", minValid: " << minValid;
+ LOGV2_FATAL(21567,
+ "Unexpected recovery needed, top of oplog is not consistent. topOfOplog: "
+ "{topOfOplog}, minValid: {minValid}",
+ "topOfOplog"_attr = topOfOplog,
+ "minValid"_attr = minValid);
fassertFailedNoTrace(31366);
}
}
@@ -287,21 +307,25 @@ void ReplicationRecoveryImpl::recoverFromOplogAsStandalone(OperationContext* opC
if (gTakeUnstableCheckpointOnShutdown) {
// Ensure 'recoverFromOplogAsStandalone' with 'takeUnstableCheckpointOnShutdown'
// is safely idempotent when it succeeds.
- log() << "Recovering from unstable checkpoint with 'takeUnstableCheckpointOnShutdown'."
- << " Confirming that no oplog recovery is needed.";
+ LOGV2(21537,
+ "Recovering from unstable checkpoint with 'takeUnstableCheckpointOnShutdown'. "
+ "Confirming that no oplog recovery is needed.");
_assertNoRecoveryNeededOnUnstableCheckpoint(opCtx);
- log() << "Not doing any oplog recovery since there is an unstable checkpoint that is "
- << "up to date.";
+ LOGV2(21538,
+ "Not doing any oplog recovery since there is an unstable checkpoint that is up "
+ "to date.");
} else {
- severe() << "Cannot use 'recoverFromOplogAsStandalone' without a stable checkpoint.";
+ LOGV2_FATAL(21568,
+ "Cannot use 'recoverFromOplogAsStandalone' without a stable checkpoint.");
fassertFailedNoTrace(31229);
}
}
reconstructPreparedTransactions(opCtx, OplogApplication::Mode::kRecovering);
- warning() << "Setting mongod to readOnly mode as a result of specifying "
- "'recoverFromOplogAsStandalone'.";
+ LOGV2_WARNING(21558,
+ "Setting mongod to readOnly mode as a result of specifying "
+ "'recoverFromOplogAsStandalone'.");
storageGlobalParams.readOnly = true;
}
@@ -313,7 +337,7 @@ void ReplicationRecoveryImpl::recoverFromOplogUpTo(OperationContext* opCtx, Time
auto recoveryTS = recoverFromOplogPrecursor(opCtx, _storageInterface);
if (!recoveryTS) {
- severe() << "Cannot use 'recoverToOplogTimestamp' without a stable checkpoint.";
+ LOGV2_FATAL(21569, "Cannot use 'recoverToOplogTimestamp' without a stable checkpoint.");
fassertFailedNoTrace(31399);
}
@@ -322,15 +346,18 @@ void ReplicationRecoveryImpl::recoverFromOplogUpTo(OperationContext* opCtx, Time
Timestamp startPoint = _consistencyMarkers->getAppliedThrough(opCtx).getTimestamp();
if (startPoint.isNull()) {
- log() << "No stored oplog entries to apply for recovery.";
+ LOGV2(21539, "No stored oplog entries to apply for recovery.");
return;
}
invariant(!endPoint.isNull());
if (startPoint == endPoint) {
- log() << "No oplog entries to apply for recovery. Start point '" << startPoint
- << "' is at the end point '" << endPoint << "' in the oplog.";
+ LOGV2(21540,
+ "No oplog entries to apply for recovery. Start point '{startPoint}' is at the end "
+ "point '{endPoint}' in the oplog.",
+ "startPoint"_attr = startPoint,
+ "endPoint"_attr = endPoint);
return;
} else if (startPoint > endPoint) {
uasserted(ErrorCodes::BadValue,
@@ -341,8 +368,11 @@ void ReplicationRecoveryImpl::recoverFromOplogUpTo(OperationContext* opCtx, Time
Timestamp appliedUpTo = _applyOplogOperations(opCtx, startPoint, endPoint);
if (appliedUpTo.isNull()) {
- log() << "No stored oplog entries to apply for recovery between " << startPoint.toString()
- << " (inclusive) and " << endPoint.toString() << " (inclusive).";
+ LOGV2(21541,
+ "No stored oplog entries to apply for recovery between {startPoint} (inclusive) and "
+ "{endPoint} (inclusive).",
+ "startPoint"_attr = startPoint.toString(),
+ "endPoint"_attr = endPoint.toString());
} else {
invariant(appliedUpTo <= endPoint);
}
@@ -353,7 +383,7 @@ void ReplicationRecoveryImpl::recoverFromOplogUpTo(OperationContext* opCtx, Time
void ReplicationRecoveryImpl::recoverFromOplog(OperationContext* opCtx,
boost::optional<Timestamp> stableTimestamp) try {
if (_consistencyMarkers->getInitialSyncFlag(opCtx)) {
- log() << "No recovery needed. Initial sync flag set.";
+ LOGV2(21542, "No recovery needed. Initial sync flag set.");
return; // Initial Sync will take over so no cleanup is needed.
}
@@ -393,7 +423,7 @@ void ReplicationRecoveryImpl::recoverFromOplog(OperationContext* opCtx,
topOfOplogSW.getStatus() == ErrorCodes::NamespaceNotFound) {
// Oplog is empty. There are no oplog entries to apply, so we exit recovery and go into
// initial sync.
- log() << "No oplog entries to apply for recovery. Oplog is empty.";
+ LOGV2(21543, "No oplog entries to apply for recovery. Oplog is empty.");
return;
}
fassert(40290, topOfOplogSW);
@@ -406,7 +436,9 @@ void ReplicationRecoveryImpl::recoverFromOplog(OperationContext* opCtx,
_recoverFromUnstableCheckpoint(opCtx, appliedThrough, topOfOplog);
}
} catch (...) {
- severe() << "Caught exception during replication recovery: " << exceptionToStatus();
+ LOGV2_FATAL(21570,
+ "Caught exception during replication recovery: {exceptionToStatus}",
+ "exceptionToStatus"_attr = exceptionToStatus());
std::terminate();
}
@@ -419,11 +451,17 @@ void ReplicationRecoveryImpl::_recoverFromStableTimestamp(OperationContext* opCt
const auto truncateAfterPoint = _consistencyMarkers->getOplogTruncateAfterPoint(opCtx);
- log() << "Recovering from stable timestamp: " << stableTimestamp
- << " (top of oplog: " << topOfOplog << ", appliedThrough: " << appliedThrough
- << ", TruncateAfter: " << truncateAfterPoint << ")";
-
- log() << "Starting recovery oplog application at the stable timestamp: " << stableTimestamp;
+ LOGV2(21544,
+ "Recovering from stable timestamp: {stableTimestamp} (top of oplog: {topOfOplog}, "
+ "appliedThrough: {appliedThrough}, TruncateAfter: {truncateAfterPoint})",
+ "stableTimestamp"_attr = stableTimestamp,
+ "topOfOplog"_attr = topOfOplog,
+ "appliedThrough"_attr = appliedThrough,
+ "truncateAfterPoint"_attr = truncateAfterPoint);
+
+ LOGV2(21545,
+ "Starting recovery oplog application at the stable timestamp: {stableTimestamp}",
+ "stableTimestamp"_attr = stableTimestamp);
_applyToEndOfOplog(opCtx, stableTimestamp, topOfOplog.getTimestamp());
}
@@ -431,18 +469,24 @@ void ReplicationRecoveryImpl::_recoverFromUnstableCheckpoint(OperationContext* o
OpTime appliedThrough,
OpTime topOfOplog) {
invariant(!topOfOplog.isNull());
- log() << "Recovering from an unstable checkpoint (top of oplog: " << topOfOplog
- << ", appliedThrough: " << appliedThrough << ")";
+ LOGV2(21546,
+ "Recovering from an unstable checkpoint (top of oplog: {topOfOplog}, appliedThrough: "
+ "{appliedThrough})",
+ "topOfOplog"_attr = topOfOplog,
+ "appliedThrough"_attr = appliedThrough);
if (appliedThrough.isNull()) {
// The appliedThrough would be null if we shut down cleanly or crashed as a primary. Either
// way we are consistent at the top of the oplog.
- log() << "No oplog entries to apply for recovery. appliedThrough is null.";
+ LOGV2(21547, "No oplog entries to apply for recovery. appliedThrough is null.");
} else {
// If the appliedThrough is not null, then we shut down uncleanly during secondary oplog
// application and must apply from the appliedThrough to the top of the oplog.
- log() << "Starting recovery oplog application at the appliedThrough: " << appliedThrough
- << ", through the top of the oplog: " << topOfOplog;
+ LOGV2(21548,
+ "Starting recovery oplog application at the appliedThrough: {appliedThrough}, "
+ "through the top of the oplog: {topOfOplog}",
+ "appliedThrough"_attr = appliedThrough,
+ "topOfOplog"_attr = topOfOplog);
// When `recoverFromOplog` truncates the oplog, that also happens to set the "oldest
// timestamp" to the truncation point[1]. `_applyToEndOfOplog` will then perform writes
@@ -492,11 +536,15 @@ void ReplicationRecoveryImpl::_applyToEndOfOplog(OperationContext* opCtx,
// Check if we have any unapplied ops in our oplog. It is important that this is done after
// deleting the ragged end of the oplog.
if (oplogApplicationStartPoint == topOfOplog) {
- log() << "No oplog entries to apply for recovery. Start point is at the top of the oplog.";
+ LOGV2(21549,
+ "No oplog entries to apply for recovery. Start point is at the top of the oplog.");
return; // We've applied all the valid oplog we have.
} else if (oplogApplicationStartPoint > topOfOplog) {
- severe() << "Applied op " << oplogApplicationStartPoint.toBSON()
- << " not found. Top of oplog is " << topOfOplog.toBSON() << '.';
+ LOGV2_FATAL(
+ 21571,
+ "Applied op {oplogApplicationStartPoint} not found. Top of oplog is {topOfOplog}.",
+ "oplogApplicationStartPoint"_attr = oplogApplicationStartPoint.toBSON(),
+ "topOfOplog"_attr = topOfOplog.toBSON());
fassertFailedNoTrace(40313);
}
@@ -511,8 +559,10 @@ void ReplicationRecoveryImpl::_applyToEndOfOplog(OperationContext* opCtx,
Timestamp ReplicationRecoveryImpl::_applyOplogOperations(OperationContext* opCtx,
const Timestamp& startPoint,
const Timestamp& endPoint) {
- log() << "Replaying stored operations from " << startPoint << " (inclusive) to " << endPoint
- << " (inclusive).";
+ LOGV2(21550,
+ "Replaying stored operations from {startPoint} (inclusive) to {endPoint} (inclusive).",
+ "startPoint"_attr = startPoint,
+ "endPoint"_attr = endPoint);
OplogBufferLocalOplog oplogBuffer(startPoint, endPoint);
oplogBuffer.startup(opCtx);
@@ -616,9 +666,10 @@ void ReplicationRecoveryImpl::_truncateOplogTo(OperationContext* opCtx,
const auto tsElem = entry["ts"];
if (count == 1) {
if (tsElem.eoo()) {
- LOG(2) << "Oplog tail entry: " << redact(entry);
+ LOGV2_DEBUG(21551, 2, "Oplog tail entry: {entry}", "entry"_attr = redact(entry));
} else {
- LOG(2) << "Oplog tail entry ts field: " << tsElem;
+ LOGV2_DEBUG(
+ 21552, 2, "Oplog tail entry ts field: {tsElem}", "tsElem"_attr = tsElem);
topOfOplog = tsElem.timestamp();
}
}
@@ -627,24 +678,35 @@ void ReplicationRecoveryImpl::_truncateOplogTo(OperationContext* opCtx,
// If count == 1, that means that we have nothing to delete because everything in the
// oplog is <= truncateTimestamp.
if (count != 1) {
- log() << "Truncating oplog from [" << Timestamp(previousRecordId.repr()) << " to "
- << topOfOplog << "]. Truncate after point is " << truncateTimestamp;
+ LOGV2(21553,
+ "Truncating oplog from [{Timestamp_previousRecordId_repr} to {topOfOplog}]. "
+ "Truncate after point is {truncateTimestamp}",
+ "Timestamp_previousRecordId_repr"_attr = Timestamp(previousRecordId.repr()),
+ "topOfOplog"_attr = topOfOplog,
+ "truncateTimestamp"_attr = truncateTimestamp);
invariant(!previousRecordId.isNull());
oplogCollection->cappedTruncateAfter(opCtx, previousRecordId, /*inclusive*/ true);
} else {
- log() << "There is no oplog after " << truncateTimestamp
- << " to truncate. The top of the oplog is " << topOfOplog;
+ LOGV2(21554,
+ "There is no oplog after {truncateTimestamp} to truncate. The top of the "
+ "oplog is {topOfOplog}",
+ "truncateTimestamp"_attr = truncateTimestamp,
+ "topOfOplog"_attr = topOfOplog);
}
- log() << "Replication recovery oplog truncation finished in: " << timer.millis()
- << "ms";
+ LOGV2(21555,
+ "Replication recovery oplog truncation finished in: {timer_millis}ms",
+ "timer_millis"_attr = timer.millis());
return;
}
previousRecordId = id;
}
- severe() << "Reached end of oplog looking for oplog entry before " << truncateTimestamp.toBSON()
- << " but couldn't find any after looking through " << count << " entries.";
+ LOGV2_FATAL(21572,
+ "Reached end of oplog looking for oplog entry before {truncateTimestamp} but "
+ "couldn't find any after looking through {count} entries.",
+ "truncateTimestamp"_attr = truncateTimestamp.toBSON(),
+ "count"_attr = count);
fassertFailedNoTrace(40296);
}
@@ -661,14 +723,18 @@ void ReplicationRecoveryImpl::_truncateOplogIfNeededAndThenClearOplogTruncateAft
AutoGetCollectionForRead oplog(opCtx, NamespaceString::kRsOplogNamespace);
invariant(oplog.getCollection());
- log() << "The oplog truncation point (" << truncatePoint
- << ") is equal to or earlier than the stable timestamp (" << stableTimestamp.get()
- << "), so truncating after the stable timestamp instead";
+ LOGV2(21556,
+ "The oplog truncation point ({truncatePoint}) is equal to or earlier than the stable "
+ "timestamp ({stableTimestamp_get}), so truncating after the stable timestamp instead",
+ "truncatePoint"_attr = truncatePoint,
+ "stableTimestamp_get"_attr = stableTimestamp.get());
truncatePoint = stableTimestamp.get();
}
- log() << "Removing unapplied oplog entries starting after: " << truncatePoint.toBSON();
+ LOGV2(21557,
+ "Removing unapplied oplog entries starting after: {truncatePoint}",
+ "truncatePoint"_attr = truncatePoint.toBSON());
_truncateOplogTo(opCtx, truncatePoint);
// Clear the oplogTruncateAfterPoint now that we have removed any holes that might exist in the