summaryrefslogtreecommitdiff
path: root/src/mongo/db/repl
diff options
context:
space:
mode:
authorXuerui Fa <xuerui.fa@mongodb.com>2020-05-18 15:20:41 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-05-21 18:46:44 +0000
commit7812887269d1623159c541f01b61e99f359cec0b (patch)
tree14ef89a45a1eb158d12df5646da224b2c1ec1825 /src/mongo/db/repl
parentef8b64b9e2f46955942240e57292d25740e11807 (diff)
downloadmongo-7812887269d1623159c541f01b61e99f359cec0b.tar.gz
SERVER-28068: Prevent nodes from going into rollback due to falling off the sync source's oplog
Diffstat (limited to 'src/mongo/db/repl')
-rw-r--r--src/mongo/db/repl/bgsync.cpp31
-rw-r--r--src/mongo/db/repl/oplog_fetcher.cpp249
-rw-r--r--src/mongo/db/repl/oplog_fetcher.h22
-rw-r--r--src/mongo/db/repl/oplog_fetcher_test.cpp99
-rw-r--r--src/mongo/db/repl/sync_source_resolver.cpp4
-rw-r--r--src/mongo/db/repl/sync_source_resolver_test.cpp4
6 files changed, 286 insertions, 123 deletions
diff --git a/src/mongo/db/repl/bgsync.cpp b/src/mongo/db/repl/bgsync.cpp
index a81b0ebbb11..52d8ef2e718 100644
--- a/src/mongo/db/repl/bgsync.cpp
+++ b/src/mongo/db/repl/bgsync.cpp
@@ -338,7 +338,7 @@ void BackgroundSync::_produce() {
numSyncSourceSelections.increment(1);
- if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) {
+ if (syncSourceResp.syncSourceStatus == ErrorCodes::TooStaleToSyncFromSource) {
// All (accessible) sync sources are too far ahead of us.
if (_replCoord->getMemberState().primary()) {
LOGV2_WARNING(21115,
@@ -560,12 +560,13 @@ void BackgroundSync::_produce() {
return;
}
+ Seconds blacklistDuration(60);
if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) {
// This is bad because it means that our source
// has not returned oplog entries in ascending ts order, and they need to be.
LOGV2_WARNING(
- 21120, "{error}", "Fetcher returned error", "error"_attr = redact(fetcherReturnStatus));
+ 21120, "Oplog fetcher returned error", "error"_attr = redact(fetcherReturnStatus));
// Do not blacklist the server here, it will be blacklisted when we try to reuse it,
// if it can't return a matching oplog start from the last fetch oplog ts field.
return;
@@ -581,19 +582,27 @@ void BackgroundSync::_produce() {
mongo::sleepmillis(100);
}
}
+ } else if (fetcherReturnStatus.code() == ErrorCodes::TooStaleToSyncFromSource) {
+ LOGV2_WARNING(
+ 2806800,
+ "Oplog fetcher discovered we are too stale to sync from sync source. Blacklisting "
+ "sync source",
+ "syncSource"_attr = source,
+ "blacklistDuration"_attr = blacklistDuration);
+ _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration);
} else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) {
- Seconds blacklistDuration(60);
- LOGV2_WARNING(21121,
- "Fetcher got invalid BSON while querying oplog. Blacklisting sync source "
- "{syncSource} for {blacklistDuration}.",
- "Fetcher got invalid BSON while querying oplog. Blacklisting sync source",
- "syncSource"_attr = source,
- "blacklistDuration"_attr = blacklistDuration);
+ LOGV2_WARNING(
+ 21121,
+ "Oplog fetcher got invalid BSON while querying oplog. Blacklisting sync source "
+ "{syncSource} for {blacklistDuration}.",
+ "Oplog fetcher got invalid BSON while querying oplog. Blacklisting sync source",
+ "syncSource"_attr = source,
+ "blacklistDuration"_attr = blacklistDuration);
_replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration);
} else if (!fetcherReturnStatus.isOK()) {
LOGV2_WARNING(21122,
- "Fetcher stopped querying remote oplog with error: {error}",
- "Fetcher stopped querying remote oplog with error",
+ "Oplog fetcher stopped querying remote oplog with error: {error}",
+ "Oplog fetcher stopped querying remote oplog with error",
"error"_attr = redact(fetcherReturnStatus));
}
}
diff --git a/src/mongo/db/repl/oplog_fetcher.cpp b/src/mongo/db/repl/oplog_fetcher.cpp
index 7426a544de1..37cc0d9d4cd 100644
--- a/src/mongo/db/repl/oplog_fetcher.cpp
+++ b/src/mongo/db/repl/oplog_fetcher.cpp
@@ -113,105 +113,6 @@ Milliseconds calculateAwaitDataTimeout(const ReplSetConfig& config) {
// We never wait longer than 30 seconds.
return std::min((config.getElectionTimeoutPeriod() / 2), maximumAwaitDataTimeoutMS);
}
-
-/**
- * Checks the first batch of results from query.
- * 'documents' are the first batch of results returned from tailing the remote oplog.
- * 'lastFetched' optime should be consistent with the predicate in the query.
- * 'remoteLastOpApplied' is the last OpTime applied on the sync source. This is optional for
- * compatibility with 3.4 servers that do not send OplogQueryMetadata.
- * 'requiredRBID' is a RollbackID received when we chose the sync source that we use here to
- * guarantee we have not rolled back since we confirmed the sync source had our minValid.
- * 'remoteRBID' is a RollbackId for the sync source returned in this oplog query. This is optional
- * for compatibility with 3.4 servers that do not send OplogQueryMetadata.
- * 'requireFresherSyncSource' is a boolean indicating whether we should require the sync source's
- * oplog to be ahead of ours. If false, the sync source's oplog is allowed to be at the same point
- * as ours, but still cannot be behind ours.
- *
- * Returns OplogStartMissing if we cannot find the optime of the last fetched operation in
- * the remote oplog.
- */
-Status checkRemoteOplogStart(const OplogFetcher::Documents& documents,
- OpTime lastFetched,
- OpTime remoteLastOpApplied,
- int requiredRBID,
- int remoteRBID,
- bool requireFresherSyncSource) {
- // Once we establish our cursor, we need to ensure that our upstream node hasn't rolled back
- // since that could cause it to not have our required minValid point. The cursor will be
- // killed if the upstream node rolls back so we don't need to keep checking once the cursor
- // is established.
- if (remoteRBID != requiredRBID) {
- return Status(ErrorCodes::InvalidSyncSource,
- "Upstream node rolled back after choosing it as a sync source. Choosing "
- "new sync source.");
- }
-
- // Sometimes our remoteLastOpApplied may be stale; if we received a document with an
- // opTime later than remoteLastApplied, we can assume the remote is at least up to that
- // opTime.
- if (!documents.empty()) {
- const auto docOpTime = OpTime::parseFromOplogEntry(documents.back());
- if (docOpTime.isOK()) {
- remoteLastOpApplied = std::max(remoteLastOpApplied, docOpTime.getValue());
- }
- }
-
- // The sync source could be behind us if it rolled back after we selected it. We could have
- // failed to detect the rollback if it occurred between sync source selection (when we check the
- // candidate is ahead of us) and sync source resolution (when we got 'requiredRBID'). If the
- // sync source is now behind us, choose a new sync source to prevent going into rollback.
- if (remoteLastOpApplied < lastFetched) {
- return Status(ErrorCodes::InvalidSyncSource,
- str::stream()
- << "Sync source's last applied OpTime " << remoteLastOpApplied.toString()
- << " is older than our last fetched OpTime " << lastFetched.toString()
- << ". Choosing new sync source.");
- }
-
- // If 'requireFresherSyncSource' is true, we must check that the sync source's
- // lastApplied is ahead of us to prevent forming a cycle. Although we check for
- // this condition in sync source selection, if an undetected rollback occurred between sync
- // source selection and sync source resolution, this condition may no longer hold.
- // 'requireFresherSyncSource' is false for initial sync, since no other node can sync off an
- // initial syncing node, so we do not need to check for cycles. In addition, it would be
- // problematic to check this condition for initial sync, since the 'lastFetched' OpTime will
- // almost always equal the 'remoteLastApplied', since we fetch the sync source's last applied
- // OpTime to determine where to start our OplogFetcher.
- if (requireFresherSyncSource && remoteLastOpApplied <= lastFetched) {
- return Status(ErrorCodes::InvalidSyncSource,
- str::stream()
- << "Sync source must be ahead of me. My last fetched oplog optime: "
- << lastFetched.toString() << ", latest oplog optime of sync source: "
- << remoteLastOpApplied.toString());
- }
-
- // At this point we know that our sync source has our minValid and is not behind us, so if our
- // history diverges from our sync source's we should prefer its history and roll back ours.
-
- // Since we checked for rollback and our sync source is ahead of us, an empty batch means that
- // we have a higher timestamp on our last fetched OpTime than our sync source's last applied
- // OpTime, but a lower term. When this occurs, we must roll back our inconsistent oplog entry.
- if (documents.empty()) {
- return Status(ErrorCodes::OplogStartMissing, "Received an empty batch from sync source.");
- }
-
- const auto& o = documents.front();
- auto opTimeResult = OpTime::parseFromOplogEntry(o);
- if (!opTimeResult.isOK()) {
- return Status(ErrorCodes::InvalidBSON,
- str::stream() << "our last optime fetched: " << lastFetched.toString()
- << ". failed to parse optime from first oplog on source: "
- << o.toString() << ": " << opTimeResult.getStatus().toString());
- }
- auto opTime = opTimeResult.getValue();
- if (opTime != lastFetched) {
- std::string message = str::stream() << "Our last optime fetched: " << lastFetched.toString()
- << ". source's GTE: " << opTime.toString();
- return Status(ErrorCodes::OplogStartMissing, message);
- }
- return Status::OK();
-}
} // namespace
@@ -749,16 +650,9 @@ Status OplogFetcher::_onSuccessfulBatch(const Documents& documents) {
}
auto oqMetadata = oqMetadataResult.getValue();
- // This lastFetched value is the last OpTime from the previous batch.
- auto lastFetched = _getLastOpTimeFetched();
-
if (_firstBatch) {
- auto status = checkRemoteOplogStart(documents,
- lastFetched,
- oqMetadata.getLastOpApplied(),
- _requiredRBID,
- oqMetadata.getRBID(),
- _requireFresherSyncSource);
+ auto status =
+ _checkRemoteOplogStart(documents, oqMetadata.getLastOpApplied(), oqMetadata.getRBID());
if (!status.isOK()) {
// Stop oplog fetcher and execute rollback if necessary.
return status;
@@ -785,6 +679,9 @@ Status OplogFetcher::_onSuccessfulBatch(const Documents& documents) {
}
}
+ // This lastFetched value is the last OpTime from the previous batch.
+ auto lastFetched = _getLastOpTimeFetched();
+
auto validateResult = OplogFetcher::validateDocuments(
documents, _firstBatch, lastFetched.getTimestamp(), _startingPoint);
if (!validateResult.isOK()) {
@@ -862,6 +759,142 @@ Status OplogFetcher::_onSuccessfulBatch(const Documents& documents) {
return Status::OK();
}
+Status OplogFetcher::_checkRemoteOplogStart(const OplogFetcher::Documents& documents,
+ OpTime remoteLastOpApplied,
+ int remoteRBID) {
+ using namespace fmt::literals;
+
+ // Once we establish our cursor, we need to ensure that our upstream node hasn't rolled back
+ // since that could cause it to not have our required minValid point. The cursor will be
+ // killed if the upstream node rolls back so we don't need to keep checking once the cursor
+ // is established.
+ if (remoteRBID != _requiredRBID) {
+ return Status(ErrorCodes::InvalidSyncSource,
+ "Upstream node rolled back after choosing it as a sync source. Choosing "
+ "new sync source.");
+ }
+
+ // Sometimes our remoteLastOpApplied may be stale; if we received a document with an
+ // opTime later than remoteLastApplied, we can assume the remote is at least up to that
+ // opTime.
+ if (!documents.empty()) {
+ const auto docOpTime = OpTime::parseFromOplogEntry(documents.back());
+ if (docOpTime.isOK()) {
+ remoteLastOpApplied = std::max(remoteLastOpApplied, docOpTime.getValue());
+ }
+ }
+
+ auto lastFetched = _getLastOpTimeFetched();
+
+ // The sync source could be behind us if it rolled back after we selected it. We could have
+ // failed to detect the rollback if it occurred between sync source selection (when we check the
+ // candidate is ahead of us) and sync source resolution (when we got '_requiredRBID'). If the
+ // sync source is now behind us, choose a new sync source to prevent going into rollback.
+ if (remoteLastOpApplied < lastFetched) {
+ return Status(ErrorCodes::InvalidSyncSource,
+ "Sync source's last applied OpTime {} is older than our last fetched OpTime "
+ "{}. Choosing new sync source."_format(remoteLastOpApplied.toString(),
+ lastFetched.toString()));
+ }
+
+ // If '_requireFresherSyncSource' is true, we must check that the sync source's
+ // lastApplied is ahead of us to prevent forming a cycle. Although we check for
+ // this condition in sync source selection, if an undetected rollback occurred between sync
+ // source selection and sync source resolution, this condition may no longer hold.
+ // '_requireFresherSyncSource' is false for initial sync, since no other node can sync off an
+ // initial syncing node, so we do not need to check for cycles. In addition, it would be
+ // problematic to check this condition for initial sync, since the 'lastFetched' OpTime will
+ // almost always equal the 'remoteLastApplied', since we fetch the sync source's last applied
+ // OpTime to determine where to start our OplogFetcher.
+ if (_requireFresherSyncSource && remoteLastOpApplied <= lastFetched) {
+ return Status(ErrorCodes::InvalidSyncSource,
+ "Sync source must be ahead of me. My last fetched oplog optime: {}, latest "
+ "oplog optime of sync source: {}"_format(lastFetched.toString(),
+ remoteLastOpApplied.toString()));
+ }
+
+ // At this point we know that our sync source has our minValid and is not behind us, so if our
+ // history diverges from our sync source's we should prefer its history and roll back ours.
+
+ // Since we checked for rollback and our sync source is ahead of us, an empty batch means that
+ // we have a higher timestamp on our last fetched OpTime than our sync source's last applied
+ // OpTime, but a lower term. When this occurs, we must roll back our inconsistent oplog entry.
+ if (documents.empty()) {
+ return Status(ErrorCodes::OplogStartMissing, "Received an empty batch from sync source.");
+ }
+
+ const auto& o = documents.front();
+ auto opTimeResult = OpTime::parseFromOplogEntry(o);
+
+ if (!opTimeResult.isOK()) {
+ return Status(ErrorCodes::InvalidBSON,
+ "our last optime fetched: {}. failed to parse optime from first oplog in "
+ "batch on source: {}: {}"_format(lastFetched.toString(),
+ o.toString(),
+ opTimeResult.getStatus().toString()));
+ }
+ auto opTime = opTimeResult.getValue();
+ if (opTime != lastFetched) {
+ Status status = _checkTooStaleToSyncFromSource(lastFetched, opTime);
+
+ // We should never return an OK status here.
+ invariant(!status.isOK());
+ return status;
+ }
+ return Status::OK();
+}
+
+Status OplogFetcher::_checkTooStaleToSyncFromSource(const OpTime lastFetched,
+ const OpTime firstOpTimeInDocument) {
+ // Check to see if the sync source's first oplog entry is later than 'lastFetched'. If it is, we
+ // are too stale to sync from this node. If it isn't, we should go into rollback instead.
+ BSONObj remoteFirstOplogEntry;
+ try {
+ // Query for the first oplog entry in the sync source's oplog.
+ auto query = Query().sort(BSON("$natural" << 1));
+ // Since this function is called after the first batch, the exhaust stream has not been
+ // started yet. As a result, using the same connection is safe.
+ remoteFirstOplogEntry = _conn->findOne(_nss.ns(), query);
+ } catch (DBException& e) {
+ // If an error occurs with the query, throw an error.
+ return Status(ErrorCodes::TooStaleToSyncFromSource, e.reason());
+ }
+
+ using namespace fmt::literals;
+
+ StatusWith<OpTime> remoteFirstOpTimeResult = OpTime::parseFromOplogEntry(remoteFirstOplogEntry);
+ if (!remoteFirstOpTimeResult.isOK()) {
+ return Status(
+ ErrorCodes::InvalidBSON,
+ "failed to parse optime from first entry in source's oplog: {}: {}"_format(
+ remoteFirstOplogEntry.toString(), remoteFirstOpTimeResult.getStatus().toString()));
+ }
+
+ auto remoteFirstOpTime = remoteFirstOpTimeResult.getValue();
+ if (remoteFirstOpTime.isNull()) {
+ return Status(ErrorCodes::InvalidBSON,
+ "optime of first entry in source's oplog cannot be null: {}"_format(
+ remoteFirstOplogEntry.toString()));
+ }
+
+ // remoteFirstOpTime may come from a very old config, so we cannot compare their terms.
+ if (lastFetched.getTimestamp() < remoteFirstOpTime.getTimestamp()) {
+ // We are too stale to sync from our current sync source.
+ return Status(ErrorCodes::TooStaleToSyncFromSource,
+ "we are too stale to sync from the sync source's oplog. our last fetched "
+ "timestamp is earlier than the sync source's first timestamp. our last "
+ "optime fetched: {}. sync source's first optime: {}"_format(
+ lastFetched.toString(), remoteFirstOpTime.toString()));
+ }
+
+ // If we are not too stale to sync from the source, we should go into rollback.
+ std::string message =
+ "the sync source's oplog and our oplog have diverged, going into rollback. our last optime "
+ "fetched: {}. source's GTE: {}"_format(lastFetched.toString(),
+ firstOpTimeInDocument.toString());
+ return Status(ErrorCodes::OplogStartMissing, message);
+}
+
bool OplogFetcher::OplogFetcherRestartDecisionDefault::shouldContinue(OplogFetcher* fetcher,
Status status) {
if (_numRestarts == _maxRestarts) {
diff --git a/src/mongo/db/repl/oplog_fetcher.h b/src/mongo/db/repl/oplog_fetcher.h
index 7d66b38c109..0292dc5e7b0 100644
--- a/src/mongo/db/repl/oplog_fetcher.h
+++ b/src/mongo/db/repl/oplog_fetcher.h
@@ -346,6 +346,28 @@ private:
*/
Milliseconds _getRetriedFindMaxTime() const;
+ /**
+ * Checks the first batch of results from query.
+ * 'documents' are the first batch of results returned from tailing the remote oplog.
+ * 'remoteLastOpApplied' is the last OpTime applied on the sync source.
+ * 'remoteRBID' is a RollbackId for the sync source returned in this oplog query. This is
+ * optional for compatibility with 3.4 servers that do not send OplogQueryMetadata.
+ *
+ * Returns TooStaleToSyncFromSource if we are too stale to sync from our source.
+ * Returns OplogStartMissing if we should go into rollback.
+ */
+ Status _checkRemoteOplogStart(const OplogFetcher::Documents& documents,
+ OpTime remoteLastOpApplied,
+ int remoteRBID);
+
+ /**
+ * Distinguishes between needing to rollback and being too stale to sync from our sync source.
+ * This will be called when we check the first batch of results and our last fetched optime does
+ * not equal the first document in that batch. This function should never return Status::OK().
+ */
+ Status _checkTooStaleToSyncFromSource(const OpTime lastFetched,
+ const OpTime firstOpTimeInDocument);
+
// Protects member data of this OplogFetcher.
mutable Mutex _mutex = MONGO_MAKE_LATCH("OplogFetcher::_mutex");
diff --git a/src/mongo/db/repl/oplog_fetcher_test.cpp b/src/mongo/db/repl/oplog_fetcher_test.cpp
index 170e506a5d6..bb22b7537fe 100644
--- a/src/mongo/db/repl/oplog_fetcher_test.cpp
+++ b/src/mongo/db/repl/oplog_fetcher_test.cpp
@@ -951,12 +951,106 @@ TEST_F(OplogFetcherTest, MetadataIsNotProcessedOnBatchThatTriggersRollback) {
auto metadataObj = makeOplogBatchMetadata(replSetMetadata, oqMetadata);
auto entry = makeNoopOplogEntry(Seconds(456));
+ // Set the remote node's first oplog entry to equal to lastFetched.
+ auto remoteFirstOplogEntry = makeNoopOplogEntry(lastFetched);
+ _mockServer->insert(nss.ns(), remoteFirstOplogEntry);
+
ASSERT_EQUALS(
ErrorCodes::OplogStartMissing,
processSingleBatch(makeFirstBatch(cursorId, {entry}, {metadataObj}))->getStatus());
+
ASSERT_FALSE(dataReplicatorExternalState->metadataWasProcessed);
}
+TEST_F(OplogFetcherTest, TooStaleToSyncFromSyncSource) {
+ CursorId cursorId = 22LL;
+ auto metadataObj = makeOplogBatchMetadata(replSetMetadata, oqMetadata);
+ auto entry = makeNoopOplogEntry(Seconds(456));
+
+ // Set the remote node's first oplog entry to be later than lastFetched, so we have fallen off
+ // the sync source's oplog.
+ auto remoteFirstOplogEntry = makeNoopOplogEntry(Seconds(200));
+ _mockServer->insert(nss.ns(), remoteFirstOplogEntry);
+
+ ASSERT_EQUALS(
+ ErrorCodes::TooStaleToSyncFromSource,
+ processSingleBatch(makeFirstBatch(cursorId, {entry}, {metadataObj}))->getStatus());
+}
+
+TEST_F(OplogFetcherTest, NotTooStaleShouldReturnOplogStartMissing) {
+ CursorId cursorId = 22LL;
+ auto metadataObj = makeOplogBatchMetadata(replSetMetadata, oqMetadata);
+ auto entry = makeNoopOplogEntry(Seconds(456));
+
+ // Set the remote node's first oplog entry to be earlier than lastFetched, so we have not fallen
+ // off the sync source's oplog.
+ auto remoteFirstOplogEntry = makeNoopOplogEntry(Seconds(1));
+ _mockServer->insert(nss.ns(), remoteFirstOplogEntry);
+
+ ASSERT_EQUALS(
+ ErrorCodes::OplogStartMissing,
+ processSingleBatch(makeFirstBatch(cursorId, {entry}, {metadataObj}))->getStatus());
+}
+
+TEST_F(OplogFetcherTest, BadRemoteFirstOplogEntryReturnsInvalidBSON) {
+ CursorId cursorId = 22LL;
+ auto metadataObj = makeOplogBatchMetadata(replSetMetadata, oqMetadata);
+ auto entry = makeNoopOplogEntry(Seconds(456));
+
+ // Set the remote node's first oplog entry to be an invalid BSON.
+ auto remoteFirstOplogEntry = BSON("ok" << false);
+ _mockServer->insert(nss.ns(), remoteFirstOplogEntry);
+
+ ASSERT_EQUALS(
+ ErrorCodes::InvalidBSON,
+ processSingleBatch(makeFirstBatch(cursorId, {entry}, {metadataObj}))->getStatus());
+}
+
+TEST_F(OplogFetcherTest, EmptyRemoteFirstOplogEntryReturnsInvalidBSON) {
+ CursorId cursorId = 22LL;
+ auto metadataObj = makeOplogBatchMetadata(replSetMetadata, oqMetadata);
+ auto entry = makeNoopOplogEntry(Seconds(456));
+
+ // Set the remote node's first oplog entry to be an empty BSON.
+ auto remoteFirstOplogEntry = BSONObj();
+ _mockServer->insert(nss.ns(), remoteFirstOplogEntry);
+
+ ASSERT_EQUALS(
+ ErrorCodes::InvalidBSON,
+ processSingleBatch(makeFirstBatch(cursorId, {entry}, {metadataObj}))->getStatus());
+}
+
+TEST_F(OplogFetcherTest, RemoteFirstOplogEntryWithNullTimestampReturnsInvalidBSON) {
+ CursorId cursorId = 22LL;
+ auto metadataObj = makeOplogBatchMetadata(replSetMetadata, oqMetadata);
+ auto entry = makeNoopOplogEntry(Seconds(456));
+
+ // Set the remote node's first oplog entry to have a null timestamp.
+ auto remoteFirstOplogEntry = makeNoopOplogEntry(Seconds(0));
+ _mockServer->insert(nss.ns(), remoteFirstOplogEntry);
+
+ ASSERT_EQUALS(
+ ErrorCodes::InvalidBSON,
+ processSingleBatch(makeFirstBatch(cursorId, {entry}, {metadataObj}))->getStatus());
+}
+
+TEST_F(OplogFetcherTest, RemoteFirstOplogEntryWithExtraFieldsReturnsOplogStartMissing) {
+ CursorId cursorId = 22LL;
+ auto metadataObj = makeOplogBatchMetadata(replSetMetadata, oqMetadata);
+ auto entry = makeNoopOplogEntry(Seconds(456));
+
+ // Set the remote node's first oplog entry to include extra fields.
+ auto remoteFirstOplogEntry = BSON("ts" << Timestamp(1, 0) << "t" << 1 << "extra"
+ << "field");
+ _mockServer->insert(nss.ns(), remoteFirstOplogEntry);
+
+ // We should have parsed the OpTime correctly and realized that we have not fallen off the sync
+ // source's oplog, so we should go into rollback.
+ ASSERT_EQUALS(
+ ErrorCodes::OplogStartMissing,
+ processSingleBatch(makeFirstBatch(cursorId, {entry}, {metadataObj}))->getStatus());
+}
+
TEST_F(OplogFetcherTest, FailingInitialCreateNewCursorNoRetriesShutsDownOplogFetcher) {
ASSERT_EQUALS(ErrorCodes::InvalidSyncSource, processSingleBatch(Message())->getStatus());
}
@@ -1457,6 +1551,11 @@ TEST_F(
CursorId cursorId = 22LL;
auto entry = makeNoopOplogEntry({{Seconds(456), 0}, lastFetched.getTerm()});
auto metadataObj = makeOplogBatchMetadata(replSetMetadata, oqMetadata);
+
+ // Set the remote node's first oplog entry to equal to lastFetched.
+ auto remoteFirstOplogEntry = makeNoopOplogEntry(lastFetched);
+ _mockServer->insert(nss.ns(), remoteFirstOplogEntry);
+
ASSERT_EQUALS(ErrorCodes::OplogStartMissing,
processSingleBatch(makeFirstBatch(cursorId, {entry}, metadataObj))->getStatus());
}
diff --git a/src/mongo/db/repl/sync_source_resolver.cpp b/src/mongo/db/repl/sync_source_resolver.cpp
index 84cb3183933..4b4a9a178a2 100644
--- a/src/mongo/db/repl/sync_source_resolver.cpp
+++ b/src/mongo/db/repl/sync_source_resolver.cpp
@@ -342,7 +342,7 @@ void SyncSourceResolver::_firstOplogEntryFetcherCallback(
if (_lastOpTimeFetched.getTimestamp() < remoteEarliestOpTime.getTimestamp()) {
// We're too stale to use this sync source.
const auto blacklistDuration = kTooStaleBlacklistDuration;
- const auto until = _taskExecutor->now() + Minutes(1);
+ const auto until = _taskExecutor->now() + blacklistDuration;
LOGV2(21771,
"We are too stale to use {candidate} as a sync source. Blacklisting this sync source "
@@ -553,7 +553,7 @@ Status SyncSourceResolver::_chooseAndProbeNextSyncSource(OpTime earliestOpTimeSe
}
SyncSourceResolverResponse response;
- response.syncSourceStatus = {ErrorCodes::OplogStartMissing, "too stale to catch up"};
+ response.syncSourceStatus = {ErrorCodes::TooStaleToSyncFromSource, "too stale to catch up"};
response.earliestOpTimeSeen = earliestOpTimeSeen;
return _finishCallback(response);
}
diff --git a/src/mongo/db/repl/sync_source_resolver_test.cpp b/src/mongo/db/repl/sync_source_resolver_test.cpp
index 6366392bd3a..1e4380d2fca 100644
--- a/src/mongo/db/repl/sync_source_resolver_test.cpp
+++ b/src/mongo/db/repl/sync_source_resolver_test.cpp
@@ -428,7 +428,7 @@ TEST_F(SyncSourceResolverTest,
}
TEST_F(SyncSourceResolverTest,
- SyncSourceResolverReturnsOplogStartMissingAndEarliestOpTimeAvailableWhenAllSourcesTooFresh) {
+ SyncSourceResolverReturnsTooStaleAndEarliestOpTimeAvailableWhenAllSourcesTooFresh) {
HostAndPort candidate1("node1", 12345);
HostAndPort candidate2("node2", 12345);
HostAndPort candidate3("node3", 12345);
@@ -448,7 +448,7 @@ TEST_F(SyncSourceResolverTest,
_resolver->join();
ASSERT_FALSE(_resolver->isActive());
- ASSERT_EQUALS(ErrorCodes::OplogStartMissing, _response.syncSourceStatus);
+ ASSERT_EQUALS(ErrorCodes::TooStaleToSyncFromSource, _response.syncSourceStatus);
ASSERT_EQUALS(Timestamp(200, 2), _response.earliestOpTimeSeen.getTimestamp());
}