summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/mongo/db/repl/bgsync.cpp18
-rw-r--r--src/mongo/db/repl/data_replicator_external_state_impl.cpp7
-rw-r--r--src/mongo/db/repl/oplog_fetcher.cpp4
-rw-r--r--src/mongo/db/repl/sync_source_feedback.cpp13
-rw-r--r--src/mongo/db/repl/sync_source_resolver.cpp8
-rw-r--r--src/mongo/db/repl/topology_coordinator_impl.cpp33
-rw-r--r--src/mongo/db/repl/topology_coordinator_impl_test.cpp10
-rw-r--r--src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp10
8 files changed, 69 insertions, 34 deletions
diff --git a/src/mongo/db/repl/bgsync.cpp b/src/mongo/db/repl/bgsync.cpp
index b79b9734321..d5fdd001921 100644
--- a/src/mongo/db/repl/bgsync.cpp
+++ b/src/mongo/db/repl/bgsync.cpp
@@ -417,8 +417,9 @@ void BackgroundSync::_produce(OperationContext* txn) {
fassertFailedWithStatus(34440, exceptionToStatus());
}
- LOG(1) << "scheduling fetcher to read remote oplog on " << _syncSourceHost << " starting at "
- << oplogFetcher->getCommandObject_forTest()["filter"];
+ const auto logLevel = Command::testCommandsEnabled ? 0 : 1;
+ LOG(logLevel) << "scheduling fetcher to read remote oplog on " << _syncSourceHost
+ << " starting at " << oplogFetcher->getCommandObject_forTest()["filter"];
auto scheduleStatus = oplogFetcher->startup();
if (!scheduleStatus.isOK()) {
warning() << "unable to schedule fetcher to read remote oplog on " << source << ": "
@@ -731,31 +732,32 @@ long long BackgroundSync::_readLastAppliedHash(OperationContext* txn) {
bool BackgroundSync::shouldStopFetching() const {
if (inShutdown()) {
- LOG(2) << "Interrupted by shutdown while checking sync source.";
+ LOG(2) << "Stopping oplog fetcher due to shutdown.";
return true;
}
// If we are transitioning to primary state, we need to stop fetching in order to go into
// bgsync-stop mode.
if (_replCoord->isWaitingForApplierToDrain()) {
- LOG(2) << "Interrupted by waiting for applier to drain while checking sync source.";
+ LOG(2) << "Stopping oplog fetcher because we are waiting for the applier to drain.";
return true;
}
if (_replCoord->getMemberState().primary() && !_replCoord->isCatchingUp()) {
- LOG(2) << "Interrupted by becoming primary while checking sync source.";
+ LOG(2) << "Stopping oplog fetcher because we are primary.";
return true;
}
// Check if we have been stopped.
if (isStopped()) {
- LOG(2) << "Interrupted by a stop request while checking sync source.";
+ LOG(2) << "Stopping oplog fetcher due to stop request.";
return true;
}
- // Check current sync target.
+ // Check current sync source.
if (getSyncTarget().empty()) {
- LOG(1) << "Canceling oplog query because we have no valid sync source.";
+ LOG(1) << "Stopping oplog fetcher; canceling oplog query because we have no valid sync "
+ "source.";
return true;
}
diff --git a/src/mongo/db/repl/data_replicator_external_state_impl.cpp b/src/mongo/db/repl/data_replicator_external_state_impl.cpp
index e17a650d89e..418d1de7455 100644
--- a/src/mongo/db/repl/data_replicator_external_state_impl.cpp
+++ b/src/mongo/db/repl/data_replicator_external_state_impl.cpp
@@ -71,9 +71,10 @@ bool DataReplicatorExternalStateImpl::shouldStopFetching(const HostAndPort& sour
const rpc::ReplSetMetadata& metadata) {
// Re-evaluate quality of sync target.
if (_replicationCoordinator->shouldChangeSyncSource(source, metadata)) {
- LOG(1) << "Canceling oplog query because we have to choose a sync source. Current source: "
- << source << ", OpTime " << metadata.getLastOpVisible()
- << ", its sync source index:" << metadata.getSyncSourceIndex();
+ log()
+ << "Canceling oplog query because we have to choose a new sync source. Current source: "
+ << source << ", OpTime " << metadata.getLastOpVisible()
+ << ", its sync source index:" << metadata.getSyncSourceIndex();
return true;
}
return false;
diff --git a/src/mongo/db/repl/oplog_fetcher.cpp b/src/mongo/db/repl/oplog_fetcher.cpp
index ac683e6d616..2893a9099d6 100644
--- a/src/mongo/db/repl/oplog_fetcher.cpp
+++ b/src/mongo/db/repl/oplog_fetcher.cpp
@@ -510,8 +510,10 @@ void OplogFetcher::_callback(const Fetcher::QueryResponseStatus& result,
if (_dataReplicatorExternalState->shouldStopFetching(_fetcher->getSource(), metadata)) {
_finishCallback(Status(ErrorCodes::InvalidSyncSource,
str::stream() << "sync source " << _fetcher->getSource().toString()
- << " (last optime: "
+ << " (last visible optime: "
<< metadata.getLastOpVisible().toString()
+ << "; config version: "
+ << metadata.getConfigVersion()
<< "; sync source index: "
<< metadata.getSyncSourceIndex()
<< "; primary index: "
diff --git a/src/mongo/db/repl/sync_source_feedback.cpp b/src/mongo/db/repl/sync_source_feedback.cpp
index 89fff3c387f..96ac055f792 100644
--- a/src/mongo/db/repl/sync_source_feedback.cpp
+++ b/src/mongo/db/repl/sync_source_feedback.cpp
@@ -72,8 +72,17 @@ Reporter::PrepareReplSetUpdatePositionCommandFn makePrepareReplSetUpdatePosition
commandStyle) -> StatusWith<BSONObj> {
auto currentSyncTarget = bgsync->getSyncTarget();
if (currentSyncTarget != syncTarget) {
- // Change in sync target
- return Status(ErrorCodes::InvalidSyncSource, "Sync target is no longer valid");
+ if (currentSyncTarget.empty()) {
+ // Sync source was cleared.
+ return Status(ErrorCodes::InvalidSyncSource,
+ str::stream() << "Sync source was cleared. Was " << syncTarget);
+
+ } else {
+ // Sync source changed.
+ return Status(ErrorCodes::InvalidSyncSource,
+ str::stream() << "Sync source changed from " << syncTarget << " to "
+ << currentSyncTarget);
+ }
}
stdx::lock_guard<stdx::mutex> lock(mtx);
diff --git a/src/mongo/db/repl/sync_source_resolver.cpp b/src/mongo/db/repl/sync_source_resolver.cpp
index 39ec824ebac..881206a7d5e 100644
--- a/src/mongo/db/repl/sync_source_resolver.cpp
+++ b/src/mongo/db/repl/sync_source_resolver.cpp
@@ -214,7 +214,7 @@ OpTime SyncSourceResolver::_parseRemoteEarliestOpTime(const HostAndPort& candida
if (queryResponse.documents.empty()) {
// Remote oplog is empty.
const auto until = _taskExecutor->now() + kOplogEmptyBlacklistDuration;
- log() << "Blacklisting due to empty oplog on host " << candidate << " for "
+ log() << "Blacklisting " << candidate << " due to empty oplog for "
<< kOplogEmptyBlacklistDuration << " until: " << until;
_syncSourceSelector->blacklistSyncSource(candidate, until);
return OpTime();
@@ -224,7 +224,7 @@ OpTime SyncSourceResolver::_parseRemoteEarliestOpTime(const HostAndPort& candida
if (firstObjFound.isEmpty()) {
// First document in remote oplog is empty.
const auto until = _taskExecutor->now() + kFirstOplogEntryEmptyBlacklistDuration;
- log() << "Blacklisting due to empty first document from host " << candidate << " for "
+ log() << "Blacklisting " << candidate << " due to empty first document for "
<< kFirstOplogEntryEmptyBlacklistDuration << " until: " << until;
_syncSourceSelector->blacklistSyncSource(candidate, until);
return OpTime();
@@ -235,8 +235,8 @@ OpTime SyncSourceResolver::_parseRemoteEarliestOpTime(const HostAndPort& candida
if (remoteEarliestOpTime.isNull()) {
// First document in remote oplog is empty.
const auto until = _taskExecutor->now() + kFirstOplogEntryNullTimestampBlacklistDuration;
- log() << "Blacklisting due to null timestamp in first document from host " << candidate
- << " for " << kFirstOplogEntryNullTimestampBlacklistDuration << " until: " << until;
+ log() << "Blacklisting " << candidate << " due to null timestamp in first document for "
+ << kFirstOplogEntryNullTimestampBlacklistDuration << " until: " << until;
_syncSourceSelector->blacklistSyncSource(candidate, until);
return OpTime();
}
diff --git a/src/mongo/db/repl/topology_coordinator_impl.cpp b/src/mongo/db/repl/topology_coordinator_impl.cpp
index bc7029a7a4c..f3f00e72488 100644
--- a/src/mongo/db/repl/topology_coordinator_impl.cpp
+++ b/src/mongo/db/repl/topology_coordinator_impl.cpp
@@ -193,7 +193,8 @@ HostAndPort TopologyCoordinatorImpl::chooseNewSyncSource(Date_t now,
_syncSource = HostAndPort();
return _syncSource;
} else if (_memberIsBlacklisted(*_currentPrimaryMember(), now)) {
- LOG(1) << "Cannot select primary member as sync source because they are blacklisted:"
+ LOG(1) << "Cannot select a sync source because chaining is not allowed and primary "
+ "member is blacklisted: "
<< _currentPrimaryMember()->getHostAndPort();
_syncSource = HostAndPort();
return _syncSource;
@@ -267,11 +268,11 @@ HostAndPort TopologyCoordinatorImpl::chooseNewSyncSource(Date_t now,
continue;
}
- // Things to skip on the first attempt.
+ // On the first attempt, we skip candidates that do not match these criteria.
if (attempts == 0) {
// Candidate must be a voter if we are a voter.
if (_selfConfig().isVoter() && !itMemberConfig.isVoter()) {
- LOG(2) << "Cannot select sync source because of voting differences: "
+ LOG(2) << "Cannot select sync source because we are a voter and it is not: "
<< itMemberConfig.getHostAndPort();
continue;
}
@@ -2419,17 +2420,25 @@ bool TopologyCoordinatorImpl::shouldChangeSyncSource(const HostAndPort& currentS
// If the user requested a sync source change, return true.
if (_forceSyncSourceIndex != -1) {
+ log() << "Choosing new sync source because the user has requested to use "
+ << _rsConfig.getMemberAt(_forceSyncSourceIndex).getHostAndPort()
+ << " as a sync source";
return true;
}
if (_rsConfig.getProtocolVersion() == 1 &&
metadata.getConfigVersion() != _rsConfig.getConfigVersion()) {
+ log() << "Choosing new sync source because the config version supplied by " << currentSource
+ << ", " << metadata.getConfigVersion() << ", does not match ours, "
+ << _rsConfig.getConfigVersion();
return true;
}
const int currentSourceIndex = _rsConfig.findMemberIndexByHostAndPort(currentSource);
// PV0 doesn't use metadata, we have to consult _rsConfig.
if (currentSourceIndex == -1) {
+ log() << "Choosing new sync source because " << currentSource.toString()
+ << " is not in our config";
return true;
}
@@ -2448,6 +2457,18 @@ bool TopologyCoordinatorImpl::shouldChangeSyncSource(const HostAndPort& currentS
// unless they are primary.
if (_rsConfig.getProtocolVersion() == 1 && metadata.getSyncSourceIndex() == -1 &&
currentSourceOpTime <= myLastOpTime && metadata.getPrimaryIndex() != currentSourceIndex) {
+ std::stringstream logMessage;
+ logMessage << "Choosing new sync source because our current sync source, "
+ << currentSource.toString() << ", has an OpTime (" << currentSourceOpTime
+ << ") which is not ahead of ours (" << myLastOpTime
+ << "), it does not have a sync source, and it's not the primary";
+ if (metadata.getPrimaryIndex() >= 0) {
+ logMessage << " (" << _rsConfig.getMemberAt(metadata.getPrimaryIndex()).getHostAndPort()
+ << " is)";
+ } else {
+ logMessage << " (sync source does not know the primary)";
+ }
+ log() << logMessage.str();
return true;
}
@@ -2462,9 +2483,9 @@ bool TopologyCoordinatorImpl::shouldChangeSyncSource(const HostAndPort& currentS
(candidateConfig.shouldBuildIndexes() || !_selfConfig().shouldBuildIndexes()) &&
it->getState().readable() && !_memberIsBlacklisted(candidateConfig, now) &&
goalSecs < it->getAppliedOpTime().getSecs()) {
- log() << "re-evaluating sync source because our current sync source's most recent "
- << "OpTime is " << currentSourceOpTime.toString() << " which is more than "
- << _options.maxSyncSourceLagSecs << " behind member "
+ log() << "Choosing new sync source because the most recent OpTime of our sync source, "
+ << currentSource << ", is " << currentSourceOpTime.toString()
+ << " which is more than " << _options.maxSyncSourceLagSecs << " behind member "
<< candidateConfig.getHostAndPort().toString() << " whose most recent OpTime is "
<< it->getAppliedOpTime().toString();
invariant(itIndex != _selfIndex);
diff --git a/src/mongo/db/repl/topology_coordinator_impl_test.cpp b/src/mongo/db/repl/topology_coordinator_impl_test.cpp
index d0281db8217..791167d2f3a 100644
--- a/src/mongo/db/repl/topology_coordinator_impl_test.cpp
+++ b/src/mongo/db/repl/topology_coordinator_impl_test.cpp
@@ -5392,7 +5392,7 @@ TEST_F(HeartbeatResponseTest, ShouldNotChangeSyncSourceWhenNodeIsFreshByHeartbea
ASSERT_FALSE(
getTopoCoord().shouldChangeSyncSource(HostAndPort("host2"), OpTime(), metadata, now()));
stopCapturingLogMessages();
- ASSERT_EQUALS(0, countLogLinesContaining("re-evaluating sync source"));
+ ASSERT_EQUALS(0, countLogLinesContaining("Choosing new sync source"));
}
TEST_F(HeartbeatResponseTest, ShouldNotChangeSyncSourceWhenNodeIsStaleByHeartbeatButNotMetadata) {
@@ -5426,7 +5426,7 @@ TEST_F(HeartbeatResponseTest, ShouldNotChangeSyncSourceWhenNodeIsStaleByHeartbea
ASSERT_FALSE(
getTopoCoord().shouldChangeSyncSource(HostAndPort("host2"), OpTime(), metadata, now()));
stopCapturingLogMessages();
- ASSERT_EQUALS(0, countLogLinesContaining("re-evaluating sync source"));
+ ASSERT_EQUALS(0, countLogLinesContaining("Choosing new sync source"));
}
TEST_F(HeartbeatResponseTest, ShouldChangeSyncSourceWhenFresherMemberExists) {
@@ -5458,7 +5458,7 @@ TEST_F(HeartbeatResponseTest, ShouldChangeSyncSourceWhenFresherMemberExists) {
ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource(
HostAndPort("host2"), OpTime(), makeMetadata(), now()));
stopCapturingLogMessages();
- ASSERT_EQUALS(1, countLogLinesContaining("re-evaluating sync source"));
+ ASSERT_EQUALS(1, countLogLinesContaining("Choosing new sync source"));
}
TEST_F(HeartbeatResponseTest, ShouldNotChangeSyncSourceWhileFresherMemberIsBlackListed) {
@@ -5503,7 +5503,7 @@ TEST_F(HeartbeatResponseTest, ShouldNotChangeSyncSourceWhileFresherMemberIsBlack
ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource(
HostAndPort("host2"), OpTime(), makeMetadata(), now()));
stopCapturingLogMessages();
- ASSERT_EQUALS(1, countLogLinesContaining("re-evaluating sync source"));
+ ASSERT_EQUALS(1, countLogLinesContaining("Choosing new sync source"));
}
TEST_F(HeartbeatResponseTest, ShouldNotChangeSyncSourceWhenFresherMemberIsDown) {
@@ -5663,7 +5663,7 @@ TEST_F(HeartbeatResponseTest,
ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource(
HostAndPort("host2"), OpTime(), makeMetadata(), now()));
stopCapturingLogMessages();
- ASSERT_EQUALS(1, countLogLinesContaining("re-evaluating sync source"));
+ ASSERT_EQUALS(1, countLogLinesContaining("Choosing new sync source"));
}
TEST_F(TopoCoordTest, ShouldNotStandForElectionWhileAwareOfPrimary) {
diff --git a/src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp b/src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp
index 3cb7f7ec038..1a627398e82 100644
--- a/src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp
+++ b/src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp
@@ -3232,7 +3232,7 @@ TEST_F(HeartbeatResponseTestV1,
ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource(
HostAndPort("host2"), OpTime(), makeMetadata(lastOpTimeApplied), now()));
stopCapturingLogMessages();
- ASSERT_EQUALS(1, countLogLinesContaining("re-evaluating sync source"));
+ ASSERT_EQUALS(1, countLogLinesContaining("Choosing new sync source"));
}
TEST_F(HeartbeatResponseTestV1, NodeReturnsBadValueWhenProcessingPV0ElectionCommandsInPV1) {
@@ -3391,7 +3391,7 @@ TEST_F(HeartbeatResponseTestV1, ShouldNotChangeSyncSourceWhileFresherMemberIsBla
ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource(
HostAndPort("host2"), OpTime(), makeMetadata(lastOpTimeApplied), now()));
stopCapturingLogMessages();
- ASSERT_EQUALS(1, countLogLinesContaining("re-evaluating sync source"));
+ ASSERT_EQUALS(1, countLogLinesContaining("Choosing new sync source"));
}
TEST_F(HeartbeatResponseTestV1, ShouldNotChangeSyncSourceIfNodeIsFreshByHeartbeatButNotMetadata) {
@@ -3424,7 +3424,7 @@ TEST_F(HeartbeatResponseTestV1, ShouldNotChangeSyncSourceIfNodeIsFreshByHeartbea
ASSERT_FALSE(getTopoCoord().shouldChangeSyncSource(
HostAndPort("host2"), OpTime(), makeMetadata(lastOpTimeApplied), now()));
stopCapturingLogMessages();
- ASSERT_EQUALS(0, countLogLinesContaining("re-evaluating sync source"));
+ ASSERT_EQUALS(0, countLogLinesContaining("Choosing new sync source"));
}
TEST_F(HeartbeatResponseTestV1, ShouldNotChangeSyncSourceIfNodeIsStaleByHeartbeatButNotMetadata) {
@@ -3457,7 +3457,7 @@ TEST_F(HeartbeatResponseTestV1, ShouldNotChangeSyncSourceIfNodeIsStaleByHeartbea
ASSERT_FALSE(getTopoCoord().shouldChangeSyncSource(
HostAndPort("host2"), OpTime(), makeMetadata(fresherLastOpTimeApplied), now()));
stopCapturingLogMessages();
- ASSERT_EQUALS(0, countLogLinesContaining("re-evaluating sync source"));
+ ASSERT_EQUALS(0, countLogLinesContaining("Choosing new sync source"));
}
TEST_F(HeartbeatResponseTestV1, ShouldChangeSyncSourceWhenFresherMemberExists) {
@@ -3489,7 +3489,7 @@ TEST_F(HeartbeatResponseTestV1, ShouldChangeSyncSourceWhenFresherMemberExists) {
ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource(
HostAndPort("host2"), OpTime(), makeMetadata(lastOpTimeApplied), now()));
stopCapturingLogMessages();
- ASSERT_EQUALS(1, countLogLinesContaining("re-evaluating sync source"));
+ ASSERT_EQUALS(1, countLogLinesContaining("Choosing new sync source"));
}
TEST_F(HeartbeatResponseTestV1, ShouldNotChangeSyncSourceWhenMemberHasYetToHeartbeatUs) {