diff options
-rw-r--r-- | src/mongo/db/repl/bgsync.cpp | 18 | ||||
-rw-r--r-- | src/mongo/db/repl/data_replicator_external_state_impl.cpp | 7 | ||||
-rw-r--r-- | src/mongo/db/repl/oplog_fetcher.cpp | 4 | ||||
-rw-r--r-- | src/mongo/db/repl/sync_source_feedback.cpp | 13 | ||||
-rw-r--r-- | src/mongo/db/repl/sync_source_resolver.cpp | 8 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator_impl.cpp | 33 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator_impl_test.cpp | 10 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp | 10 |
8 files changed, 69 insertions, 34 deletions
diff --git a/src/mongo/db/repl/bgsync.cpp b/src/mongo/db/repl/bgsync.cpp index b79b9734321..d5fdd001921 100644 --- a/src/mongo/db/repl/bgsync.cpp +++ b/src/mongo/db/repl/bgsync.cpp @@ -417,8 +417,9 @@ void BackgroundSync::_produce(OperationContext* txn) { fassertFailedWithStatus(34440, exceptionToStatus()); } - LOG(1) << "scheduling fetcher to read remote oplog on " << _syncSourceHost << " starting at " - << oplogFetcher->getCommandObject_forTest()["filter"]; + const auto logLevel = Command::testCommandsEnabled ? 0 : 1; + LOG(logLevel) << "scheduling fetcher to read remote oplog on " << _syncSourceHost + << " starting at " << oplogFetcher->getCommandObject_forTest()["filter"]; auto scheduleStatus = oplogFetcher->startup(); if (!scheduleStatus.isOK()) { warning() << "unable to schedule fetcher to read remote oplog on " << source << ": " @@ -731,31 +732,32 @@ long long BackgroundSync::_readLastAppliedHash(OperationContext* txn) { bool BackgroundSync::shouldStopFetching() const { if (inShutdown()) { - LOG(2) << "Interrupted by shutdown while checking sync source."; + LOG(2) << "Stopping oplog fetcher due to shutdown."; return true; } // If we are transitioning to primary state, we need to stop fetching in order to go into // bgsync-stop mode. if (_replCoord->isWaitingForApplierToDrain()) { - LOG(2) << "Interrupted by waiting for applier to drain while checking sync source."; + LOG(2) << "Stopping oplog fetcher because we are waiting for the applier to drain."; return true; } if (_replCoord->getMemberState().primary() && !_replCoord->isCatchingUp()) { - LOG(2) << "Interrupted by becoming primary while checking sync source."; + LOG(2) << "Stopping oplog fetcher because we are primary."; return true; } // Check if we have been stopped. if (isStopped()) { - LOG(2) << "Interrupted by a stop request while checking sync source."; + LOG(2) << "Stopping oplog fetcher due to stop request."; return true; } - // Check current sync target. + // Check current sync source. if (getSyncTarget().empty()) { - LOG(1) << "Canceling oplog query because we have no valid sync source."; + LOG(1) << "Stopping oplog fetcher; canceling oplog query because we have no valid sync " + "source."; return true; } diff --git a/src/mongo/db/repl/data_replicator_external_state_impl.cpp b/src/mongo/db/repl/data_replicator_external_state_impl.cpp index e17a650d89e..418d1de7455 100644 --- a/src/mongo/db/repl/data_replicator_external_state_impl.cpp +++ b/src/mongo/db/repl/data_replicator_external_state_impl.cpp @@ -71,9 +71,10 @@ bool DataReplicatorExternalStateImpl::shouldStopFetching(const HostAndPort& sour const rpc::ReplSetMetadata& metadata) { // Re-evaluate quality of sync target. if (_replicationCoordinator->shouldChangeSyncSource(source, metadata)) { - LOG(1) << "Canceling oplog query because we have to choose a sync source. Current source: " - << source << ", OpTime " << metadata.getLastOpVisible() - << ", its sync source index:" << metadata.getSyncSourceIndex(); + log() + << "Canceling oplog query because we have to choose a new sync source. Current source: " + << source << ", OpTime " << metadata.getLastOpVisible() + << ", its sync source index:" << metadata.getSyncSourceIndex(); return true; } return false; diff --git a/src/mongo/db/repl/oplog_fetcher.cpp b/src/mongo/db/repl/oplog_fetcher.cpp index ac683e6d616..2893a9099d6 100644 --- a/src/mongo/db/repl/oplog_fetcher.cpp +++ b/src/mongo/db/repl/oplog_fetcher.cpp @@ -510,8 +510,10 @@ void OplogFetcher::_callback(const Fetcher::QueryResponseStatus& result, if (_dataReplicatorExternalState->shouldStopFetching(_fetcher->getSource(), metadata)) { _finishCallback(Status(ErrorCodes::InvalidSyncSource, str::stream() << "sync source " << _fetcher->getSource().toString() - << " (last optime: " + << " (last visible optime: " << metadata.getLastOpVisible().toString() + << "; config version: " + << metadata.getConfigVersion() << "; sync source index: " << metadata.getSyncSourceIndex() << "; primary index: " diff --git a/src/mongo/db/repl/sync_source_feedback.cpp b/src/mongo/db/repl/sync_source_feedback.cpp index 89fff3c387f..96ac055f792 100644 --- a/src/mongo/db/repl/sync_source_feedback.cpp +++ b/src/mongo/db/repl/sync_source_feedback.cpp @@ -72,8 +72,17 @@ Reporter::PrepareReplSetUpdatePositionCommandFn makePrepareReplSetUpdatePosition commandStyle) -> StatusWith<BSONObj> { auto currentSyncTarget = bgsync->getSyncTarget(); if (currentSyncTarget != syncTarget) { - // Change in sync target - return Status(ErrorCodes::InvalidSyncSource, "Sync target is no longer valid"); + if (currentSyncTarget.empty()) { + // Sync source was cleared. + return Status(ErrorCodes::InvalidSyncSource, + str::stream() << "Sync source was cleared. Was " << syncTarget); + + } else { + // Sync source changed. + return Status(ErrorCodes::InvalidSyncSource, + str::stream() << "Sync source changed from " << syncTarget << " to " + << currentSyncTarget); + } } stdx::lock_guard<stdx::mutex> lock(mtx); diff --git a/src/mongo/db/repl/sync_source_resolver.cpp b/src/mongo/db/repl/sync_source_resolver.cpp index 39ec824ebac..881206a7d5e 100644 --- a/src/mongo/db/repl/sync_source_resolver.cpp +++ b/src/mongo/db/repl/sync_source_resolver.cpp @@ -214,7 +214,7 @@ OpTime SyncSourceResolver::_parseRemoteEarliestOpTime(const HostAndPort& candida if (queryResponse.documents.empty()) { // Remote oplog is empty. const auto until = _taskExecutor->now() + kOplogEmptyBlacklistDuration; - log() << "Blacklisting due to empty oplog on host " << candidate << " for " + log() << "Blacklisting " << candidate << " due to empty oplog for " << kOplogEmptyBlacklistDuration << " until: " << until; _syncSourceSelector->blacklistSyncSource(candidate, until); return OpTime(); @@ -224,7 +224,7 @@ OpTime SyncSourceResolver::_parseRemoteEarliestOpTime(const HostAndPort& candida if (firstObjFound.isEmpty()) { // First document in remote oplog is empty. const auto until = _taskExecutor->now() + kFirstOplogEntryEmptyBlacklistDuration; - log() << "Blacklisting due to empty first document from host " << candidate << " for " + log() << "Blacklisting " << candidate << " due to empty first document for " << kFirstOplogEntryEmptyBlacklistDuration << " until: " << until; _syncSourceSelector->blacklistSyncSource(candidate, until); return OpTime(); @@ -235,8 +235,8 @@ OpTime SyncSourceResolver::_parseRemoteEarliestOpTime(const HostAndPort& candida if (remoteEarliestOpTime.isNull()) { // First document in remote oplog is empty. const auto until = _taskExecutor->now() + kFirstOplogEntryNullTimestampBlacklistDuration; - log() << "Blacklisting due to null timestamp in first document from host " << candidate - << " for " << kFirstOplogEntryNullTimestampBlacklistDuration << " until: " << until; + log() << "Blacklisting " << candidate << " due to null timestamp in first document for " + << kFirstOplogEntryNullTimestampBlacklistDuration << " until: " << until; _syncSourceSelector->blacklistSyncSource(candidate, until); return OpTime(); } diff --git a/src/mongo/db/repl/topology_coordinator_impl.cpp b/src/mongo/db/repl/topology_coordinator_impl.cpp index bc7029a7a4c..f3f00e72488 100644 --- a/src/mongo/db/repl/topology_coordinator_impl.cpp +++ b/src/mongo/db/repl/topology_coordinator_impl.cpp @@ -193,7 +193,8 @@ HostAndPort TopologyCoordinatorImpl::chooseNewSyncSource(Date_t now, _syncSource = HostAndPort(); return _syncSource; } else if (_memberIsBlacklisted(*_currentPrimaryMember(), now)) { - LOG(1) << "Cannot select primary member as sync source because they are blacklisted:" + LOG(1) << "Cannot select a sync source because chaining is not allowed and primary " + "member is blacklisted: " << _currentPrimaryMember()->getHostAndPort(); _syncSource = HostAndPort(); return _syncSource; @@ -267,11 +268,11 @@ HostAndPort TopologyCoordinatorImpl::chooseNewSyncSource(Date_t now, continue; } - // Things to skip on the first attempt. + // On the first attempt, we skip candidates that do not match these criteria. if (attempts == 0) { // Candidate must be a voter if we are a voter. if (_selfConfig().isVoter() && !itMemberConfig.isVoter()) { - LOG(2) << "Cannot select sync source because of voting differences: " + LOG(2) << "Cannot select sync source because we are a voter and it is not: " << itMemberConfig.getHostAndPort(); continue; } @@ -2419,17 +2420,25 @@ bool TopologyCoordinatorImpl::shouldChangeSyncSource(const HostAndPort& currentS // If the user requested a sync source change, return true. if (_forceSyncSourceIndex != -1) { + log() << "Choosing new sync source because the user has requested to use " + << _rsConfig.getMemberAt(_forceSyncSourceIndex).getHostAndPort() + << " as a sync source"; return true; } if (_rsConfig.getProtocolVersion() == 1 && metadata.getConfigVersion() != _rsConfig.getConfigVersion()) { + log() << "Choosing new sync source because the config version supplied by " << currentSource + << ", " << metadata.getConfigVersion() << ", does not match ours, " + << _rsConfig.getConfigVersion(); return true; } const int currentSourceIndex = _rsConfig.findMemberIndexByHostAndPort(currentSource); // PV0 doesn't use metadata, we have to consult _rsConfig. if (currentSourceIndex == -1) { + log() << "Choosing new sync source because " << currentSource.toString() + << " is not in our config"; return true; } @@ -2448,6 +2457,18 @@ bool TopologyCoordinatorImpl::shouldChangeSyncSource(const HostAndPort& currentS // unless they are primary. if (_rsConfig.getProtocolVersion() == 1 && metadata.getSyncSourceIndex() == -1 && currentSourceOpTime <= myLastOpTime && metadata.getPrimaryIndex() != currentSourceIndex) { + std::stringstream logMessage; + logMessage << "Choosing new sync source because our current sync source, " + << currentSource.toString() << ", has an OpTime (" << currentSourceOpTime + << ") which is not ahead of ours (" << myLastOpTime + << "), it does not have a sync source, and it's not the primary"; + if (metadata.getPrimaryIndex() >= 0) { + logMessage << " (" << _rsConfig.getMemberAt(metadata.getPrimaryIndex()).getHostAndPort() + << " is)"; + } else { + logMessage << " (sync source does not know the primary)"; + } + log() << logMessage.str(); return true; } @@ -2462,9 +2483,9 @@ bool TopologyCoordinatorImpl::shouldChangeSyncSource(const HostAndPort& currentS (candidateConfig.shouldBuildIndexes() || !_selfConfig().shouldBuildIndexes()) && it->getState().readable() && !_memberIsBlacklisted(candidateConfig, now) && goalSecs < it->getAppliedOpTime().getSecs()) { - log() << "re-evaluating sync source because our current sync source's most recent " - << "OpTime is " << currentSourceOpTime.toString() << " which is more than " - << _options.maxSyncSourceLagSecs << " behind member " + log() << "Choosing new sync source because the most recent OpTime of our sync source, " + << currentSource << ", is " << currentSourceOpTime.toString() + << " which is more than " << _options.maxSyncSourceLagSecs << " behind member " << candidateConfig.getHostAndPort().toString() << " whose most recent OpTime is " << it->getAppliedOpTime().toString(); invariant(itIndex != _selfIndex); diff --git a/src/mongo/db/repl/topology_coordinator_impl_test.cpp b/src/mongo/db/repl/topology_coordinator_impl_test.cpp index d0281db8217..791167d2f3a 100644 --- a/src/mongo/db/repl/topology_coordinator_impl_test.cpp +++ b/src/mongo/db/repl/topology_coordinator_impl_test.cpp @@ -5392,7 +5392,7 @@ TEST_F(HeartbeatResponseTest, ShouldNotChangeSyncSourceWhenNodeIsFreshByHeartbea ASSERT_FALSE( getTopoCoord().shouldChangeSyncSource(HostAndPort("host2"), OpTime(), metadata, now())); stopCapturingLogMessages(); - ASSERT_EQUALS(0, countLogLinesContaining("re-evaluating sync source")); + ASSERT_EQUALS(0, countLogLinesContaining("Choosing new sync source")); } TEST_F(HeartbeatResponseTest, ShouldNotChangeSyncSourceWhenNodeIsStaleByHeartbeatButNotMetadata) { @@ -5426,7 +5426,7 @@ TEST_F(HeartbeatResponseTest, ShouldNotChangeSyncSourceWhenNodeIsStaleByHeartbea ASSERT_FALSE( getTopoCoord().shouldChangeSyncSource(HostAndPort("host2"), OpTime(), metadata, now())); stopCapturingLogMessages(); - ASSERT_EQUALS(0, countLogLinesContaining("re-evaluating sync source")); + ASSERT_EQUALS(0, countLogLinesContaining("Choosing new sync source")); } TEST_F(HeartbeatResponseTest, ShouldChangeSyncSourceWhenFresherMemberExists) { @@ -5458,7 +5458,7 @@ TEST_F(HeartbeatResponseTest, ShouldChangeSyncSourceWhenFresherMemberExists) { ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource( HostAndPort("host2"), OpTime(), makeMetadata(), now())); stopCapturingLogMessages(); - ASSERT_EQUALS(1, countLogLinesContaining("re-evaluating sync source")); + ASSERT_EQUALS(1, countLogLinesContaining("Choosing new sync source")); } TEST_F(HeartbeatResponseTest, ShouldNotChangeSyncSourceWhileFresherMemberIsBlackListed) { @@ -5503,7 +5503,7 @@ TEST_F(HeartbeatResponseTest, ShouldNotChangeSyncSourceWhileFresherMemberIsBlack ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource( HostAndPort("host2"), OpTime(), makeMetadata(), now())); stopCapturingLogMessages(); - ASSERT_EQUALS(1, countLogLinesContaining("re-evaluating sync source")); + ASSERT_EQUALS(1, countLogLinesContaining("Choosing new sync source")); } TEST_F(HeartbeatResponseTest, ShouldNotChangeSyncSourceWhenFresherMemberIsDown) { @@ -5663,7 +5663,7 @@ TEST_F(HeartbeatResponseTest, ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource( HostAndPort("host2"), OpTime(), makeMetadata(), now())); stopCapturingLogMessages(); - ASSERT_EQUALS(1, countLogLinesContaining("re-evaluating sync source")); + ASSERT_EQUALS(1, countLogLinesContaining("Choosing new sync source")); } TEST_F(TopoCoordTest, ShouldNotStandForElectionWhileAwareOfPrimary) { diff --git a/src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp b/src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp index 3cb7f7ec038..1a627398e82 100644 --- a/src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp +++ b/src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp @@ -3232,7 +3232,7 @@ TEST_F(HeartbeatResponseTestV1, ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource( HostAndPort("host2"), OpTime(), makeMetadata(lastOpTimeApplied), now())); stopCapturingLogMessages(); - ASSERT_EQUALS(1, countLogLinesContaining("re-evaluating sync source")); + ASSERT_EQUALS(1, countLogLinesContaining("Choosing new sync source")); } TEST_F(HeartbeatResponseTestV1, NodeReturnsBadValueWhenProcessingPV0ElectionCommandsInPV1) { @@ -3391,7 +3391,7 @@ TEST_F(HeartbeatResponseTestV1, ShouldNotChangeSyncSourceWhileFresherMemberIsBla ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource( HostAndPort("host2"), OpTime(), makeMetadata(lastOpTimeApplied), now())); stopCapturingLogMessages(); - ASSERT_EQUALS(1, countLogLinesContaining("re-evaluating sync source")); + ASSERT_EQUALS(1, countLogLinesContaining("Choosing new sync source")); } TEST_F(HeartbeatResponseTestV1, ShouldNotChangeSyncSourceIfNodeIsFreshByHeartbeatButNotMetadata) { @@ -3424,7 +3424,7 @@ TEST_F(HeartbeatResponseTestV1, ShouldNotChangeSyncSourceIfNodeIsFreshByHeartbea ASSERT_FALSE(getTopoCoord().shouldChangeSyncSource( HostAndPort("host2"), OpTime(), makeMetadata(lastOpTimeApplied), now())); stopCapturingLogMessages(); - ASSERT_EQUALS(0, countLogLinesContaining("re-evaluating sync source")); + ASSERT_EQUALS(0, countLogLinesContaining("Choosing new sync source")); } TEST_F(HeartbeatResponseTestV1, ShouldNotChangeSyncSourceIfNodeIsStaleByHeartbeatButNotMetadata) { @@ -3457,7 +3457,7 @@ TEST_F(HeartbeatResponseTestV1, ShouldNotChangeSyncSourceIfNodeIsStaleByHeartbea ASSERT_FALSE(getTopoCoord().shouldChangeSyncSource( HostAndPort("host2"), OpTime(), makeMetadata(fresherLastOpTimeApplied), now())); stopCapturingLogMessages(); - ASSERT_EQUALS(0, countLogLinesContaining("re-evaluating sync source")); + ASSERT_EQUALS(0, countLogLinesContaining("Choosing new sync source")); } TEST_F(HeartbeatResponseTestV1, ShouldChangeSyncSourceWhenFresherMemberExists) { @@ -3489,7 +3489,7 @@ TEST_F(HeartbeatResponseTestV1, ShouldChangeSyncSourceWhenFresherMemberExists) { ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource( HostAndPort("host2"), OpTime(), makeMetadata(lastOpTimeApplied), now())); stopCapturingLogMessages(); - ASSERT_EQUALS(1, countLogLinesContaining("re-evaluating sync source")); + ASSERT_EQUALS(1, countLogLinesContaining("Choosing new sync source")); } TEST_F(HeartbeatResponseTestV1, ShouldNotChangeSyncSourceWhenMemberHasYetToHeartbeatUs) { |