From a7bf9b5c81e43595d3548b000f5910852eefe6b3 Mon Sep 17 00:00:00 2001 From: Gabriel Marks Date: Thu, 6 Jan 2022 19:26:05 +0000 Subject: SERVER-59290 Improve criteria for shouldChangeSyncSource --- .../initial_sync_chooses_correct_sync_source.js | 6 +- jstests/replsets/libs/sync_source.js | 11 ++ jstests/replsets/sync_source_changes.js | 73 +++++++++++++ src/mongo/db/repl/topology_coordinator.cpp | 114 ++++++++++++++------- src/mongo/db/repl/topology_coordinator.h | 3 +- 5 files changed, 165 insertions(+), 42 deletions(-) create mode 100644 jstests/replsets/sync_source_changes.js diff --git a/jstests/replsets/initial_sync_chooses_correct_sync_source.js b/jstests/replsets/initial_sync_chooses_correct_sync_source.js index 6e61d6dc123..bc44457e84e 100644 --- a/jstests/replsets/initial_sync_chooses_correct_sync_source.js +++ b/jstests/replsets/initial_sync_chooses_correct_sync_source.js @@ -9,6 +9,7 @@ "use strict"; load("jstests/libs/fail_point_util.js"); +load("jstests/replsets/libs/sync_source.js"); // assertSyncSourceMatchesSoon const waitForHeartbeats = initialSyncNode => { // Hang the node before it undergoes sync source selection. @@ -173,10 +174,7 @@ initialSyncNode.adminCommand( // Once we become secondary, the secondary read preference no longer matters and we choose the // primary because chaining is disallowed. -assert.soon(function() { - let res = assert.commandWorked(initialSyncNode.adminCommand({replSetGetStatus: 1})); - return res.syncSourceHost == primary.host; -}); +assertSyncSourceMatchesSoon(initialSyncNode, primary.host); primary.delayMessagesFrom(initialSyncNode, 0); TestData.skipCollectionAndIndexValidation = false; diff --git a/jstests/replsets/libs/sync_source.js b/jstests/replsets/libs/sync_source.js index 37483912cfe..41be60ec132 100644 --- a/jstests/replsets/libs/sync_source.js +++ b/jstests/replsets/libs/sync_source.js @@ -64,6 +64,17 @@ const forceSyncSource = (rst, node, syncSource) => { return forceSyncSource; }; +/** + * Asserts that the sync source of the given node will match syncSourceName soon. Additional + * arguments are passed to the assert.soon. + */ +const assertSyncSourceMatchesSoon = (node, syncSourceName, ...assertSoonArgs) => { + return assert.soon(() => { + const res = assert.commandWorked(node.adminCommand({replSetGetStatus: 1})); + return res.syncSourceHost === syncSourceName; + }, ...assertSoonArgs); +}; + const DataCenter = class { constructor(name, nodes) { this.name = name; diff --git a/jstests/replsets/sync_source_changes.js b/jstests/replsets/sync_source_changes.js new file mode 100644 index 00000000000..80655c06667 --- /dev/null +++ b/jstests/replsets/sync_source_changes.js @@ -0,0 +1,73 @@ +/** + * Tests that when the current sync source no longer meets the strict criteria for being a sync + * source, and there is another node which does meet those criteria, we will change sync source + * (eventually). + */ + +(function() { +"use strict"; + +load("jstests/replsets/rslib.js"); // reconfig +load("jstests/replsets/libs/sync_source.js"); // assertSyncSourceMatchesSoon + +// Start RST with only one voting node, node 0 -- this will be the only valid voting node and sync +// source +const rst = new ReplSetTest({nodes: [{}, {rsConfig: {priority: 0, votes: 0}}]}); +rst.startSet(); +rst.initiate(); + +// Make sure that node 0 is primary as expected +const primary = rst.getPrimary(); +assert.eq(primary, rst.nodes[0]); + +// Add a new voting node, node 2 -- voting nodes will choose voting nodes as sync sources. +const newNode = rst.add({}); +rst.reInitiate(); +rst.waitForState(newNode, ReplSetTest.State.SECONDARY); +rst.awaitReplication(); +rst.awaitSecondaryNodes(); + +// Assure that node 2 will set node 0 as its sync source, since it is the best option. +assertSyncSourceMatchesSoon(newNode, rst.nodes[0].host); + +// Make node 1 a voter so that it will be a valid option for sync source +let cfg = rst.getReplSetConfigFromNode(); +cfg.members[1].priority = 1; +cfg.members[1].votes = 1; +reconfig(rst, cfg); + +// Force a stepup of node 1 -- we need to step node 0 down so that we can set it as a non-voter +// without causing errors. +rst.stepUp(rst.nodes[1]); + +// Make node 0 a nonvoter so that it will be an invalid option for sync source +cfg = rst.getReplSetConfigFromNode(); +cfg.members[0].priority = 0; +cfg.members[0].votes = 0; +reconfig(rst, cfg); + +// Run this repeatedly, as sometimes the stop, insert, restart won't cause the sync source to be +// switched correctly due to transient issues with the sync source we want to switch to. +assert.soon(() => { + // Insert a document while newNode is not replicating to force it to run shouldChangeSyncSource + stopServerReplication(newNode); + assert.commandWorked( + rst.getPrimary().getDB("testSyncSourceChangesDb").getCollection("coll").insert({a: 1}, { + writeConcern: {w: 1} + })); + restartServerReplication(newNode); + try { + assertSyncSourceMatchesSoon(newNode, + cfg.members[1].host, + undefined /* msg */, + 5 * 1000 /* timeout */, + undefined /* interval */, + {runHangAnalyzer: false}); + return true; + } catch (e) { + return false; + } +}); + +rst.stopSet(); +})(); diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp index ddc79002521..8c9430b9ed8 100644 --- a/src/mongo/db/repl/topology_coordinator.cpp +++ b/src/mongo/db/repl/topology_coordinator.cpp @@ -329,7 +329,8 @@ HostAndPort TopologyCoordinator::_chooseNearbySyncSource(Date_t now, now, lastOpTimeFetched, readPreference, - attempts == 0 /* firstAttempt */)) { + attempts == 0 /* firstAttempt */, + true /* shouldCheckStaleness */)) { // Node is not a viable sync source candidate. continue; } @@ -406,7 +407,8 @@ bool TopologyCoordinator::_isEligibleSyncSource(int candidateIndex, Date_t now, const OpTime& lastOpTimeFetched, ReadPreference readPreference, - const bool firstAttempt) const { + const bool firstAttempt, + const bool shouldCheckStaleness) const { // Don't consider ourselves. if (candidateIndex == _selfIndex) { return false; @@ -464,16 +466,19 @@ bool TopologyCoordinator::_isEligibleSyncSource(int candidateIndex, "syncSourceCandidate"_attr = syncSourceCandidate); return false; } - // Candidates cannot be excessively behind. - const auto oldestSyncOpTime = _getOldestSyncOpTime(); - if (memberData.getHeartbeatAppliedOpTime() < oldestSyncOpTime) { - LOGV2_DEBUG(3873110, - 2, - "Cannot select sync source because it is too far behind", - "syncSourceCandidate"_attr = syncSourceCandidate, - "syncSourceCandidateOpTime"_attr = memberData.getHeartbeatAppliedOpTime(), - "oldestAcceptableOpTime"_attr = oldestSyncOpTime); - return false; + // Candidates cannot be excessively behind, if we are checking for staleness. + if (shouldCheckStaleness) { + const auto oldestSyncOpTime = _getOldestSyncOpTime(); + if (memberData.getHeartbeatAppliedOpTime() < oldestSyncOpTime) { + LOGV2_DEBUG(3873110, + 2, + "Cannot select sync source because it is too far behind", + "syncSourceCandidate"_attr = syncSourceCandidate, + "syncSourceCandidateOpTime"_attr = + memberData.getHeartbeatAppliedOpTime(), + "oldestAcceptableOpTime"_attr = oldestSyncOpTime); + return false; + } } // Candidate must not have a configured delay larger than ours. if (_selfConfig().getSecondaryDelay() < memberConfig.getSecondaryDelay()) { @@ -497,8 +502,8 @@ bool TopologyCoordinator::_isEligibleSyncSource(int candidateIndex, return false; } } - // Only select a candidate that is ahead of me. - if (memberData.getHeartbeatAppliedOpTime() <= lastOpTimeFetched) { + // Only select a candidate that is ahead of me, if we are checking for staleness. + if (shouldCheckStaleness && memberData.getHeartbeatAppliedOpTime() <= lastOpTimeFetched) { LOGV2_DEBUG(3873113, 1, "Cannot select sync source which is not ahead of me", @@ -3110,40 +3115,74 @@ bool TopologyCoordinator::shouldChangeSyncSource(const HostAndPort& currentSourc if (MONGO_unlikely(disableMaxSyncSourceLagSecs.shouldFail())) { LOGV2( 21833, - "disableMaxSyncSourceLagSecs fail point enabled - not checking the most recent " - "OpTime, {currentSyncSourceOpTime}, of our current sync source, {syncSource}, against " - "the OpTimes of the other nodes in this replica set.", "disableMaxSyncSourceLagSecs fail point enabled - not checking the most recent OpTime " "of our current sync source against the OpTimes of the other nodes in this replica set", "currentSyncSourceOpTime"_attr = currentSourceOpTime.toString(), "syncSource"_attr = currentSource); } else { - unsigned int currentSecs = currentSourceOpTime.getSecs(); - unsigned int goalSecs = currentSecs + durationCount(_options.maxSyncSourceLagSecs); - - for (std::vector::const_iterator it = _memberData.begin(); - it != _memberData.end(); - ++it) { - const int itIndex = indexOfIterator(_memberData, it); - const MemberConfig& candidateConfig = _rsConfig.getMemberAt(itIndex); - if (it->up() && (candidateConfig.isVoter() || !_selfConfig().isVoter()) && - (candidateConfig.shouldBuildIndexes() || !_selfConfig().shouldBuildIndexes()) && - it->getState().readable() && !_memberIsDenylisted(candidateConfig, now) && - goalSecs < it->getHeartbeatAppliedOpTime().getSecs()) { + unsigned int currentSourceOpTimeSecs = currentSourceOpTime.getSecs(); + unsigned int currentSourceLagThresholdSecs = + currentSourceOpTimeSecs + durationCount(_options.maxSyncSourceLagSecs); + + for (size_t i = 0; i < _memberData.size(); i++) { + const auto& member = _memberData[i]; + if (currentSourceLagThresholdSecs < member.getHeartbeatAppliedOpTime().getSecs() && + _isEligibleSyncSource(i, + now, + lastOpTimeFetched, + ReadPreference::Nearest, + true /* firstAttempt */, + true /* shouldCheckStaleness */)) { + invariant(i != (size_t)_selfIndex, + str::stream() + << "Node " << i << " was eligible as a sync source for itself"); LOGV2(21834, - "Choosing new sync source because the most recent OpTime of our sync " - "source, {syncSource}, is {syncSourceOpTime} which is more than " - "{maxSyncSourceLagSecs} behind member {otherMember} " - "whose most recent OpTime is {otherMemberHearbeatAppliedOpTime}", "Choosing new sync source because the most recent OpTime of our sync source " "is more than maxSyncSourceLagSecs behind another member", "syncSource"_attr = currentSource, "syncSourceOpTime"_attr = currentSourceOpTime.toString(), "maxSyncSourceLagSecs"_attr = _options.maxSyncSourceLagSecs, - "otherMember"_attr = candidateConfig.getHostAndPort().toString(), + "otherMember"_attr = member.getHostAndPort().toString(), "otherMemberHearbeatAppliedOpTime"_attr = - it->getHeartbeatAppliedOpTime().toString()); - invariant(itIndex != _selfIndex); + member.getHeartbeatAppliedOpTime().toString()); + return true; + } + } + } + + // Change sync source if our current sync source is not a preferred sync source node choice due + // to non-staleness issues, such as being a non-voter when we are a voter, or being hidden, or + // any of the other conditions checked in _isEligibleSyncSource with firstAttempt=true, and + // another eligible node exists which does meet these criteria. Note that while we bypass + // staleness checks for our current node, we should not do this for a potential new node, + // because we could end up with a situation where shouldChangeSyncSource returns true, causing + // the sync source to be cleared, but then being reset to our previous sync source repeatedly + // because the new source is not actually valid. Note that _isEligibleSyncSource only checks for + // ReadPreference::Secondary*, so any choice besides those for the read preference is fine. + if (!_isEligibleSyncSource(currentSourceIndex, + now, + lastOpTimeFetched, + ReadPreference::Nearest, + true /* firstAttempt */, + false /* shouldCheckStaleness */)) { + + for (size_t i = 0; i < _memberData.size(); i++) { + if (_isEligibleSyncSource(i, + now, + lastOpTimeFetched, + ReadPreference::Nearest, + true /* firstAttempt */, + true /* shouldCheckStaleness */)) { + invariant(i != (size_t)_selfIndex, + str::stream() + << "Node " << i << " was eligible as a sync source for itself"); + LOGV2(5929000, + "Choosing new sync source because our current sync source does not satisfy " + "our strict criteria for candidates, but there is another member which does " + "satisfy these criteria", + "currentSyncSource"_attr = currentSource, + "eligibleCandidateSyncSource"_attr = + _rsConfig.getMemberAt(i).getHostAndPort().toString()); return true; } } @@ -3235,7 +3274,8 @@ bool TopologyCoordinator::shouldChangeSyncSourceDueToPingTime(const HostAndPort& now, previousOpTimeFetched, readPreference, - true /* firstAttempt */)) { + true /* firstAttempt */, + true /* shouldCheckStaleness */)) { LOGV2(4744901, "Choosing new sync source because we have found another potential sync " "source that is significantly closer than our current sync source", diff --git a/src/mongo/db/repl/topology_coordinator.h b/src/mongo/db/repl/topology_coordinator.h index 2da14343ef9..3e8466ffd21 100644 --- a/src/mongo/db/repl/topology_coordinator.h +++ b/src/mongo/db/repl/topology_coordinator.h @@ -937,7 +937,8 @@ private: Date_t now, const OpTime& lastOpTimeFetched, ReadPreference readPreference, - bool firstAttempt) const; + bool firstAttempt, + bool shouldCheckStaleness) const; // Returns the current "ping" value for the given member by their address. Milliseconds _getPing(const HostAndPort& host); -- cgit v1.2.1