diff options
author | A. Jesse Jiryu Davis <jesse@mongodb.com> | 2020-09-21 08:48:50 -0400 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-09-21 13:17:03 +0000 |
commit | 418c61279986a5eaddc66a16b5e288556ad1f6d3 (patch) | |
tree | b56c929fe926d0888c1b1b4d44e12a0a3e18e9d1 | |
parent | 2cdeee7b845751e7e50b2a5a78641ba4b4680aa2 (diff) | |
download | mongo-418c61279986a5eaddc66a16b5e288556ad1f6d3.tar.gz |
SERVER-35649 Retry DNS failures during reconfig
-rw-r--r-- | etc/backports_required_for_multiversion_tests.yml | 2 | ||||
-rw-r--r-- | jstests/replsets/awaitable_ismaster_on_nodes_with_invalid_configs.js | 2 | ||||
-rw-r--r-- | jstests/replsets/disallow_adding_initialized_node1.js | 25 | ||||
-rw-r--r-- | jstests/replsets/isself_failure_initiate.js | 28 | ||||
-rw-r--r-- | jstests/replsets/isself_failure_reconfig.js | 46 | ||||
-rw-r--r-- | jstests/replsets/isself_failure_restart.js | 44 | ||||
-rw-r--r-- | src/mongo/db/repl/heartbeat_response_action.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/repl/heartbeat_response_action.h | 15 | ||||
-rw-r--r-- | src/mongo/db/repl/isself.cpp | 9 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp | 9 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator.cpp | 21 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator_v1_test.cpp | 15 |
13 files changed, 194 insertions, 34 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml index 25c16def801..22f5a7053e3 100644 --- a/etc/backports_required_for_multiversion_tests.yml +++ b/etc/backports_required_for_multiversion_tests.yml @@ -90,6 +90,8 @@ suites: concurrency_sharded_replication_multiversion: replica_sets_multiversion: + - ticket: SERVER-35649 + test_file: jstests/replsets/disallow_adding_initialized_node1.js replica_sets_jscore_multiversion_passthrough: diff --git a/jstests/replsets/awaitable_ismaster_on_nodes_with_invalid_configs.js b/jstests/replsets/awaitable_ismaster_on_nodes_with_invalid_configs.js index 2c078261513..ae75d4a948e 100644 --- a/jstests/replsets/awaitable_ismaster_on_nodes_with_invalid_configs.js +++ b/jstests/replsets/awaitable_ismaster_on_nodes_with_invalid_configs.js @@ -175,4 +175,4 @@ assert.eq(0, numAwaitingTopologyChangeOnPrimary); assert.eq(0, numAwaitingTopologyChangeOnSecondary); replTest.stopSet(); -})();
\ No newline at end of file +})(); diff --git a/jstests/replsets/disallow_adding_initialized_node1.js b/jstests/replsets/disallow_adding_initialized_node1.js index bc6d3ed764a..2a9775e4e38 100644 --- a/jstests/replsets/disallow_adding_initialized_node1.js +++ b/jstests/replsets/disallow_adding_initialized_node1.js @@ -12,7 +12,8 @@ var replSetA = new ReplSetTest({ name: name, nodes: [ {rsConfig: {_id: 10}}, - ] + ], + nodeOptions: {setParameter: {logComponentVerbosity: tojsononeline({replication: 2})}} }); replSetA.startSet({dbpath: "$set-A-$node"}); replSetA.initiate(); @@ -21,7 +22,8 @@ var replSetB = new ReplSetTest({ name: name, nodes: [ {rsConfig: {_id: 20}}, - ] + ], + nodeOptions: {setParameter: {logComponentVerbosity: tojsononeline({replication: 2})}} }); replSetB.startSet({dbpath: "$set-B-$node"}); replSetB.initiate(); @@ -54,14 +56,19 @@ configA.version++; configA.members.push({_id: 11, host: primaryB.host, votes: 0, priority: 0}); assert.commandWorked(primaryA.adminCommand({replSetReconfig: configA})); -// Wait for primary A to report primary B down. B should reject all heartbeats from A due to a -// replset name mismatch, leading A to consider it down. +// Wait for primary A to report primary B UNKNOWN. assert.soon(function() { const statusA = assert.commandWorked(primaryA.adminCommand({replSetGetStatus: 1})); if (statusA.members.length !== 2) { return false; } - return statusA.members[1].state === ReplSetTest.State.DOWN; + return statusA.members[1].state === ReplSetTest.State.UNKNOWN; +}); + +checkLog.containsJson(primaryA, 4615621, { + error: (error) => { + return error.code === 93 && error.errmsg.indexOf("replica set IDs do not match") !== -1; + } }); // Confirm that each set still has the correct primary. @@ -72,7 +79,7 @@ jsTestLog('After merging with 0 votes: primary A = ' + newPrimaryA.host + assert.eq(primaryA, newPrimaryA); assert.eq(primaryB, newPrimaryB); -// Replica set A's config should include primary B and consider it DOWN. +// Replica set A's config should include primary B and consider it UNKNOWN. let statusA = assert.commandWorked(primaryA.adminCommand({replSetGetStatus: 1})); jsTestLog('After merging with 0 votes: replica set status A = ' + tojson(statusA)); assert.eq(2, statusA.members.length); @@ -81,7 +88,7 @@ assert.eq(primaryA.host, statusA.members[0].name); assert.eq(ReplSetTest.State.PRIMARY, statusA.members[0].state); assert.eq(11, statusA.members[1]._id); assert.eq(primaryB.host, statusA.members[1].name); -assert.eq(ReplSetTest.State.DOWN, statusA.members[1].state); +assert.eq(ReplSetTest.State.UNKNOWN, statusA.members[1].state); // Replica set B's config should remain unchanged. let statusB = assert.commandWorked(primaryB.adminCommand({replSetGetStatus: 1})); @@ -115,7 +122,7 @@ var msgB = "replica set IDs do not match, ours: " + configB.settings.replicaSetI "; remote node's: " + configA.settings.replicaSetId; checkLog.contains(primaryB, msgB); -// Confirm primary B is still DOWN. +// Confirm primary B is still UNKNOWN. statusA = assert.commandWorked(primaryA.adminCommand({replSetGetStatus: 1})); jsTestLog('After merging: replica set status A = ' + tojson(statusA)); assert.eq(2, statusA.members.length); @@ -124,7 +131,7 @@ assert.eq(primaryA.host, statusA.members[0].name); assert.eq(ReplSetTest.State.PRIMARY, statusA.members[0].state); assert.eq(11, statusA.members[1]._id); assert.eq(primaryB.host, statusA.members[1].name); -assert.eq(ReplSetTest.State.DOWN, statusA.members[1].state); +assert.eq(ReplSetTest.State.UNKNOWN, statusA.members[1].state); // Replica set B's config should remain unchanged. statusB = assert.commandWorked(primaryB.adminCommand({replSetGetStatus: 1})); diff --git a/jstests/replsets/isself_failure_initiate.js b/jstests/replsets/isself_failure_initiate.js new file mode 100644 index 00000000000..dbeb0a499c9 --- /dev/null +++ b/jstests/replsets/isself_failure_initiate.js @@ -0,0 +1,28 @@ +/** + * Tests that replSetInitiate eventually succeeds despite temporary DNS outage. + * + * @tags: [ + * requires_fcv_47, + * ] + */ + +(function() { +'use strict'; + +load("jstests/libs/fail_point_util.js"); +load("jstests/replsets/rslib.js"); + +const rst = new ReplSetTest({nodes: [{}, {rsConfig: {priority: 0, votes: 0}}]}); +const nodes = rst.startSet(); +// Node 1 will fail to find itself in the config. +const failPoint = configureFailPoint(nodes[1], "failIsSelfCheck"); +const config = rst.getReplSetConfig(); +assert.commandWorked(nodes[0].adminCommand({replSetInitiate: config})); +failPoint.wait(); +assert.commandFailedWithCode(nodes[1].adminCommand({replSetGetStatus: 1}), + ErrorCodes.NotYetInitialized); +failPoint.off(); +// Node 1 re-checks isSelf on next heartbeat and succeeds. +waitForState(nodes[1], ReplSetTest.State.SECONDARY); +rst.stopSet(); +})(); diff --git a/jstests/replsets/isself_failure_reconfig.js b/jstests/replsets/isself_failure_reconfig.js new file mode 100644 index 00000000000..61051309b32 --- /dev/null +++ b/jstests/replsets/isself_failure_reconfig.js @@ -0,0 +1,46 @@ +/** + * Tests that a replica retries on each heartbeat if isSelf fails due to temporary DNS outage during + * reconfig. + * + * @tags: [ + * requires_fcv_47, + * ] + */ + +(function() { +"use strict"; + +load("jstests/libs/fail_point_util.js"); +load("jstests/replsets/rslib.js"); + +const rst = new ReplSetTest( + {nodes: 1, nodeOptions: {setParameter: {logComponentVerbosity: tojson({replication: 3})}}}); +const nodes = rst.startSet(); +rst.initiate(); + +jsTestLog("Add second node to set, with isSelf() disabled by failpoint"); + +const newNode = rst.add({ + rsConfig: {priority: 0, votes: 0}, + setParameter: "failpoint.failIsSelfCheck=" + tojson({mode: "alwaysOn"}) +}); + +rst.reInitiate(); + +jsTestLog("Await failpoint on second node"); + +assert.commandWorked(newNode.adminCommand({ + waitForFailPoint: "failIsSelfCheck", + timesEntered: 1, + maxTimeMS: kDefaultWaitForFailPointTimeout +})); + +assert.commandFailedWithCode(nodes[1].adminCommand({replSetGetStatus: 1}), + ErrorCodes.NotYetInitialized); + +assert.commandWorked(newNode.adminCommand({configureFailPoint: "failIsSelfCheck", mode: "off"})); + +jsTestLog("New node re-checks isSelf and becomes secondary"); +waitForState(newNode, ReplSetTest.State.SECONDARY); +rst.stopSet(); +})(); diff --git a/jstests/replsets/isself_failure_restart.js b/jstests/replsets/isself_failure_restart.js new file mode 100644 index 00000000000..d9f35d02f8a --- /dev/null +++ b/jstests/replsets/isself_failure_restart.js @@ -0,0 +1,44 @@ +/** + * Tests that a replica retries on each heartbeat if isSelf fails due to temporary DNS outage during + * startup. + * + * @tags: [ + * requires_fcv_47, + * requires_persistence + * ] + */ + +(function() { +"use strict"; + +load("jstests/libs/fail_point_util.js"); +load("jstests/replsets/rslib.js"); + +const rst = new ReplSetTest({ + nodes: [{}, {rsConfig: {priority: 0, votes: 0}}], + nodeOptions: {setParameter: {logComponentVerbosity: tojson({replication: 3})}} +}); +rst.startSet(); +rst.initiate(); + +const restartNode = + rst.restart(1, {setParameter: "failpoint.failIsSelfCheck=" + tojson({mode: "alwaysOn"})}); + +// "Locally stored replica set configuration does not have a valid entry for the current node". +checkLog.containsJson(restartNode, 21405); +waitForState(restartNode, ReplSetTest.State.REMOVED); + +// "Received response to heartbeat". +checkLog.containsJson(rst.getPrimary(), 4615620, { + response: (response) => { + return response.codeName === "InvalidReplicaSetConfig"; + } +}); + +assert.commandWorked( + restartNode.adminCommand({configureFailPoint: "failIsSelfCheck", mode: "off"})); + +// Node 1 re-checks isSelf on next heartbeat and succeeds. +waitForState(restartNode, ReplSetTest.State.SECONDARY); +rst.stopSet(); +})(); diff --git a/src/mongo/db/repl/heartbeat_response_action.cpp b/src/mongo/db/repl/heartbeat_response_action.cpp index 2f2a9072001..8b84c23bac9 100644 --- a/src/mongo/db/repl/heartbeat_response_action.cpp +++ b/src/mongo/db/repl/heartbeat_response_action.cpp @@ -56,6 +56,12 @@ HeartbeatResponseAction HeartbeatResponseAction::makeCatchupTakeoverAction() { return result; } +HeartbeatResponseAction HeartbeatResponseAction::makeRetryReconfigAction() { + HeartbeatResponseAction result; + result._action = RetryReconfig; + return result; +} + HeartbeatResponseAction HeartbeatResponseAction::makeStepDownSelfAction(int primaryIndex) { HeartbeatResponseAction result; result._action = StepDownSelf; diff --git a/src/mongo/db/repl/heartbeat_response_action.h b/src/mongo/db/repl/heartbeat_response_action.h index b193239a943..a030761ecaf 100644 --- a/src/mongo/db/repl/heartbeat_response_action.h +++ b/src/mongo/db/repl/heartbeat_response_action.h @@ -45,7 +45,14 @@ public: /** * Actions taken based on heartbeat responses */ - enum Action { NoAction, Reconfig, StepDownSelf, PriorityTakeover, CatchupTakeover }; + enum Action { + NoAction, + Reconfig, + StepDownSelf, + PriorityTakeover, + CatchupTakeover, + RetryReconfig + }; /** * Makes a new action representing doing nothing. @@ -70,6 +77,12 @@ public: static HeartbeatResponseAction makeCatchupTakeoverAction(); /** + * Makes a new action telling the current node to attempt to find itself in its current replica + * set config again, in case the previous attempt's failure was due to a temporary DNS outage. + */ + static HeartbeatResponseAction makeRetryReconfigAction(); + + /** * Makes a new action telling the current node to step down as primary. * * It is an error to call this with primaryIndex != the index of the current node. diff --git a/src/mongo/db/repl/isself.cpp b/src/mongo/db/repl/isself.cpp index 4e6a544dd5b..1c561ff9f5d 100644 --- a/src/mongo/db/repl/isself.cpp +++ b/src/mongo/db/repl/isself.cpp @@ -78,6 +78,8 @@ namespace mongo { namespace repl { +MONGO_FAIL_POINT_DEFINE(failIsSelfCheck); + OID instanceId; MONGO_INITIALIZER(GenerateInstanceId)(InitializerContext*) { @@ -163,6 +165,13 @@ std::vector<std::string> getAddrsForHost(const std::string& iporhost, } // namespace bool isSelf(const HostAndPort& hostAndPort, ServiceContext* const ctx) { + if (MONGO_unlikely(failIsSelfCheck.shouldFail())) { + LOGV2(356490, + "failIsSelfCheck failpoint activated, returning false from isSelf", + "hostAndPort"_attr = hostAndPort); + return false; + } + // Fastpath: check if the host&port in question is bound to one // of the interfaces on this machine. // No need for ip match if the ports do not match diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index 9ac897a22a8..1f0684dc66f 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -371,6 +371,9 @@ stdx::unique_lock<Latch> ReplicationCoordinatorImpl::_handleHeartbeatResponseAct invariant(responseStatus.isOK()); _scheduleHeartbeatReconfig(lock, responseStatus.getValue().getConfig()); break; + case HeartbeatResponseAction::RetryReconfig: + _scheduleHeartbeatReconfig(lock, _rsConfig); + break; case HeartbeatResponseAction::StepDownSelf: invariant(action.getPrimaryConfigIndex() == _selfIndex); if (_topCoord->prepareForUnconditionalStepDown()) { @@ -602,7 +605,8 @@ void ReplicationCoordinatorImpl::_scheduleHeartbeatReconfig(WithLock lk, _setConfigState_inlock(kConfigHBReconfiguring); invariant(!_rsConfig.isInitialized() || - _rsConfig.getConfigVersionAndTerm() < newConfig.getConfigVersionAndTerm()); + _rsConfig.getConfigVersionAndTerm() < newConfig.getConfigVersionAndTerm() || + _selfIndex < 0); _replExecutor ->scheduleWork([=](const executor::TaskExecutor::CallbackArgs& cbData) { _heartbeatReconfigStore(cbData, newConfig); @@ -809,7 +813,8 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish( invariant(_rsConfigState == kConfigHBReconfiguring); invariant(!_rsConfig.isInitialized() || - _rsConfig.getConfigVersionAndTerm() < newConfig.getConfigVersionAndTerm()); + _rsConfig.getConfigVersionAndTerm() < newConfig.getConfigVersionAndTerm() || + _selfIndex < 0); if (!myIndex.isOK()) { switch (myIndex.getStatus().code()) { diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp index 736e76d2368..ebf5162a8c6 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp @@ -1035,12 +1035,8 @@ TEST_F(ReplCoordHBV1Test, IgnoreTheContentsOfMetadataWhenItsReplicaSetIdDoesNotM ASSERT_TRUE(members[1].isABSONObj()); auto member = members[1].Obj(); ASSERT_EQ(host2, HostAndPort(member["name"].String())); - ASSERT_EQ(MemberState(MemberState::RS_DOWN).toString(), + ASSERT_EQ(MemberState(MemberState::RS_UNKNOWN).toString(), MemberState(member["state"].numberInt()).toString()); - ASSERT_EQ(member["lastHeartbeatMessage"].String(), - std::string(str::stream() - << "replica set IDs do not match, ours: " << rsConfig.getReplicaSetId() - << "; remote node's: " << unexpectedId)); } TEST_F(ReplCoordHBV1Test, diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp index c74d23b5ff9..c755830380a 100644 --- a/src/mongo/db/repl/topology_coordinator.cpp +++ b/src/mongo/db/repl/topology_coordinator.cpp @@ -936,7 +936,11 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse( invariant(hbStats.getLastHeartbeatStartDate() != Date_t()); const bool isUnauthorized = (hbResponse.getStatus().code() == ErrorCodes::Unauthorized) || (hbResponse.getStatus().code() == ErrorCodes::AuthenticationFailed); - if (hbResponse.isOK() || isUnauthorized) { + const bool isInvalid = hbResponse.getStatus().code() == ErrorCodes::InvalidReplicaSetConfig; + + // Replication of auth changes can cause temporary auth failures, and a temporary DNS outage can + // make a node return InvalidReplicaSetConfig if it can't find itself in the config. + if (hbResponse.isOK() || isUnauthorized || isInvalid) { hbStats.hit(networkRoundTripTime); } else { hbStats.miss(); @@ -1030,15 +1034,14 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse( if (!_rsConfig.isInitialized()) { return nextAction; } - // If we're not in the config, we don't need to respond to heartbeats. + // This server is not in the config, either because it was removed or a DNS error finding self. if (_selfIndex == -1) { - LOGV2_DEBUG(21805, - 1, - "Could not find ourself in current config so ignoring heartbeat from {target} " - "-- current config: {currentConfig}", - "Could not find ourself in current config so ignoring heartbeat", - "target"_attr = target, - "currentConfig"_attr = _rsConfig.toBSON()); + LOGV2(3564900, + "Could not find self in current config, retrying DNS resolution of members", + "target"_attr = target, + "currentConfig"_attr = _rsConfig.toBSON()); + nextAction = HeartbeatResponseAction::makeRetryReconfigAction(); + nextAction.setNextHeartbeatStartDate(nextHeartbeatStartDate); return nextAction; } const int memberIndex = _rsConfig.findMemberIndexByHostAndPort(target); diff --git a/src/mongo/db/repl/topology_coordinator_v1_test.cpp b/src/mongo/db/repl/topology_coordinator_v1_test.cpp index d0625259077..1f9b5d84591 100644 --- a/src/mongo/db/repl/topology_coordinator_v1_test.cpp +++ b/src/mongo/db/repl/topology_coordinator_v1_test.cpp @@ -1311,7 +1311,7 @@ TEST_F(TopoCoordTest, NodeChangesToRecoveringWhenOnlyUnauthorizedNodesAreUp) { ASSERT_NO_ACTION(action.getAction()); } -TEST_F(TopoCoordTest, NodeDoesNotActOnHeartbeatsWhenAbsentFromConfig) { +TEST_F(TopoCoordTest, NodeRetriesReconfigWhenAbsentFromConfig) { updateConfig(BSON("_id" << "rs0" << "version" << 1 << "members" @@ -1322,12 +1322,13 @@ TEST_F(TopoCoordTest, NodeDoesNotActOnHeartbeatsWhenAbsentFromConfig) { << BSON("_id" << 30 << "host" << "h3"))), -1); - ASSERT_NO_ACTION(heartbeatFromMember(HostAndPort("h2"), - "rs0", - MemberState::RS_SECONDARY, - OpTime(Timestamp(1, 0), 0), - Milliseconds(300)) - .getAction()); + ASSERT_EQUALS(HeartbeatResponseAction::RetryReconfig, + heartbeatFromMember(HostAndPort("h2"), + "rs0", + MemberState::RS_SECONDARY, + OpTime(Timestamp(1, 0), 0), + Milliseconds(300)) + .getAction()); } TEST_F(TopoCoordTest, NodeReturnsNotSecondaryWhenSyncFromIsRunPriorToHavingAConfig) { |