From d1871f0fb5f61159aecc76865a4b8a3f5d0159fd Mon Sep 17 00:00:00 2001 From: "A. Jesse Jiryu Davis" Date: Mon, 21 Sep 2020 08:48:50 -0400 Subject: SERVER-35649 Retry DNS failures during reconfig (cherry picked from commit 418c61279986a5eaddc66a16b5e288556ad1f6d3) --- etc/backports_required_for_multiversion_tests.yml | 2 + .../replsets/disallow_adding_initialized_node1.js | 6 ++- jstests/replsets/isself_failure_initiate.js | 28 +++++++++++++ jstests/replsets/isself_failure_reconfig.js | 46 ++++++++++++++++++++++ jstests/replsets/isself_failure_restart.js | 45 +++++++++++++++++++++ src/mongo/db/repl/heartbeat_response_action.cpp | 6 +++ src/mongo/db/repl/heartbeat_response_action.h | 15 ++++++- src/mongo/db/repl/isself.cpp | 8 ++++ .../replication_coordinator_impl_heartbeat.cpp | 7 +++- ...lication_coordinator_impl_heartbeat_v1_test.cpp | 6 +-- src/mongo/db/repl/topology_coordinator.cpp | 10 +++-- src/mongo/db/repl/topology_coordinator_v1_test.cpp | 15 +++---- 12 files changed, 174 insertions(+), 20 deletions(-) create mode 100644 jstests/replsets/isself_failure_initiate.js create mode 100644 jstests/replsets/isself_failure_reconfig.js create mode 100644 jstests/replsets/isself_failure_restart.js diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml index 90709ff7ea9..be5c138cf31 100644 --- a/etc/backports_required_for_multiversion_tests.yml +++ b/etc/backports_required_for_multiversion_tests.yml @@ -113,6 +113,8 @@ suites: test_file: jstests/replsets/rollback_via_refetch_anomaly.js - ticket: SERVER-47645 test_file: jstests/replsets/invalidate_sessions_on_stepdown.js + - ticket: SERVER-35649 + test_file: jstests/replsets/disallow_adding_initialized_node1.js sharding_multiversion: diff --git a/jstests/replsets/disallow_adding_initialized_node1.js b/jstests/replsets/disallow_adding_initialized_node1.js index d9b953d874a..c414b0ccc9b 100644 --- a/jstests/replsets/disallow_adding_initialized_node1.js +++ b/jstests/replsets/disallow_adding_initialized_node1.js @@ -12,7 +12,8 @@ var replSetA = new ReplSetTest({ name: name, nodes: [ {rsConfig: {_id: 10}}, - ] + ], + nodeOptions: {setParameter: {logComponentVerbosity: tojsononeline({replication: 2})}} }); replSetA.startSet({dbpath: "$set-A-$node"}); replSetA.initiate(); @@ -21,7 +22,8 @@ var replSetB = new ReplSetTest({ name: name, nodes: [ {rsConfig: {_id: 20}}, - ] + ], + nodeOptions: {setParameter: {logComponentVerbosity: tojsononeline({replication: 2})}} }); replSetB.startSet({dbpath: "$set-B-$node"}); replSetB.initiate(); diff --git a/jstests/replsets/isself_failure_initiate.js b/jstests/replsets/isself_failure_initiate.js new file mode 100644 index 00000000000..dbeb0a499c9 --- /dev/null +++ b/jstests/replsets/isself_failure_initiate.js @@ -0,0 +1,28 @@ +/** + * Tests that replSetInitiate eventually succeeds despite temporary DNS outage. + * + * @tags: [ + * requires_fcv_47, + * ] + */ + +(function() { +'use strict'; + +load("jstests/libs/fail_point_util.js"); +load("jstests/replsets/rslib.js"); + +const rst = new ReplSetTest({nodes: [{}, {rsConfig: {priority: 0, votes: 0}}]}); +const nodes = rst.startSet(); +// Node 1 will fail to find itself in the config. +const failPoint = configureFailPoint(nodes[1], "failIsSelfCheck"); +const config = rst.getReplSetConfig(); +assert.commandWorked(nodes[0].adminCommand({replSetInitiate: config})); +failPoint.wait(); +assert.commandFailedWithCode(nodes[1].adminCommand({replSetGetStatus: 1}), + ErrorCodes.NotYetInitialized); +failPoint.off(); +// Node 1 re-checks isSelf on next heartbeat and succeeds. +waitForState(nodes[1], ReplSetTest.State.SECONDARY); +rst.stopSet(); +})(); diff --git a/jstests/replsets/isself_failure_reconfig.js b/jstests/replsets/isself_failure_reconfig.js new file mode 100644 index 00000000000..61051309b32 --- /dev/null +++ b/jstests/replsets/isself_failure_reconfig.js @@ -0,0 +1,46 @@ +/** + * Tests that a replica retries on each heartbeat if isSelf fails due to temporary DNS outage during + * reconfig. + * + * @tags: [ + * requires_fcv_47, + * ] + */ + +(function() { +"use strict"; + +load("jstests/libs/fail_point_util.js"); +load("jstests/replsets/rslib.js"); + +const rst = new ReplSetTest( + {nodes: 1, nodeOptions: {setParameter: {logComponentVerbosity: tojson({replication: 3})}}}); +const nodes = rst.startSet(); +rst.initiate(); + +jsTestLog("Add second node to set, with isSelf() disabled by failpoint"); + +const newNode = rst.add({ + rsConfig: {priority: 0, votes: 0}, + setParameter: "failpoint.failIsSelfCheck=" + tojson({mode: "alwaysOn"}) +}); + +rst.reInitiate(); + +jsTestLog("Await failpoint on second node"); + +assert.commandWorked(newNode.adminCommand({ + waitForFailPoint: "failIsSelfCheck", + timesEntered: 1, + maxTimeMS: kDefaultWaitForFailPointTimeout +})); + +assert.commandFailedWithCode(nodes[1].adminCommand({replSetGetStatus: 1}), + ErrorCodes.NotYetInitialized); + +assert.commandWorked(newNode.adminCommand({configureFailPoint: "failIsSelfCheck", mode: "off"})); + +jsTestLog("New node re-checks isSelf and becomes secondary"); +waitForState(newNode, ReplSetTest.State.SECONDARY); +rst.stopSet(); +})(); diff --git a/jstests/replsets/isself_failure_restart.js b/jstests/replsets/isself_failure_restart.js new file mode 100644 index 00000000000..6b0f880d2a1 --- /dev/null +++ b/jstests/replsets/isself_failure_restart.js @@ -0,0 +1,45 @@ +/** + * Tests that a replica retries on each heartbeat if isSelf fails due to temporary DNS outage during + * startup. + * + * @tags: [ + * requires_fcv_47, + * requires_persistence + * ] + */ + +(function() { +"use strict"; + +load("jstests/libs/fail_point_util.js"); +load("jstests/replsets/rslib.js"); + +const rst = new ReplSetTest({ + nodes: [{}, {rsConfig: {priority: 0, votes: 0}}], + nodeOptions: {setParameter: {logComponentVerbosity: tojson({replication: 3})}} +}); +rst.startSet(); +rst.initiate(); + +const restartNode = + rst.restart(1, {setParameter: "failpoint.failIsSelfCheck=" + tojson({mode: "alwaysOn"})}); + +checkLog.contains( + restartNode, + "Locally stored replica set configuration does not have a valid entry for the current node; waiting for reconfig or remote heartbeat"); +waitForState(restartNode, ReplSetTest.State.REMOVED); + +// Clear the log. +const primary = rst.getPrimary(); +assert.commandWorked(primary.adminCommand({clearLog: 'global'})); + +// Secondary should send heartbeat responses with 'InvalidReplicaSetConfig'. +checkLog.contains(primary, /Received response to heartbeat(.*)InvalidReplicaSetConfig/); + +assert.commandWorked( + restartNode.adminCommand({configureFailPoint: "failIsSelfCheck", mode: "off"})); + +// Node 1 re-checks isSelf on next heartbeat and succeeds. +waitForState(restartNode, ReplSetTest.State.SECONDARY); +rst.stopSet(); +})(); diff --git a/src/mongo/db/repl/heartbeat_response_action.cpp b/src/mongo/db/repl/heartbeat_response_action.cpp index 44feca0a32e..c21dbd0cf53 100644 --- a/src/mongo/db/repl/heartbeat_response_action.cpp +++ b/src/mongo/db/repl/heartbeat_response_action.cpp @@ -56,6 +56,12 @@ HeartbeatResponseAction HeartbeatResponseAction::makeCatchupTakeoverAction() { return result; } +HeartbeatResponseAction HeartbeatResponseAction::makeRetryReconfigAction() { + HeartbeatResponseAction result; + result._action = RetryReconfig; + return result; +} + HeartbeatResponseAction HeartbeatResponseAction::makeStepDownSelfAction(int primaryIndex) { HeartbeatResponseAction result; result._action = StepDownSelf; diff --git a/src/mongo/db/repl/heartbeat_response_action.h b/src/mongo/db/repl/heartbeat_response_action.h index aa52c21eee9..9f0b3e4da8c 100644 --- a/src/mongo/db/repl/heartbeat_response_action.h +++ b/src/mongo/db/repl/heartbeat_response_action.h @@ -45,7 +45,14 @@ public: /** * Actions taken based on heartbeat responses */ - enum Action { NoAction, Reconfig, StepDownSelf, PriorityTakeover, CatchupTakeover }; + enum Action { + NoAction, + Reconfig, + StepDownSelf, + PriorityTakeover, + CatchupTakeover, + RetryReconfig + }; /** * Makes a new action representing doing nothing. @@ -69,6 +76,12 @@ public: */ static HeartbeatResponseAction makeCatchupTakeoverAction(); + /** + * Makes a new action telling the current node to attempt to find itself in its current replica + * set config again, in case the previous attempt's failure was due to a temporary DNS outage. + */ + static HeartbeatResponseAction makeRetryReconfigAction(); + /** * Makes a new action telling the current node to step down as primary. * diff --git a/src/mongo/db/repl/isself.cpp b/src/mongo/db/repl/isself.cpp index 3d62a2b60b4..427892bb6c5 100644 --- a/src/mongo/db/repl/isself.cpp +++ b/src/mongo/db/repl/isself.cpp @@ -77,6 +77,8 @@ namespace mongo { namespace repl { +MONGO_FAIL_POINT_DEFINE(failIsSelfCheck); + OID instanceId; MONGO_INITIALIZER(GenerateInstanceId)(InitializerContext*) { @@ -156,6 +158,12 @@ std::vector getAddrsForHost(const std::string& iporhost, } // namespace bool isSelf(const HostAndPort& hostAndPort, ServiceContext* const ctx) { + if (MONGO_unlikely(failIsSelfCheck.shouldFail())) { + log() << "failIsSelfCheck failpoint activated, returning false from isSelf; hostAndPort: " + << hostAndPort; + return false; + } + // Fastpath: check if the host&port in question is bound to one // of the interfaces on this machine. // No need for ip match if the ports do not match diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index f10b3b4b52c..c993d696a0d 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -283,6 +283,9 @@ stdx::unique_lock ReplicationCoordinatorImpl::_handleHeartbeatResponseAct invariant(responseStatus.isOK()); _scheduleHeartbeatReconfig_inlock(responseStatus.getValue().getConfig()); break; + case HeartbeatResponseAction::RetryReconfig: + _scheduleHeartbeatReconfig_inlock(_rsConfig); + break; case HeartbeatResponseAction::StepDownSelf: invariant(action.getPrimaryConfigIndex() == _selfIndex); if (_topCoord->prepareForUnconditionalStepDown()) { @@ -482,7 +485,7 @@ void ReplicationCoordinatorImpl::_scheduleHeartbeatReconfig_inlock(const ReplSet } _setConfigState_inlock(kConfigHBReconfiguring); invariant(!_rsConfig.isInitialized() || - _rsConfig.getConfigVersion() < newConfig.getConfigVersion()); + _rsConfig.getConfigVersion() < newConfig.getConfigVersion() || _selfIndex < 0); if (auto electionFinishedEvent = _cancelElectionIfNeeded_inlock()) { LOG_FOR_HEARTBEATS(2) << "Rescheduling heartbeat reconfig to version " << newConfig.getConfigVersion() @@ -674,7 +677,7 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish( invariant(_rsConfigState == kConfigHBReconfiguring); invariant(!_rsConfig.isInitialized() || - _rsConfig.getConfigVersion() < newConfig.getConfigVersion()); + _rsConfig.getConfigVersion() < newConfig.getConfigVersion() || _selfIndex < 0); if (!myIndex.isOK()) { switch (myIndex.getStatus().code()) { diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp index 3099894bcfb..0ef88f6001c 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp @@ -426,12 +426,8 @@ TEST_F(ReplCoordHBV1Test, IgnoreTheContentsOfMetadataWhenItsReplicaSetIdDoesNotM ASSERT_TRUE(members[1].isABSONObj()); auto member = members[1].Obj(); ASSERT_EQ(host2, HostAndPort(member["name"].String())); - ASSERT_EQ(MemberState(MemberState::RS_DOWN).toString(), + ASSERT_EQ(MemberState(MemberState::RS_UNKNOWN).toString(), MemberState(member["state"].numberInt()).toString()); - ASSERT_EQ(member["lastHeartbeatMessage"].String(), - std::string(str::stream() - << "replica set IDs do not match, ours: " << rsConfig.getReplicaSetId() - << "; remote node's: " << unexpectedId)); } TEST_F(ReplCoordHBV1Test, diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp index f8ea5428661..1df593d8330 100644 --- a/src/mongo/db/repl/topology_coordinator.cpp +++ b/src/mongo/db/repl/topology_coordinator.cpp @@ -755,7 +755,11 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse( invariant(hbStats.getLastHeartbeatStartDate() != Date_t()); const bool isUnauthorized = (hbResponse.getStatus().code() == ErrorCodes::Unauthorized) || (hbResponse.getStatus().code() == ErrorCodes::AuthenticationFailed); - if (hbResponse.isOK() || isUnauthorized) { + const bool isInvalid = hbResponse.getStatus().code() == ErrorCodes::InvalidReplicaSetConfig; + + // Replication of auth changes can cause temporary auth failures, and a temporary DNS outage can + // make a node return InvalidReplicaSetConfig if it can't find itself in the config. + if (hbResponse.isOK() || isUnauthorized || isInvalid) { hbStats.hit(networkRoundTripTime); } else { hbStats.miss(); @@ -826,11 +830,11 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse( nextAction.setNextHeartbeatStartDate(nextHeartbeatStartDate); return nextAction; } - // If we're not in the config, we don't need to respond to heartbeats. + // This server is not in the config, either because it was removed or a DNS error finding self. if (_selfIndex == -1) { LOG(1) << "Could not find ourself in current config so ignoring heartbeat from " << target << " -- current config: " << _rsConfig.toBSON(); - HeartbeatResponseAction nextAction = HeartbeatResponseAction::makeNoAction(); + HeartbeatResponseAction nextAction = HeartbeatResponseAction::makeRetryReconfigAction(); nextAction.setNextHeartbeatStartDate(nextHeartbeatStartDate); return nextAction; } diff --git a/src/mongo/db/repl/topology_coordinator_v1_test.cpp b/src/mongo/db/repl/topology_coordinator_v1_test.cpp index db2a4fbc433..e2ce2fd9314 100644 --- a/src/mongo/db/repl/topology_coordinator_v1_test.cpp +++ b/src/mongo/db/repl/topology_coordinator_v1_test.cpp @@ -1596,7 +1596,7 @@ TEST_F(TopoCoordTest, NodeChangesToRecoveringWhenOnlyUnauthorizedNodesAreUp) { ASSERT_NO_ACTION(action.getAction()); } -TEST_F(TopoCoordTest, NodeDoesNotActOnHeartbeatsWhenAbsentFromConfig) { +TEST_F(TopoCoordTest, NodeRetriesReconfigWhenAbsentFromConfig) { updateConfig(BSON("_id" << "rs0" << "version" << 1 << "members" @@ -1607,12 +1607,13 @@ TEST_F(TopoCoordTest, NodeDoesNotActOnHeartbeatsWhenAbsentFromConfig) { << BSON("_id" << 30 << "host" << "h3"))), -1); - ASSERT_NO_ACTION(heartbeatFromMember(HostAndPort("h2"), - "rs0", - MemberState::RS_SECONDARY, - OpTime(Timestamp(1, 0), 0), - Milliseconds(300)) - .getAction()); + ASSERT_EQUALS(HeartbeatResponseAction::RetryReconfig, + heartbeatFromMember(HostAndPort("h2"), + "rs0", + MemberState::RS_SECONDARY, + OpTime(Timestamp(1, 0), 0), + Milliseconds(300)) + .getAction()); } TEST_F(TopoCoordTest, NodeReturnsNotSecondaryWhenSyncFromIsRunPriorToHavingAConfig) { -- cgit v1.2.1