diff options
author | Xuerui Fa <xuerui.fa@mongodb.com> | 2021-03-10 11:41:52 -0500 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-03-11 15:36:06 +0000 |
commit | 6156b718630344212bea269a28ed5bacf96a9b86 (patch) | |
tree | 2608210f07a60f9172217ffd7e010c12992f90e5 | |
parent | 59dcc00af2ed0091088081449fa47fac73943cac (diff) | |
download | mongo-6156b718630344212bea269a28ed5bacf96a9b86.tar.gz |
SERVER-35649 Retry DNS failures during reconfig
(cherry picked from commit 418c61279986a5eaddc66a16b5e288556ad1f6d3)
SERVER-51163 Mark nodes returning InvalidReplicaSetConfig in heartbeats as down
(cherry picked from commit 28c6948a2a02760a69aaee3875c4b2a427528a5a)
-rw-r--r-- | jstests/replsets/disallow_adding_initialized_node1.js | 6 | ||||
-rw-r--r-- | jstests/replsets/disallow_adding_initialized_node2.js | 3 | ||||
-rw-r--r-- | jstests/replsets/isself_failure_initiate.js | 29 | ||||
-rw-r--r-- | jstests/replsets/isself_failure_reconfig.js | 42 | ||||
-rw-r--r-- | jstests/replsets/isself_failure_restart.js | 44 | ||||
-rw-r--r-- | src/mongo/db/repl/heartbeat_response_action.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/repl/heartbeat_response_action.h | 9 | ||||
-rw-r--r-- | src/mongo/db/repl/isself.cpp | 9 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp | 7 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp | 5 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator.cpp | 8 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator_v1_test.cpp | 15 |
12 files changed, 161 insertions, 22 deletions
diff --git a/jstests/replsets/disallow_adding_initialized_node1.js b/jstests/replsets/disallow_adding_initialized_node1.js index 910e71c7d8c..4a9e80e6d09 100644 --- a/jstests/replsets/disallow_adding_initialized_node1.js +++ b/jstests/replsets/disallow_adding_initialized_node1.js @@ -13,7 +13,8 @@ name: name, nodes: [ {rsConfig: {_id: 10}}, - ] + ], + nodeOptions: {setParameter: {logComponentVerbosity: tojsononeline({replication: 2})}} }); replSetA.startSet({dbpath: "$set-A-$node"}); replSetA.initiate(); @@ -22,7 +23,8 @@ name: name, nodes: [ {rsConfig: {_id: 20}}, - ] + ], + nodeOptions: {setParameter: {logComponentVerbosity: tojsononeline({replication: 2})}} }); replSetB.startSet({dbpath: "$set-B-$node"}); replSetB.initiate(); diff --git a/jstests/replsets/disallow_adding_initialized_node2.js b/jstests/replsets/disallow_adding_initialized_node2.js index e92fc77880b..58b76b520e3 100644 --- a/jstests/replsets/disallow_adding_initialized_node2.js +++ b/jstests/replsets/disallow_adding_initialized_node2.js @@ -19,7 +19,8 @@ nodes: [ {rsConfig: {_id: 10}}, {rsConfig: {_id: 11, arbiterOnly: true}}, - ] + ], + nodeOptions: {setParameter: {logComponentVerbosity: tojsononeline({replication: 2})}} }); replSetA.startSet({dbpath: "$set-A-$node"}); replSetA.initiate(); diff --git a/jstests/replsets/isself_failure_initiate.js b/jstests/replsets/isself_failure_initiate.js new file mode 100644 index 00000000000..9def81e4dab --- /dev/null +++ b/jstests/replsets/isself_failure_initiate.js @@ -0,0 +1,29 @@ +/** + * Tests that replSetInitiate eventually succeeds despite temporary DNS outage. + * + * @tags: [ + * requires_fcv_47, + * ] + */ + +(function() { + 'use strict'; + + load("jstests/replsets/rslib.js"); + + const rst = new ReplSetTest({nodes: [{}, {rsConfig: {priority: 0, votes: 0}}]}); + const nodes = rst.startSet(); + // Node 1 will fail to find itself in the config. + assert.commandWorked( + nodes[1].adminCommand({configureFailPoint: 'failIsSelfCheck', mode: 'alwaysOn'})); + const config = rst.getReplSetConfig(); + assert.commandWorked(nodes[0].adminCommand({replSetInitiate: config})); + checkLog.contains(nodes[1], "failIsSelfCheck failpoint activated, returning false from isSelf"); + assert.commandFailedWithCode(nodes[1].adminCommand({replSetGetStatus: 1}), + ErrorCodes.NotYetInitialized); + assert.commandWorked( + nodes[1].adminCommand({configureFailPoint: 'failIsSelfCheck', mode: 'off'})); + // Node 1 re-checks isSelf on next heartbeat and succeeds. + waitForState(nodes[1], ReplSetTest.State.SECONDARY); + rst.stopSet(); +})(); diff --git a/jstests/replsets/isself_failure_reconfig.js b/jstests/replsets/isself_failure_reconfig.js new file mode 100644 index 00000000000..f94c81b0014 --- /dev/null +++ b/jstests/replsets/isself_failure_reconfig.js @@ -0,0 +1,42 @@ +/** + * Tests that a replica retries on each heartbeat if isSelf fails due to temporary DNS outage during + * reconfig. + * + * @tags: [ + * requires_fcv_47, + * ] + */ + +(function() { + "use strict"; + + load("jstests/replsets/rslib.js"); + + const rst = new ReplSetTest( + {nodes: 1, nodeOptions: {setParameter: {logComponentVerbosity: tojson({replication: 3})}}}); + const nodes = rst.startSet(); + rst.initiate(); + + jsTestLog("Add second node to set, with isSelf() disabled by failpoint"); + + const newNode = rst.add({ + rsConfig: {priority: 0, votes: 0}, + setParameter: "failpoint.failIsSelfCheck=" + tojson({mode: "alwaysOn"}) + }); + + rst.reInitiate(); + + jsTestLog("Await failpoint on second node"); + + checkLog.contains(newNode, "failIsSelfCheck failpoint activated, returning false from isSelf"); + + assert.commandFailedWithCode(nodes[1].adminCommand({replSetGetStatus: 1}), + ErrorCodes.NotYetInitialized); + + assert.commandWorked( + newNode.adminCommand({configureFailPoint: "failIsSelfCheck", mode: "off"})); + + jsTestLog("New node re-checks isSelf and becomes secondary"); + waitForState(newNode, ReplSetTest.State.SECONDARY); + rst.stopSet(); +})(); diff --git a/jstests/replsets/isself_failure_restart.js b/jstests/replsets/isself_failure_restart.js new file mode 100644 index 00000000000..e575e324ba0 --- /dev/null +++ b/jstests/replsets/isself_failure_restart.js @@ -0,0 +1,44 @@ +/** + * Tests that a replica retries on each heartbeat if isSelf fails due to temporary DNS outage during + * startup. + * + * @tags: [ + * requires_fcv_47, + * requires_persistence + * ] + */ + +(function() { + "use strict"; + + load("jstests/replsets/rslib.js"); + + const rst = new ReplSetTest({ + nodes: [{}, {rsConfig: {priority: 0, votes: 0}}], + nodeOptions: {setParameter: {logComponentVerbosity: tojson({replication: 3})}} + }); + rst.startSet(); + rst.initiate(); + + const restartNode = + rst.restart(1, {setParameter: "failpoint.failIsSelfCheck=" + tojson({mode: "alwaysOn"})}); + + checkLog.contains( + restartNode, + "Locally stored replica set configuration does not have a valid entry for the current node; waiting for reconfig or remote heartbeat"); + waitForState(restartNode, ReplSetTest.State.REMOVED); + + // Clear the log. + const primary = rst.getPrimary(); + assert.commandWorked(primary.adminCommand({clearLog: 'global'})); + + // Secondary should send heartbeat responses with 'InvalidReplicaSetConfig'. + checkLog.contains(primary, /Received response to heartbeat(.*)InvalidReplicaSetConfig/); + + assert.commandWorked( + restartNode.adminCommand({configureFailPoint: "failIsSelfCheck", mode: "off"})); + + // Node 1 re-checks isSelf on next heartbeat and succeeds. + waitForState(restartNode, ReplSetTest.State.SECONDARY); + rst.stopSet(); +})(); diff --git a/src/mongo/db/repl/heartbeat_response_action.cpp b/src/mongo/db/repl/heartbeat_response_action.cpp index d6edbc7809e..43f79e970a4 100644 --- a/src/mongo/db/repl/heartbeat_response_action.cpp +++ b/src/mongo/db/repl/heartbeat_response_action.cpp @@ -63,6 +63,12 @@ HeartbeatResponseAction HeartbeatResponseAction::makeElectAction() { return result; } +HeartbeatResponseAction HeartbeatResponseAction::makeRetryReconfigAction() { + HeartbeatResponseAction result; + result._action = RetryReconfig; + return result; +} + HeartbeatResponseAction HeartbeatResponseAction::makeStepDownSelfAction(int primaryIndex) { HeartbeatResponseAction result; result._action = StepDownSelf; diff --git a/src/mongo/db/repl/heartbeat_response_action.h b/src/mongo/db/repl/heartbeat_response_action.h index ee659deef64..c451a2c33e9 100644 --- a/src/mongo/db/repl/heartbeat_response_action.h +++ b/src/mongo/db/repl/heartbeat_response_action.h @@ -53,7 +53,8 @@ public: StepDownSelf, StepDownRemotePrimary, PriorityTakeover, - CatchupTakeover + CatchupTakeover, + RetryReconfig }; /** @@ -84,6 +85,12 @@ public: static HeartbeatResponseAction makeCatchupTakeoverAction(); /** + * Makes a new action telling the current node to attempt to find itself in its current replica + * set config again, in case the previous attempt's failure was due to a temporary DNS outage. + */ + static HeartbeatResponseAction makeRetryReconfigAction(); + + /** * Makes a new action telling the current node to step down as primary. * * It is an error to call this with primaryIndex != the index of the current node. diff --git a/src/mongo/db/repl/isself.cpp b/src/mongo/db/repl/isself.cpp index 5f9ea8a7715..df2939dba63 100644 --- a/src/mongo/db/repl/isself.cpp +++ b/src/mongo/db/repl/isself.cpp @@ -47,6 +47,7 @@ #include "mongo/db/auth/privilege.h" #include "mongo/db/commands.h" #include "mongo/db/service_context.h" +#include "mongo/util/fail_point_service.h" #include "mongo/util/log.h" #include "mongo/util/net/socket_utils.h" #include "mongo/util/scopeguard.h" @@ -80,6 +81,8 @@ namespace mongo { namespace repl { +MONGO_FAIL_POINT_DEFINE(failIsSelfCheck); + OID instanceId; MONGO_INITIALIZER(GenerateInstanceId)(InitializerContext*) { @@ -159,6 +162,12 @@ std::vector<std::string> getAddrsForHost(const std::string& iporhost, } // namespace bool isSelf(const HostAndPort& hostAndPort, ServiceContext* const ctx) { + if (MONGO_FAIL_POINT(failIsSelfCheck)) { + log() << "failIsSelfCheck failpoint activated, returning false from isSelf; hostAndPort: " + << hostAndPort; + return false; + } + // Fastpath: check if the host&port in question is bound to one // of the interfaces on this machine. // No need for ip match if the ports do not match diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index 10480a11cc3..de13a4eaa2d 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -275,6 +275,9 @@ stdx::unique_lock<stdx::mutex> ReplicationCoordinatorImpl::_handleHeartbeatRespo case HeartbeatResponseAction::StartElection: _startElectSelf_inlock(); break; + case HeartbeatResponseAction::RetryReconfig: + _scheduleHeartbeatReconfig_inlock(_rsConfig); + break; case HeartbeatResponseAction::StepDownSelf: invariant(action.getPrimaryConfigIndex() == _selfIndex); if (_topCoord->prepareForUnconditionalStepDown()) { @@ -458,7 +461,7 @@ void ReplicationCoordinatorImpl::_scheduleHeartbeatReconfig_inlock(const ReplSet } _setConfigState_inlock(kConfigHBReconfiguring); invariant(!_rsConfig.isInitialized() || - _rsConfig.getConfigVersion() < newConfig.getConfigVersion()); + _rsConfig.getConfigVersion() < newConfig.getConfigVersion() || _selfIndex < 0); if (auto electionFinishedEvent = _cancelElectionIfNeeded_inlock()) { LOG_FOR_HEARTBEATS(2) << "Rescheduling heartbeat reconfig to version " << newConfig.getConfigVersion() @@ -598,7 +601,7 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish( invariant(_rsConfigState == kConfigHBReconfiguring); invariant(!_rsConfig.isInitialized() || - _rsConfig.getConfigVersion() < newConfig.getConfigVersion()); + _rsConfig.getConfigVersion() < newConfig.getConfigVersion() || _selfIndex < 0); // Do not conduct an election during a reconfig, as the node may not be electable post-reconfig. if (auto electionFinishedEvent = _cancelElectionIfNeeded_inlock()) { diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp index fc87738121c..269f7dbac5a 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp @@ -425,11 +425,6 @@ TEST_F(ReplCoordHBV1Test, IgnoreTheContentsOfMetadataWhenItsReplicaSetIdDoesNotM ASSERT_EQ(host2, HostAndPort(member["name"].String())); ASSERT_EQ(MemberState(MemberState::RS_DOWN).toString(), MemberState(member["state"].numberInt()).toString()); - ASSERT_EQ(member["lastHeartbeatMessage"].String(), - std::string(str::stream() << "replica set IDs do not match, ours: " - << rsConfig.getReplicaSetId() - << "; remote node's: " - << unexpectedId)); } } // namespace diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp index ede25322ced..5a9cd0c332c 100644 --- a/src/mongo/db/repl/topology_coordinator.cpp +++ b/src/mongo/db/repl/topology_coordinator.cpp @@ -1123,11 +1123,11 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse( nextAction.setNextHeartbeatStartDate(nextHeartbeatStartDate); return nextAction; } - // If we're not in the config, we don't need to respond to heartbeats. + // This server is not in the config, either because it was removed or a DNS error finding self. if (_selfIndex == -1) { - LOG(1) << "Could not find ourself in current config so ignoring heartbeat from " << target - << " -- current config: " << _rsConfig.toBSON(); - HeartbeatResponseAction nextAction = HeartbeatResponseAction::makeNoAction(); + LOG(1) << "Could not find self in current config, retrying DNS resolution of members " + << target << " -- current config: " << _rsConfig.toBSON(); + HeartbeatResponseAction nextAction = HeartbeatResponseAction::makeRetryReconfigAction(); nextAction.setNextHeartbeatStartDate(nextHeartbeatStartDate); return nextAction; } diff --git a/src/mongo/db/repl/topology_coordinator_v1_test.cpp b/src/mongo/db/repl/topology_coordinator_v1_test.cpp index 84c94741ef2..b971aec30d5 100644 --- a/src/mongo/db/repl/topology_coordinator_v1_test.cpp +++ b/src/mongo/db/repl/topology_coordinator_v1_test.cpp @@ -1019,7 +1019,7 @@ TEST_F(TopoCoordTest, NodeChangesToRecoveringWhenOnlyUnauthorizedNodesAreUp) { ASSERT_NO_ACTION(action.getAction()); } -TEST_F(TopoCoordTest, NodeDoesNotActOnHeartbeatsWhenAbsentFromConfig) { +TEST_F(TopoCoordTest, NodeRetriesReconfigWhenAbsentFromConfig) { updateConfig(BSON("_id" << "rs0" << "version" @@ -1032,12 +1032,13 @@ TEST_F(TopoCoordTest, NodeDoesNotActOnHeartbeatsWhenAbsentFromConfig) { << BSON("_id" << 30 << "host" << "h3"))), -1); - ASSERT_NO_ACTION(heartbeatFromMember(HostAndPort("h2"), - "rs0", - MemberState::RS_SECONDARY, - OpTime(Timestamp(1, 0), 0), - Milliseconds(300)) - .getAction()); + ASSERT_EQUALS(HeartbeatResponseAction::RetryReconfig, + heartbeatFromMember(HostAndPort("h2"), + "rs0", + MemberState::RS_SECONDARY, + OpTime(Timestamp(1, 0), 0), + Milliseconds(300)) + .getAction()); } TEST_F(TopoCoordTest, NodeReturnsNotSecondaryWhenSyncFromIsRunPriorToHavingAConfig) { |