summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorA. Jesse Jiryu Davis <jesse@mongodb.com>2020-09-21 08:48:50 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-02-04 19:33:48 +0000
commitd1871f0fb5f61159aecc76865a4b8a3f5d0159fd (patch)
tree3f184e6d5d163792a06b8baa99e03d8a64705f10
parent0431757da259bc5586d69477775319cb49346a0f (diff)
downloadmongo-d1871f0fb5f61159aecc76865a4b8a3f5d0159fd.tar.gz
SERVER-35649 Retry DNS failures during reconfig
(cherry picked from commit 418c61279986a5eaddc66a16b5e288556ad1f6d3)
-rw-r--r--etc/backports_required_for_multiversion_tests.yml2
-rw-r--r--jstests/replsets/disallow_adding_initialized_node1.js6
-rw-r--r--jstests/replsets/isself_failure_initiate.js28
-rw-r--r--jstests/replsets/isself_failure_reconfig.js46
-rw-r--r--jstests/replsets/isself_failure_restart.js45
-rw-r--r--src/mongo/db/repl/heartbeat_response_action.cpp6
-rw-r--r--src/mongo/db/repl/heartbeat_response_action.h15
-rw-r--r--src/mongo/db/repl/isself.cpp8
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp7
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp6
-rw-r--r--src/mongo/db/repl/topology_coordinator.cpp10
-rw-r--r--src/mongo/db/repl/topology_coordinator_v1_test.cpp15
12 files changed, 174 insertions, 20 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml
index 90709ff7ea9..be5c138cf31 100644
--- a/etc/backports_required_for_multiversion_tests.yml
+++ b/etc/backports_required_for_multiversion_tests.yml
@@ -113,6 +113,8 @@ suites:
test_file: jstests/replsets/rollback_via_refetch_anomaly.js
- ticket: SERVER-47645
test_file: jstests/replsets/invalidate_sessions_on_stepdown.js
+ - ticket: SERVER-35649
+ test_file: jstests/replsets/disallow_adding_initialized_node1.js
sharding_multiversion:
diff --git a/jstests/replsets/disallow_adding_initialized_node1.js b/jstests/replsets/disallow_adding_initialized_node1.js
index d9b953d874a..c414b0ccc9b 100644
--- a/jstests/replsets/disallow_adding_initialized_node1.js
+++ b/jstests/replsets/disallow_adding_initialized_node1.js
@@ -12,7 +12,8 @@ var replSetA = new ReplSetTest({
name: name,
nodes: [
{rsConfig: {_id: 10}},
- ]
+ ],
+ nodeOptions: {setParameter: {logComponentVerbosity: tojsononeline({replication: 2})}}
});
replSetA.startSet({dbpath: "$set-A-$node"});
replSetA.initiate();
@@ -21,7 +22,8 @@ var replSetB = new ReplSetTest({
name: name,
nodes: [
{rsConfig: {_id: 20}},
- ]
+ ],
+ nodeOptions: {setParameter: {logComponentVerbosity: tojsononeline({replication: 2})}}
});
replSetB.startSet({dbpath: "$set-B-$node"});
replSetB.initiate();
diff --git a/jstests/replsets/isself_failure_initiate.js b/jstests/replsets/isself_failure_initiate.js
new file mode 100644
index 00000000000..dbeb0a499c9
--- /dev/null
+++ b/jstests/replsets/isself_failure_initiate.js
@@ -0,0 +1,28 @@
+/**
+ * Tests that replSetInitiate eventually succeeds despite temporary DNS outage.
+ *
+ * @tags: [
+ * requires_fcv_47,
+ * ]
+ */
+
+(function() {
+'use strict';
+
+load("jstests/libs/fail_point_util.js");
+load("jstests/replsets/rslib.js");
+
+const rst = new ReplSetTest({nodes: [{}, {rsConfig: {priority: 0, votes: 0}}]});
+const nodes = rst.startSet();
+// Node 1 will fail to find itself in the config.
+const failPoint = configureFailPoint(nodes[1], "failIsSelfCheck");
+const config = rst.getReplSetConfig();
+assert.commandWorked(nodes[0].adminCommand({replSetInitiate: config}));
+failPoint.wait();
+assert.commandFailedWithCode(nodes[1].adminCommand({replSetGetStatus: 1}),
+ ErrorCodes.NotYetInitialized);
+failPoint.off();
+// Node 1 re-checks isSelf on next heartbeat and succeeds.
+waitForState(nodes[1], ReplSetTest.State.SECONDARY);
+rst.stopSet();
+})();
diff --git a/jstests/replsets/isself_failure_reconfig.js b/jstests/replsets/isself_failure_reconfig.js
new file mode 100644
index 00000000000..61051309b32
--- /dev/null
+++ b/jstests/replsets/isself_failure_reconfig.js
@@ -0,0 +1,46 @@
+/**
+ * Tests that a replica retries on each heartbeat if isSelf fails due to temporary DNS outage during
+ * reconfig.
+ *
+ * @tags: [
+ * requires_fcv_47,
+ * ]
+ */
+
+(function() {
+"use strict";
+
+load("jstests/libs/fail_point_util.js");
+load("jstests/replsets/rslib.js");
+
+const rst = new ReplSetTest(
+ {nodes: 1, nodeOptions: {setParameter: {logComponentVerbosity: tojson({replication: 3})}}});
+const nodes = rst.startSet();
+rst.initiate();
+
+jsTestLog("Add second node to set, with isSelf() disabled by failpoint");
+
+const newNode = rst.add({
+ rsConfig: {priority: 0, votes: 0},
+ setParameter: "failpoint.failIsSelfCheck=" + tojson({mode: "alwaysOn"})
+});
+
+rst.reInitiate();
+
+jsTestLog("Await failpoint on second node");
+
+assert.commandWorked(newNode.adminCommand({
+ waitForFailPoint: "failIsSelfCheck",
+ timesEntered: 1,
+ maxTimeMS: kDefaultWaitForFailPointTimeout
+}));
+
+assert.commandFailedWithCode(nodes[1].adminCommand({replSetGetStatus: 1}),
+ ErrorCodes.NotYetInitialized);
+
+assert.commandWorked(newNode.adminCommand({configureFailPoint: "failIsSelfCheck", mode: "off"}));
+
+jsTestLog("New node re-checks isSelf and becomes secondary");
+waitForState(newNode, ReplSetTest.State.SECONDARY);
+rst.stopSet();
+})();
diff --git a/jstests/replsets/isself_failure_restart.js b/jstests/replsets/isself_failure_restart.js
new file mode 100644
index 00000000000..6b0f880d2a1
--- /dev/null
+++ b/jstests/replsets/isself_failure_restart.js
@@ -0,0 +1,45 @@
+/**
+ * Tests that a replica retries on each heartbeat if isSelf fails due to temporary DNS outage during
+ * startup.
+ *
+ * @tags: [
+ * requires_fcv_47,
+ * requires_persistence
+ * ]
+ */
+
+(function() {
+"use strict";
+
+load("jstests/libs/fail_point_util.js");
+load("jstests/replsets/rslib.js");
+
+const rst = new ReplSetTest({
+ nodes: [{}, {rsConfig: {priority: 0, votes: 0}}],
+ nodeOptions: {setParameter: {logComponentVerbosity: tojson({replication: 3})}}
+});
+rst.startSet();
+rst.initiate();
+
+const restartNode =
+ rst.restart(1, {setParameter: "failpoint.failIsSelfCheck=" + tojson({mode: "alwaysOn"})});
+
+checkLog.contains(
+ restartNode,
+ "Locally stored replica set configuration does not have a valid entry for the current node; waiting for reconfig or remote heartbeat");
+waitForState(restartNode, ReplSetTest.State.REMOVED);
+
+// Clear the log.
+const primary = rst.getPrimary();
+assert.commandWorked(primary.adminCommand({clearLog: 'global'}));
+
+// Secondary should send heartbeat responses with 'InvalidReplicaSetConfig'.
+checkLog.contains(primary, /Received response to heartbeat(.*)InvalidReplicaSetConfig/);
+
+assert.commandWorked(
+ restartNode.adminCommand({configureFailPoint: "failIsSelfCheck", mode: "off"}));
+
+// Node 1 re-checks isSelf on next heartbeat and succeeds.
+waitForState(restartNode, ReplSetTest.State.SECONDARY);
+rst.stopSet();
+})();
diff --git a/src/mongo/db/repl/heartbeat_response_action.cpp b/src/mongo/db/repl/heartbeat_response_action.cpp
index 44feca0a32e..c21dbd0cf53 100644
--- a/src/mongo/db/repl/heartbeat_response_action.cpp
+++ b/src/mongo/db/repl/heartbeat_response_action.cpp
@@ -56,6 +56,12 @@ HeartbeatResponseAction HeartbeatResponseAction::makeCatchupTakeoverAction() {
return result;
}
+HeartbeatResponseAction HeartbeatResponseAction::makeRetryReconfigAction() {
+ HeartbeatResponseAction result;
+ result._action = RetryReconfig;
+ return result;
+}
+
HeartbeatResponseAction HeartbeatResponseAction::makeStepDownSelfAction(int primaryIndex) {
HeartbeatResponseAction result;
result._action = StepDownSelf;
diff --git a/src/mongo/db/repl/heartbeat_response_action.h b/src/mongo/db/repl/heartbeat_response_action.h
index aa52c21eee9..9f0b3e4da8c 100644
--- a/src/mongo/db/repl/heartbeat_response_action.h
+++ b/src/mongo/db/repl/heartbeat_response_action.h
@@ -45,7 +45,14 @@ public:
/**
* Actions taken based on heartbeat responses
*/
- enum Action { NoAction, Reconfig, StepDownSelf, PriorityTakeover, CatchupTakeover };
+ enum Action {
+ NoAction,
+ Reconfig,
+ StepDownSelf,
+ PriorityTakeover,
+ CatchupTakeover,
+ RetryReconfig
+ };
/**
* Makes a new action representing doing nothing.
@@ -70,6 +77,12 @@ public:
static HeartbeatResponseAction makeCatchupTakeoverAction();
/**
+ * Makes a new action telling the current node to attempt to find itself in its current replica
+ * set config again, in case the previous attempt's failure was due to a temporary DNS outage.
+ */
+ static HeartbeatResponseAction makeRetryReconfigAction();
+
+ /**
* Makes a new action telling the current node to step down as primary.
*
* It is an error to call this with primaryIndex != the index of the current node.
diff --git a/src/mongo/db/repl/isself.cpp b/src/mongo/db/repl/isself.cpp
index 3d62a2b60b4..427892bb6c5 100644
--- a/src/mongo/db/repl/isself.cpp
+++ b/src/mongo/db/repl/isself.cpp
@@ -77,6 +77,8 @@
namespace mongo {
namespace repl {
+MONGO_FAIL_POINT_DEFINE(failIsSelfCheck);
+
OID instanceId;
MONGO_INITIALIZER(GenerateInstanceId)(InitializerContext*) {
@@ -156,6 +158,12 @@ std::vector<std::string> getAddrsForHost(const std::string& iporhost,
} // namespace
bool isSelf(const HostAndPort& hostAndPort, ServiceContext* const ctx) {
+ if (MONGO_unlikely(failIsSelfCheck.shouldFail())) {
+ log() << "failIsSelfCheck failpoint activated, returning false from isSelf; hostAndPort: "
+ << hostAndPort;
+ return false;
+ }
+
// Fastpath: check if the host&port in question is bound to one
// of the interfaces on this machine.
// No need for ip match if the ports do not match
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
index f10b3b4b52c..c993d696a0d 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
@@ -283,6 +283,9 @@ stdx::unique_lock<Latch> ReplicationCoordinatorImpl::_handleHeartbeatResponseAct
invariant(responseStatus.isOK());
_scheduleHeartbeatReconfig_inlock(responseStatus.getValue().getConfig());
break;
+ case HeartbeatResponseAction::RetryReconfig:
+ _scheduleHeartbeatReconfig_inlock(_rsConfig);
+ break;
case HeartbeatResponseAction::StepDownSelf:
invariant(action.getPrimaryConfigIndex() == _selfIndex);
if (_topCoord->prepareForUnconditionalStepDown()) {
@@ -482,7 +485,7 @@ void ReplicationCoordinatorImpl::_scheduleHeartbeatReconfig_inlock(const ReplSet
}
_setConfigState_inlock(kConfigHBReconfiguring);
invariant(!_rsConfig.isInitialized() ||
- _rsConfig.getConfigVersion() < newConfig.getConfigVersion());
+ _rsConfig.getConfigVersion() < newConfig.getConfigVersion() || _selfIndex < 0);
if (auto electionFinishedEvent = _cancelElectionIfNeeded_inlock()) {
LOG_FOR_HEARTBEATS(2) << "Rescheduling heartbeat reconfig to version "
<< newConfig.getConfigVersion()
@@ -674,7 +677,7 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish(
invariant(_rsConfigState == kConfigHBReconfiguring);
invariant(!_rsConfig.isInitialized() ||
- _rsConfig.getConfigVersion() < newConfig.getConfigVersion());
+ _rsConfig.getConfigVersion() < newConfig.getConfigVersion() || _selfIndex < 0);
if (!myIndex.isOK()) {
switch (myIndex.getStatus().code()) {
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp
index 3099894bcfb..0ef88f6001c 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp
@@ -426,12 +426,8 @@ TEST_F(ReplCoordHBV1Test, IgnoreTheContentsOfMetadataWhenItsReplicaSetIdDoesNotM
ASSERT_TRUE(members[1].isABSONObj());
auto member = members[1].Obj();
ASSERT_EQ(host2, HostAndPort(member["name"].String()));
- ASSERT_EQ(MemberState(MemberState::RS_DOWN).toString(),
+ ASSERT_EQ(MemberState(MemberState::RS_UNKNOWN).toString(),
MemberState(member["state"].numberInt()).toString());
- ASSERT_EQ(member["lastHeartbeatMessage"].String(),
- std::string(str::stream()
- << "replica set IDs do not match, ours: " << rsConfig.getReplicaSetId()
- << "; remote node's: " << unexpectedId));
}
TEST_F(ReplCoordHBV1Test,
diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp
index f8ea5428661..1df593d8330 100644
--- a/src/mongo/db/repl/topology_coordinator.cpp
+++ b/src/mongo/db/repl/topology_coordinator.cpp
@@ -755,7 +755,11 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse(
invariant(hbStats.getLastHeartbeatStartDate() != Date_t());
const bool isUnauthorized = (hbResponse.getStatus().code() == ErrorCodes::Unauthorized) ||
(hbResponse.getStatus().code() == ErrorCodes::AuthenticationFailed);
- if (hbResponse.isOK() || isUnauthorized) {
+ const bool isInvalid = hbResponse.getStatus().code() == ErrorCodes::InvalidReplicaSetConfig;
+
+ // Replication of auth changes can cause temporary auth failures, and a temporary DNS outage can
+ // make a node return InvalidReplicaSetConfig if it can't find itself in the config.
+ if (hbResponse.isOK() || isUnauthorized || isInvalid) {
hbStats.hit(networkRoundTripTime);
} else {
hbStats.miss();
@@ -826,11 +830,11 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse(
nextAction.setNextHeartbeatStartDate(nextHeartbeatStartDate);
return nextAction;
}
- // If we're not in the config, we don't need to respond to heartbeats.
+ // This server is not in the config, either because it was removed or a DNS error finding self.
if (_selfIndex == -1) {
LOG(1) << "Could not find ourself in current config so ignoring heartbeat from " << target
<< " -- current config: " << _rsConfig.toBSON();
- HeartbeatResponseAction nextAction = HeartbeatResponseAction::makeNoAction();
+ HeartbeatResponseAction nextAction = HeartbeatResponseAction::makeRetryReconfigAction();
nextAction.setNextHeartbeatStartDate(nextHeartbeatStartDate);
return nextAction;
}
diff --git a/src/mongo/db/repl/topology_coordinator_v1_test.cpp b/src/mongo/db/repl/topology_coordinator_v1_test.cpp
index db2a4fbc433..e2ce2fd9314 100644
--- a/src/mongo/db/repl/topology_coordinator_v1_test.cpp
+++ b/src/mongo/db/repl/topology_coordinator_v1_test.cpp
@@ -1596,7 +1596,7 @@ TEST_F(TopoCoordTest, NodeChangesToRecoveringWhenOnlyUnauthorizedNodesAreUp) {
ASSERT_NO_ACTION(action.getAction());
}
-TEST_F(TopoCoordTest, NodeDoesNotActOnHeartbeatsWhenAbsentFromConfig) {
+TEST_F(TopoCoordTest, NodeRetriesReconfigWhenAbsentFromConfig) {
updateConfig(BSON("_id"
<< "rs0"
<< "version" << 1 << "members"
@@ -1607,12 +1607,13 @@ TEST_F(TopoCoordTest, NodeDoesNotActOnHeartbeatsWhenAbsentFromConfig) {
<< BSON("_id" << 30 << "host"
<< "h3"))),
-1);
- ASSERT_NO_ACTION(heartbeatFromMember(HostAndPort("h2"),
- "rs0",
- MemberState::RS_SECONDARY,
- OpTime(Timestamp(1, 0), 0),
- Milliseconds(300))
- .getAction());
+ ASSERT_EQUALS(HeartbeatResponseAction::RetryReconfig,
+ heartbeatFromMember(HostAndPort("h2"),
+ "rs0",
+ MemberState::RS_SECONDARY,
+ OpTime(Timestamp(1, 0), 0),
+ Milliseconds(300))
+ .getAction());
}
TEST_F(TopoCoordTest, NodeReturnsNotSecondaryWhenSyncFromIsRunPriorToHavingAConfig) {