summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorA. Jesse Jiryu Davis <jesse@mongodb.com>2020-09-21 08:48:50 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-09-21 13:17:03 +0000
commit418c61279986a5eaddc66a16b5e288556ad1f6d3 (patch)
treeb56c929fe926d0888c1b1b4d44e12a0a3e18e9d1
parent2cdeee7b845751e7e50b2a5a78641ba4b4680aa2 (diff)
downloadmongo-418c61279986a5eaddc66a16b5e288556ad1f6d3.tar.gz
SERVER-35649 Retry DNS failures during reconfig
-rw-r--r--etc/backports_required_for_multiversion_tests.yml2
-rw-r--r--jstests/replsets/awaitable_ismaster_on_nodes_with_invalid_configs.js2
-rw-r--r--jstests/replsets/disallow_adding_initialized_node1.js25
-rw-r--r--jstests/replsets/isself_failure_initiate.js28
-rw-r--r--jstests/replsets/isself_failure_reconfig.js46
-rw-r--r--jstests/replsets/isself_failure_restart.js44
-rw-r--r--src/mongo/db/repl/heartbeat_response_action.cpp6
-rw-r--r--src/mongo/db/repl/heartbeat_response_action.h15
-rw-r--r--src/mongo/db/repl/isself.cpp9
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp9
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp6
-rw-r--r--src/mongo/db/repl/topology_coordinator.cpp21
-rw-r--r--src/mongo/db/repl/topology_coordinator_v1_test.cpp15
13 files changed, 194 insertions, 34 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml
index 25c16def801..22f5a7053e3 100644
--- a/etc/backports_required_for_multiversion_tests.yml
+++ b/etc/backports_required_for_multiversion_tests.yml
@@ -90,6 +90,8 @@ suites:
concurrency_sharded_replication_multiversion:
replica_sets_multiversion:
+ - ticket: SERVER-35649
+ test_file: jstests/replsets/disallow_adding_initialized_node1.js
replica_sets_jscore_multiversion_passthrough:
diff --git a/jstests/replsets/awaitable_ismaster_on_nodes_with_invalid_configs.js b/jstests/replsets/awaitable_ismaster_on_nodes_with_invalid_configs.js
index 2c078261513..ae75d4a948e 100644
--- a/jstests/replsets/awaitable_ismaster_on_nodes_with_invalid_configs.js
+++ b/jstests/replsets/awaitable_ismaster_on_nodes_with_invalid_configs.js
@@ -175,4 +175,4 @@ assert.eq(0, numAwaitingTopologyChangeOnPrimary);
assert.eq(0, numAwaitingTopologyChangeOnSecondary);
replTest.stopSet();
-})(); \ No newline at end of file
+})();
diff --git a/jstests/replsets/disallow_adding_initialized_node1.js b/jstests/replsets/disallow_adding_initialized_node1.js
index bc6d3ed764a..2a9775e4e38 100644
--- a/jstests/replsets/disallow_adding_initialized_node1.js
+++ b/jstests/replsets/disallow_adding_initialized_node1.js
@@ -12,7 +12,8 @@ var replSetA = new ReplSetTest({
name: name,
nodes: [
{rsConfig: {_id: 10}},
- ]
+ ],
+ nodeOptions: {setParameter: {logComponentVerbosity: tojsononeline({replication: 2})}}
});
replSetA.startSet({dbpath: "$set-A-$node"});
replSetA.initiate();
@@ -21,7 +22,8 @@ var replSetB = new ReplSetTest({
name: name,
nodes: [
{rsConfig: {_id: 20}},
- ]
+ ],
+ nodeOptions: {setParameter: {logComponentVerbosity: tojsononeline({replication: 2})}}
});
replSetB.startSet({dbpath: "$set-B-$node"});
replSetB.initiate();
@@ -54,14 +56,19 @@ configA.version++;
configA.members.push({_id: 11, host: primaryB.host, votes: 0, priority: 0});
assert.commandWorked(primaryA.adminCommand({replSetReconfig: configA}));
-// Wait for primary A to report primary B down. B should reject all heartbeats from A due to a
-// replset name mismatch, leading A to consider it down.
+// Wait for primary A to report primary B UNKNOWN.
assert.soon(function() {
const statusA = assert.commandWorked(primaryA.adminCommand({replSetGetStatus: 1}));
if (statusA.members.length !== 2) {
return false;
}
- return statusA.members[1].state === ReplSetTest.State.DOWN;
+ return statusA.members[1].state === ReplSetTest.State.UNKNOWN;
+});
+
+checkLog.containsJson(primaryA, 4615621, {
+ error: (error) => {
+ return error.code === 93 && error.errmsg.indexOf("replica set IDs do not match") !== -1;
+ }
});
// Confirm that each set still has the correct primary.
@@ -72,7 +79,7 @@ jsTestLog('After merging with 0 votes: primary A = ' + newPrimaryA.host +
assert.eq(primaryA, newPrimaryA);
assert.eq(primaryB, newPrimaryB);
-// Replica set A's config should include primary B and consider it DOWN.
+// Replica set A's config should include primary B and consider it UNKNOWN.
let statusA = assert.commandWorked(primaryA.adminCommand({replSetGetStatus: 1}));
jsTestLog('After merging with 0 votes: replica set status A = ' + tojson(statusA));
assert.eq(2, statusA.members.length);
@@ -81,7 +88,7 @@ assert.eq(primaryA.host, statusA.members[0].name);
assert.eq(ReplSetTest.State.PRIMARY, statusA.members[0].state);
assert.eq(11, statusA.members[1]._id);
assert.eq(primaryB.host, statusA.members[1].name);
-assert.eq(ReplSetTest.State.DOWN, statusA.members[1].state);
+assert.eq(ReplSetTest.State.UNKNOWN, statusA.members[1].state);
// Replica set B's config should remain unchanged.
let statusB = assert.commandWorked(primaryB.adminCommand({replSetGetStatus: 1}));
@@ -115,7 +122,7 @@ var msgB = "replica set IDs do not match, ours: " + configB.settings.replicaSetI
"; remote node's: " + configA.settings.replicaSetId;
checkLog.contains(primaryB, msgB);
-// Confirm primary B is still DOWN.
+// Confirm primary B is still UNKNOWN.
statusA = assert.commandWorked(primaryA.adminCommand({replSetGetStatus: 1}));
jsTestLog('After merging: replica set status A = ' + tojson(statusA));
assert.eq(2, statusA.members.length);
@@ -124,7 +131,7 @@ assert.eq(primaryA.host, statusA.members[0].name);
assert.eq(ReplSetTest.State.PRIMARY, statusA.members[0].state);
assert.eq(11, statusA.members[1]._id);
assert.eq(primaryB.host, statusA.members[1].name);
-assert.eq(ReplSetTest.State.DOWN, statusA.members[1].state);
+assert.eq(ReplSetTest.State.UNKNOWN, statusA.members[1].state);
// Replica set B's config should remain unchanged.
statusB = assert.commandWorked(primaryB.adminCommand({replSetGetStatus: 1}));
diff --git a/jstests/replsets/isself_failure_initiate.js b/jstests/replsets/isself_failure_initiate.js
new file mode 100644
index 00000000000..dbeb0a499c9
--- /dev/null
+++ b/jstests/replsets/isself_failure_initiate.js
@@ -0,0 +1,28 @@
+/**
+ * Tests that replSetInitiate eventually succeeds despite temporary DNS outage.
+ *
+ * @tags: [
+ * requires_fcv_47,
+ * ]
+ */
+
+(function() {
+'use strict';
+
+load("jstests/libs/fail_point_util.js");
+load("jstests/replsets/rslib.js");
+
+const rst = new ReplSetTest({nodes: [{}, {rsConfig: {priority: 0, votes: 0}}]});
+const nodes = rst.startSet();
+// Node 1 will fail to find itself in the config.
+const failPoint = configureFailPoint(nodes[1], "failIsSelfCheck");
+const config = rst.getReplSetConfig();
+assert.commandWorked(nodes[0].adminCommand({replSetInitiate: config}));
+failPoint.wait();
+assert.commandFailedWithCode(nodes[1].adminCommand({replSetGetStatus: 1}),
+ ErrorCodes.NotYetInitialized);
+failPoint.off();
+// Node 1 re-checks isSelf on next heartbeat and succeeds.
+waitForState(nodes[1], ReplSetTest.State.SECONDARY);
+rst.stopSet();
+})();
diff --git a/jstests/replsets/isself_failure_reconfig.js b/jstests/replsets/isself_failure_reconfig.js
new file mode 100644
index 00000000000..61051309b32
--- /dev/null
+++ b/jstests/replsets/isself_failure_reconfig.js
@@ -0,0 +1,46 @@
+/**
+ * Tests that a replica retries on each heartbeat if isSelf fails due to temporary DNS outage during
+ * reconfig.
+ *
+ * @tags: [
+ * requires_fcv_47,
+ * ]
+ */
+
+(function() {
+"use strict";
+
+load("jstests/libs/fail_point_util.js");
+load("jstests/replsets/rslib.js");
+
+const rst = new ReplSetTest(
+ {nodes: 1, nodeOptions: {setParameter: {logComponentVerbosity: tojson({replication: 3})}}});
+const nodes = rst.startSet();
+rst.initiate();
+
+jsTestLog("Add second node to set, with isSelf() disabled by failpoint");
+
+const newNode = rst.add({
+ rsConfig: {priority: 0, votes: 0},
+ setParameter: "failpoint.failIsSelfCheck=" + tojson({mode: "alwaysOn"})
+});
+
+rst.reInitiate();
+
+jsTestLog("Await failpoint on second node");
+
+assert.commandWorked(newNode.adminCommand({
+ waitForFailPoint: "failIsSelfCheck",
+ timesEntered: 1,
+ maxTimeMS: kDefaultWaitForFailPointTimeout
+}));
+
+assert.commandFailedWithCode(nodes[1].adminCommand({replSetGetStatus: 1}),
+ ErrorCodes.NotYetInitialized);
+
+assert.commandWorked(newNode.adminCommand({configureFailPoint: "failIsSelfCheck", mode: "off"}));
+
+jsTestLog("New node re-checks isSelf and becomes secondary");
+waitForState(newNode, ReplSetTest.State.SECONDARY);
+rst.stopSet();
+})();
diff --git a/jstests/replsets/isself_failure_restart.js b/jstests/replsets/isself_failure_restart.js
new file mode 100644
index 00000000000..d9f35d02f8a
--- /dev/null
+++ b/jstests/replsets/isself_failure_restart.js
@@ -0,0 +1,44 @@
+/**
+ * Tests that a replica retries on each heartbeat if isSelf fails due to temporary DNS outage during
+ * startup.
+ *
+ * @tags: [
+ * requires_fcv_47,
+ * requires_persistence
+ * ]
+ */
+
+(function() {
+"use strict";
+
+load("jstests/libs/fail_point_util.js");
+load("jstests/replsets/rslib.js");
+
+const rst = new ReplSetTest({
+ nodes: [{}, {rsConfig: {priority: 0, votes: 0}}],
+ nodeOptions: {setParameter: {logComponentVerbosity: tojson({replication: 3})}}
+});
+rst.startSet();
+rst.initiate();
+
+const restartNode =
+ rst.restart(1, {setParameter: "failpoint.failIsSelfCheck=" + tojson({mode: "alwaysOn"})});
+
+// "Locally stored replica set configuration does not have a valid entry for the current node".
+checkLog.containsJson(restartNode, 21405);
+waitForState(restartNode, ReplSetTest.State.REMOVED);
+
+// "Received response to heartbeat".
+checkLog.containsJson(rst.getPrimary(), 4615620, {
+ response: (response) => {
+ return response.codeName === "InvalidReplicaSetConfig";
+ }
+});
+
+assert.commandWorked(
+ restartNode.adminCommand({configureFailPoint: "failIsSelfCheck", mode: "off"}));
+
+// Node 1 re-checks isSelf on next heartbeat and succeeds.
+waitForState(restartNode, ReplSetTest.State.SECONDARY);
+rst.stopSet();
+})();
diff --git a/src/mongo/db/repl/heartbeat_response_action.cpp b/src/mongo/db/repl/heartbeat_response_action.cpp
index 2f2a9072001..8b84c23bac9 100644
--- a/src/mongo/db/repl/heartbeat_response_action.cpp
+++ b/src/mongo/db/repl/heartbeat_response_action.cpp
@@ -56,6 +56,12 @@ HeartbeatResponseAction HeartbeatResponseAction::makeCatchupTakeoverAction() {
return result;
}
+HeartbeatResponseAction HeartbeatResponseAction::makeRetryReconfigAction() {
+ HeartbeatResponseAction result;
+ result._action = RetryReconfig;
+ return result;
+}
+
HeartbeatResponseAction HeartbeatResponseAction::makeStepDownSelfAction(int primaryIndex) {
HeartbeatResponseAction result;
result._action = StepDownSelf;
diff --git a/src/mongo/db/repl/heartbeat_response_action.h b/src/mongo/db/repl/heartbeat_response_action.h
index b193239a943..a030761ecaf 100644
--- a/src/mongo/db/repl/heartbeat_response_action.h
+++ b/src/mongo/db/repl/heartbeat_response_action.h
@@ -45,7 +45,14 @@ public:
/**
* Actions taken based on heartbeat responses
*/
- enum Action { NoAction, Reconfig, StepDownSelf, PriorityTakeover, CatchupTakeover };
+ enum Action {
+ NoAction,
+ Reconfig,
+ StepDownSelf,
+ PriorityTakeover,
+ CatchupTakeover,
+ RetryReconfig
+ };
/**
* Makes a new action representing doing nothing.
@@ -70,6 +77,12 @@ public:
static HeartbeatResponseAction makeCatchupTakeoverAction();
/**
+ * Makes a new action telling the current node to attempt to find itself in its current replica
+ * set config again, in case the previous attempt's failure was due to a temporary DNS outage.
+ */
+ static HeartbeatResponseAction makeRetryReconfigAction();
+
+ /**
* Makes a new action telling the current node to step down as primary.
*
* It is an error to call this with primaryIndex != the index of the current node.
diff --git a/src/mongo/db/repl/isself.cpp b/src/mongo/db/repl/isself.cpp
index 4e6a544dd5b..1c561ff9f5d 100644
--- a/src/mongo/db/repl/isself.cpp
+++ b/src/mongo/db/repl/isself.cpp
@@ -78,6 +78,8 @@
namespace mongo {
namespace repl {
+MONGO_FAIL_POINT_DEFINE(failIsSelfCheck);
+
OID instanceId;
MONGO_INITIALIZER(GenerateInstanceId)(InitializerContext*) {
@@ -163,6 +165,13 @@ std::vector<std::string> getAddrsForHost(const std::string& iporhost,
} // namespace
bool isSelf(const HostAndPort& hostAndPort, ServiceContext* const ctx) {
+ if (MONGO_unlikely(failIsSelfCheck.shouldFail())) {
+ LOGV2(356490,
+ "failIsSelfCheck failpoint activated, returning false from isSelf",
+ "hostAndPort"_attr = hostAndPort);
+ return false;
+ }
+
// Fastpath: check if the host&port in question is bound to one
// of the interfaces on this machine.
// No need for ip match if the ports do not match
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
index 9ac897a22a8..1f0684dc66f 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
@@ -371,6 +371,9 @@ stdx::unique_lock<Latch> ReplicationCoordinatorImpl::_handleHeartbeatResponseAct
invariant(responseStatus.isOK());
_scheduleHeartbeatReconfig(lock, responseStatus.getValue().getConfig());
break;
+ case HeartbeatResponseAction::RetryReconfig:
+ _scheduleHeartbeatReconfig(lock, _rsConfig);
+ break;
case HeartbeatResponseAction::StepDownSelf:
invariant(action.getPrimaryConfigIndex() == _selfIndex);
if (_topCoord->prepareForUnconditionalStepDown()) {
@@ -602,7 +605,8 @@ void ReplicationCoordinatorImpl::_scheduleHeartbeatReconfig(WithLock lk,
_setConfigState_inlock(kConfigHBReconfiguring);
invariant(!_rsConfig.isInitialized() ||
- _rsConfig.getConfigVersionAndTerm() < newConfig.getConfigVersionAndTerm());
+ _rsConfig.getConfigVersionAndTerm() < newConfig.getConfigVersionAndTerm() ||
+ _selfIndex < 0);
_replExecutor
->scheduleWork([=](const executor::TaskExecutor::CallbackArgs& cbData) {
_heartbeatReconfigStore(cbData, newConfig);
@@ -809,7 +813,8 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish(
invariant(_rsConfigState == kConfigHBReconfiguring);
invariant(!_rsConfig.isInitialized() ||
- _rsConfig.getConfigVersionAndTerm() < newConfig.getConfigVersionAndTerm());
+ _rsConfig.getConfigVersionAndTerm() < newConfig.getConfigVersionAndTerm() ||
+ _selfIndex < 0);
if (!myIndex.isOK()) {
switch (myIndex.getStatus().code()) {
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp
index 736e76d2368..ebf5162a8c6 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp
@@ -1035,12 +1035,8 @@ TEST_F(ReplCoordHBV1Test, IgnoreTheContentsOfMetadataWhenItsReplicaSetIdDoesNotM
ASSERT_TRUE(members[1].isABSONObj());
auto member = members[1].Obj();
ASSERT_EQ(host2, HostAndPort(member["name"].String()));
- ASSERT_EQ(MemberState(MemberState::RS_DOWN).toString(),
+ ASSERT_EQ(MemberState(MemberState::RS_UNKNOWN).toString(),
MemberState(member["state"].numberInt()).toString());
- ASSERT_EQ(member["lastHeartbeatMessage"].String(),
- std::string(str::stream()
- << "replica set IDs do not match, ours: " << rsConfig.getReplicaSetId()
- << "; remote node's: " << unexpectedId));
}
TEST_F(ReplCoordHBV1Test,
diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp
index c74d23b5ff9..c755830380a 100644
--- a/src/mongo/db/repl/topology_coordinator.cpp
+++ b/src/mongo/db/repl/topology_coordinator.cpp
@@ -936,7 +936,11 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse(
invariant(hbStats.getLastHeartbeatStartDate() != Date_t());
const bool isUnauthorized = (hbResponse.getStatus().code() == ErrorCodes::Unauthorized) ||
(hbResponse.getStatus().code() == ErrorCodes::AuthenticationFailed);
- if (hbResponse.isOK() || isUnauthorized) {
+ const bool isInvalid = hbResponse.getStatus().code() == ErrorCodes::InvalidReplicaSetConfig;
+
+ // Replication of auth changes can cause temporary auth failures, and a temporary DNS outage can
+ // make a node return InvalidReplicaSetConfig if it can't find itself in the config.
+ if (hbResponse.isOK() || isUnauthorized || isInvalid) {
hbStats.hit(networkRoundTripTime);
} else {
hbStats.miss();
@@ -1030,15 +1034,14 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse(
if (!_rsConfig.isInitialized()) {
return nextAction;
}
- // If we're not in the config, we don't need to respond to heartbeats.
+ // This server is not in the config, either because it was removed or a DNS error finding self.
if (_selfIndex == -1) {
- LOGV2_DEBUG(21805,
- 1,
- "Could not find ourself in current config so ignoring heartbeat from {target} "
- "-- current config: {currentConfig}",
- "Could not find ourself in current config so ignoring heartbeat",
- "target"_attr = target,
- "currentConfig"_attr = _rsConfig.toBSON());
+ LOGV2(3564900,
+ "Could not find self in current config, retrying DNS resolution of members",
+ "target"_attr = target,
+ "currentConfig"_attr = _rsConfig.toBSON());
+ nextAction = HeartbeatResponseAction::makeRetryReconfigAction();
+ nextAction.setNextHeartbeatStartDate(nextHeartbeatStartDate);
return nextAction;
}
const int memberIndex = _rsConfig.findMemberIndexByHostAndPort(target);
diff --git a/src/mongo/db/repl/topology_coordinator_v1_test.cpp b/src/mongo/db/repl/topology_coordinator_v1_test.cpp
index d0625259077..1f9b5d84591 100644
--- a/src/mongo/db/repl/topology_coordinator_v1_test.cpp
+++ b/src/mongo/db/repl/topology_coordinator_v1_test.cpp
@@ -1311,7 +1311,7 @@ TEST_F(TopoCoordTest, NodeChangesToRecoveringWhenOnlyUnauthorizedNodesAreUp) {
ASSERT_NO_ACTION(action.getAction());
}
-TEST_F(TopoCoordTest, NodeDoesNotActOnHeartbeatsWhenAbsentFromConfig) {
+TEST_F(TopoCoordTest, NodeRetriesReconfigWhenAbsentFromConfig) {
updateConfig(BSON("_id"
<< "rs0"
<< "version" << 1 << "members"
@@ -1322,12 +1322,13 @@ TEST_F(TopoCoordTest, NodeDoesNotActOnHeartbeatsWhenAbsentFromConfig) {
<< BSON("_id" << 30 << "host"
<< "h3"))),
-1);
- ASSERT_NO_ACTION(heartbeatFromMember(HostAndPort("h2"),
- "rs0",
- MemberState::RS_SECONDARY,
- OpTime(Timestamp(1, 0), 0),
- Milliseconds(300))
- .getAction());
+ ASSERT_EQUALS(HeartbeatResponseAction::RetryReconfig,
+ heartbeatFromMember(HostAndPort("h2"),
+ "rs0",
+ MemberState::RS_SECONDARY,
+ OpTime(Timestamp(1, 0), 0),
+ Milliseconds(300))
+ .getAction());
}
TEST_F(TopoCoordTest, NodeReturnsNotSecondaryWhenSyncFromIsRunPriorToHavingAConfig) {