summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorXuerui Fa <xuerui.fa@mongodb.com>2021-03-10 11:41:52 -0500
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-03-11 15:36:06 +0000
commit6156b718630344212bea269a28ed5bacf96a9b86 (patch)
tree2608210f07a60f9172217ffd7e010c12992f90e5
parent59dcc00af2ed0091088081449fa47fac73943cac (diff)
downloadmongo-6156b718630344212bea269a28ed5bacf96a9b86.tar.gz
SERVER-35649 Retry DNS failures during reconfig
(cherry picked from commit 418c61279986a5eaddc66a16b5e288556ad1f6d3) SERVER-51163 Mark nodes returning InvalidReplicaSetConfig in heartbeats as down (cherry picked from commit 28c6948a2a02760a69aaee3875c4b2a427528a5a)
-rw-r--r--jstests/replsets/disallow_adding_initialized_node1.js6
-rw-r--r--jstests/replsets/disallow_adding_initialized_node2.js3
-rw-r--r--jstests/replsets/isself_failure_initiate.js29
-rw-r--r--jstests/replsets/isself_failure_reconfig.js42
-rw-r--r--jstests/replsets/isself_failure_restart.js44
-rw-r--r--src/mongo/db/repl/heartbeat_response_action.cpp6
-rw-r--r--src/mongo/db/repl/heartbeat_response_action.h9
-rw-r--r--src/mongo/db/repl/isself.cpp9
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp7
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp5
-rw-r--r--src/mongo/db/repl/topology_coordinator.cpp8
-rw-r--r--src/mongo/db/repl/topology_coordinator_v1_test.cpp15
12 files changed, 161 insertions, 22 deletions
diff --git a/jstests/replsets/disallow_adding_initialized_node1.js b/jstests/replsets/disallow_adding_initialized_node1.js
index 910e71c7d8c..4a9e80e6d09 100644
--- a/jstests/replsets/disallow_adding_initialized_node1.js
+++ b/jstests/replsets/disallow_adding_initialized_node1.js
@@ -13,7 +13,8 @@
name: name,
nodes: [
{rsConfig: {_id: 10}},
- ]
+ ],
+ nodeOptions: {setParameter: {logComponentVerbosity: tojsononeline({replication: 2})}}
});
replSetA.startSet({dbpath: "$set-A-$node"});
replSetA.initiate();
@@ -22,7 +23,8 @@
name: name,
nodes: [
{rsConfig: {_id: 20}},
- ]
+ ],
+ nodeOptions: {setParameter: {logComponentVerbosity: tojsononeline({replication: 2})}}
});
replSetB.startSet({dbpath: "$set-B-$node"});
replSetB.initiate();
diff --git a/jstests/replsets/disallow_adding_initialized_node2.js b/jstests/replsets/disallow_adding_initialized_node2.js
index e92fc77880b..58b76b520e3 100644
--- a/jstests/replsets/disallow_adding_initialized_node2.js
+++ b/jstests/replsets/disallow_adding_initialized_node2.js
@@ -19,7 +19,8 @@
nodes: [
{rsConfig: {_id: 10}},
{rsConfig: {_id: 11, arbiterOnly: true}},
- ]
+ ],
+ nodeOptions: {setParameter: {logComponentVerbosity: tojsononeline({replication: 2})}}
});
replSetA.startSet({dbpath: "$set-A-$node"});
replSetA.initiate();
diff --git a/jstests/replsets/isself_failure_initiate.js b/jstests/replsets/isself_failure_initiate.js
new file mode 100644
index 00000000000..9def81e4dab
--- /dev/null
+++ b/jstests/replsets/isself_failure_initiate.js
@@ -0,0 +1,29 @@
+/**
+ * Tests that replSetInitiate eventually succeeds despite temporary DNS outage.
+ *
+ * @tags: [
+ * requires_fcv_47,
+ * ]
+ */
+
+(function() {
+ 'use strict';
+
+ load("jstests/replsets/rslib.js");
+
+ const rst = new ReplSetTest({nodes: [{}, {rsConfig: {priority: 0, votes: 0}}]});
+ const nodes = rst.startSet();
+ // Node 1 will fail to find itself in the config.
+ assert.commandWorked(
+ nodes[1].adminCommand({configureFailPoint: 'failIsSelfCheck', mode: 'alwaysOn'}));
+ const config = rst.getReplSetConfig();
+ assert.commandWorked(nodes[0].adminCommand({replSetInitiate: config}));
+ checkLog.contains(nodes[1], "failIsSelfCheck failpoint activated, returning false from isSelf");
+ assert.commandFailedWithCode(nodes[1].adminCommand({replSetGetStatus: 1}),
+ ErrorCodes.NotYetInitialized);
+ assert.commandWorked(
+ nodes[1].adminCommand({configureFailPoint: 'failIsSelfCheck', mode: 'off'}));
+ // Node 1 re-checks isSelf on next heartbeat and succeeds.
+ waitForState(nodes[1], ReplSetTest.State.SECONDARY);
+ rst.stopSet();
+})();
diff --git a/jstests/replsets/isself_failure_reconfig.js b/jstests/replsets/isself_failure_reconfig.js
new file mode 100644
index 00000000000..f94c81b0014
--- /dev/null
+++ b/jstests/replsets/isself_failure_reconfig.js
@@ -0,0 +1,42 @@
+/**
+ * Tests that a replica retries on each heartbeat if isSelf fails due to temporary DNS outage during
+ * reconfig.
+ *
+ * @tags: [
+ * requires_fcv_47,
+ * ]
+ */
+
+(function() {
+ "use strict";
+
+ load("jstests/replsets/rslib.js");
+
+ const rst = new ReplSetTest(
+ {nodes: 1, nodeOptions: {setParameter: {logComponentVerbosity: tojson({replication: 3})}}});
+ const nodes = rst.startSet();
+ rst.initiate();
+
+ jsTestLog("Add second node to set, with isSelf() disabled by failpoint");
+
+ const newNode = rst.add({
+ rsConfig: {priority: 0, votes: 0},
+ setParameter: "failpoint.failIsSelfCheck=" + tojson({mode: "alwaysOn"})
+ });
+
+ rst.reInitiate();
+
+ jsTestLog("Await failpoint on second node");
+
+ checkLog.contains(newNode, "failIsSelfCheck failpoint activated, returning false from isSelf");
+
+ assert.commandFailedWithCode(nodes[1].adminCommand({replSetGetStatus: 1}),
+ ErrorCodes.NotYetInitialized);
+
+ assert.commandWorked(
+ newNode.adminCommand({configureFailPoint: "failIsSelfCheck", mode: "off"}));
+
+ jsTestLog("New node re-checks isSelf and becomes secondary");
+ waitForState(newNode, ReplSetTest.State.SECONDARY);
+ rst.stopSet();
+})();
diff --git a/jstests/replsets/isself_failure_restart.js b/jstests/replsets/isself_failure_restart.js
new file mode 100644
index 00000000000..e575e324ba0
--- /dev/null
+++ b/jstests/replsets/isself_failure_restart.js
@@ -0,0 +1,44 @@
+/**
+ * Tests that a replica retries on each heartbeat if isSelf fails due to temporary DNS outage during
+ * startup.
+ *
+ * @tags: [
+ * requires_fcv_47,
+ * requires_persistence
+ * ]
+ */
+
+(function() {
+ "use strict";
+
+ load("jstests/replsets/rslib.js");
+
+ const rst = new ReplSetTest({
+ nodes: [{}, {rsConfig: {priority: 0, votes: 0}}],
+ nodeOptions: {setParameter: {logComponentVerbosity: tojson({replication: 3})}}
+ });
+ rst.startSet();
+ rst.initiate();
+
+ const restartNode =
+ rst.restart(1, {setParameter: "failpoint.failIsSelfCheck=" + tojson({mode: "alwaysOn"})});
+
+ checkLog.contains(
+ restartNode,
+ "Locally stored replica set configuration does not have a valid entry for the current node; waiting for reconfig or remote heartbeat");
+ waitForState(restartNode, ReplSetTest.State.REMOVED);
+
+ // Clear the log.
+ const primary = rst.getPrimary();
+ assert.commandWorked(primary.adminCommand({clearLog: 'global'}));
+
+ // Secondary should send heartbeat responses with 'InvalidReplicaSetConfig'.
+ checkLog.contains(primary, /Received response to heartbeat(.*)InvalidReplicaSetConfig/);
+
+ assert.commandWorked(
+ restartNode.adminCommand({configureFailPoint: "failIsSelfCheck", mode: "off"}));
+
+ // Node 1 re-checks isSelf on next heartbeat and succeeds.
+ waitForState(restartNode, ReplSetTest.State.SECONDARY);
+ rst.stopSet();
+})();
diff --git a/src/mongo/db/repl/heartbeat_response_action.cpp b/src/mongo/db/repl/heartbeat_response_action.cpp
index d6edbc7809e..43f79e970a4 100644
--- a/src/mongo/db/repl/heartbeat_response_action.cpp
+++ b/src/mongo/db/repl/heartbeat_response_action.cpp
@@ -63,6 +63,12 @@ HeartbeatResponseAction HeartbeatResponseAction::makeElectAction() {
return result;
}
+HeartbeatResponseAction HeartbeatResponseAction::makeRetryReconfigAction() {
+ HeartbeatResponseAction result;
+ result._action = RetryReconfig;
+ return result;
+}
+
HeartbeatResponseAction HeartbeatResponseAction::makeStepDownSelfAction(int primaryIndex) {
HeartbeatResponseAction result;
result._action = StepDownSelf;
diff --git a/src/mongo/db/repl/heartbeat_response_action.h b/src/mongo/db/repl/heartbeat_response_action.h
index ee659deef64..c451a2c33e9 100644
--- a/src/mongo/db/repl/heartbeat_response_action.h
+++ b/src/mongo/db/repl/heartbeat_response_action.h
@@ -53,7 +53,8 @@ public:
StepDownSelf,
StepDownRemotePrimary,
PriorityTakeover,
- CatchupTakeover
+ CatchupTakeover,
+ RetryReconfig
};
/**
@@ -84,6 +85,12 @@ public:
static HeartbeatResponseAction makeCatchupTakeoverAction();
/**
+ * Makes a new action telling the current node to attempt to find itself in its current replica
+ * set config again, in case the previous attempt's failure was due to a temporary DNS outage.
+ */
+ static HeartbeatResponseAction makeRetryReconfigAction();
+
+ /**
* Makes a new action telling the current node to step down as primary.
*
* It is an error to call this with primaryIndex != the index of the current node.
diff --git a/src/mongo/db/repl/isself.cpp b/src/mongo/db/repl/isself.cpp
index 5f9ea8a7715..df2939dba63 100644
--- a/src/mongo/db/repl/isself.cpp
+++ b/src/mongo/db/repl/isself.cpp
@@ -47,6 +47,7 @@
#include "mongo/db/auth/privilege.h"
#include "mongo/db/commands.h"
#include "mongo/db/service_context.h"
+#include "mongo/util/fail_point_service.h"
#include "mongo/util/log.h"
#include "mongo/util/net/socket_utils.h"
#include "mongo/util/scopeguard.h"
@@ -80,6 +81,8 @@
namespace mongo {
namespace repl {
+MONGO_FAIL_POINT_DEFINE(failIsSelfCheck);
+
OID instanceId;
MONGO_INITIALIZER(GenerateInstanceId)(InitializerContext*) {
@@ -159,6 +162,12 @@ std::vector<std::string> getAddrsForHost(const std::string& iporhost,
} // namespace
bool isSelf(const HostAndPort& hostAndPort, ServiceContext* const ctx) {
+ if (MONGO_FAIL_POINT(failIsSelfCheck)) {
+ log() << "failIsSelfCheck failpoint activated, returning false from isSelf; hostAndPort: "
+ << hostAndPort;
+ return false;
+ }
+
// Fastpath: check if the host&port in question is bound to one
// of the interfaces on this machine.
// No need for ip match if the ports do not match
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
index 10480a11cc3..de13a4eaa2d 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
@@ -275,6 +275,9 @@ stdx::unique_lock<stdx::mutex> ReplicationCoordinatorImpl::_handleHeartbeatRespo
case HeartbeatResponseAction::StartElection:
_startElectSelf_inlock();
break;
+ case HeartbeatResponseAction::RetryReconfig:
+ _scheduleHeartbeatReconfig_inlock(_rsConfig);
+ break;
case HeartbeatResponseAction::StepDownSelf:
invariant(action.getPrimaryConfigIndex() == _selfIndex);
if (_topCoord->prepareForUnconditionalStepDown()) {
@@ -458,7 +461,7 @@ void ReplicationCoordinatorImpl::_scheduleHeartbeatReconfig_inlock(const ReplSet
}
_setConfigState_inlock(kConfigHBReconfiguring);
invariant(!_rsConfig.isInitialized() ||
- _rsConfig.getConfigVersion() < newConfig.getConfigVersion());
+ _rsConfig.getConfigVersion() < newConfig.getConfigVersion() || _selfIndex < 0);
if (auto electionFinishedEvent = _cancelElectionIfNeeded_inlock()) {
LOG_FOR_HEARTBEATS(2) << "Rescheduling heartbeat reconfig to version "
<< newConfig.getConfigVersion()
@@ -598,7 +601,7 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish(
invariant(_rsConfigState == kConfigHBReconfiguring);
invariant(!_rsConfig.isInitialized() ||
- _rsConfig.getConfigVersion() < newConfig.getConfigVersion());
+ _rsConfig.getConfigVersion() < newConfig.getConfigVersion() || _selfIndex < 0);
// Do not conduct an election during a reconfig, as the node may not be electable post-reconfig.
if (auto electionFinishedEvent = _cancelElectionIfNeeded_inlock()) {
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp
index fc87738121c..269f7dbac5a 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp
@@ -425,11 +425,6 @@ TEST_F(ReplCoordHBV1Test, IgnoreTheContentsOfMetadataWhenItsReplicaSetIdDoesNotM
ASSERT_EQ(host2, HostAndPort(member["name"].String()));
ASSERT_EQ(MemberState(MemberState::RS_DOWN).toString(),
MemberState(member["state"].numberInt()).toString());
- ASSERT_EQ(member["lastHeartbeatMessage"].String(),
- std::string(str::stream() << "replica set IDs do not match, ours: "
- << rsConfig.getReplicaSetId()
- << "; remote node's: "
- << unexpectedId));
}
} // namespace
diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp
index ede25322ced..5a9cd0c332c 100644
--- a/src/mongo/db/repl/topology_coordinator.cpp
+++ b/src/mongo/db/repl/topology_coordinator.cpp
@@ -1123,11 +1123,11 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse(
nextAction.setNextHeartbeatStartDate(nextHeartbeatStartDate);
return nextAction;
}
- // If we're not in the config, we don't need to respond to heartbeats.
+ // This server is not in the config, either because it was removed or a DNS error finding self.
if (_selfIndex == -1) {
- LOG(1) << "Could not find ourself in current config so ignoring heartbeat from " << target
- << " -- current config: " << _rsConfig.toBSON();
- HeartbeatResponseAction nextAction = HeartbeatResponseAction::makeNoAction();
+ LOG(1) << "Could not find self in current config, retrying DNS resolution of members "
+ << target << " -- current config: " << _rsConfig.toBSON();
+ HeartbeatResponseAction nextAction = HeartbeatResponseAction::makeRetryReconfigAction();
nextAction.setNextHeartbeatStartDate(nextHeartbeatStartDate);
return nextAction;
}
diff --git a/src/mongo/db/repl/topology_coordinator_v1_test.cpp b/src/mongo/db/repl/topology_coordinator_v1_test.cpp
index 84c94741ef2..b971aec30d5 100644
--- a/src/mongo/db/repl/topology_coordinator_v1_test.cpp
+++ b/src/mongo/db/repl/topology_coordinator_v1_test.cpp
@@ -1019,7 +1019,7 @@ TEST_F(TopoCoordTest, NodeChangesToRecoveringWhenOnlyUnauthorizedNodesAreUp) {
ASSERT_NO_ACTION(action.getAction());
}
-TEST_F(TopoCoordTest, NodeDoesNotActOnHeartbeatsWhenAbsentFromConfig) {
+TEST_F(TopoCoordTest, NodeRetriesReconfigWhenAbsentFromConfig) {
updateConfig(BSON("_id"
<< "rs0"
<< "version"
@@ -1032,12 +1032,13 @@ TEST_F(TopoCoordTest, NodeDoesNotActOnHeartbeatsWhenAbsentFromConfig) {
<< BSON("_id" << 30 << "host"
<< "h3"))),
-1);
- ASSERT_NO_ACTION(heartbeatFromMember(HostAndPort("h2"),
- "rs0",
- MemberState::RS_SECONDARY,
- OpTime(Timestamp(1, 0), 0),
- Milliseconds(300))
- .getAction());
+ ASSERT_EQUALS(HeartbeatResponseAction::RetryReconfig,
+ heartbeatFromMember(HostAndPort("h2"),
+ "rs0",
+ MemberState::RS_SECONDARY,
+ OpTime(Timestamp(1, 0), 0),
+ Milliseconds(300))
+ .getAction());
}
TEST_F(TopoCoordTest, NodeReturnsNotSecondaryWhenSyncFromIsRunPriorToHavingAConfig) {