summaryrefslogtreecommitdiff
path: root/src/mongo
diff options
context:
space:
mode:
authorA. Jesse Jiryu Davis <jesse@mongodb.com>2020-09-21 08:48:50 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-02-04 19:33:48 +0000
commitd1871f0fb5f61159aecc76865a4b8a3f5d0159fd (patch)
tree3f184e6d5d163792a06b8baa99e03d8a64705f10 /src/mongo
parent0431757da259bc5586d69477775319cb49346a0f (diff)
downloadmongo-d1871f0fb5f61159aecc76865a4b8a3f5d0159fd.tar.gz
SERVER-35649 Retry DNS failures during reconfig
(cherry picked from commit 418c61279986a5eaddc66a16b5e288556ad1f6d3)
Diffstat (limited to 'src/mongo')
-rw-r--r--src/mongo/db/repl/heartbeat_response_action.cpp6
-rw-r--r--src/mongo/db/repl/heartbeat_response_action.h15
-rw-r--r--src/mongo/db/repl/isself.cpp8
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp7
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp6
-rw-r--r--src/mongo/db/repl/topology_coordinator.cpp10
-rw-r--r--src/mongo/db/repl/topology_coordinator_v1_test.cpp15
7 files changed, 49 insertions, 18 deletions
diff --git a/src/mongo/db/repl/heartbeat_response_action.cpp b/src/mongo/db/repl/heartbeat_response_action.cpp
index 44feca0a32e..c21dbd0cf53 100644
--- a/src/mongo/db/repl/heartbeat_response_action.cpp
+++ b/src/mongo/db/repl/heartbeat_response_action.cpp
@@ -56,6 +56,12 @@ HeartbeatResponseAction HeartbeatResponseAction::makeCatchupTakeoverAction() {
return result;
}
+HeartbeatResponseAction HeartbeatResponseAction::makeRetryReconfigAction() {
+ HeartbeatResponseAction result;
+ result._action = RetryReconfig;
+ return result;
+}
+
HeartbeatResponseAction HeartbeatResponseAction::makeStepDownSelfAction(int primaryIndex) {
HeartbeatResponseAction result;
result._action = StepDownSelf;
diff --git a/src/mongo/db/repl/heartbeat_response_action.h b/src/mongo/db/repl/heartbeat_response_action.h
index aa52c21eee9..9f0b3e4da8c 100644
--- a/src/mongo/db/repl/heartbeat_response_action.h
+++ b/src/mongo/db/repl/heartbeat_response_action.h
@@ -45,7 +45,14 @@ public:
/**
* Actions taken based on heartbeat responses
*/
- enum Action { NoAction, Reconfig, StepDownSelf, PriorityTakeover, CatchupTakeover };
+ enum Action {
+ NoAction,
+ Reconfig,
+ StepDownSelf,
+ PriorityTakeover,
+ CatchupTakeover,
+ RetryReconfig
+ };
/**
* Makes a new action representing doing nothing.
@@ -70,6 +77,12 @@ public:
static HeartbeatResponseAction makeCatchupTakeoverAction();
/**
+ * Makes a new action telling the current node to attempt to find itself in its current replica
+ * set config again, in case the previous attempt's failure was due to a temporary DNS outage.
+ */
+ static HeartbeatResponseAction makeRetryReconfigAction();
+
+ /**
* Makes a new action telling the current node to step down as primary.
*
* It is an error to call this with primaryIndex != the index of the current node.
diff --git a/src/mongo/db/repl/isself.cpp b/src/mongo/db/repl/isself.cpp
index 3d62a2b60b4..427892bb6c5 100644
--- a/src/mongo/db/repl/isself.cpp
+++ b/src/mongo/db/repl/isself.cpp
@@ -77,6 +77,8 @@
namespace mongo {
namespace repl {
+MONGO_FAIL_POINT_DEFINE(failIsSelfCheck);
+
OID instanceId;
MONGO_INITIALIZER(GenerateInstanceId)(InitializerContext*) {
@@ -156,6 +158,12 @@ std::vector<std::string> getAddrsForHost(const std::string& iporhost,
} // namespace
bool isSelf(const HostAndPort& hostAndPort, ServiceContext* const ctx) {
+ if (MONGO_unlikely(failIsSelfCheck.shouldFail())) {
+ log() << "failIsSelfCheck failpoint activated, returning false from isSelf; hostAndPort: "
+ << hostAndPort;
+ return false;
+ }
+
// Fastpath: check if the host&port in question is bound to one
// of the interfaces on this machine.
// No need for ip match if the ports do not match
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
index f10b3b4b52c..c993d696a0d 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
@@ -283,6 +283,9 @@ stdx::unique_lock<Latch> ReplicationCoordinatorImpl::_handleHeartbeatResponseAct
invariant(responseStatus.isOK());
_scheduleHeartbeatReconfig_inlock(responseStatus.getValue().getConfig());
break;
+ case HeartbeatResponseAction::RetryReconfig:
+ _scheduleHeartbeatReconfig_inlock(_rsConfig);
+ break;
case HeartbeatResponseAction::StepDownSelf:
invariant(action.getPrimaryConfigIndex() == _selfIndex);
if (_topCoord->prepareForUnconditionalStepDown()) {
@@ -482,7 +485,7 @@ void ReplicationCoordinatorImpl::_scheduleHeartbeatReconfig_inlock(const ReplSet
}
_setConfigState_inlock(kConfigHBReconfiguring);
invariant(!_rsConfig.isInitialized() ||
- _rsConfig.getConfigVersion() < newConfig.getConfigVersion());
+ _rsConfig.getConfigVersion() < newConfig.getConfigVersion() || _selfIndex < 0);
if (auto electionFinishedEvent = _cancelElectionIfNeeded_inlock()) {
LOG_FOR_HEARTBEATS(2) << "Rescheduling heartbeat reconfig to version "
<< newConfig.getConfigVersion()
@@ -674,7 +677,7 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish(
invariant(_rsConfigState == kConfigHBReconfiguring);
invariant(!_rsConfig.isInitialized() ||
- _rsConfig.getConfigVersion() < newConfig.getConfigVersion());
+ _rsConfig.getConfigVersion() < newConfig.getConfigVersion() || _selfIndex < 0);
if (!myIndex.isOK()) {
switch (myIndex.getStatus().code()) {
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp
index 3099894bcfb..0ef88f6001c 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp
@@ -426,12 +426,8 @@ TEST_F(ReplCoordHBV1Test, IgnoreTheContentsOfMetadataWhenItsReplicaSetIdDoesNotM
ASSERT_TRUE(members[1].isABSONObj());
auto member = members[1].Obj();
ASSERT_EQ(host2, HostAndPort(member["name"].String()));
- ASSERT_EQ(MemberState(MemberState::RS_DOWN).toString(),
+ ASSERT_EQ(MemberState(MemberState::RS_UNKNOWN).toString(),
MemberState(member["state"].numberInt()).toString());
- ASSERT_EQ(member["lastHeartbeatMessage"].String(),
- std::string(str::stream()
- << "replica set IDs do not match, ours: " << rsConfig.getReplicaSetId()
- << "; remote node's: " << unexpectedId));
}
TEST_F(ReplCoordHBV1Test,
diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp
index f8ea5428661..1df593d8330 100644
--- a/src/mongo/db/repl/topology_coordinator.cpp
+++ b/src/mongo/db/repl/topology_coordinator.cpp
@@ -755,7 +755,11 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse(
invariant(hbStats.getLastHeartbeatStartDate() != Date_t());
const bool isUnauthorized = (hbResponse.getStatus().code() == ErrorCodes::Unauthorized) ||
(hbResponse.getStatus().code() == ErrorCodes::AuthenticationFailed);
- if (hbResponse.isOK() || isUnauthorized) {
+ const bool isInvalid = hbResponse.getStatus().code() == ErrorCodes::InvalidReplicaSetConfig;
+
+ // Replication of auth changes can cause temporary auth failures, and a temporary DNS outage can
+ // make a node return InvalidReplicaSetConfig if it can't find itself in the config.
+ if (hbResponse.isOK() || isUnauthorized || isInvalid) {
hbStats.hit(networkRoundTripTime);
} else {
hbStats.miss();
@@ -826,11 +830,11 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse(
nextAction.setNextHeartbeatStartDate(nextHeartbeatStartDate);
return nextAction;
}
- // If we're not in the config, we don't need to respond to heartbeats.
+ // This server is not in the config, either because it was removed or a DNS error finding self.
if (_selfIndex == -1) {
LOG(1) << "Could not find ourself in current config so ignoring heartbeat from " << target
<< " -- current config: " << _rsConfig.toBSON();
- HeartbeatResponseAction nextAction = HeartbeatResponseAction::makeNoAction();
+ HeartbeatResponseAction nextAction = HeartbeatResponseAction::makeRetryReconfigAction();
nextAction.setNextHeartbeatStartDate(nextHeartbeatStartDate);
return nextAction;
}
diff --git a/src/mongo/db/repl/topology_coordinator_v1_test.cpp b/src/mongo/db/repl/topology_coordinator_v1_test.cpp
index db2a4fbc433..e2ce2fd9314 100644
--- a/src/mongo/db/repl/topology_coordinator_v1_test.cpp
+++ b/src/mongo/db/repl/topology_coordinator_v1_test.cpp
@@ -1596,7 +1596,7 @@ TEST_F(TopoCoordTest, NodeChangesToRecoveringWhenOnlyUnauthorizedNodesAreUp) {
ASSERT_NO_ACTION(action.getAction());
}
-TEST_F(TopoCoordTest, NodeDoesNotActOnHeartbeatsWhenAbsentFromConfig) {
+TEST_F(TopoCoordTest, NodeRetriesReconfigWhenAbsentFromConfig) {
updateConfig(BSON("_id"
<< "rs0"
<< "version" << 1 << "members"
@@ -1607,12 +1607,13 @@ TEST_F(TopoCoordTest, NodeDoesNotActOnHeartbeatsWhenAbsentFromConfig) {
<< BSON("_id" << 30 << "host"
<< "h3"))),
-1);
- ASSERT_NO_ACTION(heartbeatFromMember(HostAndPort("h2"),
- "rs0",
- MemberState::RS_SECONDARY,
- OpTime(Timestamp(1, 0), 0),
- Milliseconds(300))
- .getAction());
+ ASSERT_EQUALS(HeartbeatResponseAction::RetryReconfig,
+ heartbeatFromMember(HostAndPort("h2"),
+ "rs0",
+ MemberState::RS_SECONDARY,
+ OpTime(Timestamp(1, 0), 0),
+ Milliseconds(300))
+ .getAction());
}
TEST_F(TopoCoordTest, NodeReturnsNotSecondaryWhenSyncFromIsRunPriorToHavingAConfig) {