diff options
author | A. Jesse Jiryu Davis <jesse@mongodb.com> | 2020-09-21 08:48:50 -0400 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-02-04 19:33:48 +0000 |
commit | d1871f0fb5f61159aecc76865a4b8a3f5d0159fd (patch) | |
tree | 3f184e6d5d163792a06b8baa99e03d8a64705f10 /src/mongo | |
parent | 0431757da259bc5586d69477775319cb49346a0f (diff) | |
download | mongo-d1871f0fb5f61159aecc76865a4b8a3f5d0159fd.tar.gz |
SERVER-35649 Retry DNS failures during reconfig
(cherry picked from commit 418c61279986a5eaddc66a16b5e288556ad1f6d3)
Diffstat (limited to 'src/mongo')
-rw-r--r-- | src/mongo/db/repl/heartbeat_response_action.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/repl/heartbeat_response_action.h | 15 | ||||
-rw-r--r-- | src/mongo/db/repl/isself.cpp | 8 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp | 7 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator.cpp | 10 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator_v1_test.cpp | 15 |
7 files changed, 49 insertions, 18 deletions
diff --git a/src/mongo/db/repl/heartbeat_response_action.cpp b/src/mongo/db/repl/heartbeat_response_action.cpp index 44feca0a32e..c21dbd0cf53 100644 --- a/src/mongo/db/repl/heartbeat_response_action.cpp +++ b/src/mongo/db/repl/heartbeat_response_action.cpp @@ -56,6 +56,12 @@ HeartbeatResponseAction HeartbeatResponseAction::makeCatchupTakeoverAction() { return result; } +HeartbeatResponseAction HeartbeatResponseAction::makeRetryReconfigAction() { + HeartbeatResponseAction result; + result._action = RetryReconfig; + return result; +} + HeartbeatResponseAction HeartbeatResponseAction::makeStepDownSelfAction(int primaryIndex) { HeartbeatResponseAction result; result._action = StepDownSelf; diff --git a/src/mongo/db/repl/heartbeat_response_action.h b/src/mongo/db/repl/heartbeat_response_action.h index aa52c21eee9..9f0b3e4da8c 100644 --- a/src/mongo/db/repl/heartbeat_response_action.h +++ b/src/mongo/db/repl/heartbeat_response_action.h @@ -45,7 +45,14 @@ public: /** * Actions taken based on heartbeat responses */ - enum Action { NoAction, Reconfig, StepDownSelf, PriorityTakeover, CatchupTakeover }; + enum Action { + NoAction, + Reconfig, + StepDownSelf, + PriorityTakeover, + CatchupTakeover, + RetryReconfig + }; /** * Makes a new action representing doing nothing. @@ -70,6 +77,12 @@ public: static HeartbeatResponseAction makeCatchupTakeoverAction(); /** + * Makes a new action telling the current node to attempt to find itself in its current replica + * set config again, in case the previous attempt's failure was due to a temporary DNS outage. + */ + static HeartbeatResponseAction makeRetryReconfigAction(); + + /** * Makes a new action telling the current node to step down as primary. * * It is an error to call this with primaryIndex != the index of the current node. diff --git a/src/mongo/db/repl/isself.cpp b/src/mongo/db/repl/isself.cpp index 3d62a2b60b4..427892bb6c5 100644 --- a/src/mongo/db/repl/isself.cpp +++ b/src/mongo/db/repl/isself.cpp @@ -77,6 +77,8 @@ namespace mongo { namespace repl { +MONGO_FAIL_POINT_DEFINE(failIsSelfCheck); + OID instanceId; MONGO_INITIALIZER(GenerateInstanceId)(InitializerContext*) { @@ -156,6 +158,12 @@ std::vector<std::string> getAddrsForHost(const std::string& iporhost, } // namespace bool isSelf(const HostAndPort& hostAndPort, ServiceContext* const ctx) { + if (MONGO_unlikely(failIsSelfCheck.shouldFail())) { + log() << "failIsSelfCheck failpoint activated, returning false from isSelf; hostAndPort: " + << hostAndPort; + return false; + } + // Fastpath: check if the host&port in question is bound to one // of the interfaces on this machine. // No need for ip match if the ports do not match diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index f10b3b4b52c..c993d696a0d 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -283,6 +283,9 @@ stdx::unique_lock<Latch> ReplicationCoordinatorImpl::_handleHeartbeatResponseAct invariant(responseStatus.isOK()); _scheduleHeartbeatReconfig_inlock(responseStatus.getValue().getConfig()); break; + case HeartbeatResponseAction::RetryReconfig: + _scheduleHeartbeatReconfig_inlock(_rsConfig); + break; case HeartbeatResponseAction::StepDownSelf: invariant(action.getPrimaryConfigIndex() == _selfIndex); if (_topCoord->prepareForUnconditionalStepDown()) { @@ -482,7 +485,7 @@ void ReplicationCoordinatorImpl::_scheduleHeartbeatReconfig_inlock(const ReplSet } _setConfigState_inlock(kConfigHBReconfiguring); invariant(!_rsConfig.isInitialized() || - _rsConfig.getConfigVersion() < newConfig.getConfigVersion()); + _rsConfig.getConfigVersion() < newConfig.getConfigVersion() || _selfIndex < 0); if (auto electionFinishedEvent = _cancelElectionIfNeeded_inlock()) { LOG_FOR_HEARTBEATS(2) << "Rescheduling heartbeat reconfig to version " << newConfig.getConfigVersion() @@ -674,7 +677,7 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish( invariant(_rsConfigState == kConfigHBReconfiguring); invariant(!_rsConfig.isInitialized() || - _rsConfig.getConfigVersion() < newConfig.getConfigVersion()); + _rsConfig.getConfigVersion() < newConfig.getConfigVersion() || _selfIndex < 0); if (!myIndex.isOK()) { switch (myIndex.getStatus().code()) { diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp index 3099894bcfb..0ef88f6001c 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp @@ -426,12 +426,8 @@ TEST_F(ReplCoordHBV1Test, IgnoreTheContentsOfMetadataWhenItsReplicaSetIdDoesNotM ASSERT_TRUE(members[1].isABSONObj()); auto member = members[1].Obj(); ASSERT_EQ(host2, HostAndPort(member["name"].String())); - ASSERT_EQ(MemberState(MemberState::RS_DOWN).toString(), + ASSERT_EQ(MemberState(MemberState::RS_UNKNOWN).toString(), MemberState(member["state"].numberInt()).toString()); - ASSERT_EQ(member["lastHeartbeatMessage"].String(), - std::string(str::stream() - << "replica set IDs do not match, ours: " << rsConfig.getReplicaSetId() - << "; remote node's: " << unexpectedId)); } TEST_F(ReplCoordHBV1Test, diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp index f8ea5428661..1df593d8330 100644 --- a/src/mongo/db/repl/topology_coordinator.cpp +++ b/src/mongo/db/repl/topology_coordinator.cpp @@ -755,7 +755,11 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse( invariant(hbStats.getLastHeartbeatStartDate() != Date_t()); const bool isUnauthorized = (hbResponse.getStatus().code() == ErrorCodes::Unauthorized) || (hbResponse.getStatus().code() == ErrorCodes::AuthenticationFailed); - if (hbResponse.isOK() || isUnauthorized) { + const bool isInvalid = hbResponse.getStatus().code() == ErrorCodes::InvalidReplicaSetConfig; + + // Replication of auth changes can cause temporary auth failures, and a temporary DNS outage can + // make a node return InvalidReplicaSetConfig if it can't find itself in the config. + if (hbResponse.isOK() || isUnauthorized || isInvalid) { hbStats.hit(networkRoundTripTime); } else { hbStats.miss(); @@ -826,11 +830,11 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse( nextAction.setNextHeartbeatStartDate(nextHeartbeatStartDate); return nextAction; } - // If we're not in the config, we don't need to respond to heartbeats. + // This server is not in the config, either because it was removed or a DNS error finding self. if (_selfIndex == -1) { LOG(1) << "Could not find ourself in current config so ignoring heartbeat from " << target << " -- current config: " << _rsConfig.toBSON(); - HeartbeatResponseAction nextAction = HeartbeatResponseAction::makeNoAction(); + HeartbeatResponseAction nextAction = HeartbeatResponseAction::makeRetryReconfigAction(); nextAction.setNextHeartbeatStartDate(nextHeartbeatStartDate); return nextAction; } diff --git a/src/mongo/db/repl/topology_coordinator_v1_test.cpp b/src/mongo/db/repl/topology_coordinator_v1_test.cpp index db2a4fbc433..e2ce2fd9314 100644 --- a/src/mongo/db/repl/topology_coordinator_v1_test.cpp +++ b/src/mongo/db/repl/topology_coordinator_v1_test.cpp @@ -1596,7 +1596,7 @@ TEST_F(TopoCoordTest, NodeChangesToRecoveringWhenOnlyUnauthorizedNodesAreUp) { ASSERT_NO_ACTION(action.getAction()); } -TEST_F(TopoCoordTest, NodeDoesNotActOnHeartbeatsWhenAbsentFromConfig) { +TEST_F(TopoCoordTest, NodeRetriesReconfigWhenAbsentFromConfig) { updateConfig(BSON("_id" << "rs0" << "version" << 1 << "members" @@ -1607,12 +1607,13 @@ TEST_F(TopoCoordTest, NodeDoesNotActOnHeartbeatsWhenAbsentFromConfig) { << BSON("_id" << 30 << "host" << "h3"))), -1); - ASSERT_NO_ACTION(heartbeatFromMember(HostAndPort("h2"), - "rs0", - MemberState::RS_SECONDARY, - OpTime(Timestamp(1, 0), 0), - Milliseconds(300)) - .getAction()); + ASSERT_EQUALS(HeartbeatResponseAction::RetryReconfig, + heartbeatFromMember(HostAndPort("h2"), + "rs0", + MemberState::RS_SECONDARY, + OpTime(Timestamp(1, 0), 0), + Milliseconds(300)) + .getAction()); } TEST_F(TopoCoordTest, NodeReturnsNotSecondaryWhenSyncFromIsRunPriorToHavingAConfig) { |