summaryrefslogtreecommitdiff
path: root/src/mongo/db/process_health
diff options
context:
space:
mode:
authorKshitij Gupta <kshitij.gupta@mongodb.com>2022-01-17 19:35:13 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-02-03 06:34:22 +0000
commit1d8aaed7f9286ce90317d03fc1815bb58b43e31d (patch)
treec0ef354434941aa446bad53f9b9f10741c1c2fe3 /src/mongo/db/process_health
parent3b73633480220630507435a4edc3fb141739e93c (diff)
downloadmongo-1d8aaed7f9286ce90317d03fc1815bb58b43e31d.tar.gz
SERVER-59384: Should provide ability to perform periodic DNS health checks
Diffstat (limited to 'src/mongo/db/process_health')
-rw-r--r--src/mongo/db/process_health/SConscript2
-rw-r--r--src/mongo/db/process_health/dns_health_observer.cpp123
-rw-r--r--src/mongo/db/process_health/dns_health_observer.h65
-rw-r--r--src/mongo/db/process_health/fault_manager_config.cpp3
-rw-r--r--src/mongo/db/process_health/fault_manager_config.h5
-rw-r--r--src/mongo/db/process_health/fault_manager_test_suite.h3
-rw-r--r--src/mongo/db/process_health/fault_state_machine_test.cpp47
7 files changed, 248 insertions, 0 deletions
diff --git a/src/mongo/db/process_health/SConscript b/src/mongo/db/process_health/SConscript
index 9cd8847e168..4fc5b9e855b 100644
--- a/src/mongo/db/process_health/SConscript
+++ b/src/mongo/db/process_health/SConscript
@@ -7,6 +7,7 @@ env = env.Clone()
env.Library(
target='fault_manager',
source=[
+ 'dns_health_observer.cpp',
'fault.cpp',
'fault_facet_impl.cpp',
'fault_manager.cpp',
@@ -27,6 +28,7 @@ env.Library(
'$BUILD_DIR/mongo/executor/network_interface_factory',
'$BUILD_DIR/mongo/executor/network_interface_thread_pool',
'$BUILD_DIR/mongo/executor/thread_pool_task_executor',
+ '$BUILD_DIR/mongo/s/grid',
],
LIBDEPS_PRIVATE=[
'$BUILD_DIR/mongo/db/process_health_feature_flag',
diff --git a/src/mongo/db/process_health/dns_health_observer.cpp b/src/mongo/db/process_health/dns_health_observer.cpp
new file mode 100644
index 00000000000..ef414513aff
--- /dev/null
+++ b/src/mongo/db/process_health/dns_health_observer.cpp
@@ -0,0 +1,123 @@
+/**
+ * Copyright (C) 2021-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kProcessHealth
+
+#include "mongo/db/process_health/dns_health_observer.h"
+
+#include "mongo/db/process_health/health_observer_registration.h"
+#include "mongo/logv2/log.h"
+#include "mongo/s/grid.h"
+#include "mongo/util/dns_name.h"
+#include "mongo/util/net/hostandport.h"
+#include "mongo/util/net/hostname_canonicalization.h"
+#include <algorithm>
+#include <random>
+
+namespace mongo {
+namespace process_health {
+
+MONGO_FAIL_POINT_DEFINE(dnsHealthObserverFp);
+
+Future<HealthCheckStatus> DnsHealthObserver::periodicCheckImpl(
+ PeriodicHealthCheckContext&& periodicCheckContext) noexcept {
+ LOGV2_DEBUG(5938401, 2, "DNS health observer executing");
+
+ auto makeFailedHealthCheckFuture = [this](const Status& status) {
+ return Future<HealthCheckStatus>::makeReady(
+ makeSimpleFailedStatus(Severity::kFailure, {status}));
+ };
+
+ ConnectionString connString;
+ auto isFailPointActive = false;
+ if (MONGO_unlikely(dnsHealthObserverFp.shouldFail())) {
+ isFailPointActive = true;
+ dnsHealthObserverFp.executeIf(
+ [this, &connString](const BSONObj& data) {
+ auto fpHostname = data["hostname"].String();
+ connString = ConnectionString::forReplicaSet("serverWithBadHostName",
+ {HostAndPort(fpHostname, 27017)});
+ },
+ [&](const BSONObj& data) { return !data.isEmpty(); });
+ }
+
+ if (!isFailPointActive) {
+ auto client = _svcCtx->makeClient("DNSHealthObserver");
+ auto opCtx = client->makeOperationContext();
+ auto const shardRegistry = Grid::get(_svcCtx)->shardRegistry();
+ auto shardIds = shardRegistry->getAllShardIds(opCtx.get());
+
+ if (shardIds.size() == 0) {
+ connString = shardRegistry->getConfigServerConnectionString();
+ } else {
+ auto shardSW =
+ shardRegistry->getShard(opCtx.get(), shardIds.at(rand() % shardIds.size()));
+ auto shardSWStatus = shardSW.getStatus();
+ if (shardSWStatus.isOK()) {
+ connString = shardSW.getValue()->getConnString();
+ } else {
+ return makeFailedHealthCheckFuture(shardSWStatus);
+ }
+ }
+ }
+
+ auto servers = connString.getServers();
+ if (servers.empty()) {
+ return makeFailedHealthCheckFuture(
+ Status(ErrorCodes::NetworkTimeout, "No hostnames for DNS health check"));
+ }
+
+ std::shuffle(servers.begin(), servers.end(), _random.urbg());
+
+ auto completionPf = makePromiseFuture<HealthCheckStatus>();
+
+ auto status = periodicCheckContext.taskExecutor->scheduleWork(
+ [this, servers, promise = std::move(completionPf.promise)](
+ const executor::TaskExecutor::CallbackArgs& cbArgs) mutable {
+ auto statusWith =
+ getHostFQDNs(servers.front().host(), HostnameCanonicalizationMode::kForward);
+ if (statusWith.isOK() && !statusWith.getValue().empty()) {
+ promise.emplaceValue(makeHealthyStatus());
+ } else {
+ promise.emplaceValue(
+ makeSimpleFailedStatus(Severity::kFailure, {statusWith.getStatus()}));
+ }
+ });
+
+ return std::move(completionPf.future);
+}
+
+namespace {
+MONGO_INITIALIZER(DnsHealthObserver)(InitializerContext*) {
+ HealthObserverRegistration::registerObserverFactory(
+ [](ServiceContext* svcCtx) { return std::make_unique<DnsHealthObserver>(svcCtx); });
+}
+} // namespace
+
+} // namespace process_health
+} // namespace mongo
diff --git a/src/mongo/db/process_health/dns_health_observer.h b/src/mongo/db/process_health/dns_health_observer.h
new file mode 100644
index 00000000000..2640c9024f7
--- /dev/null
+++ b/src/mongo/db/process_health/dns_health_observer.h
@@ -0,0 +1,65 @@
+/**
+ * Copyright (C) 2021-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+#pragma once
+
+#include "mongo/db/process_health/health_observer_base.h"
+#include "mongo/platform/random.h"
+
+namespace mongo {
+namespace process_health {
+class DnsHealthObserver final : public HealthObserverBase {
+public:
+ DnsHealthObserver(ServiceContext* svcCtx)
+ : HealthObserverBase(svcCtx), _random(PseudoRandom(SecureRandom().nextInt64())){};
+
+protected:
+ FaultFacetType getType() const override {
+ return FaultFacetType::kDns;
+ }
+
+ Milliseconds healthCheckJitter() const override {
+ return Milliseconds(5);
+ }
+
+ Milliseconds getObserverTimeout() const override {
+ return Milliseconds(Seconds(10));
+ }
+
+ bool isConfigured() const override {
+ return true;
+ }
+
+ Future<HealthCheckStatus> periodicCheckImpl(
+ PeriodicHealthCheckContext&& periodicCheckContext) noexcept override;
+
+private:
+ mutable PseudoRandom _random;
+};
+} // namespace process_health
+} // namespace mongo
diff --git a/src/mongo/db/process_health/fault_manager_config.cpp b/src/mongo/db/process_health/fault_manager_config.cpp
index 4b3b3695459..b5e3cc57d46 100644
--- a/src/mongo/db/process_health/fault_manager_config.cpp
+++ b/src/mongo/db/process_health/fault_manager_config.cpp
@@ -40,6 +40,7 @@ namespace {
constexpr auto inline kDefaultObserverInterval = Milliseconds{10000};
constexpr auto inline kDefaultLdapObserverInterval = Milliseconds{30000};
constexpr auto inline kDefaultConfigServerObserverInterval = Milliseconds{30000};
+constexpr auto inline kDefaultDNSObserverInterval = Milliseconds{30000};
constexpr auto inline kDefaultTestObserverInterval = Milliseconds{1000};
} // namespace
@@ -49,6 +50,8 @@ Milliseconds FaultManagerConfig::_getDefaultObserverInterval(FaultFacetType type
return kDefaultLdapObserverInterval;
case FaultFacetType::kConfigServer:
return kDefaultConfigServerObserverInterval;
+ case FaultFacetType::kDns:
+ return kDefaultDNSObserverInterval;
case FaultFacetType::kMock1:
case FaultFacetType::kMock2:
case FaultFacetType::kTestObserver:
diff --git a/src/mongo/db/process_health/fault_manager_config.h b/src/mongo/db/process_health/fault_manager_config.h
index ac4f4ba0a75..83d87b1d67d 100644
--- a/src/mongo/db/process_health/fault_manager_config.h
+++ b/src/mongo/db/process_health/fault_manager_config.h
@@ -110,6 +110,11 @@ public:
auto getIntensity = [this, intensities](FaultFacetType type) {
auto observerType = toObserverType(type);
if (observerType) {
+ stdx::lock_guard lock(_mutex);
+ if (_facetToIntensityMapForTest.contains(type)) {
+ return _facetToIntensityMapForTest.at(type);
+ }
+
auto x = intensities->_data->getValues();
if (x) {
for (auto setting : *x) {
diff --git a/src/mongo/db/process_health/fault_manager_test_suite.h b/src/mongo/db/process_health/fault_manager_test_suite.h
index 38235658923..b9a5ab8aeea 100644
--- a/src/mongo/db/process_health/fault_manager_test_suite.h
+++ b/src/mongo/db/process_health/fault_manager_test_suite.h
@@ -32,6 +32,7 @@
#include "mongo/db/process_health/fault_manager.h"
+#include "mongo/db/concurrency/locker_noop_client_observer.h"
#include "mongo/db/process_health/health_observer_mock.h"
#include "mongo/db/process_health/health_observer_registration.h"
#include "mongo/executor/network_interface_factory.h"
@@ -136,6 +137,8 @@ public:
_svcCtx->setFastClockSource(std::make_unique<ClockSourceMock>());
_svcCtx->setPreciseClockSource(std::make_unique<ClockSourceMock>());
_svcCtx->setTickSource(std::make_unique<TickSourceMock<Milliseconds>>());
+ _svcCtx->registerClientObserver(
+ std::make_unique<LockerNoopClientObserverWithReplacementPolicy>());
advanceTime(Seconds(100));
}
}
diff --git a/src/mongo/db/process_health/fault_state_machine_test.cpp b/src/mongo/db/process_health/fault_state_machine_test.cpp
index 111d62cb5d6..0a08b20cb8d 100644
--- a/src/mongo/db/process_health/fault_state_machine_test.cpp
+++ b/src/mongo/db/process_health/fault_state_machine_test.cpp
@@ -29,6 +29,7 @@
#include "mongo/db/process_health/fault_manager.h"
+#include "mongo/db/process_health/dns_health_observer.h"
#include "mongo/db/process_health/fault_manager_test_suite.h"
#include "mongo/db/process_health/health_check_status.h"
#include "mongo/executor/thread_pool_task_executor_test_fixture.h"
@@ -359,6 +360,52 @@ TEST_F(FaultManagerTest, HealthCheckWithOffFacetCreatesNoFaultInOk) {
ASSERT_EQ(manager().getFaultState(), FaultState::kOk);
}
+TEST_F(FaultManagerTest, DNSHealthCheckWithBadHostNameFailsAndGoodHostNameSuccess) {
+ RAIIServerParameterControllerForTest _controller{"featureFlagHealthMonitoring", true};
+ const auto faultFacetType = FaultFacetType::kDns;
+ auto config = std::make_unique<FaultManagerConfig>();
+ config->setIntensityForType(faultFacetType, HealthObserverIntensityEnum::kCritical);
+ resetManager(std::move(config));
+
+ auto serverParam =
+ ServerParameterSet::getNodeParameterSet()->get<PeriodicHealthCheckIntervalsServerParameter>(
+ "healthMonitoringIntervals");
+ auto bsonOBj = BSON("values" << BSON_ARRAY(BSON("type"
+ << "dns"
+ << "interval" << 1000)));
+ const BSONObj newParameterObj = BSON("key" << bsonOBj);
+ auto element = newParameterObj.getField("key");
+ uassertStatusOK(serverParam->set(element));
+
+ registerHealthObserver<DnsHealthObserver>();
+ globalFailPointRegistry()
+ .find("dnsHealthObserverFp")
+ ->setMode(FailPoint::alwaysOn,
+ 0,
+ BSON("hostname"
+ << "yahoo.com"));
+
+ auto initialHealthCheckFuture = manager().startPeriodicHealthChecks();
+ assertSoon([this]() { return manager().getFaultState() == FaultState::kOk; });
+
+ globalFailPointRegistry()
+ .find("dnsHealthObserverFp")
+ ->setMode(FailPoint::alwaysOn,
+ 0,
+ BSON("hostname"
+ << "badhostname.invalid"));
+ sleepFor(Seconds(1));
+ assertSoon([this]() { return manager().getFaultState() == FaultState::kTransientFault; });
+
+ globalFailPointRegistry()
+ .find("dnsHealthObserverFp")
+ ->setMode(FailPoint::alwaysOn,
+ 0,
+ BSON("hostname"
+ << "yahoo.com"));
+ assertSoon([this]() { return manager().getFaultState() == FaultState::kOk; });
+}
+
} // namespace
} // namespace process_health
} // namespace mongo