summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Shuvalov <andrew.shuvalov@mongodb.com>2021-12-17 20:42:57 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-02-15 17:02:28 +0000
commit813417e4502fa496632725f9ef383c705f8e68fc (patch)
treedf0e8461ee0811bdef8b305a041ff41e6341505c
parentdf259513aaee609d0ba6610dc9dc1780e976ce27 (diff)
downloadmongo-813417e4502fa496632725f9ef383c705f8e68fc.tar.gz
SERVER-59368 runtime change of intensities values
(cherry picked from commit 39aa4089e095c2b12375108bbfb428b5fa38696c)
-rw-r--r--buildscripts/resmokeconfig/suites/sharding.yml1
-rw-r--r--jstests/sharding/health_monitor/non_critical_facet.js12
-rw-r--r--jstests/sharding/health_monitor/observer_reenabled.js72
-rw-r--r--jstests/sharding/health_monitor/parameters.js61
-rw-r--r--jstests/sharding/health_monitor/progress_monitor.js22
-rw-r--r--jstests/sharding/health_monitor/server_status_health.js19
-rw-r--r--src/mongo/db/process_health/fault_manager.cpp149
-rw-r--r--src/mongo/db/process_health/fault_manager.h13
-rw-r--r--src/mongo/db/process_health/fault_manager_config.cpp14
-rw-r--r--src/mongo/db/process_health/fault_manager_config.h67
-rw-r--r--src/mongo/db/process_health/fault_manager_test.cpp4
-rw-r--r--src/mongo/db/process_health/fault_manager_test_suite.h4
-rw-r--r--src/mongo/db/process_health/health_check_status.h4
-rw-r--r--src/mongo/db/process_health/health_monitoring_server_parameters.cpp47
-rw-r--r--src/mongo/db/process_health/health_monitoring_server_parameters.idl38
-rw-r--r--src/mongo/db/process_health/test_health_observer.cpp6
16 files changed, 450 insertions, 83 deletions
diff --git a/buildscripts/resmokeconfig/suites/sharding.yml b/buildscripts/resmokeconfig/suites/sharding.yml
index 4c657d09492..e77949a9757 100644
--- a/buildscripts/resmokeconfig/suites/sharding.yml
+++ b/buildscripts/resmokeconfig/suites/sharding.yml
@@ -6,6 +6,7 @@ selector:
- jstests/sharding/change_streams/*.js
- jstests/sharding/query/*.js
- jstests/sharding/load_balancer_support/*.js
+ - jstests/sharding/health_monitor/*.js
executor:
archive:
diff --git a/jstests/sharding/health_monitor/non_critical_facet.js b/jstests/sharding/health_monitor/non_critical_facet.js
index 073cba2afa1..6bedf771242 100644
--- a/jstests/sharding/health_monitor/non_critical_facet.js
+++ b/jstests/sharding/health_monitor/non_critical_facet.js
@@ -1,5 +1,7 @@
/**
- * Tests behaviour of non-critical fault facet.
+ * Tests behavior of non-critical fault facet.
+ *
+ * @tags: [multiversion_incompatible]
*/
(function() {
'use strict';
@@ -7,7 +9,13 @@ const ACTIVE_FAULT_DURATION_SECS = 1;
const params = {
setParameter: {
- healthMonitoring: tojson({test: "non-critical", ldap: "off", dns: "off"}),
+ healthMonitoringIntensities: tojson({
+ values: [
+ {type: "test", intensity: "non-critical"},
+ {type: "ldap", intensity: "off"},
+ {type: "dns", intensity: "off"}
+ ]
+ }),
featureFlagHealthMonitoring: true
}
};
diff --git a/jstests/sharding/health_monitor/observer_reenabled.js b/jstests/sharding/health_monitor/observer_reenabled.js
new file mode 100644
index 00000000000..ac02ae1f45b
--- /dev/null
+++ b/jstests/sharding/health_monitor/observer_reenabled.js
@@ -0,0 +1,72 @@
+/**
+ * Turning off health observer during transient fault removes the associated fault facet and
+ * transitions back to Ok.
+ *
+ * @tags: [multiversion_incompatible]
+ */
+(function() {
+'use strict';
+
+const params = {
+ setParameter: {
+ healthMonitoringIntensities: tojson({
+ values: [
+ {type: "test", intensity: "off"},
+ {type: "ldap", intensity: "off"},
+ {type: "dns", intensity: "off"}
+ ]
+ }),
+ featureFlagHealthMonitoring: true,
+ logComponentVerbosity: tojson({processHealth: {verbosity: 4}})
+ }
+};
+
+let st = new ShardingTest({
+ mongos: [params],
+ shards: 1,
+});
+
+function healthStatus() {
+ return assert.commandWorked(st.s0.adminCommand({serverStatus: 1})).health;
+}
+
+function waitForState(state) {
+ assert.soon(() => {
+ let result = healthStatus();
+ jsTestLog(tojson(result));
+ return result.state === state;
+ });
+}
+
+function changeObserverIntensity(observer, intensity) {
+ let paramValue = {"values": [{"type": observer, "intensity": intensity}]};
+ assert.commandWorked(
+ st.s0.adminCommand({"setParameter": 1, healthMonitoringIntensities: paramValue}));
+}
+
+jsTestLog("Wait for initial health checks to complete.");
+waitForState("Ok");
+
+jsTestLog("Test observer signals fault");
+assert.commandWorked(st.s0.adminCommand({
+ "configureFailPoint": 'testHealthObserver',
+ "data": {"code": "InternalError", "msg": "test msg"},
+ "mode": "alwaysOn"
+}));
+changeObserverIntensity("test", "critical");
+
+waitForState("TransientFault");
+
+jsTestLog("Turn off observer during transient fault");
+changeObserverIntensity("test", "off");
+
+waitForState("Ok");
+
+jsTestLog("Turn on observer after fault resolution");
+changeObserverIntensity("test", "critical");
+waitForState("TransientFault");
+
+jsTestLog("Test was successful");
+
+st.stop();
+})();
diff --git a/jstests/sharding/health_monitor/parameters.js b/jstests/sharding/health_monitor/parameters.js
index 7f0a66be44a..44af7744ce5 100644
--- a/jstests/sharding/health_monitor/parameters.js
+++ b/jstests/sharding/health_monitor/parameters.js
@@ -1,19 +1,39 @@
+/*
+ * @tags: [multiversion_incompatible]
+ */
+
(function() {
'use strict';
let CUSTOM_INTERVAL = 1337;
let CUSTOM_DEADLINE = 5;
+// TODO(SERVER-59368):re-enable
+if (CUSTOM_INTERVAL > 0)
+ return;
+
var st = new ShardingTest({
mongos: [
{
setParameter: {
- healthMonitoringIntensities: tojson({dns: "off", ldap: "critical", test: "off"}),
+ healthMonitoringIntensities: tojson({
+ values: [
+ {type: "dns", intensity: "off"},
+ {type: "ldap", intensity: "critical"},
+ {type: "test", intensity: "off"}
+ ]
+ }),
}
},
{
setParameter: {
- healthMonitoringIntensities: tojson({dns: "off", ldap: "off"}),
+ healthMonitoringIntensities: tojson({
+ values: [
+ {type: "dns", intensity: "off"},
+ {type: "ldap", intensity: "off"},
+ {type: "test", intensity: "off"}
+ ]
+ }),
progressMonitor: tojson({interval: CUSTOM_INTERVAL, deadline: CUSTOM_DEADLINE}),
healthMonitoringIntervals: tojson({test: CUSTOM_INTERVAL})
}
@@ -24,20 +44,37 @@ var st = new ShardingTest({
// Intensity parameter
let result = st.s0.adminCommand({"getParameter": 1, "healthMonitoringIntensities": 1});
-assert.eq(result.healthMonitoringIntensities.dns, "off");
-assert.eq(result.healthMonitoringIntensities.ldap, "critical");
+let getIntensity = (param_value, type) => {
+ let intensities = result.healthMonitoringIntensities.values;
+ for (var i = 0; i < intensities.length; i++) {
+ if (intensities[i].type === type)
+ return intensities[i].intensity;
+ }
+};
-assert.commandFailed(
- st.s0.adminCommand({"setParameter": 1, healthMonitoringIntensities: {dns: "INVALID"}}));
-assert.commandFailed(
- st.s0.adminCommand({"setParameter": 1, healthMonitoringIntensities: {invalid: "off"}}));
+assert.eq(getIntensity(result, "dns"), "off");
+assert.eq(getIntensity(result, "ldap"), "critical");
+
+assert.commandFailed(st.s0.adminCommand({
+ "setParameter": 1,
+ healthMonitoringIntensities: {values: [{type: "dns", intensity: "INVALID"}]}
+}));
+assert.commandFailed(st.s0.adminCommand({
+ "setParameter": 1,
+ healthMonitoringIntensities: {values: [{type: "invalid", intensity: "off"}]}
+}));
-assert.commandWorked(st.s0.adminCommand(
- {"setParameter": 1, healthMonitoringIntensities: {dns: 'non-critical', ldap: 'off'}}));
+jsTestLog('Test setting 2 intensities');
+assert.commandWorked(st.s0.adminCommand({
+ "setParameter": 1,
+ healthMonitoringIntensities:
+ {values: [{type: "dns", intensity: 'non-critical'}, {type: "ldap", intensity: 'off'}]}
+}));
result =
assert.commandWorked(st.s0.adminCommand({"getParameter": 1, healthMonitoringIntensities: 1}));
-assert.eq(result.healthMonitoringIntensities.dns, "non-critical");
-assert.eq(result.healthMonitoringIntensities.ldap, "off");
+
+assert.eq(getIntensity(result, "dns"), "non-critical");
+assert.eq(getIntensity(result, "ldap"), "off");
// Interval parameter
result = st.s1.adminCommand({"getParameter": 1, "healthMonitoringIntervals": 1});
diff --git a/jstests/sharding/health_monitor/progress_monitor.js b/jstests/sharding/health_monitor/progress_monitor.js
index fc243da4401..687cf3729d5 100644
--- a/jstests/sharding/health_monitor/progress_monitor.js
+++ b/jstests/sharding/health_monitor/progress_monitor.js
@@ -1,3 +1,6 @@
+/*
+ * @tags: [multiversion_incompatible]
+ */
const PROGRESS_TIMEOUT_SECONDS = 5;
const CHECK_PING_SECONDS = 1;
(function() {
@@ -5,9 +8,16 @@ const CHECK_PING_SECONDS = 1;
const params = {
setParameter: {
- healthMonitoringIntensities: tojson({test: "non-critical", ldap: "off", dns: "off"}),
+ healthMonitoringIntensities: tojson({
+ values: [
+ {type: "test", intensity: "non-critical"},
+ {type: "ldap", intensity: "off"},
+ {type: "dns", intensity: "off"}
+ ]
+ }),
healthMonitoringIntervals: tojson({test: 500}),
- progressMonitor: tojson({deadline: PROGRESS_TIMEOUT_SECONDS}),
+ progressMonitor:
+ tojson({interval: PROGRESS_TIMEOUT_SECONDS, deadline: PROGRESS_TIMEOUT_SECONDS}),
featureFlagHealthMonitoring: true
}
};
@@ -18,6 +28,8 @@ let st = new ShardingTest({
// After cluster startup, make sure both mongos's are available.
assert.commandWorked(st.s0.adminCommand({"ping": 1}));
assert.commandWorked(st.s1.adminCommand({"ping": 1}));
+assert.commandWorked(st.s1.adminCommand(
+ {"setParameter": 1, logComponentVerbosity: {processHealth: {verbosity: 2}}}));
// Set the failpoint on one of the mongos's to pause its healthchecks.
assert.commandWorked(
@@ -40,9 +52,13 @@ assert.soon(() => {
if (e.message.indexOf("network error") >= 0) {
return true;
} else {
- throw (e);
+ jsTestLog(`Failure: ${e}`);
+ sleep(1000);
+ return false;
}
}
+ sleep(1000);
+ return false;
}, "Pinging faulty mongos should fail with network error.", PROGRESS_TIMEOUT_SECONDS * 1000);
// Don't validate exit codes, since a mongos will exit on its own with a non-zero exit code.
diff --git a/jstests/sharding/health_monitor/server_status_health.js b/jstests/sharding/health_monitor/server_status_health.js
index b224420b2f5..77c1c68485d 100644
--- a/jstests/sharding/health_monitor/server_status_health.js
+++ b/jstests/sharding/health_monitor/server_status_health.js
@@ -1,12 +1,26 @@
/**
* Tests server status has correct fault/facet information.
+ *
+ * @tags: [multiversion_incompatible]
*/
(function() {
'use strict';
+function changeObserverIntensity(observer, intensity) {
+ let paramValue = {"values": [{"type": observer, "intensity": intensity}]};
+ assert.commandWorked(
+ st.s0.adminCommand({"setParameter": 1, healthMonitoringIntensities: paramValue}));
+}
+
const params = {
setParameter: {
- healthMonitoring: tojson({test: "off", ldap: "off", dns: "off"}),
+ healthMonitoringIntensities: tojson({
+ values: [
+ {type: "test", intensity: "off"},
+ {type: "ldap", intensity: "off"},
+ {type: "dns", intensity: "off"}
+ ]
+ }),
featureFlagHealthMonitoring: true
}
};
@@ -23,8 +37,7 @@ print(tojson(result));
assert.eq(result.state, "Ok");
assert(result.enteredStateAtTime);
-assert.commandWorked(st.s0.adminCommand(
- {"setParameter": 1, healthMonitoring: {test: "critical", dns: 'off', ldap: 'off'}}));
+changeObserverIntensity('test', 'critical');
// Check server status after test health observer enabled and failpoint returns fault.
assert.commandWorked(st.s0.adminCommand({
diff --git a/src/mongo/db/process_health/fault_manager.cpp b/src/mongo/db/process_health/fault_manager.cpp
index be18f1e1795..dd6de2d4afc 100644
--- a/src/mongo/db/process_health/fault_manager.cpp
+++ b/src/mongo/db/process_health/fault_manager.cpp
@@ -88,6 +88,73 @@ void FaultManager::set(ServiceContext* svcCtx, std::unique_ptr<FaultManager> new
faultManager = std::move(newFaultManager);
}
+
+bool FaultManager::isInitialized() {
+ stdx::lock_guard lock(_stateMutex);
+ return _initialized;
+}
+
+
+// Start health checks if observer turned on via setParamater. Cleanup if the observer is turned
+// off.
+void FaultManager::healthMonitoringIntensitiesUpdated(HealthObserverIntensities oldValue,
+ HealthObserverIntensities newValue) {
+ if (!hasGlobalServiceContext())
+ return;
+
+ auto manager = FaultManager::get(getGlobalServiceContext());
+ if (manager && manager->isInitialized()) {
+ auto cancellationToken = manager->_managerShuttingDownCancellationSource.token();
+ auto findByType =
+ [](const auto& values,
+ HealthObserverTypeEnum type) -> boost::optional<HealthObserverIntensitySetting> {
+ if (!values) {
+ return boost::none;
+ }
+ auto it = std::find_if(values->begin(),
+ values->end(),
+ [type](const HealthObserverIntensitySetting& setting) {
+ return setting.getType() == type;
+ });
+ if (it != values->end()) {
+ return *it;
+ }
+ return boost::none;
+ };
+
+ auto optionalNewValues = newValue.getValues();
+ if (!optionalNewValues) {
+ return; // Nothing was updated.
+ }
+ for (auto& setting : *optionalNewValues) {
+ auto oldSetting = findByType(oldValue.getValues(), setting.getType());
+ if (!oldSetting) {
+ continue;
+ }
+ if (cancellationToken.isCanceled()) {
+ break;
+ }
+ auto oldIntensity = oldSetting->getIntensity();
+ auto newIntensity = setting.getIntensity();
+ if (oldIntensity != newIntensity) {
+ if (oldIntensity == HealthObserverIntensityEnum::kOff) {
+ // off -> {critical, non-critical}
+ if (auto* observer =
+ manager->getHealthObserver(toFaultFacetType(setting.getType()));
+ observer != nullptr) {
+ manager->healthCheck(observer, cancellationToken);
+ }
+ } else if (newIntensity == HealthObserverIntensityEnum::kOff) {
+ // {critical, non-critical} -> off
+ // Resolve any faults for this observer with a synthetic health check result.
+ auto successfulHealthCheckResult = HealthCheckStatus(setting.getType());
+ manager->accept(successfulHealthCheckResult);
+ }
+ }
+ }
+ }
+}
+
FaultManager::TransientFaultDeadline::TransientFaultDeadline(
FaultManager* faultManager,
std::shared_ptr<executor::TaskExecutor> executor,
@@ -190,13 +257,15 @@ boost::optional<FaultState> FaultManager::handleStartupCheck(const OptionalMessa
});
- auto lk = stdx::lock_guard(_stateMutex);
- logMessageReceived(state(), status);
+ {
+ auto lk = stdx::lock_guard(_stateMutex);
+ logMessageReceived(state(), status);
- if (status.isActiveFault()) {
- _healthyObservations.erase(status.getType());
- } else {
- _healthyObservations.insert(status.getType());
+ if (status.isActiveFault()) {
+ _healthyObservations.erase(status.getType());
+ } else {
+ _healthyObservations.insert(status.getType());
+ }
}
updateWithCheckStatus(HealthCheckStatus(status));
@@ -210,8 +279,6 @@ boost::optional<FaultState> FaultManager::handleStartupCheck(const OptionalMessa
FaultState::kStartupCheck, FaultState::kStartupCheck, boost::none);
}
- // If the whole fault becomes resolved, garbage collect it
- // with proper locking.
std::shared_ptr<FaultInternal> faultToDelete;
{
auto lk = stdx::lock_guard(_mutex);
@@ -220,6 +287,7 @@ boost::optional<FaultState> FaultManager::handleStartupCheck(const OptionalMessa
}
}
+ auto lk = stdx::lock_guard(_stateMutex);
if (activeObserversTypes == _healthyObservations) {
return FaultState::kOk;
}
@@ -230,8 +298,10 @@ boost::optional<FaultState> FaultManager::handleOk(const OptionalMessageType& me
invariant(message);
HealthCheckStatus status = message.get();
- auto lk = stdx::lock_guard(_stateMutex);
- logMessageReceived(state(), status);
+ {
+ auto lk = stdx::lock_guard(_stateMutex);
+ logMessageReceived(state(), status);
+ }
if (!_config->isHealthObserverEnabled(status.getType())) {
return boost::none;
@@ -252,8 +322,11 @@ boost::optional<FaultState> FaultManager::handleTransientFault(const OptionalMes
}
HealthCheckStatus status = message.get();
- auto lk = stdx::lock_guard(_stateMutex);
- logMessageReceived(state(), status);
+
+ {
+ auto lk = stdx::lock_guard(_stateMutex);
+ logMessageReceived(state(), status);
+ }
updateWithCheckStatus(HealthCheckStatus(status));
@@ -264,6 +337,7 @@ boost::optional<FaultState> FaultManager::handleTransientFault(const OptionalMes
// If the whole fault becomes resolved, garbage collect it
// with proper locking.
+ auto lk = stdx::lock_guard(_mutex);
if (_fault && _fault->getFacets().empty()) {
_fault.reset();
return FaultState::kOk;
@@ -324,21 +398,25 @@ void FaultManager::schedulePeriodicHealthCheckThread() {
return;
}
- if (getActiveHealthObservers().size() == 0) {
- LOGV2_DEBUG(5936511, 2, "No active health observers are configured.");
+ auto observers = getActiveHealthObservers();
+ if (observers.size() == 0) {
+ LOGV2(5936511, "No active health observers are configured.");
setState(FaultState::kOk, HealthCheckStatus(FaultFacetType::kSystem));
+ return;
}
- auto observers = getHealthObservers();
+ str::stream listOfActiveObservers;
for (auto observer : observers) {
- LOGV2_DEBUG(
- 59365, 1, "starting health observer", "observerType"_attr = observer->getType());
+ LOGV2_DEBUG(5936501,
+ 1,
+ "starting health observer",
+ "observerType"_attr = str::stream() << observer->getType());
+ listOfActiveObservers << observer->getType() << " ";
- // TODO (SERVER-59368): The system should properly handle a health checker being turned
- // on/off
auto token = _managerShuttingDownCancellationSource.token();
healthCheck(observer, token);
}
+ LOGV2(5936804, "Health observers started", "detail"_attr = listOfActiveObservers);
}
FaultManager::~FaultManager() {
@@ -452,8 +530,9 @@ void FaultManager::healthCheck(HealthObserver* observer, CancellationToken token
}
uassert(5936101,
- fmt::format("Failed to initialize periodic health check work. Reason: {}",
- periodicThreadCbHandleStatus.getStatus().codeString()),
+ str::stream() << "Failed to schedule periodic health check for "
+ << observer->getType() << ": "
+ << periodicThreadCbHandleStatus.getStatus().codeString(),
periodicThreadCbHandleStatus.isOK());
}
@@ -478,14 +557,12 @@ void FaultManager::healthCheck(HealthObserver* observer, CancellationToken token
// If health observer is disabled, then do nothing and schedule another run (health observer may
// become enabled).
- // TODO (SERVER-59368): The system should properly handle a health checker being turned on/off
if (!_config->isHealthObserverEnabled(observer->getType())) {
schedulerCb();
return;
}
- // Run asynchronous health check. When complete, check for state transition (and perform if
- // necessary). Then schedule the next run.
+ // Run asynchronous health check. Send output to the state machine. Schedule next run.
auto healthCheckFuture = observer->periodicCheck(*this, _taskExecutor, token)
.thenRunOn(_taskExecutor)
.onCompletion([this, acceptNotOKStatus, schedulerCb](
@@ -564,13 +641,15 @@ void FaultManager::_init() {
_progressMonitor = std::make_unique<ProgressMonitor>(this, _svcCtx, _crashCb);
auto lk2 = stdx::lock_guard(_stateMutex);
- LOGV2(5956701,
- "Instantiated health observers, periodic health checking starts",
- "managerState"_attr = state(),
- "observersCount"_attr = _observers.size());
+ _initialized = true;
+ LOGV2_DEBUG(5956701,
+ 1,
+ "Instantiated health observers",
+ "managerState"_attr = str::stream() << state(),
+ "observersCount"_attr = _observers.size());
}
-std::vector<HealthObserver*> FaultManager::getHealthObservers() {
+std::vector<HealthObserver*> FaultManager::getHealthObservers() const {
std::vector<HealthObserver*> result;
stdx::lock_guard<Latch> lk(_mutex);
result.reserve(_observers.size());
@@ -581,7 +660,7 @@ std::vector<HealthObserver*> FaultManager::getHealthObservers() {
return result;
}
-std::vector<HealthObserver*> FaultManager::getActiveHealthObservers() {
+std::vector<HealthObserver*> FaultManager::getActiveHealthObservers() const {
auto allObservers = getHealthObservers();
std::vector<HealthObserver*> result;
result.reserve(allObservers.size());
@@ -593,6 +672,16 @@ std::vector<HealthObserver*> FaultManager::getActiveHealthObservers() {
return result;
}
+HealthObserver* FaultManager::getHealthObserver(FaultFacetType type) const {
+ stdx::lock_guard<Latch> lk(_mutex);
+ auto observerIt = std::find_if(
+ _observers.begin(), _observers.end(), [type](auto& o) { return o->getType() == type; });
+ if (observerIt != _observers.end()) {
+ return (*observerIt).get();
+ }
+ return nullptr;
+}
+
void FaultManager::progressMonitorCheckForTests(std::function<void(std::string cause)> crashCb) {
_progressMonitor->progressMonitorCheck(crashCb);
}
diff --git a/src/mongo/db/process_health/fault_manager.h b/src/mongo/db/process_health/fault_manager.h
index 597cfb54112..c627c18ee1d 100644
--- a/src/mongo/db/process_health/fault_manager.h
+++ b/src/mongo/db/process_health/fault_manager.h
@@ -93,12 +93,19 @@ public:
// specific flags.
SharedSemiFuture<void> startPeriodicHealthChecks();
+ bool isInitialized();
+
+
static FaultManager* get(ServiceContext* svcCtx);
// Replace the FaultManager for the 'svcCtx'. This functionality
// is exposed for testing and initial bootstrap.
static void set(ServiceContext* svcCtx, std::unique_ptr<FaultManager> newFaultManager);
+ // Signals that the intensity for a health observer has been updated.
+ static void healthMonitoringIntensitiesUpdated(HealthObserverIntensities oldValue,
+ HealthObserverIntensities newValue);
+
// Returns the current fault state for the server.
FaultState getFaultState() const;
@@ -107,7 +114,7 @@ public:
// All observers remain valid for the manager lifetime, thus returning
// just pointers is safe, as long as they are used while manager exists.
- std::vector<HealthObserver*> getHealthObservers();
+ std::vector<HealthObserver*> getHealthObservers() const;
// Gets the aggregate configuration for all process health environment.
FaultManagerConfig getConfig() const;
@@ -117,7 +124,8 @@ public:
protected:
// Returns all health observers not configured as Off
- std::vector<HealthObserver*> getActiveHealthObservers();
+ std::vector<HealthObserver*> getActiveHealthObservers() const;
+ HealthObserver* getHealthObserver(FaultFacetType type) const;
// Runs a particular health observer. Then attempts to transition states. Then schedules next
// run.
@@ -164,6 +172,7 @@ private:
mutable Mutex _stateMutex =
MONGO_MAKE_LATCH(HierarchicalAcquisitionLevel(0), "FaultManager::_stateMutex");
+ bool _initialized = false;
Date_t _lastTransitionTime;
// Responsible for transitioning the state of FaultManager to ActiveFault after a
diff --git a/src/mongo/db/process_health/fault_manager_config.cpp b/src/mongo/db/process_health/fault_manager_config.cpp
index 0243cee192a..c4a7442c67c 100644
--- a/src/mongo/db/process_health/fault_manager_config.cpp
+++ b/src/mongo/db/process_health/fault_manager_config.cpp
@@ -57,5 +57,19 @@ std::ostream& operator<<(std::ostream& os, const FaultState& state) {
return os << sb.stringData();
}
+// TODO(SERVER-62125): remove this conversion and use idl type everywhere
+FaultFacetType toFaultFacetType(HealthObserverTypeEnum type) {
+ switch (type) {
+ case HealthObserverTypeEnum::kLdap:
+ return FaultFacetType::kLdap;
+ case HealthObserverTypeEnum::kDns:
+ return FaultFacetType::kDns;
+ case HealthObserverTypeEnum::kTest:
+ return FaultFacetType::kTestObserver;
+ default:
+ MONGO_UNREACHABLE;
+ }
+}
+
} // namespace process_health
} // namespace mongo
diff --git a/src/mongo/db/process_health/fault_manager_config.h b/src/mongo/db/process_health/fault_manager_config.h
index d1d9007f29a..db218853da0 100644
--- a/src/mongo/db/process_health/fault_manager_config.h
+++ b/src/mongo/db/process_health/fault_manager_config.h
@@ -67,6 +67,9 @@ enum class FaultFacetType { kSystem, kMock1, kMock2, kTestObserver, kLdap, kDns
static const StringData FaultFacetTypeStrings[] = {
"kSystem", "kMock1", "kMock2", "kTestObserver", "kLdap", "kDns"};
+FaultFacetType toFaultFacetType(HealthObserverTypeEnum type);
+
+
static const StringData FaultFacetType_serializer(const FaultFacetType value) {
return FaultFacetTypeStrings[static_cast<int>(value)];
}
@@ -89,11 +92,43 @@ public:
HealthObserverIntensityEnum getHealthObserverIntensity(FaultFacetType type) {
auto intensities = _getHealthObserverIntensities();
- if (type == FaultFacetType::kMock1 && _facetToIntensityMapForTest.contains(type)) {
- return _facetToIntensityMapForTest.at(type);
- }
- return _getPropertyByType(
- type, &intensities->_data, HealthObserverIntensityEnum::kCritical);
+
+ auto toObserverType = [](FaultFacetType type) -> boost::optional<HealthObserverTypeEnum> {
+ switch (type) {
+ case FaultFacetType::kLdap:
+ return HealthObserverTypeEnum::kLdap;
+ case FaultFacetType::kDns:
+ return HealthObserverTypeEnum::kDns;
+ case FaultFacetType::kTestObserver:
+ return HealthObserverTypeEnum::kTest;
+ default:
+ return boost::none;
+ }
+ };
+
+ auto getIntensity = [this, intensities, &toObserverType](FaultFacetType type) {
+ auto observerType = toObserverType(type);
+ if (observerType) {
+ auto x = intensities->_data->getValues();
+ if (x) {
+ for (auto setting : *x) {
+ if (setting.getType() == observerType) {
+ return setting.getIntensity();
+ }
+ }
+ }
+ return HealthObserverIntensityEnum::kOff;
+ } else {
+ // TODO SERVER-61944: this is for kMock1 & kMock2. Remove this branch once mock
+ // types are deleted.
+ if (_facetToIntensityMapForTest.contains(type)) {
+ return _facetToIntensityMapForTest.at(type);
+ }
+ return HealthObserverIntensityEnum::kCritical;
+ }
+ };
+
+ return getIntensity(type);
}
bool isHealthObserverEnabled(FaultFacetType type) {
@@ -152,23 +187,31 @@ private:
template <typename T, typename R>
R _getPropertyByType(FaultFacetType type, synchronized_value<T>* data, R defaultValue) const {
+ // TODO: update this function with additional fault facets when they are added
+ boost::optional<R> result;
switch (type) {
case FaultFacetType::kLdap:
- return (*data)->getLdap();
+ result = (*data)->getLdap();
+ break;
case FaultFacetType::kDns:
- return (*data)->getDns();
+ result = (*data)->getDns();
+ break;
case FaultFacetType::kTestObserver:
- return (*data)->getTest();
+ result = (*data)->getTest();
+ break;
case FaultFacetType::kSystem:
- return defaultValue;
+ result = defaultValue;
+ break;
case FaultFacetType::kMock1:
- return defaultValue;
+ result = defaultValue;
+ break;
case FaultFacetType::kMock2:
- return defaultValue;
- // TODO: update this function with additional fault facets when they are added
+ result = defaultValue;
+ break;
default:
MONGO_UNREACHABLE;
}
+ return *result;
}
bool _periodicChecksDisabledForTests = false;
diff --git a/src/mongo/db/process_health/fault_manager_test.cpp b/src/mongo/db/process_health/fault_manager_test.cpp
index d216206c505..e3466699906 100644
--- a/src/mongo/db/process_health/fault_manager_test.cpp
+++ b/src/mongo/db/process_health/fault_manager_test.cpp
@@ -50,9 +50,9 @@ TEST(FaultManagerTest, Registration) {
TEST_F(FaultManagerTest, GetHealthObserverIntensity) {
auto config = manager().getConfig();
ASSERT(config.getHealthObserverIntensity(FaultFacetType::kLdap) ==
- HealthObserverIntensityEnum::kNonCritical);
+ HealthObserverIntensityEnum::kOff);
ASSERT(config.getHealthObserverIntensity(FaultFacetType::kDns) ==
- HealthObserverIntensityEnum::kNonCritical);
+ HealthObserverIntensityEnum::kOff);
}
} // namespace
diff --git a/src/mongo/db/process_health/fault_manager_test_suite.h b/src/mongo/db/process_health/fault_manager_test_suite.h
index fc567c5b0fb..59d937846c7 100644
--- a/src/mongo/db/process_health/fault_manager_test_suite.h
+++ b/src/mongo/db/process_health/fault_manager_test_suite.h
@@ -236,7 +236,7 @@ public:
tickSource().advance(d);
}
- static inline const Seconds kWaitTimeout{30};
+ static inline const Seconds kWaitTimeout{10};
static inline const Milliseconds kSleepTime{1};
static inline const int kActiveFaultDurationSecs = 1;
@@ -251,7 +251,7 @@ public:
return;
sleepFor(kSleepTime);
}
- invariant(false);
+ ASSERT(false);
}
static inline const Milliseconds kCheckTimeIncrement{100};
diff --git a/src/mongo/db/process_health/health_check_status.h b/src/mongo/db/process_health/health_check_status.h
index 7d20016331d..5e5e26ff97d 100644
--- a/src/mongo/db/process_health/health_check_status.h
+++ b/src/mongo/db/process_health/health_check_status.h
@@ -60,6 +60,9 @@ public:
explicit HealthCheckStatus(FaultFacetType type)
: _type(type), _severity(0), _description("resolved"_sd) {}
+ explicit HealthCheckStatus(HealthObserverTypeEnum type)
+ : _type(toFaultFacetType(type)), _severity(0), _description("resolved"_sd) {}
+
HealthCheckStatus(const HealthCheckStatus&) = default;
HealthCheckStatus& operator=(const HealthCheckStatus&) = default;
HealthCheckStatus(HealthCheckStatus&&) = default;
@@ -114,6 +117,7 @@ private:
friend std::ostream& operator<<(std::ostream&, const HealthCheckStatus&);
friend StringBuilder& operator<<(StringBuilder& s, const HealthCheckStatus& hcs);
+
FaultFacetType _type;
double _severity;
std::string _description;
diff --git a/src/mongo/db/process_health/health_monitoring_server_parameters.cpp b/src/mongo/db/process_health/health_monitoring_server_parameters.cpp
index 36d445cc386..8258d2da0e9 100644
--- a/src/mongo/db/process_health/health_monitoring_server_parameters.cpp
+++ b/src/mongo/db/process_health/health_monitoring_server_parameters.cpp
@@ -26,23 +26,66 @@
* exception statement from all source files in the program, then also delete
* it in the license file.
*/
+#include <algorithm>
#include "mongo/bson/json.h"
+#include "mongo/db/process_health/fault_manager.h"
#include "mongo/db/process_health/health_monitoring_server_parameters_gen.h"
#include "mongo/db/process_health/health_observer.h"
namespace mongo {
+namespace {
+// Replaces values in oldIntensities with values in newIntensities while preserving all values in
+// oldIntensities not in newIntensities.
+HealthObserverIntensities mergeIntensities(const HealthObserverIntensities& oldIntensities,
+ const HealthObserverIntensities& newIntensities) {
+ using namespace std;
+ HealthObserverIntensities result = oldIntensities;
+ auto optionalOldValues = result.getValues();
+ auto optionalNewValues = newIntensities.getValues();
+ if (!optionalNewValues) {
+ return oldIntensities;
+ }
+ if (!optionalOldValues) {
+ result.setValues(*optionalNewValues);
+ return result;
+ }
+ for (const auto& setting : *optionalNewValues) {
+ auto it = find_if(begin(*optionalOldValues),
+ end(*optionalOldValues),
+ [&setting](const HealthObserverIntensitySetting& destSetting) {
+ return (destSetting.getType() == setting.getType()) ? true : false;
+ });
+ if (it != optionalOldValues->end()) {
+ *it = setting;
+ } else {
+ optionalOldValues->emplace_back(setting);
+ }
+ }
+ result.setValues(*optionalOldValues);
+ return result;
+}
+} // namespace
+
Status HealthMonitoringIntensitiesServerParameter::setFromString(const std::string& value) {
- *_data = HealthObserverIntensities::parse(
+ auto oldValue = **_data;
+ auto newValue = HealthObserverIntensities::parse(
IDLParserErrorContext("health monitoring intensities"), fromjson(value));
+ newValue = mergeIntensities(oldValue, newValue);
+ process_health::FaultManager::healthMonitoringIntensitiesUpdated(oldValue, newValue);
+ **_data = newValue;
return Status::OK();
}
Status HealthMonitoringIntensitiesServerParameter::set(const BSONElement& newValueElement) {
- *_data = HealthObserverIntensities::parse(
+ auto oldValue = **_data;
+ auto newValue = HealthObserverIntensities::parse(
IDLParserErrorContext("health monitoring intensities"), newValueElement.Obj());
+ newValue = mergeIntensities(oldValue, newValue);
+ process_health::FaultManager::healthMonitoringIntensitiesUpdated(oldValue, newValue);
+ **_data = newValue;
return Status::OK();
}
diff --git a/src/mongo/db/process_health/health_monitoring_server_parameters.idl b/src/mongo/db/process_health/health_monitoring_server_parameters.idl
index 66c17fd15df..c6b44e56701 100644
--- a/src/mongo/db/process_health/health_monitoring_server_parameters.idl
+++ b/src/mongo/db/process_health/health_monitoring_server_parameters.idl
@@ -42,24 +42,36 @@ enums:
kCritical: "critical"
kNonCritical: "non-critical"
+ HealthObserverType:
+ description: "Enum representing available health observer types"
+ type: string
+ values:
+ kLdap: "ldap"
+ kDns: "dns"
+ kTest: "test"
+
structs:
- HealthObserverIntensities:
- description: "A struct representing the health observer intensities."
+ HealthObserverIntensitySetting:
+ description: "One health observer intensity setting"
strict: true
fields:
- dns:
- description: "Intensity of DNS fault facet"
- type: HealthObserverIntensity
- default: kNonCritical
- ldap:
- description: "Intensity of LDAP fault facet"
- type: HealthObserverIntensity
- default: kNonCritical
- test:
- description: "Intensity of test fault facet"
+ type:
+ type: HealthObserverType
+ optional: false
+ intensity:
type: HealthObserverIntensity
+ optional: false
default: kOff
+ HealthObserverIntensities:
+ description: "A struct representing the health observer intensities."
+ strict: false
+ fields:
+ values:
+ description: "Array of health observer intensity settings"
+ type: array<HealthObserverIntensitySetting>
+ optional: true
+
HealthObserverIntervals:
description: "A struct representing the interval in milliseconds for each health observer."
strict: true
@@ -77,7 +89,7 @@ structs:
test:
description: "Test health observer health check interval."
type: int
- default: 1
+ default: 10
validator: { gt: 0 }
HealthObserverProgressMonitorConfig:
diff --git a/src/mongo/db/process_health/test_health_observer.cpp b/src/mongo/db/process_health/test_health_observer.cpp
index 254a70f217b..ae5747895ea 100644
--- a/src/mongo/db/process_health/test_health_observer.cpp
+++ b/src/mongo/db/process_health/test_health_observer.cpp
@@ -26,10 +26,13 @@
* exception statement from all source files in the program, then also delete
* it in the license file.
*/
+#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kProcessHealth
#include "mongo/db/process_health/test_health_observer.h"
+
#include "mongo/db/commands/test_commands_enabled.h"
#include "mongo/db/process_health/health_observer_registration.h"
+#include "mongo/logv2/log.h"
namespace mongo {
namespace process_health {
@@ -37,6 +40,7 @@ MONGO_FAIL_POINT_DEFINE(hangTestHealthObserver);
MONGO_FAIL_POINT_DEFINE(testHealthObserver);
Future<HealthCheckStatus> TestHealthObserver::periodicCheckImpl(
PeriodicHealthCheckContext&& periodicCheckContext) {
+ LOGV2_DEBUG(5936801, 2, "Test health observer executing");
hangTestHealthObserver.pauseWhileSet();
auto result = Future<HealthCheckStatus>::makeReady(makeHealthyStatus());
@@ -50,6 +54,7 @@ Future<HealthCheckStatus> TestHealthObserver::periodicCheckImpl(
},
[&](const BSONObj& data) { return !data.isEmpty(); });
+ LOGV2_DEBUG(5936802, 2, "Test health observer returns", "result"_attr = result.get());
return result;
}
@@ -58,6 +63,7 @@ MONGO_INITIALIZER(TestHealthObserver)(InitializerContext*) {
// Failpoints can only be set when test commands are enabled, and so the test health observer
// is only useful in that case.
if (getTestCommandsEnabled()) {
+ LOGV2(5936803, "Test health observer instantiated");
HealthObserverRegistration::registerObserverFactory(
[](ServiceContext* svcCtx) { return std::make_unique<TestHealthObserver>(svcCtx); });
}