diff options
author | Davis Haupt <davis.haupt@mongodb.com> | 2021-12-15 15:23:20 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-12-28 23:18:25 +0000 |
commit | 86de78310d82cd0bafc6dbd5fcdbacfed6223c8d (patch) | |
tree | 3eab87def2e3e2074b2a040362f525f4361de25e | |
parent | 340c3773a0348b427b132023042d216cd701c5d9 (diff) | |
download | mongo-86de78310d82cd0bafc6dbd5fcdbacfed6223c8d.tar.gz |
SERVER-61873 add configurable health observer parameters
7 files changed, 239 insertions, 66 deletions
diff --git a/jstests/sharding/health_monitor/parameters.js b/jstests/sharding/health_monitor/parameters.js new file mode 100644 index 00000000000..7f0a66be44a --- /dev/null +++ b/jstests/sharding/health_monitor/parameters.js @@ -0,0 +1,87 @@ +(function() { +'use strict'; + +let CUSTOM_INTERVAL = 1337; +let CUSTOM_DEADLINE = 5; + +var st = new ShardingTest({ + mongos: [ + { + setParameter: { + healthMonitoringIntensities: tojson({dns: "off", ldap: "critical", test: "off"}), + } + }, + { + setParameter: { + healthMonitoringIntensities: tojson({dns: "off", ldap: "off"}), + progressMonitor: tojson({interval: CUSTOM_INTERVAL, deadline: CUSTOM_DEADLINE}), + healthMonitoringIntervals: tojson({test: CUSTOM_INTERVAL}) + } + } + ], + shards: 1, +}); + +// Intensity parameter +let result = st.s0.adminCommand({"getParameter": 1, "healthMonitoringIntensities": 1}); +assert.eq(result.healthMonitoringIntensities.dns, "off"); +assert.eq(result.healthMonitoringIntensities.ldap, "critical"); + +assert.commandFailed( + st.s0.adminCommand({"setParameter": 1, healthMonitoringIntensities: {dns: "INVALID"}})); +assert.commandFailed( + st.s0.adminCommand({"setParameter": 1, healthMonitoringIntensities: {invalid: "off"}})); + +assert.commandWorked(st.s0.adminCommand( + {"setParameter": 1, healthMonitoringIntensities: {dns: 'non-critical', ldap: 'off'}})); +result = + assert.commandWorked(st.s0.adminCommand({"getParameter": 1, healthMonitoringIntensities: 1})); +assert.eq(result.healthMonitoringIntensities.dns, "non-critical"); +assert.eq(result.healthMonitoringIntensities.ldap, "off"); + +// Interval parameter +result = st.s1.adminCommand({"getParameter": 1, "healthMonitoringIntervals": 1}); +assert.eq(result.healthMonitoringIntervals.test, CUSTOM_INTERVAL); + +assert.commandFailed(st.s1.adminCommand({"setParameter": 1, healthMonitoringIntervals: {dns: 0}})); +assert.commandFailed( + st.s1.adminCommand({"setParameter": 1, healthMonitoringIntervals: {invalid: 1000}})); + +assert.commandWorked(st.s1.adminCommand({ + "setParameter": 1, + healthMonitoringIntervals: {dns: NumberInt(2000), ldap: NumberInt(600000)} +})); +result = + assert.commandWorked(st.s1.adminCommand({"getParameter": 1, healthMonitoringIntervals: 1})); +assert.eq(result.healthMonitoringIntervals.dns, 2000); +assert.eq(result.healthMonitoringIntervals.ldap, 600000); + +// Check that custom liveness values were set properly. +result = st.s1.adminCommand({"getParameter": 1, "progressMonitor": 1}); +assert.eq(result.progressMonitor.interval, CUSTOM_INTERVAL); +assert.eq(result.progressMonitor.deadline, CUSTOM_DEADLINE); + +// Validation tests: intervals must be > 0. +assert.commandFailed(st.s1.adminCommand({"setParameter": 1, progressMonitor: {interval: 0}})); +assert.commandFailed(st.s1.adminCommand({"setParameter": 1, progressMonitor: {interval: -5}})); +assert.commandFailed(st.s1.adminCommand({"setParameter": 1, progressMonitor: {deadline: 0}})); +assert.commandFailed(st.s1.adminCommand({"setParameter": 1, progressMonitor: {deadline: -5}})); + +// Setting parameter properly during runtime. +assert.commandWorked(st.s1.adminCommand( + {"setParameter": 1, progressMonitor: {deadline: NumberInt(CUSTOM_DEADLINE + 1)}})); +result = st.s1.adminCommand({"getParameter": 1, "progressMonitor": 1}); +assert.eq(result.progressMonitor.deadline, CUSTOM_DEADLINE + 1); +// Setting only one sub-field will reset others to their default. +assert.eq(result.progressMonitor.interval, 50); + +assert.commandWorked(st.s1.adminCommand({ + "setParameter": 1, + progressMonitor: + {deadline: NumberInt(CUSTOM_DEADLINE + 1), interval: NumberInt(CUSTOM_INTERVAL)} +})); +result = st.s1.adminCommand({"getParameter": 1, "progressMonitor": 1}); +assert.eq(result.progressMonitor.deadline, CUSTOM_DEADLINE + 1); +assert.eq(result.progressMonitor.interval, CUSTOM_INTERVAL); +st.stop(); +}()); diff --git a/jstests/sharding/health_monitor/set_parameter_health_monitor_intensity.js b/jstests/sharding/health_monitor/set_parameter_health_monitor_intensity.js deleted file mode 100644 index 236dcaeb31d..00000000000 --- a/jstests/sharding/health_monitor/set_parameter_health_monitor_intensity.js +++ /dev/null @@ -1,32 +0,0 @@ - - -(function() { -'use strict'; - -var st = new ShardingTest({ - mongos: [{ - setParameter: { - healthMonitoring: tojson({dns: "off", ldap: "critical"}), - featureFlagHealthMonitoring: true - } - }], - shards: 1, -}); - -var result = st.s0.adminCommand({"getParameter": 1, "healthMonitoring": 1}); -print(tojson(result)); -assert.eq(result.healthMonitoring.dns, "off"); -assert.eq(result.healthMonitoring.ldap, "critical"); - -assert.commandFailed(st.s0.adminCommand({"setParameter": 1, healthMonitoring: {dns: "INVALID"}})); -assert.commandFailed(st.s0.adminCommand({"setParameter": 1, healthMonitoring: {invalid: "off"}})); - -assert.commandWorked( - st.s0.adminCommand({"setParameter": 1, healthMonitoring: {dns: 'non-critical', ldap: 'off'}})); -var result = st.s0.adminCommand({"getParameter": 1, healthMonitoring: 1}); -print(tojson(result)); -assert.eq(result.healthMonitoring.dns, "non-critical"); -assert.eq(result.healthMonitoring.ldap, "off"); - -st.stop(); -}()); diff --git a/src/mongo/db/process_health/fault_manager.cpp b/src/mongo/db/process_health/fault_manager.cpp index 9b52ee34247..276e57c47b0 100644 --- a/src/mongo/db/process_health/fault_manager.cpp +++ b/src/mongo/db/process_health/fault_manager.cpp @@ -420,7 +420,8 @@ FaultFacetsContainerPtr FaultManager::getOrCreateFaultFacetsContainer() { void FaultManager::healthCheck(HealthObserver* observer, std::shared_ptr<AtomicWord<bool>> token) { auto schedulerCb = [this, observer, token] { - auto scheduledTime = _taskExecutor->now() + _config->kPeriodicHealthCheckInterval + + auto scheduledTime = _taskExecutor->now() + + _config->getPeriodicHealthCheckInterval(observer->getType()) + std::min(observer->healthCheckJitter(), FaultManagerConfig::kPeriodicHealthCheckMaxJitter); LOGV2_DEBUG(5939701, diff --git a/src/mongo/db/process_health/fault_manager_config.h b/src/mongo/db/process_health/fault_manager_config.h index 8542dbf9ca5..1ba938a3d84 100644 --- a/src/mongo/db/process_health/fault_manager_config.h +++ b/src/mongo/db/process_health/fault_manager_config.h @@ -73,36 +73,16 @@ static const StringData FaultFacetType_serializer(const FaultFacetType value) { class FaultManagerConfig { public: - /* Default value of time between health checks - * TODO SERVER-61947 make this a property of health observers - */ - static auto inline constexpr kPeriodicHealthCheckInterval{Milliseconds(1000)}; - /* Maximum possible jitter added to the time between health checks */ static auto inline constexpr kPeriodicHealthCheckMaxJitter{Milliseconds{100}}; HealthObserverIntensityEnum getHealthObserverIntensity(FaultFacetType type) { - auto intensities = getHealthObserverIntensities(); - switch (type) { - case FaultFacetType::kLdap: - return intensities->_data->getLdap(); - case FaultFacetType::kDns: - return intensities->_data->getDns(); - // TODO: update this function with additional fault facets when they are added - case FaultFacetType::kTestObserver: - return intensities->_data->getTest(); - case FaultFacetType::kSystem: - return HealthObserverIntensityEnum::kCritical; - case FaultFacetType::kMock1: - if (_facetToIntensityMapForTest.contains(type)) { - return _facetToIntensityMapForTest.at(type); - } - return HealthObserverIntensityEnum::kCritical; - case FaultFacetType::kMock2: - return HealthObserverIntensityEnum::kCritical; - default: - MONGO_UNREACHABLE; + auto intensities = _getHealthObserverIntensities(); + if (type == FaultFacetType::kMock1 && _facetToIntensityMapForTest.contains(type)) { + return _facetToIntensityMapForTest.at(type); } + return _getPropertyByType( + type, &intensities->_data, HealthObserverIntensityEnum::kCritical); } bool isHealthObserverEnabled(FaultFacetType type) { @@ -119,16 +99,17 @@ public: return Milliseconds(Seconds(mongo::gActiveFaultDurationSecs.load())); } - Milliseconds getPeriodicHealthCheckInterval() const { - return kPeriodicHealthCheckInterval; + Milliseconds getPeriodicHealthCheckInterval(FaultFacetType type) const { + auto intervals = _getHealthObserverIntervals(); + return Milliseconds(_getPropertyByType(type, &intervals->_data, 1000)); } Milliseconds getPeriodicLivenessCheckInterval() const { - return Milliseconds(50); + return Milliseconds(_getLivenessConfig()->_data->getInterval()); } Seconds getPeriodicLivenessDeadline() const { - return Seconds(300); + return Seconds(_getLivenessConfig()->_data->getInterval()); } /** @returns true if the periodic checks are disabled for testing purposes. This is @@ -143,9 +124,40 @@ public: } private: - static HealthMonitoringIntensitiesServerParameter* getHealthObserverIntensities() { + static HealthMonitoringIntensitiesServerParameter* _getHealthObserverIntensities() { return ServerParameterSet::getGlobal()->get<HealthMonitoringIntensitiesServerParameter>( - "healthMonitoring"); + "healthMonitoringIntensities"); + } + + static PeriodicHealthCheckIntervalsServerParameter* _getHealthObserverIntervals() { + return ServerParameterSet::getGlobal()->get<PeriodicHealthCheckIntervalsServerParameter>( + "healthMonitoringIntervals"); + } + + static HealthMonitoringProgressMonitorServerParameter* _getLivenessConfig() { + return ServerParameterSet::getGlobal()->get<HealthMonitoringProgressMonitorServerParameter>( + "progressMonitor"); + } + + template <typename T, typename R> + R _getPropertyByType(FaultFacetType type, synchronized_value<T>* data, R defaultValue) const { + switch (type) { + case FaultFacetType::kLdap: + return (*data)->getLdap(); + case FaultFacetType::kDns: + return (*data)->getDns(); + case FaultFacetType::kTestObserver: + return (*data)->getTest(); + case FaultFacetType::kSystem: + return defaultValue; + case FaultFacetType::kMock1: + return defaultValue; + case FaultFacetType::kMock2: + return defaultValue; + // TODO: update this function with additional fault facets when they are added + default: + MONGO_UNREACHABLE; + } } bool _periodicChecksDisabledForTests = false; diff --git a/src/mongo/db/process_health/health_monitoring_server_parameters.cpp b/src/mongo/db/process_health/health_monitoring_server_parameters.cpp index f4aefdf940b..36d445cc386 100644 --- a/src/mongo/db/process_health/health_monitoring_server_parameters.cpp +++ b/src/mongo/db/process_health/health_monitoring_server_parameters.cpp @@ -54,4 +54,44 @@ void HealthMonitoringIntensitiesServerParameter::append(OperationContext*, b.append(name, healthMonitoring.obj()); } +Status HealthMonitoringProgressMonitorServerParameter::setFromString(const std::string& value) { + *_data = HealthObserverProgressMonitorConfig::parse( + IDLParserErrorContext("health monitoring liveness"), fromjson(value)); + return Status::OK(); +} + +Status HealthMonitoringProgressMonitorServerParameter::set(const BSONElement& newValueElement) { + *_data = HealthObserverProgressMonitorConfig::parse( + IDLParserErrorContext("health monitoring liveness"), newValueElement.Obj()); + return Status::OK(); +} + +void HealthMonitoringProgressMonitorServerParameter::append(OperationContext*, + BSONObjBuilder& b, + const std::string& name) { + BSONObjBuilder healthMonitoring; + _data->serialize(&healthMonitoring); + b.append(name, healthMonitoring.obj()); +} + +Status PeriodicHealthCheckIntervalsServerParameter::setFromString(const std::string& value) { + *_data = HealthObserverIntervals::parse(IDLParserErrorContext("health monitoring liveness"), + fromjson(value)); + return Status::OK(); +} + +Status PeriodicHealthCheckIntervalsServerParameter::set(const BSONElement& newValueElement) { + *_data = HealthObserverIntervals::parse(IDLParserErrorContext("health monitoring liveness"), + newValueElement.Obj()); + return Status::OK(); +} + +void PeriodicHealthCheckIntervalsServerParameter::append(OperationContext*, + BSONObjBuilder& b, + const std::string& name) { + BSONObjBuilder healthMonitoring; + _data->serialize(&healthMonitoring); + b.append(name, healthMonitoring.obj()); +} + } // namespace mongo diff --git a/src/mongo/db/process_health/health_monitoring_server_parameters.idl b/src/mongo/db/process_health/health_monitoring_server_parameters.idl index 779c9370d90..66c17fd15df 100644 --- a/src/mongo/db/process_health/health_monitoring_server_parameters.idl +++ b/src/mongo/db/process_health/health_monitoring_server_parameters.idl @@ -60,14 +60,66 @@ structs: type: HealthObserverIntensity default: kOff + HealthObserverIntervals: + description: "A struct representing the interval in milliseconds for each health observer." + strict: true + fields: + dns: + description: "DNS health check interval." + type: int + default: 1000 + validator: { gt: 0 } + ldap: + description: "LDAP health check interval." + type: int + default: 10000 + validator: { gt: 0 } + test: + description: "Test health observer health check interval." + type: int + default: 1 + validator: { gt: 0 } + + HealthObserverProgressMonitorConfig: + description: "A struct representing configuration for health observer liveness checks." + strict: true + fields: + interval: + description: "Interval between liveness checks in milliseconds." + type: int + default: 50 + validator: { gt: 0 } + deadline: + description: "Deadline for liveness checks, after which process should exit, in seconds." + type: int + default: 300 + validator: { gt: 0 } + server_parameters: - healthMonitoring: + healthMonitoringIntensities: set_at: ["startup", "runtime"] description: "A server parameter for specifying the intensity of fault facets." cpp_class: name: "HealthMonitoringIntensitiesServerParameter" data: "synchronized_value<HealthObserverIntensities>" override_set: true + + progressMonitor: + set_at: ["startup", "runtime"] + description: "A server parameter for specifying intervals for health monitoring." + cpp_class: + name: "HealthMonitoringProgressMonitorServerParameter" + data: "synchronized_value<HealthObserverProgressMonitorConfig>" + override_set: true + + healthMonitoringIntervals: + set_at: [startup, runtime] + description: "A server parameter for specifying the interval in milliseconds between health checks." + cpp_class: + name: "PeriodicHealthCheckIntervalsServerParameter" + data: "synchronized_value<HealthObserverIntervals>" + override_set: true + activeFaultDurationSecs: description: "A server parameter for specifying the duration after which we transition to active fault." set_at: [startup, runtime] diff --git a/src/mongo/db/process_health/progress_monitor.cpp b/src/mongo/db/process_health/progress_monitor.cpp index 7c2fd4fe221..5eecd886cce 100644 --- a/src/mongo/db/process_health/progress_monitor.cpp +++ b/src/mongo/db/process_health/progress_monitor.cpp @@ -91,9 +91,22 @@ void ProgressMonitor::progressMonitorCheck(std::function<void(std::string cause) if (secondPass.empty()) { return; } + + auto longestIntervalHealthObserver = *std::max_element( + secondPass.begin(), secondPass.end(), [&](const auto& lhs, const auto& rhs) { + auto lhs_interval = + _faultManager->getConfig().getPeriodicHealthCheckInterval(lhs->getType()); + auto rhs_interval = + _faultManager->getConfig().getPeriodicHealthCheckInterval(rhs->getType()); + return lhs_interval < rhs_interval; + }); + + auto longestInterval = _faultManager->getConfig().getPeriodicHealthCheckInterval( + longestIntervalHealthObserver->getType()); + + sleepFor(longestInterval * 2); // The observer is enabled but did not run for a while. Sleep two cycles // and check again. Note: this should be rare. - sleepFor(_faultManager->getConfig().getPeriodicHealthCheckInterval() * 2); for (auto observer : secondPass) { const auto stats = observer->getStats(); if (!_faultManager->getConfig().isHealthObserverEnabled(observer->getType()) && |