summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavis Haupt <davis.haupt@mongodb.com>2021-12-15 15:23:20 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-12-28 23:18:25 +0000
commit86de78310d82cd0bafc6dbd5fcdbacfed6223c8d (patch)
tree3eab87def2e3e2074b2a040362f525f4361de25e
parent340c3773a0348b427b132023042d216cd701c5d9 (diff)
downloadmongo-86de78310d82cd0bafc6dbd5fcdbacfed6223c8d.tar.gz
SERVER-61873 add configurable health observer parameters
-rw-r--r--jstests/sharding/health_monitor/parameters.js87
-rw-r--r--jstests/sharding/health_monitor/set_parameter_health_monitor_intensity.js32
-rw-r--r--src/mongo/db/process_health/fault_manager.cpp3
-rw-r--r--src/mongo/db/process_health/fault_manager_config.h74
-rw-r--r--src/mongo/db/process_health/health_monitoring_server_parameters.cpp40
-rw-r--r--src/mongo/db/process_health/health_monitoring_server_parameters.idl54
-rw-r--r--src/mongo/db/process_health/progress_monitor.cpp15
7 files changed, 239 insertions, 66 deletions
diff --git a/jstests/sharding/health_monitor/parameters.js b/jstests/sharding/health_monitor/parameters.js
new file mode 100644
index 00000000000..7f0a66be44a
--- /dev/null
+++ b/jstests/sharding/health_monitor/parameters.js
@@ -0,0 +1,87 @@
+(function() {
+'use strict';
+
+let CUSTOM_INTERVAL = 1337;
+let CUSTOM_DEADLINE = 5;
+
+var st = new ShardingTest({
+ mongos: [
+ {
+ setParameter: {
+ healthMonitoringIntensities: tojson({dns: "off", ldap: "critical", test: "off"}),
+ }
+ },
+ {
+ setParameter: {
+ healthMonitoringIntensities: tojson({dns: "off", ldap: "off"}),
+ progressMonitor: tojson({interval: CUSTOM_INTERVAL, deadline: CUSTOM_DEADLINE}),
+ healthMonitoringIntervals: tojson({test: CUSTOM_INTERVAL})
+ }
+ }
+ ],
+ shards: 1,
+});
+
+// Intensity parameter
+let result = st.s0.adminCommand({"getParameter": 1, "healthMonitoringIntensities": 1});
+assert.eq(result.healthMonitoringIntensities.dns, "off");
+assert.eq(result.healthMonitoringIntensities.ldap, "critical");
+
+assert.commandFailed(
+ st.s0.adminCommand({"setParameter": 1, healthMonitoringIntensities: {dns: "INVALID"}}));
+assert.commandFailed(
+ st.s0.adminCommand({"setParameter": 1, healthMonitoringIntensities: {invalid: "off"}}));
+
+assert.commandWorked(st.s0.adminCommand(
+ {"setParameter": 1, healthMonitoringIntensities: {dns: 'non-critical', ldap: 'off'}}));
+result =
+ assert.commandWorked(st.s0.adminCommand({"getParameter": 1, healthMonitoringIntensities: 1}));
+assert.eq(result.healthMonitoringIntensities.dns, "non-critical");
+assert.eq(result.healthMonitoringIntensities.ldap, "off");
+
+// Interval parameter
+result = st.s1.adminCommand({"getParameter": 1, "healthMonitoringIntervals": 1});
+assert.eq(result.healthMonitoringIntervals.test, CUSTOM_INTERVAL);
+
+assert.commandFailed(st.s1.adminCommand({"setParameter": 1, healthMonitoringIntervals: {dns: 0}}));
+assert.commandFailed(
+ st.s1.adminCommand({"setParameter": 1, healthMonitoringIntervals: {invalid: 1000}}));
+
+assert.commandWorked(st.s1.adminCommand({
+ "setParameter": 1,
+ healthMonitoringIntervals: {dns: NumberInt(2000), ldap: NumberInt(600000)}
+}));
+result =
+ assert.commandWorked(st.s1.adminCommand({"getParameter": 1, healthMonitoringIntervals: 1}));
+assert.eq(result.healthMonitoringIntervals.dns, 2000);
+assert.eq(result.healthMonitoringIntervals.ldap, 600000);
+
+// Check that custom liveness values were set properly.
+result = st.s1.adminCommand({"getParameter": 1, "progressMonitor": 1});
+assert.eq(result.progressMonitor.interval, CUSTOM_INTERVAL);
+assert.eq(result.progressMonitor.deadline, CUSTOM_DEADLINE);
+
+// Validation tests: intervals must be > 0.
+assert.commandFailed(st.s1.adminCommand({"setParameter": 1, progressMonitor: {interval: 0}}));
+assert.commandFailed(st.s1.adminCommand({"setParameter": 1, progressMonitor: {interval: -5}}));
+assert.commandFailed(st.s1.adminCommand({"setParameter": 1, progressMonitor: {deadline: 0}}));
+assert.commandFailed(st.s1.adminCommand({"setParameter": 1, progressMonitor: {deadline: -5}}));
+
+// Setting parameter properly during runtime.
+assert.commandWorked(st.s1.adminCommand(
+ {"setParameter": 1, progressMonitor: {deadline: NumberInt(CUSTOM_DEADLINE + 1)}}));
+result = st.s1.adminCommand({"getParameter": 1, "progressMonitor": 1});
+assert.eq(result.progressMonitor.deadline, CUSTOM_DEADLINE + 1);
+// Setting only one sub-field will reset others to their default.
+assert.eq(result.progressMonitor.interval, 50);
+
+assert.commandWorked(st.s1.adminCommand({
+ "setParameter": 1,
+ progressMonitor:
+ {deadline: NumberInt(CUSTOM_DEADLINE + 1), interval: NumberInt(CUSTOM_INTERVAL)}
+}));
+result = st.s1.adminCommand({"getParameter": 1, "progressMonitor": 1});
+assert.eq(result.progressMonitor.deadline, CUSTOM_DEADLINE + 1);
+assert.eq(result.progressMonitor.interval, CUSTOM_INTERVAL);
+st.stop();
+}());
diff --git a/jstests/sharding/health_monitor/set_parameter_health_monitor_intensity.js b/jstests/sharding/health_monitor/set_parameter_health_monitor_intensity.js
deleted file mode 100644
index 236dcaeb31d..00000000000
--- a/jstests/sharding/health_monitor/set_parameter_health_monitor_intensity.js
+++ /dev/null
@@ -1,32 +0,0 @@
-
-
-(function() {
-'use strict';
-
-var st = new ShardingTest({
- mongos: [{
- setParameter: {
- healthMonitoring: tojson({dns: "off", ldap: "critical"}),
- featureFlagHealthMonitoring: true
- }
- }],
- shards: 1,
-});
-
-var result = st.s0.adminCommand({"getParameter": 1, "healthMonitoring": 1});
-print(tojson(result));
-assert.eq(result.healthMonitoring.dns, "off");
-assert.eq(result.healthMonitoring.ldap, "critical");
-
-assert.commandFailed(st.s0.adminCommand({"setParameter": 1, healthMonitoring: {dns: "INVALID"}}));
-assert.commandFailed(st.s0.adminCommand({"setParameter": 1, healthMonitoring: {invalid: "off"}}));
-
-assert.commandWorked(
- st.s0.adminCommand({"setParameter": 1, healthMonitoring: {dns: 'non-critical', ldap: 'off'}}));
-var result = st.s0.adminCommand({"getParameter": 1, healthMonitoring: 1});
-print(tojson(result));
-assert.eq(result.healthMonitoring.dns, "non-critical");
-assert.eq(result.healthMonitoring.ldap, "off");
-
-st.stop();
-}());
diff --git a/src/mongo/db/process_health/fault_manager.cpp b/src/mongo/db/process_health/fault_manager.cpp
index 9b52ee34247..276e57c47b0 100644
--- a/src/mongo/db/process_health/fault_manager.cpp
+++ b/src/mongo/db/process_health/fault_manager.cpp
@@ -420,7 +420,8 @@ FaultFacetsContainerPtr FaultManager::getOrCreateFaultFacetsContainer() {
void FaultManager::healthCheck(HealthObserver* observer, std::shared_ptr<AtomicWord<bool>> token) {
auto schedulerCb = [this, observer, token] {
- auto scheduledTime = _taskExecutor->now() + _config->kPeriodicHealthCheckInterval +
+ auto scheduledTime = _taskExecutor->now() +
+ _config->getPeriodicHealthCheckInterval(observer->getType()) +
std::min(observer->healthCheckJitter(),
FaultManagerConfig::kPeriodicHealthCheckMaxJitter);
LOGV2_DEBUG(5939701,
diff --git a/src/mongo/db/process_health/fault_manager_config.h b/src/mongo/db/process_health/fault_manager_config.h
index 8542dbf9ca5..1ba938a3d84 100644
--- a/src/mongo/db/process_health/fault_manager_config.h
+++ b/src/mongo/db/process_health/fault_manager_config.h
@@ -73,36 +73,16 @@ static const StringData FaultFacetType_serializer(const FaultFacetType value) {
class FaultManagerConfig {
public:
- /* Default value of time between health checks
- * TODO SERVER-61947 make this a property of health observers
- */
- static auto inline constexpr kPeriodicHealthCheckInterval{Milliseconds(1000)};
-
/* Maximum possible jitter added to the time between health checks */
static auto inline constexpr kPeriodicHealthCheckMaxJitter{Milliseconds{100}};
HealthObserverIntensityEnum getHealthObserverIntensity(FaultFacetType type) {
- auto intensities = getHealthObserverIntensities();
- switch (type) {
- case FaultFacetType::kLdap:
- return intensities->_data->getLdap();
- case FaultFacetType::kDns:
- return intensities->_data->getDns();
- // TODO: update this function with additional fault facets when they are added
- case FaultFacetType::kTestObserver:
- return intensities->_data->getTest();
- case FaultFacetType::kSystem:
- return HealthObserverIntensityEnum::kCritical;
- case FaultFacetType::kMock1:
- if (_facetToIntensityMapForTest.contains(type)) {
- return _facetToIntensityMapForTest.at(type);
- }
- return HealthObserverIntensityEnum::kCritical;
- case FaultFacetType::kMock2:
- return HealthObserverIntensityEnum::kCritical;
- default:
- MONGO_UNREACHABLE;
+ auto intensities = _getHealthObserverIntensities();
+ if (type == FaultFacetType::kMock1 && _facetToIntensityMapForTest.contains(type)) {
+ return _facetToIntensityMapForTest.at(type);
}
+ return _getPropertyByType(
+ type, &intensities->_data, HealthObserverIntensityEnum::kCritical);
}
bool isHealthObserverEnabled(FaultFacetType type) {
@@ -119,16 +99,17 @@ public:
return Milliseconds(Seconds(mongo::gActiveFaultDurationSecs.load()));
}
- Milliseconds getPeriodicHealthCheckInterval() const {
- return kPeriodicHealthCheckInterval;
+ Milliseconds getPeriodicHealthCheckInterval(FaultFacetType type) const {
+ auto intervals = _getHealthObserverIntervals();
+ return Milliseconds(_getPropertyByType(type, &intervals->_data, 1000));
}
Milliseconds getPeriodicLivenessCheckInterval() const {
- return Milliseconds(50);
+ return Milliseconds(_getLivenessConfig()->_data->getInterval());
}
Seconds getPeriodicLivenessDeadline() const {
- return Seconds(300);
+ return Seconds(_getLivenessConfig()->_data->getInterval());
}
/** @returns true if the periodic checks are disabled for testing purposes. This is
@@ -143,9 +124,40 @@ public:
}
private:
- static HealthMonitoringIntensitiesServerParameter* getHealthObserverIntensities() {
+ static HealthMonitoringIntensitiesServerParameter* _getHealthObserverIntensities() {
return ServerParameterSet::getGlobal()->get<HealthMonitoringIntensitiesServerParameter>(
- "healthMonitoring");
+ "healthMonitoringIntensities");
+ }
+
+ static PeriodicHealthCheckIntervalsServerParameter* _getHealthObserverIntervals() {
+ return ServerParameterSet::getGlobal()->get<PeriodicHealthCheckIntervalsServerParameter>(
+ "healthMonitoringIntervals");
+ }
+
+ static HealthMonitoringProgressMonitorServerParameter* _getLivenessConfig() {
+ return ServerParameterSet::getGlobal()->get<HealthMonitoringProgressMonitorServerParameter>(
+ "progressMonitor");
+ }
+
+ template <typename T, typename R>
+ R _getPropertyByType(FaultFacetType type, synchronized_value<T>* data, R defaultValue) const {
+ switch (type) {
+ case FaultFacetType::kLdap:
+ return (*data)->getLdap();
+ case FaultFacetType::kDns:
+ return (*data)->getDns();
+ case FaultFacetType::kTestObserver:
+ return (*data)->getTest();
+ case FaultFacetType::kSystem:
+ return defaultValue;
+ case FaultFacetType::kMock1:
+ return defaultValue;
+ case FaultFacetType::kMock2:
+ return defaultValue;
+ // TODO: update this function with additional fault facets when they are added
+ default:
+ MONGO_UNREACHABLE;
+ }
}
bool _periodicChecksDisabledForTests = false;
diff --git a/src/mongo/db/process_health/health_monitoring_server_parameters.cpp b/src/mongo/db/process_health/health_monitoring_server_parameters.cpp
index f4aefdf940b..36d445cc386 100644
--- a/src/mongo/db/process_health/health_monitoring_server_parameters.cpp
+++ b/src/mongo/db/process_health/health_monitoring_server_parameters.cpp
@@ -54,4 +54,44 @@ void HealthMonitoringIntensitiesServerParameter::append(OperationContext*,
b.append(name, healthMonitoring.obj());
}
+Status HealthMonitoringProgressMonitorServerParameter::setFromString(const std::string& value) {
+ *_data = HealthObserverProgressMonitorConfig::parse(
+ IDLParserErrorContext("health monitoring liveness"), fromjson(value));
+ return Status::OK();
+}
+
+Status HealthMonitoringProgressMonitorServerParameter::set(const BSONElement& newValueElement) {
+ *_data = HealthObserverProgressMonitorConfig::parse(
+ IDLParserErrorContext("health monitoring liveness"), newValueElement.Obj());
+ return Status::OK();
+}
+
+void HealthMonitoringProgressMonitorServerParameter::append(OperationContext*,
+ BSONObjBuilder& b,
+ const std::string& name) {
+ BSONObjBuilder healthMonitoring;
+ _data->serialize(&healthMonitoring);
+ b.append(name, healthMonitoring.obj());
+}
+
+Status PeriodicHealthCheckIntervalsServerParameter::setFromString(const std::string& value) {
+ *_data = HealthObserverIntervals::parse(IDLParserErrorContext("health monitoring liveness"),
+ fromjson(value));
+ return Status::OK();
+}
+
+Status PeriodicHealthCheckIntervalsServerParameter::set(const BSONElement& newValueElement) {
+ *_data = HealthObserverIntervals::parse(IDLParserErrorContext("health monitoring liveness"),
+ newValueElement.Obj());
+ return Status::OK();
+}
+
+void PeriodicHealthCheckIntervalsServerParameter::append(OperationContext*,
+ BSONObjBuilder& b,
+ const std::string& name) {
+ BSONObjBuilder healthMonitoring;
+ _data->serialize(&healthMonitoring);
+ b.append(name, healthMonitoring.obj());
+}
+
} // namespace mongo
diff --git a/src/mongo/db/process_health/health_monitoring_server_parameters.idl b/src/mongo/db/process_health/health_monitoring_server_parameters.idl
index 779c9370d90..66c17fd15df 100644
--- a/src/mongo/db/process_health/health_monitoring_server_parameters.idl
+++ b/src/mongo/db/process_health/health_monitoring_server_parameters.idl
@@ -60,14 +60,66 @@ structs:
type: HealthObserverIntensity
default: kOff
+ HealthObserverIntervals:
+ description: "A struct representing the interval in milliseconds for each health observer."
+ strict: true
+ fields:
+ dns:
+ description: "DNS health check interval."
+ type: int
+ default: 1000
+ validator: { gt: 0 }
+ ldap:
+ description: "LDAP health check interval."
+ type: int
+ default: 10000
+ validator: { gt: 0 }
+ test:
+ description: "Test health observer health check interval."
+ type: int
+ default: 1
+ validator: { gt: 0 }
+
+ HealthObserverProgressMonitorConfig:
+ description: "A struct representing configuration for health observer liveness checks."
+ strict: true
+ fields:
+ interval:
+ description: "Interval between liveness checks in milliseconds."
+ type: int
+ default: 50
+ validator: { gt: 0 }
+ deadline:
+ description: "Deadline for liveness checks, after which process should exit, in seconds."
+ type: int
+ default: 300
+ validator: { gt: 0 }
+
server_parameters:
- healthMonitoring:
+ healthMonitoringIntensities:
set_at: ["startup", "runtime"]
description: "A server parameter for specifying the intensity of fault facets."
cpp_class:
name: "HealthMonitoringIntensitiesServerParameter"
data: "synchronized_value<HealthObserverIntensities>"
override_set: true
+
+ progressMonitor:
+ set_at: ["startup", "runtime"]
+ description: "A server parameter for specifying intervals for health monitoring."
+ cpp_class:
+ name: "HealthMonitoringProgressMonitorServerParameter"
+ data: "synchronized_value<HealthObserverProgressMonitorConfig>"
+ override_set: true
+
+ healthMonitoringIntervals:
+ set_at: [startup, runtime]
+ description: "A server parameter for specifying the interval in milliseconds between health checks."
+ cpp_class:
+ name: "PeriodicHealthCheckIntervalsServerParameter"
+ data: "synchronized_value<HealthObserverIntervals>"
+ override_set: true
+
activeFaultDurationSecs:
description: "A server parameter for specifying the duration after which we transition to active fault."
set_at: [startup, runtime]
diff --git a/src/mongo/db/process_health/progress_monitor.cpp b/src/mongo/db/process_health/progress_monitor.cpp
index 7c2fd4fe221..5eecd886cce 100644
--- a/src/mongo/db/process_health/progress_monitor.cpp
+++ b/src/mongo/db/process_health/progress_monitor.cpp
@@ -91,9 +91,22 @@ void ProgressMonitor::progressMonitorCheck(std::function<void(std::string cause)
if (secondPass.empty()) {
return;
}
+
+ auto longestIntervalHealthObserver = *std::max_element(
+ secondPass.begin(), secondPass.end(), [&](const auto& lhs, const auto& rhs) {
+ auto lhs_interval =
+ _faultManager->getConfig().getPeriodicHealthCheckInterval(lhs->getType());
+ auto rhs_interval =
+ _faultManager->getConfig().getPeriodicHealthCheckInterval(rhs->getType());
+ return lhs_interval < rhs_interval;
+ });
+
+ auto longestInterval = _faultManager->getConfig().getPeriodicHealthCheckInterval(
+ longestIntervalHealthObserver->getType());
+
+ sleepFor(longestInterval * 2);
// The observer is enabled but did not run for a while. Sleep two cycles
// and check again. Note: this should be rare.
- sleepFor(_faultManager->getConfig().getPeriodicHealthCheckInterval() * 2);
for (auto observer : secondPass) {
const auto stats = observer->getStats();
if (!_faultManager->getConfig().isHealthObserverEnabled(observer->getType()) &&