summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLaMont Nelson <lamont.nelson@mongodb.com>2022-01-25 03:15:50 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-01-25 03:43:11 +0000
commitc438071c21ea9241ad451c2af8d5766f07f5debe (patch)
tree5018a66faa35fd7b2c0bec11e8ae64d569fef1ba
parent30b290f315d0c30b44b4f6c746e5e8235c66f7c9 (diff)
downloadmongo-c438071c21ea9241ad451c2af8d5766f07f5debe.tar.gz
SERVER-62904: Fault Manager progress checker should not fault unless the offending health observer is marked critical
-rw-r--r--jstests/sharding/health_monitor/progress_monitor.js21
-rw-r--r--jstests/sharding/health_monitor/progress_monitor_doesnt_crash_on_not_critical.js80
-rw-r--r--src/mongo/db/process_health/fault_manager_config.h2
-rw-r--r--src/mongo/db/process_health/progress_monitor.cpp20
-rw-r--r--src/mongo/db/process_health/test_health_observer.h2
5 files changed, 109 insertions, 16 deletions
diff --git a/jstests/sharding/health_monitor/progress_monitor.js b/jstests/sharding/health_monitor/progress_monitor.js
index 4ef70928685..ede3f98d3ae 100644
--- a/jstests/sharding/health_monitor/progress_monitor.js
+++ b/jstests/sharding/health_monitor/progress_monitor.js
@@ -2,7 +2,7 @@
* @tags: [multiversion_incompatible]
*/
const PROGRESS_TIMEOUT_SECONDS = 5;
-const CHECK_PING_SECONDS = 1;
+const monitoringIntervalMs = 500;
(function() {
'use strict';
@@ -10,14 +10,13 @@ const params = {
setParameter: {
healthMonitoringIntensities: tojson({
values: [
- {type: "test", intensity: "non-critical"},
- {type: "ldap", intensity: "off"},
- {type: "dns", intensity: "off"}
+ {type: "test", intensity: "critical"},
]
}),
- healthMonitoringIntervals: tojson({values: [{type: "test", interval: NumberInt(500)}]}),
+ healthMonitoringIntervals:
+ tojson({values: [{type: "test", interval: NumberInt(monitoringIntervalMs)}]}),
progressMonitor:
- tojson({interval: PROGRESS_TIMEOUT_SECONDS, deadline: PROGRESS_TIMEOUT_SECONDS}),
+ tojson({interval: PROGRESS_TIMEOUT_SECONDS * 1000, deadline: PROGRESS_TIMEOUT_SECONDS}),
featureFlagHealthMonitoring: true
}
};
@@ -32,13 +31,13 @@ assert.commandWorked(st.s1.adminCommand(
{"setParameter": 1, logComponentVerbosity: {processHealth: {verbosity: 2}}}));
// Set the failpoint on one of the mongos's to pause its healthchecks.
+jsTestLog("hang test health observer on " + st.s1.host);
assert.commandWorked(
st.s1.adminCommand({"configureFailPoint": 'hangTestHealthObserver', "mode": "alwaysOn"}));
-sleep(CHECK_PING_SECONDS * 1000);
-// Make sure the failpoint on its own doesn't bring down the server.
-assert.commandWorked(st.s1.adminCommand({"ping": 1}));
+
// Wait for the progress monitor timeout to elapse.
-sleep(PROGRESS_TIMEOUT_SECONDS * 1000);
+sleep(1.1 * PROGRESS_TIMEOUT_SECONDS * 1000);
+jsTestLog("Done sleeping");
assert.soon(() => {
try {
@@ -59,7 +58,7 @@ assert.soon(() => {
}
sleep(1000);
return false;
-}, "Pinging faulty mongos should fail with network error.", PROGRESS_TIMEOUT_SECONDS * 1000);
+}, "Pinging faulty mongos should fail with network error.");
// Don't validate exit codes, since a mongos will exit on its own with a non-zero exit code.
st.stop({skipValidatingExitCode: true});
diff --git a/jstests/sharding/health_monitor/progress_monitor_doesnt_crash_on_not_critical.js b/jstests/sharding/health_monitor/progress_monitor_doesnt_crash_on_not_critical.js
new file mode 100644
index 00000000000..5b4b582f164
--- /dev/null
+++ b/jstests/sharding/health_monitor/progress_monitor_doesnt_crash_on_not_critical.js
@@ -0,0 +1,80 @@
+/*
+ * @tags: [multiversion_incompatible]
+ */
+const PROGRESS_TIMEOUT_SECONDS = 5;
+(function() {
+'use strict';
+
+const params = {
+ setParameter: {
+ healthMonitoringIntensities: tojson({
+ values: [
+ {type: "test", intensity: "non-critical"},
+ ]
+ }),
+ healthMonitoringIntervals: tojson({values: [{type: "test", interval: NumberInt(500)}]}),
+ progressMonitor:
+ tojson({interval: PROGRESS_TIMEOUT_SECONDS * 1000, deadline: PROGRESS_TIMEOUT_SECONDS}),
+ featureFlagHealthMonitoring: true
+ }
+};
+let st = new ShardingTest({
+ mongos: [params, params],
+ shards: 1,
+});
+
+function changeObserverIntensity(conn, observer, intensity) {
+ let paramValue = {"values": [{"type": observer, "intensity": intensity}]};
+ assert.commandWorked(
+ conn.adminCommand({"setParameter": 1, healthMonitoringIntensities: paramValue}));
+}
+
+// After cluster startup, make sure both mongos's are available.
+let pingServers = (servers) => {
+ servers.forEach((server) => {
+ assert.commandWorked(server.adminCommand({"ping": 1}));
+ });
+};
+
+pingServers([st.s0, st.s1]);
+assert.commandWorked(st.s1.adminCommand(
+ {"setParameter": 1, logComponentVerbosity: {processHealth: {verbosity: 2}}}));
+
+jsTestLog("Block test health observer on " + st.s1.host);
+assert.commandWorked(
+ st.s1.adminCommand({"configureFailPoint": 'hangTestHealthObserver', "mode": "alwaysOn"}));
+
+// Wait for the progress monitor timeout to elapse.
+sleep(1.1 * PROGRESS_TIMEOUT_SECONDS * 1000);
+
+// Servers should be still be alive.
+jsTestLog("Expect monogs processes to still be alive");
+pingServers([st.s0, st.s1]);
+
+jsTestLog("Change observer to critical");
+changeObserverIntensity(st.s1, 'test', 'critical');
+
+// Wait for the progress monitor timeout to elapse.
+sleep(1.1 * PROGRESS_TIMEOUT_SECONDS * 1000);
+
+jsTestLog("Done sleeping");
+// Servers should be still be alive.
+pingServers([st.s0]);
+
+let pingNetworkError = false;
+try {
+ pingServers([st.s1]);
+} catch (ex) {
+ if (ex.message.indexOf("network error") >= 0) {
+ print("Got expected error: " + tojson(ex));
+ pingNetworkError = true;
+ } else {
+ jsTestLog(`Unexpected failure: ${ex}`);
+ assert(false, "expected ping on " + st.s1.host + " to fail with network error.");
+ }
+}
+assert(pingNetworkError, "expected " + st.s1.host + "to be killed");
+
+// Don't validate exit codes, since a mongos will exit on its own with a non-zero exit code.
+st.stop({skipValidatingExitCode: true});
+})();
diff --git a/src/mongo/db/process_health/fault_manager_config.h b/src/mongo/db/process_health/fault_manager_config.h
index 961461111f1..a2c7197920a 100644
--- a/src/mongo/db/process_health/fault_manager_config.h
+++ b/src/mongo/db/process_health/fault_manager_config.h
@@ -156,7 +156,7 @@ public:
// If the server persists in TransientFault for more than this duration
// it will move to the ActiveFault state and terminate.
Milliseconds getActiveFaultDuration() const {
- return Milliseconds(Seconds(mongo::gActiveFaultDurationSecs.load()));
+ return Seconds(mongo::gActiveFaultDurationSecs.load());
}
Milliseconds getPeriodicHealthCheckInterval(FaultFacetType type) const {
diff --git a/src/mongo/db/process_health/progress_monitor.cpp b/src/mongo/db/process_health/progress_monitor.cpp
index d1d1ec13f47..c8552ef8f3e 100644
--- a/src/mongo/db/process_health/progress_monitor.cpp
+++ b/src/mongo/db/process_health/progress_monitor.cpp
@@ -32,6 +32,7 @@
#include "mongo/db/process_health/progress_monitor.h"
#include "mongo/db/process_health/fault_manager.h"
+#include "mongo/db/process_health/health_monitoring_server_parameters_gen.h"
#include "mongo/db/process_health/health_observer.h"
#include "mongo/logv2/log.h"
@@ -65,7 +66,16 @@ void ProgressMonitor::progressMonitorCheck(std::function<void(std::string cause)
// Check the liveness of every health observer.
for (auto observer : observers) {
const auto stats = observer->getStats();
- if (!_faultManager->getConfig().isHealthObserverEnabled(observer->getType())) {
+ const auto observerType = observer->getType();
+ const auto observerIntensity =
+ _faultManager->getConfig().getHealthObserverIntensity(observerType);
+
+ if (observerIntensity != HealthObserverIntensityEnum::kCritical) {
+ LOGV2_DEBUG(6290401,
+ 2,
+ "Skip checking progress of not critical health observer",
+ "observerType"_attr = (str::stream() << observerType),
+ "intensity"_attr = HealthObserverIntensity_serializer(observerIntensity));
continue;
}
@@ -81,6 +91,7 @@ void ProgressMonitor::progressMonitorCheck(std::function<void(std::string cause)
if (stats.currentlyRunningHealthCheck &&
now - stats.lastTimeCheckStarted >
_faultManager->getConfig().getPeriodicLivenessDeadline()) {
+
// Crash because this health checker is running for too long.
crashCb(str::stream() << "Health observer " << observer->getType()
<< " is still running since "
@@ -109,10 +120,12 @@ void ProgressMonitor::progressMonitorCheck(std::function<void(std::string cause)
// and check again. Note: this should be rare.
for (auto observer : secondPass) {
const auto stats = observer->getStats();
+
if (!_faultManager->getConfig().isHealthObserverEnabled(observer->getType()) &&
!stats.currentlyRunningHealthCheck &&
now - stats.lastTimeCheckStarted >
_faultManager->getConfig().getPeriodicLivenessDeadline() * 2) {
+
// Crash because this health checker was never started.
crashCb(str::stream() << "Health observer " << observer->getType()
<< " did not run since "
@@ -129,8 +142,9 @@ void ProgressMonitor::_progressMonitorLoop() {
while (!_terminate.load()) {
progressMonitorCheck(_crashCb);
- const auto interval =
- Microseconds(_faultManager->getConfig().getPeriodicLivenessCheckInterval());
+ const auto interval = duration_cast<Microseconds>(
+ _faultManager->getConfig().getPeriodicLivenessCheckInterval());
+
// Breaking up the sleeping interval to check for `_terminate` more often.
for (int i = 0; i < kSleepsPerInterval && !_terminate.load(); ++i) {
sleepFor(interval / kSleepsPerInterval);
diff --git a/src/mongo/db/process_health/test_health_observer.h b/src/mongo/db/process_health/test_health_observer.h
index f325ac71534..0c23df7fb42 100644
--- a/src/mongo/db/process_health/test_health_observer.h
+++ b/src/mongo/db/process_health/test_health_observer.h
@@ -46,7 +46,7 @@ protected:
}
Milliseconds getObserverTimeout() const override {
- return Milliseconds(Seconds(30));
+ return Seconds(30);
}
Future<HealthCheckStatus> periodicCheckImpl(