SERVER-62904: Fault Manager progress checker should not fault unless the offending health observer is marked critical

author: LaMont Nelson <lamont.nelson@mongodb.com> 2022-01-25 03:15:50 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2022-01-25 03:43:11 +0000
commit: c438071c21ea9241ad451c2af8d5766f07f5debe (patch)
tree: 5018a66faa35fd7b2c0bec11e8ae64d569fef1ba
parent: 30b290f315d0c30b44b4f6c746e5e8235c66f7c9 (diff)
download: mongo-c438071c21ea9241ad451c2af8d5766f07f5debe.tar.gz
5 files changed, 109 insertions, 16 deletions
diff --git a/jstests/sharding/health_monitor/progress_monitor.js b/jstests/sharding/health_monitor/progress_monitor.js
index 4ef70928685..ede3f98d3ae 100644
--- a/jstests/sharding/health_monitor/progress_monitor.js
+++ b/jstests/sharding/health_monitor/progress_monitor.js
@@ -2,7 +2,7 @@
  *  @tags: [multiversion_incompatible]
  */
 const PROGRESS_TIMEOUT_SECONDS = 5;
-const CHECK_PING_SECONDS = 1;
+const monitoringIntervalMs = 500;
 (function() {
 'use strict';
 
@@ -10,14 +10,13 @@ const params = {
     setParameter: {
         healthMonitoringIntensities: tojson({
             values: [
-                {type: "test", intensity: "non-critical"},
-                {type: "ldap", intensity: "off"},
-                {type: "dns", intensity: "off"}
+                {type: "test", intensity: "critical"},
             ]
         }),
-        healthMonitoringIntervals: tojson({values: [{type: "test", interval: NumberInt(500)}]}),
+        healthMonitoringIntervals:
+            tojson({values: [{type: "test", interval: NumberInt(monitoringIntervalMs)}]}),
         progressMonitor:
-            tojson({interval: PROGRESS_TIMEOUT_SECONDS, deadline: PROGRESS_TIMEOUT_SECONDS}),
+            tojson({interval: PROGRESS_TIMEOUT_SECONDS * 1000, deadline: PROGRESS_TIMEOUT_SECONDS}),
         featureFlagHealthMonitoring: true
     }
 };
@@ -32,13 +31,13 @@ assert.commandWorked(st.s1.adminCommand(
     {"setParameter": 1, logComponentVerbosity: {processHealth: {verbosity: 2}}}));
 
 // Set the failpoint on one of the mongos's to pause its healthchecks.
+jsTestLog("hang test health observer on " + st.s1.host);
 assert.commandWorked(
     st.s1.adminCommand({"configureFailPoint": 'hangTestHealthObserver', "mode": "alwaysOn"}));
-sleep(CHECK_PING_SECONDS * 1000);
-// Make sure the failpoint on its own doesn't bring down the server.
-assert.commandWorked(st.s1.adminCommand({"ping": 1}));
+
 // Wait for the progress monitor timeout to elapse.
-sleep(PROGRESS_TIMEOUT_SECONDS * 1000);
+sleep(1.1 * PROGRESS_TIMEOUT_SECONDS * 1000);
+jsTestLog("Done sleeping");
 
 assert.soon(() => {
     try {
@@ -59,7 +58,7 @@ assert.soon(() => {
     }
     sleep(1000);
     return false;
-}, "Pinging faulty mongos should fail with network error.", PROGRESS_TIMEOUT_SECONDS * 1000);
+}, "Pinging faulty mongos should fail with network error.");
 // Don't validate exit codes, since a mongos will exit on its own with a non-zero exit code.
 
 st.stop({skipValidatingExitCode: true});
diff --git a/jstests/sharding/health_monitor/progress_monitor_doesnt_crash_on_not_critical.js b/jstests/sharding/health_monitor/progress_monitor_doesnt_crash_on_not_critical.js
new file mode 100644
index 00000000000..5b4b582f164
--- /dev/null
+++ b/jstests/sharding/health_monitor/progress_monitor_doesnt_crash_on_not_critical.js
@@ -0,0 +1,80 @@
+/*
+ *  @tags: [multiversion_incompatible]
+ */
+const PROGRESS_TIMEOUT_SECONDS = 5;
+(function() {
+'use strict';
+
+const params = {
+    setParameter: {
+        healthMonitoringIntensities: tojson({
+            values: [
+                {type: "test", intensity: "non-critical"},
+            ]
+        }),
+        healthMonitoringIntervals: tojson({values: [{type: "test", interval: NumberInt(500)}]}),
+        progressMonitor:
+            tojson({interval: PROGRESS_TIMEOUT_SECONDS * 1000, deadline: PROGRESS_TIMEOUT_SECONDS}),
+        featureFlagHealthMonitoring: true
+    }
+};
+let st = new ShardingTest({
+    mongos: [params, params],
+    shards: 1,
+});
+
+function changeObserverIntensity(conn, observer, intensity) {
+    let paramValue = {"values": [{"type": observer, "intensity": intensity}]};
+    assert.commandWorked(
+        conn.adminCommand({"setParameter": 1, healthMonitoringIntensities: paramValue}));
+}
+
+// After cluster startup, make sure both mongos's are available.
+let pingServers = (servers) => {
+    servers.forEach((server) => {
+        assert.commandWorked(server.adminCommand({"ping": 1}));
+    });
+};
+
+pingServers([st.s0, st.s1]);
+assert.commandWorked(st.s1.adminCommand(
+    {"setParameter": 1, logComponentVerbosity: {processHealth: {verbosity: 2}}}));
+
+jsTestLog("Block test health observer on " + st.s1.host);
+assert.commandWorked(
+    st.s1.adminCommand({"configureFailPoint": 'hangTestHealthObserver', "mode": "alwaysOn"}));
+
+// Wait for the progress monitor timeout to elapse.
+sleep(1.1 * PROGRESS_TIMEOUT_SECONDS * 1000);
+
+// Servers should be still be alive.
+jsTestLog("Expect monogs processes to still be alive");
+pingServers([st.s0, st.s1]);
+
+jsTestLog("Change observer to critical");
+changeObserverIntensity(st.s1, 'test', 'critical');
+
+// Wait for the progress monitor timeout to elapse.
+sleep(1.1 * PROGRESS_TIMEOUT_SECONDS * 1000);
+
+jsTestLog("Done sleeping");
+// Servers should be still be alive.
+pingServers([st.s0]);
+
+let pingNetworkError = false;
+try {
+    pingServers([st.s1]);
+} catch (ex) {
+    if (ex.message.indexOf("network error") >= 0) {
+        print("Got expected error: " + tojson(ex));
+        pingNetworkError = true;
+    } else {
+        jsTestLog(`Unexpected failure: ${ex}`);
+        assert(false, "expected ping on " + st.s1.host + " to fail with network error.");
+    }
+}
+assert(pingNetworkError, "expected " + st.s1.host + "to be killed");
+
+// Don't validate exit codes, since a mongos will exit on its own with a non-zero exit code.
+st.stop({skipValidatingExitCode: true});
+})();
diff --git a/src/mongo/db/process_health/fault_manager_config.h b/src/mongo/db/process_health/fault_manager_config.h
index 961461111f1..a2c7197920a 100644
--- a/src/mongo/db/process_health/fault_manager_config.h
+++ b/src/mongo/db/process_health/fault_manager_config.h
@@ -156,7 +156,7 @@ public:
     // If the server persists in TransientFault for more than this duration
     // it will move to the ActiveFault state and terminate.
     Milliseconds getActiveFaultDuration() const {
-        return Milliseconds(Seconds(mongo::gActiveFaultDurationSecs.load()));
+        return Seconds(mongo::gActiveFaultDurationSecs.load());
     }
 
     Milliseconds getPeriodicHealthCheckInterval(FaultFacetType type) const {
diff --git a/src/mongo/db/process_health/progress_monitor.cpp b/src/mongo/db/process_health/progress_monitor.cpp
index d1d1ec13f47..c8552ef8f3e 100644
--- a/src/mongo/db/process_health/progress_monitor.cpp
+++ b/src/mongo/db/process_health/progress_monitor.cpp
@@ -32,6 +32,7 @@
 #include "mongo/db/process_health/progress_monitor.h"
 
 #include "mongo/db/process_health/fault_manager.h"
+#include "mongo/db/process_health/health_monitoring_server_parameters_gen.h"
 #include "mongo/db/process_health/health_observer.h"
 #include "mongo/logv2/log.h"
 
@@ -65,7 +66,16 @@ void ProgressMonitor::progressMonitorCheck(std::function<void(std::string cause)
     // Check the liveness of every health observer.
     for (auto observer : observers) {
         const auto stats = observer->getStats();
-        if (!_faultManager->getConfig().isHealthObserverEnabled(observer->getType())) {
+        const auto observerType = observer->getType();
+        const auto observerIntensity =
+            _faultManager->getConfig().getHealthObserverIntensity(observerType);
+
+        if (observerIntensity != HealthObserverIntensityEnum::kCritical) {
+            LOGV2_DEBUG(6290401,
+                        2,
+                        "Skip checking progress of not critical health observer",
+                        "observerType"_attr = (str::stream() << observerType),
+                        "intensity"_attr = HealthObserverIntensity_serializer(observerIntensity));
             continue;
         }
 
@@ -81,6 +91,7 @@ void ProgressMonitor::progressMonitorCheck(std::function<void(std::string cause)
         if (stats.currentlyRunningHealthCheck &&
             now - stats.lastTimeCheckStarted >
                 _faultManager->getConfig().getPeriodicLivenessDeadline()) {
+
             // Crash because this health checker is running for too long.
             crashCb(str::stream() << "Health observer " << observer->getType()
                                   << " is still running since "
@@ -109,10 +120,12 @@ void ProgressMonitor::progressMonitorCheck(std::function<void(std::string cause)
     // and check again. Note: this should be rare.
     for (auto observer : secondPass) {
         const auto stats = observer->getStats();
+
         if (!_faultManager->getConfig().isHealthObserverEnabled(observer->getType()) &&
             !stats.currentlyRunningHealthCheck &&
             now - stats.lastTimeCheckStarted >
                 _faultManager->getConfig().getPeriodicLivenessDeadline() * 2) {
+
             // Crash because this health checker was never started.
             crashCb(str::stream() << "Health observer " << observer->getType()
                                   << " did not run since "
@@ -129,8 +142,9 @@ void ProgressMonitor::_progressMonitorLoop() {
     while (!_terminate.load()) {
         progressMonitorCheck(_crashCb);
 
-        const auto interval =
-            Microseconds(_faultManager->getConfig().getPeriodicLivenessCheckInterval());
+        const auto interval = duration_cast<Microseconds>(
+            _faultManager->getConfig().getPeriodicLivenessCheckInterval());
+
         // Breaking up the sleeping interval to check for `_terminate` more often.
         for (int i = 0; i < kSleepsPerInterval && !_terminate.load(); ++i) {
             sleepFor(interval / kSleepsPerInterval);
diff --git a/src/mongo/db/process_health/test_health_observer.h b/src/mongo/db/process_health/test_health_observer.h
index f325ac71534..0c23df7fb42 100644
--- a/src/mongo/db/process_health/test_health_observer.h
+++ b/src/mongo/db/process_health/test_health_observer.h
@@ -46,7 +46,7 @@ protected:
     }
 
     Milliseconds getObserverTimeout() const override {
-        return Milliseconds(Seconds(30));
+        return Seconds(30);
     }
 
     Future<HealthCheckStatus> periodicCheckImpl(
author	LaMont Nelson <lamont.nelson@mongodb.com>	2022-01-25 03:15:50 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2022-01-25 03:43:11 +0000
commit	c438071c21ea9241ad451c2af8d5766f07f5debe (patch)
tree	5018a66faa35fd7b2c0bec11e8ae64d569fef1ba
parent	30b290f315d0c30b44b4f6c746e5e8235c66f7c9 (diff)
download	mongo-c438071c21ea9241ad451c2af8d5766f07f5debe.tar.gz