diff options
author | LaMont Nelson <lamont.nelson@mongodb.com> | 2022-01-25 03:15:50 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-01-25 03:43:11 +0000 |
commit | c438071c21ea9241ad451c2af8d5766f07f5debe (patch) | |
tree | 5018a66faa35fd7b2c0bec11e8ae64d569fef1ba | |
parent | 30b290f315d0c30b44b4f6c746e5e8235c66f7c9 (diff) | |
download | mongo-c438071c21ea9241ad451c2af8d5766f07f5debe.tar.gz |
SERVER-62904: Fault Manager progress checker should not fault unless the offending health observer is marked critical
5 files changed, 109 insertions, 16 deletions
diff --git a/jstests/sharding/health_monitor/progress_monitor.js b/jstests/sharding/health_monitor/progress_monitor.js index 4ef70928685..ede3f98d3ae 100644 --- a/jstests/sharding/health_monitor/progress_monitor.js +++ b/jstests/sharding/health_monitor/progress_monitor.js @@ -2,7 +2,7 @@ * @tags: [multiversion_incompatible] */ const PROGRESS_TIMEOUT_SECONDS = 5; -const CHECK_PING_SECONDS = 1; +const monitoringIntervalMs = 500; (function() { 'use strict'; @@ -10,14 +10,13 @@ const params = { setParameter: { healthMonitoringIntensities: tojson({ values: [ - {type: "test", intensity: "non-critical"}, - {type: "ldap", intensity: "off"}, - {type: "dns", intensity: "off"} + {type: "test", intensity: "critical"}, ] }), - healthMonitoringIntervals: tojson({values: [{type: "test", interval: NumberInt(500)}]}), + healthMonitoringIntervals: + tojson({values: [{type: "test", interval: NumberInt(monitoringIntervalMs)}]}), progressMonitor: - tojson({interval: PROGRESS_TIMEOUT_SECONDS, deadline: PROGRESS_TIMEOUT_SECONDS}), + tojson({interval: PROGRESS_TIMEOUT_SECONDS * 1000, deadline: PROGRESS_TIMEOUT_SECONDS}), featureFlagHealthMonitoring: true } }; @@ -32,13 +31,13 @@ assert.commandWorked(st.s1.adminCommand( {"setParameter": 1, logComponentVerbosity: {processHealth: {verbosity: 2}}})); // Set the failpoint on one of the mongos's to pause its healthchecks. +jsTestLog("hang test health observer on " + st.s1.host); assert.commandWorked( st.s1.adminCommand({"configureFailPoint": 'hangTestHealthObserver', "mode": "alwaysOn"})); -sleep(CHECK_PING_SECONDS * 1000); -// Make sure the failpoint on its own doesn't bring down the server. -assert.commandWorked(st.s1.adminCommand({"ping": 1})); + // Wait for the progress monitor timeout to elapse. -sleep(PROGRESS_TIMEOUT_SECONDS * 1000); +sleep(1.1 * PROGRESS_TIMEOUT_SECONDS * 1000); +jsTestLog("Done sleeping"); assert.soon(() => { try { @@ -59,7 +58,7 @@ assert.soon(() => { } sleep(1000); return false; -}, "Pinging faulty mongos should fail with network error.", PROGRESS_TIMEOUT_SECONDS * 1000); +}, "Pinging faulty mongos should fail with network error."); // Don't validate exit codes, since a mongos will exit on its own with a non-zero exit code. st.stop({skipValidatingExitCode: true}); diff --git a/jstests/sharding/health_monitor/progress_monitor_doesnt_crash_on_not_critical.js b/jstests/sharding/health_monitor/progress_monitor_doesnt_crash_on_not_critical.js new file mode 100644 index 00000000000..5b4b582f164 --- /dev/null +++ b/jstests/sharding/health_monitor/progress_monitor_doesnt_crash_on_not_critical.js @@ -0,0 +1,80 @@ +/* + * @tags: [multiversion_incompatible] + */ +const PROGRESS_TIMEOUT_SECONDS = 5; +(function() { +'use strict'; + +const params = { + setParameter: { + healthMonitoringIntensities: tojson({ + values: [ + {type: "test", intensity: "non-critical"}, + ] + }), + healthMonitoringIntervals: tojson({values: [{type: "test", interval: NumberInt(500)}]}), + progressMonitor: + tojson({interval: PROGRESS_TIMEOUT_SECONDS * 1000, deadline: PROGRESS_TIMEOUT_SECONDS}), + featureFlagHealthMonitoring: true + } +}; +let st = new ShardingTest({ + mongos: [params, params], + shards: 1, +}); + +function changeObserverIntensity(conn, observer, intensity) { + let paramValue = {"values": [{"type": observer, "intensity": intensity}]}; + assert.commandWorked( + conn.adminCommand({"setParameter": 1, healthMonitoringIntensities: paramValue})); +} + +// After cluster startup, make sure both mongos's are available. +let pingServers = (servers) => { + servers.forEach((server) => { + assert.commandWorked(server.adminCommand({"ping": 1})); + }); +}; + +pingServers([st.s0, st.s1]); +assert.commandWorked(st.s1.adminCommand( + {"setParameter": 1, logComponentVerbosity: {processHealth: {verbosity: 2}}})); + +jsTestLog("Block test health observer on " + st.s1.host); +assert.commandWorked( + st.s1.adminCommand({"configureFailPoint": 'hangTestHealthObserver', "mode": "alwaysOn"})); + +// Wait for the progress monitor timeout to elapse. +sleep(1.1 * PROGRESS_TIMEOUT_SECONDS * 1000); + +// Servers should be still be alive. +jsTestLog("Expect monogs processes to still be alive"); +pingServers([st.s0, st.s1]); + +jsTestLog("Change observer to critical"); +changeObserverIntensity(st.s1, 'test', 'critical'); + +// Wait for the progress monitor timeout to elapse. +sleep(1.1 * PROGRESS_TIMEOUT_SECONDS * 1000); + +jsTestLog("Done sleeping"); +// Servers should be still be alive. +pingServers([st.s0]); + +let pingNetworkError = false; +try { + pingServers([st.s1]); +} catch (ex) { + if (ex.message.indexOf("network error") >= 0) { + print("Got expected error: " + tojson(ex)); + pingNetworkError = true; + } else { + jsTestLog(`Unexpected failure: ${ex}`); + assert(false, "expected ping on " + st.s1.host + " to fail with network error."); + } +} +assert(pingNetworkError, "expected " + st.s1.host + "to be killed"); + +// Don't validate exit codes, since a mongos will exit on its own with a non-zero exit code. +st.stop({skipValidatingExitCode: true}); +})(); diff --git a/src/mongo/db/process_health/fault_manager_config.h b/src/mongo/db/process_health/fault_manager_config.h index 961461111f1..a2c7197920a 100644 --- a/src/mongo/db/process_health/fault_manager_config.h +++ b/src/mongo/db/process_health/fault_manager_config.h @@ -156,7 +156,7 @@ public: // If the server persists in TransientFault for more than this duration // it will move to the ActiveFault state and terminate. Milliseconds getActiveFaultDuration() const { - return Milliseconds(Seconds(mongo::gActiveFaultDurationSecs.load())); + return Seconds(mongo::gActiveFaultDurationSecs.load()); } Milliseconds getPeriodicHealthCheckInterval(FaultFacetType type) const { diff --git a/src/mongo/db/process_health/progress_monitor.cpp b/src/mongo/db/process_health/progress_monitor.cpp index d1d1ec13f47..c8552ef8f3e 100644 --- a/src/mongo/db/process_health/progress_monitor.cpp +++ b/src/mongo/db/process_health/progress_monitor.cpp @@ -32,6 +32,7 @@ #include "mongo/db/process_health/progress_monitor.h" #include "mongo/db/process_health/fault_manager.h" +#include "mongo/db/process_health/health_monitoring_server_parameters_gen.h" #include "mongo/db/process_health/health_observer.h" #include "mongo/logv2/log.h" @@ -65,7 +66,16 @@ void ProgressMonitor::progressMonitorCheck(std::function<void(std::string cause) // Check the liveness of every health observer. for (auto observer : observers) { const auto stats = observer->getStats(); - if (!_faultManager->getConfig().isHealthObserverEnabled(observer->getType())) { + const auto observerType = observer->getType(); + const auto observerIntensity = + _faultManager->getConfig().getHealthObserverIntensity(observerType); + + if (observerIntensity != HealthObserverIntensityEnum::kCritical) { + LOGV2_DEBUG(6290401, + 2, + "Skip checking progress of not critical health observer", + "observerType"_attr = (str::stream() << observerType), + "intensity"_attr = HealthObserverIntensity_serializer(observerIntensity)); continue; } @@ -81,6 +91,7 @@ void ProgressMonitor::progressMonitorCheck(std::function<void(std::string cause) if (stats.currentlyRunningHealthCheck && now - stats.lastTimeCheckStarted > _faultManager->getConfig().getPeriodicLivenessDeadline()) { + // Crash because this health checker is running for too long. crashCb(str::stream() << "Health observer " << observer->getType() << " is still running since " @@ -109,10 +120,12 @@ void ProgressMonitor::progressMonitorCheck(std::function<void(std::string cause) // and check again. Note: this should be rare. for (auto observer : secondPass) { const auto stats = observer->getStats(); + if (!_faultManager->getConfig().isHealthObserverEnabled(observer->getType()) && !stats.currentlyRunningHealthCheck && now - stats.lastTimeCheckStarted > _faultManager->getConfig().getPeriodicLivenessDeadline() * 2) { + // Crash because this health checker was never started. crashCb(str::stream() << "Health observer " << observer->getType() << " did not run since " @@ -129,8 +142,9 @@ void ProgressMonitor::_progressMonitorLoop() { while (!_terminate.load()) { progressMonitorCheck(_crashCb); - const auto interval = - Microseconds(_faultManager->getConfig().getPeriodicLivenessCheckInterval()); + const auto interval = duration_cast<Microseconds>( + _faultManager->getConfig().getPeriodicLivenessCheckInterval()); + // Breaking up the sleeping interval to check for `_terminate` more often. for (int i = 0; i < kSleepsPerInterval && !_terminate.load(); ++i) { sleepFor(interval / kSleepsPerInterval); diff --git a/src/mongo/db/process_health/test_health_observer.h b/src/mongo/db/process_health/test_health_observer.h index f325ac71534..0c23df7fb42 100644 --- a/src/mongo/db/process_health/test_health_observer.h +++ b/src/mongo/db/process_health/test_health_observer.h @@ -46,7 +46,7 @@ protected: } Milliseconds getObserverTimeout() const override { - return Milliseconds(Seconds(30)); + return Seconds(30); } Future<HealthCheckStatus> periodicCheckImpl( |