From 1497a7d78706a502b98eeb7fa121bf8ec311029c Mon Sep 17 00:00:00 2001 From: Kshitij Gupta Date: Tue, 1 Feb 2022 21:50:12 +0000 Subject: SERVER-63013: When health observer returns status failure it will never run again. (cherry picked from commit 458895d6629e5bc700c2d6f8d84c8f91c58eab94) --- jstests/sharding/health_monitor/status_failure.js | 70 ++++++++++++++++++++++ .../config_server_health_observer.cpp | 4 +- .../db/process_health/health_observer_base.cpp | 24 ++++---- src/mongo/db/process_health/health_observer_base.h | 5 +- src/mongo/db/process_health/health_observer_mock.h | 2 +- .../db/process_health/test_health_observer.cpp | 7 ++- src/mongo/db/process_health/test_health_observer.h | 2 +- 7 files changed, 93 insertions(+), 21 deletions(-) create mode 100644 jstests/sharding/health_monitor/status_failure.js diff --git a/jstests/sharding/health_monitor/status_failure.js b/jstests/sharding/health_monitor/status_failure.js new file mode 100644 index 00000000000..9d65180b2f7 --- /dev/null +++ b/jstests/sharding/health_monitor/status_failure.js @@ -0,0 +1,70 @@ +/** + * Tests behavior of fault manager when health observer returns status failure. + * + * @tags: [multiversion_incompatible] + */ +(function() { +'use strict'; + +const kWaitForCompletedChecksCount = 12; +const kWaitForPassedChecksCount = 8; + +const params = { + setParameter: { + healthMonitoringIntensities: tojson({ + values: [ + {type: "test", intensity: "critical"}, + {type: "ldap", intensity: "off"}, + {type: "dns", intensity: "off"} + ] + }), + featureFlagHealthMonitoring: true + } +}; + +let st = new ShardingTest({ + mongos: [params], + shards: 1, +}); + +let result = assert.commandWorked(st.s0.adminCommand({serverStatus: 1})).health; +assert.eq(result.state, "Ok"); + +const checkServerStats = function() { + while (true) { + // Failpoint returns status failure. + assert.commandWorked(st.s0.adminCommand( + {"configureFailPoint": 'statusFailureTestHealthObserver', "mode": "alwaysOn"})); + + assert.soon(() => { + result = assert.commandWorked(st.s0.adminCommand({serverStatus: 1})).health; + return result.state == "TransientFault" && + result.faultInformation.facets.testObserver.description.includes( + "Status failure in test health observer"); + }); + + assert.commandWorked(st.s0.adminCommand( + {"configureFailPoint": 'statusFailureTestHealthObserver', "mode": "off"})); + + assert.soon(() => { + result = + assert.commandWorked(st.s0.adminCommand({serverStatus: 1, health: {details: true}})) + .health; + return result.state == "Ok"; + }); + + print(`Server status: ${tojson(result)}`); + // Wait for: at least kWaitForPassedChecksCount checks completed. + if (result.testObserver.totalChecks >= kWaitForCompletedChecksCount && + result.testObserver.totalChecks - result.testObserver.totalChecksWithFailure >= + kWaitForPassedChecksCount) { + break; + } + sleep(1000); + } +}; + +checkServerStats(); + +st.stop(); +})(); diff --git a/src/mongo/db/process_health/config_server_health_observer.cpp b/src/mongo/db/process_health/config_server_health_observer.cpp index e981e9a268b..37f1a8644d4 100644 --- a/src/mongo/db/process_health/config_server_health_observer.cpp +++ b/src/mongo/db/process_health/config_server_health_observer.cpp @@ -85,7 +85,7 @@ public: * previous one is filled, thus synchronization can be relaxed. */ Future periodicCheckImpl( - PeriodicHealthCheckContext&& periodicCheckContext) override; + PeriodicHealthCheckContext&& periodicCheckContext) noexcept override; private: // Collects the results of one check. @@ -144,7 +144,7 @@ ConfigServerHealthObserver::ConfigServerHealthObserver(ServiceContext* svcCtx) : HealthObserverBase(svcCtx) {} Future ConfigServerHealthObserver::periodicCheckImpl( - PeriodicHealthCheckContext&& periodicCheckContext) { + PeriodicHealthCheckContext&& periodicCheckContext) noexcept { // The chain is not capturing 'this' for the case the network call outlives the observer. return _checkImpl(std::move(periodicCheckContext)) .then([type = getType()](CheckResult result) mutable -> Future { diff --git a/src/mongo/db/process_health/health_observer_base.cpp b/src/mongo/db/process_health/health_observer_base.cpp index ca032799018..fe3509835df 100644 --- a/src/mongo/db/process_health/health_observer_base.cpp +++ b/src/mongo/db/process_health/health_observer_base.cpp @@ -42,16 +42,14 @@ HealthObserverBase::HealthObserverBase(ServiceContext* svcCtx) : _svcCtx(svcCtx), _rand(PseudoRandom(SecureRandom().nextInt64())) {} SharedSemiFuture HealthObserverBase::periodicCheck( - std::shared_ptr taskExecutor, CancellationToken token) { + std::shared_ptr taskExecutor, CancellationToken token) noexcept { // If we have reached here, the intensity of this health observer must not be off { - auto lk = stdx::lock_guard(_mutex); - if (_currentlyRunningHealthCheck) { - return _deadlineFuture->get(); - } LOGV2_DEBUG(6007902, 2, "Start periodic health check", "observerType"_attr = getType()); const auto now = _svcCtx->getPreciseClockSource()->now(); + + auto lk = stdx::lock_guard(_mutex); _lastTimeTheCheckWasRun = now; _currentlyRunningHealthCheck = true; } @@ -60,21 +58,19 @@ SharedSemiFuture HealthObserverBase::periodicCheck( taskExecutor, periodicCheckImpl({token, taskExecutor}) .onCompletion([this](StatusWith status) { - if (!status.isOK()) { - return status; - } - - auto healthStatus = status.getValue(); - const auto now = _svcCtx->getPreciseClockSource()->now(); + auto lk = stdx::lock_guard(_mutex); ++_completedChecksCount; invariant(_currentlyRunningHealthCheck); - if (!HealthCheckStatus::isResolved(healthStatus.getSeverity())) { - ++_completedChecksWithFaultCount; - } _currentlyRunningHealthCheck = false; _lastTimeCheckCompleted = now; + + if (!status.isOK() || + !HealthCheckStatus::isResolved(status.getValue().getSeverity())) { + ++_completedChecksWithFaultCount; + } + return status; }), getObserverTimeout()); diff --git a/src/mongo/db/process_health/health_observer_base.h b/src/mongo/db/process_health/health_observer_base.h index e2d53ad68c4..18f24eb8540 100644 --- a/src/mongo/db/process_health/health_observer_base.h +++ b/src/mongo/db/process_health/health_observer_base.h @@ -62,7 +62,8 @@ public: // Implements the common logic for periodic checks. // Every observer should implement periodicCheckImpl() for specific tests. SharedSemiFuture periodicCheck( - std::shared_ptr taskExecutor, CancellationToken token) override; + std::shared_ptr taskExecutor, + CancellationToken token) noexcept override; HealthCheckStatus makeHealthyStatus() const; static HealthCheckStatus makeHealthyStatusWithType(FaultFacetType type); @@ -90,7 +91,7 @@ protected: * @return The result of a complete health check */ virtual Future periodicCheckImpl( - PeriodicHealthCheckContext&& periodicCheckContext) = 0; + PeriodicHealthCheckContext&& periodicCheckContext) noexcept = 0; HealthObserverLivenessStats getStatsLocked(WithLock) const; diff --git a/src/mongo/db/process_health/health_observer_mock.h b/src/mongo/db/process_health/health_observer_mock.h index 644cfa3c247..80051dc5eac 100644 --- a/src/mongo/db/process_health/health_observer_mock.h +++ b/src/mongo/db/process_health/health_observer_mock.h @@ -67,7 +67,7 @@ protected: } Future periodicCheckImpl( - PeriodicHealthCheckContext&& periodicCheckContext) override { + PeriodicHealthCheckContext&& periodicCheckContext) noexcept override { auto completionPf = makePromiseFuture(); diff --git a/src/mongo/db/process_health/test_health_observer.cpp b/src/mongo/db/process_health/test_health_observer.cpp index 41b2087372c..ece51a11165 100644 --- a/src/mongo/db/process_health/test_health_observer.cpp +++ b/src/mongo/db/process_health/test_health_observer.cpp @@ -39,11 +39,16 @@ namespace process_health { MONGO_FAIL_POINT_DEFINE(hangTestHealthObserver); MONGO_FAIL_POINT_DEFINE(testHealthObserver); MONGO_FAIL_POINT_DEFINE(badConfigTestHealthObserver); +MONGO_FAIL_POINT_DEFINE(statusFailureTestHealthObserver); Future TestHealthObserver::periodicCheckImpl( - PeriodicHealthCheckContext&& periodicCheckContext) { + PeriodicHealthCheckContext&& periodicCheckContext) noexcept { LOGV2_DEBUG(5936801, 2, "Test health observer executing"); hangTestHealthObserver.pauseWhileSet(); + if (statusFailureTestHealthObserver.shouldFail()) { + return Status(ErrorCodes::BadValue, "Status failure in test health observer"); + } + auto result = Future::makeReady(makeHealthyStatus()); testHealthObserver.executeIf( diff --git a/src/mongo/db/process_health/test_health_observer.h b/src/mongo/db/process_health/test_health_observer.h index 0c23df7fb42..428d57f8e9d 100644 --- a/src/mongo/db/process_health/test_health_observer.h +++ b/src/mongo/db/process_health/test_health_observer.h @@ -50,7 +50,7 @@ protected: } Future periodicCheckImpl( - PeriodicHealthCheckContext&& periodicCheckContext) override; + PeriodicHealthCheckContext&& periodicCheckContext) noexcept override; bool isConfigured() const override; }; -- cgit v1.2.1