From 907d0bff8a62b6a2c7360ed96b2d4a780bfde936 Mon Sep 17 00:00:00 2001 From: Kshitij Gupta Date: Mon, 13 Dec 2021 21:03:13 +0000 Subject: SERVER-59382: Enforce non-critical facets not entering ActiveFault state --- src/mongo/db/process_health/fault_manager.cpp | 14 +++--- src/mongo/db/process_health/fault_manager_config.h | 14 ++---- .../db/process_health/fault_manager_test_suite.h | 6 +++ .../db/process_health/fault_state_machine_test.cpp | 57 ++++++++++++++++------ .../health_monitoring_server_parameters.idl | 9 ++++ 5 files changed, 69 insertions(+), 31 deletions(-) (limited to 'src/mongo/db/process_health') diff --git a/src/mongo/db/process_health/fault_manager.cpp b/src/mongo/db/process_health/fault_manager.cpp index 0d0618f3da9..c20d06b1e9d 100644 --- a/src/mongo/db/process_health/fault_manager.cpp +++ b/src/mongo/db/process_health/fault_manager.cpp @@ -200,12 +200,12 @@ boost::optional FaultManager::handleStartupCheck(const OptionalMessa } updateWithCheckStatus(HealthCheckStatus(status)); - auto optionalActiveFault = getFaultFacetsContainer(); - if (optionalActiveFault) { - optionalActiveFault->garbageCollectResolvedFacets(); + auto optionalFault = getFaultFacetsContainer(); + if (optionalFault) { + optionalFault->garbageCollectResolvedFacets(); } - if (optionalActiveFault && hasCriticalFacet(_fault.get()) && !_transientFaultDeadline) { + if (optionalFault) { setTransientFaultDeadline( FaultState::kStartupCheck, FaultState::kStartupCheck, boost::none); } @@ -298,8 +298,10 @@ void FaultManager::logCurrentState(FaultState, FaultState newState, const Option } void FaultManager::setTransientFaultDeadline(FaultState, FaultState, const OptionalMessageType&) { - _transientFaultDeadline = std::make_unique( - this, _taskExecutor, _config->getActiveFaultDuration()); + if (hasCriticalFacet(_fault.get()) && !_transientFaultDeadline) { + _transientFaultDeadline = std::make_unique( + this, _taskExecutor, _config->getActiveFaultDuration()); + } } void FaultManager::clearTransientFaultDeadline(FaultState, FaultState, const OptionalMessageType&) { diff --git a/src/mongo/db/process_health/fault_manager_config.h b/src/mongo/db/process_health/fault_manager_config.h index 92834acafad..8542dbf9ca5 100644 --- a/src/mongo/db/process_health/fault_manager_config.h +++ b/src/mongo/db/process_health/fault_manager_config.h @@ -113,8 +113,10 @@ public: _facetToIntensityMapForTest.insert({type, intensity}); } + // If the server persists in TransientFault for more than this duration + // it will move to the ActiveFault state and terminate. Milliseconds getActiveFaultDuration() const { - return _activeFaultDuration; + return Milliseconds(Seconds(mongo::gActiveFaultDurationSecs.load())); } Milliseconds getPeriodicHealthCheckInterval() const { @@ -140,15 +142,6 @@ public: _periodicChecksDisabledForTests = true; } - void setActiveFaultDurationForTests(Milliseconds duration) { - _activeFaultDuration = duration; - } - -protected: - // If the server persists in TransientFault for more than this duration - // it will move to the ActiveFault state and terminate. - static inline const auto kActiveFaultDuration = Seconds(120); - private: static HealthMonitoringIntensitiesServerParameter* getHealthObserverIntensities() { return ServerParameterSet::getGlobal()->get( @@ -156,7 +149,6 @@ private: } bool _periodicChecksDisabledForTests = false; - Milliseconds _activeFaultDuration = kActiveFaultDuration; stdx::unordered_map _facetToIntensityMapForTest; }; diff --git a/src/mongo/db/process_health/fault_manager_test_suite.h b/src/mongo/db/process_health/fault_manager_test_suite.h index 40667e394ce..581d4c5a651 100644 --- a/src/mongo/db/process_health/fault_manager_test_suite.h +++ b/src/mongo/db/process_health/fault_manager_test_suite.h @@ -231,6 +231,12 @@ public: static inline const Seconds kWaitTimeout{30}; static inline const Milliseconds kSleepTime{1}; + + static inline const int kActiveFaultDurationSecs = 1; + + RAIIServerParameterControllerForTest serverParamController{"activeFaultDurationSecs", + kActiveFaultDurationSecs}; + void assertSoon(std::function predicate, Milliseconds timeout = kWaitTimeout) { Timer t; while (t.elapsed() < timeout) { diff --git a/src/mongo/db/process_health/fault_state_machine_test.cpp b/src/mongo/db/process_health/fault_state_machine_test.cpp index c1d3c5efc59..8a4b639d576 100644 --- a/src/mongo/db/process_health/fault_state_machine_test.cpp +++ b/src/mongo/db/process_health/fault_state_machine_test.cpp @@ -214,10 +214,6 @@ TEST_F(FaultManagerTest, OneFacetIsResolved) { DEATH_TEST_F(FaultManagerTest, TransitionsToActiveFaultAfterTimeoutFromTransientFault, "Fatal") { RAIIServerParameterControllerForTest _controller{"featureFlagHealthMonitoring", true}; auto faultFacetType = FaultFacetType::kMock1; - auto config = test::getConfigWithDisabledPeriodicChecks(); - auto activeFaultDuration = Milliseconds(100); - config->setActiveFaultDurationForTests(activeFaultDuration); - resetManager(std::move(config)); registerMockHealthObserver(faultFacetType, [] { return 1.1; }); auto initialHealthCheckFuture = manager().startPeriodicHealthChecks(); @@ -227,35 +223,68 @@ DEATH_TEST_F(FaultManagerTest, TransitionsToActiveFaultAfterTimeoutFromTransient manager().acceptTest(HealthCheckStatus(faultFacetType, 1.0, "error")); ASSERT(manager().getFaultState() == FaultState::kTransientFault); - advanceTime(activeFaultDuration); + advanceTime(Seconds(kActiveFaultDurationSecs)); waitForTransitionIntoState(FaultState::kActiveFault); } -DEATH_TEST_F(FaultManagerTest, TransitionsToActiveFaultAfterTimeoutFromStartupCheck, "Fatal") { +TEST_F(FaultManagerTest, + NonCriticalFacetDoesNotTransitionToActiveFaultAfterTimeoutFromTransientFault) { RAIIServerParameterControllerForTest _controller{"featureFlagHealthMonitoring", true}; auto faultFacetType = FaultFacetType::kMock1; auto config = test::getConfigWithDisabledPeriodicChecks(); - auto activeFaultDuration = Milliseconds(100); - config->setActiveFaultDurationForTests(activeFaultDuration); + config->setIntensityForType(faultFacetType, HealthObserverIntensityEnum::kNonCritical); resetManager(std::move(config)); + registerMockHealthObserver(faultFacetType, [] { return 1.1; }); + auto initialHealthCheckFuture = manager().startPeriodicHealthChecks(); + manager().acceptTest(HealthCheckStatus(faultFacetType)); + ASSERT(manager().getFaultState() == FaultState::kOk); + + manager().acceptTest(HealthCheckStatus(faultFacetType, 1.0, "error")); + ASSERT(manager().getFaultState() == FaultState::kTransientFault); + + advanceTime(Seconds(kActiveFaultDurationSecs)); + // Should be enough time to move to Active fault if we were going to crash. + sleepFor(Seconds(1)); + ASSERT(manager().getFaultState() == FaultState::kTransientFault); +} + +DEATH_TEST_F(FaultManagerTest, TransitionsToActiveFaultAfterTimeoutFromStartupCheck, "Fatal") { + RAIIServerParameterControllerForTest _controller{"featureFlagHealthMonitoring", true}; + auto faultFacetType = FaultFacetType::kMock1; + registerMockHealthObserver(faultFacetType, [] { return 1.1; }); auto initialHealthCheckFuture = manager().startPeriodicHealthChecks(); manager().acceptTest(HealthCheckStatus(faultFacetType, 1.0, "error")); ASSERT(manager().getFaultState() == FaultState::kStartupCheck); - advanceTime(activeFaultDuration); + advanceTime(Seconds(kActiveFaultDurationSecs)); waitForTransitionIntoState(FaultState::kActiveFault); } -TEST_F(FaultManagerTest, DoesNotTransitionToActiveFaultIfResolved) { +TEST_F(FaultManagerTest, + NonCriticalFacetDoesNotTransitionToActiveFaultAfterTimeoutFromStartupCheck) { RAIIServerParameterControllerForTest _controller{"featureFlagHealthMonitoring", true}; auto faultFacetType = FaultFacetType::kMock1; auto config = test::getConfigWithDisabledPeriodicChecks(); - auto activeFaultDuration = Milliseconds(100); - config->setActiveFaultDurationForTests(activeFaultDuration); + config->setIntensityForType(faultFacetType, HealthObserverIntensityEnum::kNonCritical); resetManager(std::move(config)); + registerMockHealthObserver(faultFacetType, [] { return 1.1; }); + auto initialHealthCheckFuture = manager().startPeriodicHealthChecks(); + manager().acceptTest(HealthCheckStatus(faultFacetType, 1.0, "error")); + ASSERT(manager().getFaultState() == FaultState::kStartupCheck); + + advanceTime(Seconds(kActiveFaultDurationSecs) * 10); + // Should be enough time to move to Active fault if we were going to crash. + sleepFor(Seconds(1)); + ASSERT(manager().getFaultState() == FaultState::kStartupCheck); +} + +TEST_F(FaultManagerTest, DoesNotTransitionToActiveFaultIfResolved) { + RAIIServerParameterControllerForTest _controller{"featureFlagHealthMonitoring", true}; + auto faultFacetType = FaultFacetType::kMock1; + registerMockHealthObserver(faultFacetType, [] { return 1.1; }); auto initialHealthCheckFuture = manager().startPeriodicHealthChecks(); manager().acceptTest(HealthCheckStatus(faultFacetType)); @@ -264,10 +293,10 @@ TEST_F(FaultManagerTest, DoesNotTransitionToActiveFaultIfResolved) { manager().acceptTest(HealthCheckStatus(faultFacetType, 1.0, "error")); ASSERT(manager().getFaultState() == FaultState::kTransientFault); - advanceTime(activeFaultDuration / 2); + advanceTime(Seconds(kActiveFaultDurationSecs / 2)); manager().acceptTest(HealthCheckStatus(faultFacetType)); - advanceTime(activeFaultDuration); + advanceTime(Seconds(kActiveFaultDurationSecs)); ASSERT(manager().getFaultState() == FaultState::kOk); } diff --git a/src/mongo/db/process_health/health_monitoring_server_parameters.idl b/src/mongo/db/process_health/health_monitoring_server_parameters.idl index 42adb48c2d3..779c9370d90 100644 --- a/src/mongo/db/process_health/health_monitoring_server_parameters.idl +++ b/src/mongo/db/process_health/health_monitoring_server_parameters.idl @@ -68,3 +68,12 @@ server_parameters: name: "HealthMonitoringIntensitiesServerParameter" data: "synchronized_value" override_set: true + activeFaultDurationSecs: + description: "A server parameter for specifying the duration after which we transition to active fault." + set_at: [startup, runtime] + cpp_vartype: AtomicWord + cpp_varname: gActiveFaultDurationSecs + default: + expr: 120 + validator: + gte: 0 -- cgit v1.2.1