diff options
author | Kshitij Gupta <kshitij.gupta@mongodb.com> | 2021-12-13 21:03:13 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-12-28 23:18:25 +0000 |
commit | b09900c6482d450cda67787bef1d548411c3aa69 (patch) | |
tree | e0ada13611919f43421191a8517c6186fa80a8b8 | |
parent | a00bb2e633ccf18be6e74c4345b6c7ad6f838c3d (diff) | |
download | mongo-b09900c6482d450cda67787bef1d548411c3aa69.tar.gz |
SERVER-59382: Enforce non-critical facets not entering ActiveFault state
6 files changed, 115 insertions, 31 deletions
diff --git a/jstests/sharding/health_monitor/non_critical_facet.js b/jstests/sharding/health_monitor/non_critical_facet.js new file mode 100644 index 00000000000..073cba2afa1 --- /dev/null +++ b/jstests/sharding/health_monitor/non_critical_facet.js @@ -0,0 +1,48 @@ +/** + * Tests behaviour of non-critical fault facet. + */ +(function() { +'use strict'; +const ACTIVE_FAULT_DURATION_SECS = 1; + +const params = { + setParameter: { + healthMonitoring: tojson({test: "non-critical", ldap: "off", dns: "off"}), + featureFlagHealthMonitoring: true + } +}; + +let st = new ShardingTest({ + mongos: [params], + shards: 1, +}); + +assert.commandWorked( + st.s0.adminCommand({"setParameter": 1, activeFaultDurationSecs: ACTIVE_FAULT_DURATION_SECS})); + +let result = assert.commandWorked(st.s0.adminCommand({serverStatus: 1})).health; +assert.eq(result.state, "Ok"); + +// Failpoint returns fault. +assert.commandWorked(st.s0.adminCommand({ + "configureFailPoint": 'testHealthObserver', + "data": {"code": "InternalError", "msg": "test msg"}, + "mode": "alwaysOn" +})); + +assert.soon(() => { + result = assert.commandWorked(st.s0.adminCommand({serverStatus: 1})).health; + return result.state == "TransientFault"; +}); + +// Sleep for twice as long as active fault duration (in Millis). +sleep(ACTIVE_FAULT_DURATION_SECS * 2000); + +// Still in transient fault. +result = assert.commandWorked(st.s0.adminCommand({serverStatus: 1})).health; +assert.eq(result.state, "TransientFault"); +assert( + result.faultInformation.facets.kTestObserver.description.includes("InternalError: test msg")); + +st.stop(); +})(); diff --git a/src/mongo/db/process_health/fault_manager.cpp b/src/mongo/db/process_health/fault_manager.cpp index 10c43033689..9b52ee34247 100644 --- a/src/mongo/db/process_health/fault_manager.cpp +++ b/src/mongo/db/process_health/fault_manager.cpp @@ -205,12 +205,12 @@ boost::optional<FaultState> FaultManager::handleStartupCheck(const OptionalMessa } updateWithCheckStatus(HealthCheckStatus(status)); - auto optionalActiveFault = getFaultFacetsContainer(); - if (optionalActiveFault) { - optionalActiveFault->garbageCollectResolvedFacets(); + auto optionalFault = getFaultFacetsContainer(); + if (optionalFault) { + optionalFault->garbageCollectResolvedFacets(); } - if (optionalActiveFault && hasCriticalFacet(_fault.get()) && !_transientFaultDeadline) { + if (optionalFault) { setTransientFaultDeadline( FaultState::kStartupCheck, FaultState::kStartupCheck, boost::none); } @@ -303,8 +303,10 @@ void FaultManager::logCurrentState(FaultState, FaultState newState, const Option } void FaultManager::setTransientFaultDeadline(FaultState, FaultState, const OptionalMessageType&) { - _transientFaultDeadline = std::make_unique<TransientFaultDeadline>( - this, _taskExecutor, _config->getActiveFaultDuration()); + if (hasCriticalFacet(_fault.get()) && !_transientFaultDeadline) { + _transientFaultDeadline = std::make_unique<TransientFaultDeadline>( + this, _taskExecutor, _config->getActiveFaultDuration()); + } } void FaultManager::clearTransientFaultDeadline(FaultState, FaultState, const OptionalMessageType&) { diff --git a/src/mongo/db/process_health/fault_manager_config.h b/src/mongo/db/process_health/fault_manager_config.h index 92834acafad..8542dbf9ca5 100644 --- a/src/mongo/db/process_health/fault_manager_config.h +++ b/src/mongo/db/process_health/fault_manager_config.h @@ -113,8 +113,10 @@ public: _facetToIntensityMapForTest.insert({type, intensity}); } + // If the server persists in TransientFault for more than this duration + // it will move to the ActiveFault state and terminate. Milliseconds getActiveFaultDuration() const { - return _activeFaultDuration; + return Milliseconds(Seconds(mongo::gActiveFaultDurationSecs.load())); } Milliseconds getPeriodicHealthCheckInterval() const { @@ -140,15 +142,6 @@ public: _periodicChecksDisabledForTests = true; } - void setActiveFaultDurationForTests(Milliseconds duration) { - _activeFaultDuration = duration; - } - -protected: - // If the server persists in TransientFault for more than this duration - // it will move to the ActiveFault state and terminate. - static inline const auto kActiveFaultDuration = Seconds(120); - private: static HealthMonitoringIntensitiesServerParameter* getHealthObserverIntensities() { return ServerParameterSet::getGlobal()->get<HealthMonitoringIntensitiesServerParameter>( @@ -156,7 +149,6 @@ private: } bool _periodicChecksDisabledForTests = false; - Milliseconds _activeFaultDuration = kActiveFaultDuration; stdx::unordered_map<FaultFacetType, HealthObserverIntensityEnum> _facetToIntensityMapForTest; }; diff --git a/src/mongo/db/process_health/fault_manager_test_suite.h b/src/mongo/db/process_health/fault_manager_test_suite.h index 4296add1ee2..807a771ea06 100644 --- a/src/mongo/db/process_health/fault_manager_test_suite.h +++ b/src/mongo/db/process_health/fault_manager_test_suite.h @@ -126,6 +126,7 @@ class FaultManagerTest : public unittest::Test { public: void setUp() override { feature_flags::gFeatureFlagHealthMonitoring = true; + mongo::gActiveFaultDurationSecs.store(kActiveFaultDurationSecs); HealthObserverRegistration::resetObserverFactoriesForTest(); createServiceContextIfNeeded(); @@ -236,6 +237,9 @@ public: static inline const Seconds kWaitTimeout{30}; static inline const Milliseconds kSleepTime{1}; + + static inline const int kActiveFaultDurationSecs = 1; + void assertSoon(std::function<bool()> predicate, Milliseconds timeout = kWaitTimeout) { Timer t; while (t.elapsed() < timeout) { diff --git a/src/mongo/db/process_health/fault_state_machine_test.cpp b/src/mongo/db/process_health/fault_state_machine_test.cpp index 6ee69968288..0cede3b5f1a 100644 --- a/src/mongo/db/process_health/fault_state_machine_test.cpp +++ b/src/mongo/db/process_health/fault_state_machine_test.cpp @@ -214,10 +214,6 @@ TEST_F(FaultManagerTest, OneFacetIsResolved) { DEATH_TEST_F(FaultManagerTest, TransitionsToActiveFaultAfterTimeoutFromTransientFault, "Fatal") { feature_flags::gFeatureFlagHealthMonitoring = true; auto faultFacetType = FaultFacetType::kMock1; - auto config = test::getConfigWithDisabledPeriodicChecks(); - auto activeFaultDuration = Milliseconds(100); - config->setActiveFaultDurationForTests(activeFaultDuration); - resetManager(std::move(config)); registerMockHealthObserver(faultFacetType, [] { return 1.1; }); auto initialHealthCheckFuture = manager().startPeriodicHealthChecks(); @@ -227,47 +223,80 @@ DEATH_TEST_F(FaultManagerTest, TransitionsToActiveFaultAfterTimeoutFromTransient manager().acceptTest(HealthCheckStatus(faultFacetType, 1.0, "error")); ASSERT(manager().getFaultState() == FaultState::kTransientFault); - advanceTime(activeFaultDuration); + advanceTime(Seconds(kActiveFaultDurationSecs)); waitForTransitionIntoState(FaultState::kActiveFault); } -DEATH_TEST_F(FaultManagerTest, TransitionsToActiveFaultAfterTimeoutFromStartupCheck, "Fatal") { +TEST_F(FaultManagerTest, + NonCriticalFacetDoesNotTransitionToActiveFaultAfterTimeoutFromTransientFault) { feature_flags::gFeatureFlagHealthMonitoring = true; auto faultFacetType = FaultFacetType::kMock1; auto config = test::getConfigWithDisabledPeriodicChecks(); - auto activeFaultDuration = Milliseconds(100); - config->setActiveFaultDurationForTests(activeFaultDuration); + config->setIntensityForType(faultFacetType, HealthObserverIntensityEnum::kNonCritical); resetManager(std::move(config)); registerMockHealthObserver(faultFacetType, [] { return 1.1; }); auto initialHealthCheckFuture = manager().startPeriodicHealthChecks(); + manager().acceptTest(HealthCheckStatus(faultFacetType)); + ASSERT(manager().getFaultState() == FaultState::kOk); + + manager().acceptTest(HealthCheckStatus(faultFacetType, 1.0, "error")); + ASSERT(manager().getFaultState() == FaultState::kTransientFault); + + advanceTime(Seconds(kActiveFaultDurationSecs)); + // Should be enough time to move to Active fault if we were going to crash. + sleepFor(Seconds(1)); + ASSERT(manager().getFaultState() == FaultState::kTransientFault); +} + +DEATH_TEST_F(FaultManagerTest, TransitionsToActiveFaultAfterTimeoutFromStartupCheck, "Fatal") { + feature_flags::gFeatureFlagHealthMonitoring = true; + auto faultFacetType = FaultFacetType::kMock1; + + registerMockHealthObserver(faultFacetType, [] { return 1.1; }); + auto initialHealthCheckFuture = manager().startPeriodicHealthChecks(); manager().acceptTest(HealthCheckStatus(faultFacetType, 1.0, "error")); ASSERT(manager().getFaultState() == FaultState::kStartupCheck); - advanceTime(activeFaultDuration); + advanceTime(Seconds(kActiveFaultDurationSecs)); waitForTransitionIntoState(FaultState::kActiveFault); } -TEST_F(FaultManagerTest, DoesNotTransitionToActiveFaultIfResolved) { +TEST_F(FaultManagerTest, + NonCriticalFacetDoesNotTransitionToActiveFaultAfterTimeoutFromStartupCheck) { feature_flags::gFeatureFlagHealthMonitoring = true; auto faultFacetType = FaultFacetType::kMock1; auto config = test::getConfigWithDisabledPeriodicChecks(); - auto activeFaultDuration = Milliseconds(100); - config->setActiveFaultDurationForTests(activeFaultDuration); + config->setIntensityForType(faultFacetType, HealthObserverIntensityEnum::kNonCritical); resetManager(std::move(config)); registerMockHealthObserver(faultFacetType, [] { return 1.1; }); auto initialHealthCheckFuture = manager().startPeriodicHealthChecks(); + manager().acceptTest(HealthCheckStatus(faultFacetType, 1.0, "error")); + ASSERT(manager().getFaultState() == FaultState::kStartupCheck); + + advanceTime(Seconds(kActiveFaultDurationSecs) * 10); + // Should be enough time to move to Active fault if we were going to crash. + sleepFor(Seconds(1)); + ASSERT(manager().getFaultState() == FaultState::kStartupCheck); +} + +TEST_F(FaultManagerTest, DoesNotTransitionToActiveFaultIfResolved) { + feature_flags::gFeatureFlagHealthMonitoring = true; + auto faultFacetType = FaultFacetType::kMock1; + + registerMockHealthObserver(faultFacetType, [] { return 1.1; }); + auto initialHealthCheckFuture = manager().startPeriodicHealthChecks(); manager().acceptTest(HealthCheckStatus(faultFacetType)); ASSERT(manager().getFaultState() == FaultState::kOk); manager().acceptTest(HealthCheckStatus(faultFacetType, 1.0, "error")); ASSERT(manager().getFaultState() == FaultState::kTransientFault); - advanceTime(activeFaultDuration / 2); + advanceTime(Seconds(kActiveFaultDurationSecs / 2)); manager().acceptTest(HealthCheckStatus(faultFacetType)); - advanceTime(activeFaultDuration); + advanceTime(Seconds(kActiveFaultDurationSecs)); ASSERT(manager().getFaultState() == FaultState::kOk); } diff --git a/src/mongo/db/process_health/health_monitoring_server_parameters.idl b/src/mongo/db/process_health/health_monitoring_server_parameters.idl index 42adb48c2d3..779c9370d90 100644 --- a/src/mongo/db/process_health/health_monitoring_server_parameters.idl +++ b/src/mongo/db/process_health/health_monitoring_server_parameters.idl @@ -68,3 +68,12 @@ server_parameters: name: "HealthMonitoringIntensitiesServerParameter" data: "synchronized_value<HealthObserverIntensities>" override_set: true + activeFaultDurationSecs: + description: "A server parameter for specifying the duration after which we transition to active fault." + set_at: [startup, runtime] + cpp_vartype: AtomicWord<int> + cpp_varname: gActiveFaultDurationSecs + default: + expr: 120 + validator: + gte: 0 |