summaryrefslogtreecommitdiff
path: root/src/mongo/db/process_health
diff options
context:
space:
mode:
authorKshitij Gupta <kshitij.gupta@mongodb.com>2021-12-13 21:03:13 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-12-13 21:30:20 +0000
commit907d0bff8a62b6a2c7360ed96b2d4a780bfde936 (patch)
treea4066b6126ba0822baf1512cc94cd70a8376fcc8 /src/mongo/db/process_health
parent09d330def86f4467559cebbf0e91ecf038c296e6 (diff)
downloadmongo-907d0bff8a62b6a2c7360ed96b2d4a780bfde936.tar.gz
SERVER-59382: Enforce non-critical facets not entering ActiveFault state
Diffstat (limited to 'src/mongo/db/process_health')
-rw-r--r--src/mongo/db/process_health/fault_manager.cpp14
-rw-r--r--src/mongo/db/process_health/fault_manager_config.h14
-rw-r--r--src/mongo/db/process_health/fault_manager_test_suite.h6
-rw-r--r--src/mongo/db/process_health/fault_state_machine_test.cpp57
-rw-r--r--src/mongo/db/process_health/health_monitoring_server_parameters.idl9
5 files changed, 69 insertions, 31 deletions
diff --git a/src/mongo/db/process_health/fault_manager.cpp b/src/mongo/db/process_health/fault_manager.cpp
index 0d0618f3da9..c20d06b1e9d 100644
--- a/src/mongo/db/process_health/fault_manager.cpp
+++ b/src/mongo/db/process_health/fault_manager.cpp
@@ -200,12 +200,12 @@ boost::optional<FaultState> FaultManager::handleStartupCheck(const OptionalMessa
}
updateWithCheckStatus(HealthCheckStatus(status));
- auto optionalActiveFault = getFaultFacetsContainer();
- if (optionalActiveFault) {
- optionalActiveFault->garbageCollectResolvedFacets();
+ auto optionalFault = getFaultFacetsContainer();
+ if (optionalFault) {
+ optionalFault->garbageCollectResolvedFacets();
}
- if (optionalActiveFault && hasCriticalFacet(_fault.get()) && !_transientFaultDeadline) {
+ if (optionalFault) {
setTransientFaultDeadline(
FaultState::kStartupCheck, FaultState::kStartupCheck, boost::none);
}
@@ -298,8 +298,10 @@ void FaultManager::logCurrentState(FaultState, FaultState newState, const Option
}
void FaultManager::setTransientFaultDeadline(FaultState, FaultState, const OptionalMessageType&) {
- _transientFaultDeadline = std::make_unique<TransientFaultDeadline>(
- this, _taskExecutor, _config->getActiveFaultDuration());
+ if (hasCriticalFacet(_fault.get()) && !_transientFaultDeadline) {
+ _transientFaultDeadline = std::make_unique<TransientFaultDeadline>(
+ this, _taskExecutor, _config->getActiveFaultDuration());
+ }
}
void FaultManager::clearTransientFaultDeadline(FaultState, FaultState, const OptionalMessageType&) {
diff --git a/src/mongo/db/process_health/fault_manager_config.h b/src/mongo/db/process_health/fault_manager_config.h
index 92834acafad..8542dbf9ca5 100644
--- a/src/mongo/db/process_health/fault_manager_config.h
+++ b/src/mongo/db/process_health/fault_manager_config.h
@@ -113,8 +113,10 @@ public:
_facetToIntensityMapForTest.insert({type, intensity});
}
+ // If the server persists in TransientFault for more than this duration
+ // it will move to the ActiveFault state and terminate.
Milliseconds getActiveFaultDuration() const {
- return _activeFaultDuration;
+ return Milliseconds(Seconds(mongo::gActiveFaultDurationSecs.load()));
}
Milliseconds getPeriodicHealthCheckInterval() const {
@@ -140,15 +142,6 @@ public:
_periodicChecksDisabledForTests = true;
}
- void setActiveFaultDurationForTests(Milliseconds duration) {
- _activeFaultDuration = duration;
- }
-
-protected:
- // If the server persists in TransientFault for more than this duration
- // it will move to the ActiveFault state and terminate.
- static inline const auto kActiveFaultDuration = Seconds(120);
-
private:
static HealthMonitoringIntensitiesServerParameter* getHealthObserverIntensities() {
return ServerParameterSet::getGlobal()->get<HealthMonitoringIntensitiesServerParameter>(
@@ -156,7 +149,6 @@ private:
}
bool _periodicChecksDisabledForTests = false;
- Milliseconds _activeFaultDuration = kActiveFaultDuration;
stdx::unordered_map<FaultFacetType, HealthObserverIntensityEnum> _facetToIntensityMapForTest;
};
diff --git a/src/mongo/db/process_health/fault_manager_test_suite.h b/src/mongo/db/process_health/fault_manager_test_suite.h
index 40667e394ce..581d4c5a651 100644
--- a/src/mongo/db/process_health/fault_manager_test_suite.h
+++ b/src/mongo/db/process_health/fault_manager_test_suite.h
@@ -231,6 +231,12 @@ public:
static inline const Seconds kWaitTimeout{30};
static inline const Milliseconds kSleepTime{1};
+
+ static inline const int kActiveFaultDurationSecs = 1;
+
+ RAIIServerParameterControllerForTest serverParamController{"activeFaultDurationSecs",
+ kActiveFaultDurationSecs};
+
void assertSoon(std::function<bool()> predicate, Milliseconds timeout = kWaitTimeout) {
Timer t;
while (t.elapsed() < timeout) {
diff --git a/src/mongo/db/process_health/fault_state_machine_test.cpp b/src/mongo/db/process_health/fault_state_machine_test.cpp
index c1d3c5efc59..8a4b639d576 100644
--- a/src/mongo/db/process_health/fault_state_machine_test.cpp
+++ b/src/mongo/db/process_health/fault_state_machine_test.cpp
@@ -214,10 +214,6 @@ TEST_F(FaultManagerTest, OneFacetIsResolved) {
DEATH_TEST_F(FaultManagerTest, TransitionsToActiveFaultAfterTimeoutFromTransientFault, "Fatal") {
RAIIServerParameterControllerForTest _controller{"featureFlagHealthMonitoring", true};
auto faultFacetType = FaultFacetType::kMock1;
- auto config = test::getConfigWithDisabledPeriodicChecks();
- auto activeFaultDuration = Milliseconds(100);
- config->setActiveFaultDurationForTests(activeFaultDuration);
- resetManager(std::move(config));
registerMockHealthObserver(faultFacetType, [] { return 1.1; });
auto initialHealthCheckFuture = manager().startPeriodicHealthChecks();
@@ -227,47 +223,80 @@ DEATH_TEST_F(FaultManagerTest, TransitionsToActiveFaultAfterTimeoutFromTransient
manager().acceptTest(HealthCheckStatus(faultFacetType, 1.0, "error"));
ASSERT(manager().getFaultState() == FaultState::kTransientFault);
- advanceTime(activeFaultDuration);
+ advanceTime(Seconds(kActiveFaultDurationSecs));
waitForTransitionIntoState(FaultState::kActiveFault);
}
-DEATH_TEST_F(FaultManagerTest, TransitionsToActiveFaultAfterTimeoutFromStartupCheck, "Fatal") {
+TEST_F(FaultManagerTest,
+ NonCriticalFacetDoesNotTransitionToActiveFaultAfterTimeoutFromTransientFault) {
RAIIServerParameterControllerForTest _controller{"featureFlagHealthMonitoring", true};
auto faultFacetType = FaultFacetType::kMock1;
auto config = test::getConfigWithDisabledPeriodicChecks();
- auto activeFaultDuration = Milliseconds(100);
- config->setActiveFaultDurationForTests(activeFaultDuration);
+ config->setIntensityForType(faultFacetType, HealthObserverIntensityEnum::kNonCritical);
resetManager(std::move(config));
registerMockHealthObserver(faultFacetType, [] { return 1.1; });
auto initialHealthCheckFuture = manager().startPeriodicHealthChecks();
+ manager().acceptTest(HealthCheckStatus(faultFacetType));
+ ASSERT(manager().getFaultState() == FaultState::kOk);
+
+ manager().acceptTest(HealthCheckStatus(faultFacetType, 1.0, "error"));
+ ASSERT(manager().getFaultState() == FaultState::kTransientFault);
+
+ advanceTime(Seconds(kActiveFaultDurationSecs));
+ // Should be enough time to move to Active fault if we were going to crash.
+ sleepFor(Seconds(1));
+ ASSERT(manager().getFaultState() == FaultState::kTransientFault);
+}
+
+DEATH_TEST_F(FaultManagerTest, TransitionsToActiveFaultAfterTimeoutFromStartupCheck, "Fatal") {
+ RAIIServerParameterControllerForTest _controller{"featureFlagHealthMonitoring", true};
+ auto faultFacetType = FaultFacetType::kMock1;
+
+ registerMockHealthObserver(faultFacetType, [] { return 1.1; });
+ auto initialHealthCheckFuture = manager().startPeriodicHealthChecks();
manager().acceptTest(HealthCheckStatus(faultFacetType, 1.0, "error"));
ASSERT(manager().getFaultState() == FaultState::kStartupCheck);
- advanceTime(activeFaultDuration);
+ advanceTime(Seconds(kActiveFaultDurationSecs));
waitForTransitionIntoState(FaultState::kActiveFault);
}
-TEST_F(FaultManagerTest, DoesNotTransitionToActiveFaultIfResolved) {
+TEST_F(FaultManagerTest,
+ NonCriticalFacetDoesNotTransitionToActiveFaultAfterTimeoutFromStartupCheck) {
RAIIServerParameterControllerForTest _controller{"featureFlagHealthMonitoring", true};
auto faultFacetType = FaultFacetType::kMock1;
auto config = test::getConfigWithDisabledPeriodicChecks();
- auto activeFaultDuration = Milliseconds(100);
- config->setActiveFaultDurationForTests(activeFaultDuration);
+ config->setIntensityForType(faultFacetType, HealthObserverIntensityEnum::kNonCritical);
resetManager(std::move(config));
registerMockHealthObserver(faultFacetType, [] { return 1.1; });
auto initialHealthCheckFuture = manager().startPeriodicHealthChecks();
+ manager().acceptTest(HealthCheckStatus(faultFacetType, 1.0, "error"));
+ ASSERT(manager().getFaultState() == FaultState::kStartupCheck);
+
+ advanceTime(Seconds(kActiveFaultDurationSecs) * 10);
+ // Should be enough time to move to Active fault if we were going to crash.
+ sleepFor(Seconds(1));
+ ASSERT(manager().getFaultState() == FaultState::kStartupCheck);
+}
+
+TEST_F(FaultManagerTest, DoesNotTransitionToActiveFaultIfResolved) {
+ RAIIServerParameterControllerForTest _controller{"featureFlagHealthMonitoring", true};
+ auto faultFacetType = FaultFacetType::kMock1;
+
+ registerMockHealthObserver(faultFacetType, [] { return 1.1; });
+ auto initialHealthCheckFuture = manager().startPeriodicHealthChecks();
manager().acceptTest(HealthCheckStatus(faultFacetType));
ASSERT(manager().getFaultState() == FaultState::kOk);
manager().acceptTest(HealthCheckStatus(faultFacetType, 1.0, "error"));
ASSERT(manager().getFaultState() == FaultState::kTransientFault);
- advanceTime(activeFaultDuration / 2);
+ advanceTime(Seconds(kActiveFaultDurationSecs / 2));
manager().acceptTest(HealthCheckStatus(faultFacetType));
- advanceTime(activeFaultDuration);
+ advanceTime(Seconds(kActiveFaultDurationSecs));
ASSERT(manager().getFaultState() == FaultState::kOk);
}
diff --git a/src/mongo/db/process_health/health_monitoring_server_parameters.idl b/src/mongo/db/process_health/health_monitoring_server_parameters.idl
index 42adb48c2d3..779c9370d90 100644
--- a/src/mongo/db/process_health/health_monitoring_server_parameters.idl
+++ b/src/mongo/db/process_health/health_monitoring_server_parameters.idl
@@ -68,3 +68,12 @@ server_parameters:
name: "HealthMonitoringIntensitiesServerParameter"
data: "synchronized_value<HealthObserverIntensities>"
override_set: true
+ activeFaultDurationSecs:
+ description: "A server parameter for specifying the duration after which we transition to active fault."
+ set_at: [startup, runtime]
+ cpp_vartype: AtomicWord<int>
+ cpp_varname: gActiveFaultDurationSecs
+ default:
+ expr: 120
+ validator:
+ gte: 0