summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKshitij Gupta <kshitij.gupta@mongodb.com>2021-12-13 21:03:13 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-12-28 23:18:25 +0000
commitb09900c6482d450cda67787bef1d548411c3aa69 (patch)
treee0ada13611919f43421191a8517c6186fa80a8b8
parenta00bb2e633ccf18be6e74c4345b6c7ad6f838c3d (diff)
downloadmongo-b09900c6482d450cda67787bef1d548411c3aa69.tar.gz
SERVER-59382: Enforce non-critical facets not entering ActiveFault state
-rw-r--r--jstests/sharding/health_monitor/non_critical_facet.js48
-rw-r--r--src/mongo/db/process_health/fault_manager.cpp14
-rw-r--r--src/mongo/db/process_health/fault_manager_config.h14
-rw-r--r--src/mongo/db/process_health/fault_manager_test_suite.h4
-rw-r--r--src/mongo/db/process_health/fault_state_machine_test.cpp57
-rw-r--r--src/mongo/db/process_health/health_monitoring_server_parameters.idl9
6 files changed, 115 insertions, 31 deletions
diff --git a/jstests/sharding/health_monitor/non_critical_facet.js b/jstests/sharding/health_monitor/non_critical_facet.js
new file mode 100644
index 00000000000..073cba2afa1
--- /dev/null
+++ b/jstests/sharding/health_monitor/non_critical_facet.js
@@ -0,0 +1,48 @@
+/**
+ * Tests behaviour of non-critical fault facet.
+ */
+(function() {
+'use strict';
+const ACTIVE_FAULT_DURATION_SECS = 1;
+
+const params = {
+ setParameter: {
+ healthMonitoring: tojson({test: "non-critical", ldap: "off", dns: "off"}),
+ featureFlagHealthMonitoring: true
+ }
+};
+
+let st = new ShardingTest({
+ mongos: [params],
+ shards: 1,
+});
+
+assert.commandWorked(
+ st.s0.adminCommand({"setParameter": 1, activeFaultDurationSecs: ACTIVE_FAULT_DURATION_SECS}));
+
+let result = assert.commandWorked(st.s0.adminCommand({serverStatus: 1})).health;
+assert.eq(result.state, "Ok");
+
+// Failpoint returns fault.
+assert.commandWorked(st.s0.adminCommand({
+ "configureFailPoint": 'testHealthObserver',
+ "data": {"code": "InternalError", "msg": "test msg"},
+ "mode": "alwaysOn"
+}));
+
+assert.soon(() => {
+ result = assert.commandWorked(st.s0.adminCommand({serverStatus: 1})).health;
+ return result.state == "TransientFault";
+});
+
+// Sleep for twice as long as active fault duration (in Millis).
+sleep(ACTIVE_FAULT_DURATION_SECS * 2000);
+
+// Still in transient fault.
+result = assert.commandWorked(st.s0.adminCommand({serverStatus: 1})).health;
+assert.eq(result.state, "TransientFault");
+assert(
+ result.faultInformation.facets.kTestObserver.description.includes("InternalError: test msg"));
+
+st.stop();
+})();
diff --git a/src/mongo/db/process_health/fault_manager.cpp b/src/mongo/db/process_health/fault_manager.cpp
index 10c43033689..9b52ee34247 100644
--- a/src/mongo/db/process_health/fault_manager.cpp
+++ b/src/mongo/db/process_health/fault_manager.cpp
@@ -205,12 +205,12 @@ boost::optional<FaultState> FaultManager::handleStartupCheck(const OptionalMessa
}
updateWithCheckStatus(HealthCheckStatus(status));
- auto optionalActiveFault = getFaultFacetsContainer();
- if (optionalActiveFault) {
- optionalActiveFault->garbageCollectResolvedFacets();
+ auto optionalFault = getFaultFacetsContainer();
+ if (optionalFault) {
+ optionalFault->garbageCollectResolvedFacets();
}
- if (optionalActiveFault && hasCriticalFacet(_fault.get()) && !_transientFaultDeadline) {
+ if (optionalFault) {
setTransientFaultDeadline(
FaultState::kStartupCheck, FaultState::kStartupCheck, boost::none);
}
@@ -303,8 +303,10 @@ void FaultManager::logCurrentState(FaultState, FaultState newState, const Option
}
void FaultManager::setTransientFaultDeadline(FaultState, FaultState, const OptionalMessageType&) {
- _transientFaultDeadline = std::make_unique<TransientFaultDeadline>(
- this, _taskExecutor, _config->getActiveFaultDuration());
+ if (hasCriticalFacet(_fault.get()) && !_transientFaultDeadline) {
+ _transientFaultDeadline = std::make_unique<TransientFaultDeadline>(
+ this, _taskExecutor, _config->getActiveFaultDuration());
+ }
}
void FaultManager::clearTransientFaultDeadline(FaultState, FaultState, const OptionalMessageType&) {
diff --git a/src/mongo/db/process_health/fault_manager_config.h b/src/mongo/db/process_health/fault_manager_config.h
index 92834acafad..8542dbf9ca5 100644
--- a/src/mongo/db/process_health/fault_manager_config.h
+++ b/src/mongo/db/process_health/fault_manager_config.h
@@ -113,8 +113,10 @@ public:
_facetToIntensityMapForTest.insert({type, intensity});
}
+ // If the server persists in TransientFault for more than this duration
+ // it will move to the ActiveFault state and terminate.
Milliseconds getActiveFaultDuration() const {
- return _activeFaultDuration;
+ return Milliseconds(Seconds(mongo::gActiveFaultDurationSecs.load()));
}
Milliseconds getPeriodicHealthCheckInterval() const {
@@ -140,15 +142,6 @@ public:
_periodicChecksDisabledForTests = true;
}
- void setActiveFaultDurationForTests(Milliseconds duration) {
- _activeFaultDuration = duration;
- }
-
-protected:
- // If the server persists in TransientFault for more than this duration
- // it will move to the ActiveFault state and terminate.
- static inline const auto kActiveFaultDuration = Seconds(120);
-
private:
static HealthMonitoringIntensitiesServerParameter* getHealthObserverIntensities() {
return ServerParameterSet::getGlobal()->get<HealthMonitoringIntensitiesServerParameter>(
@@ -156,7 +149,6 @@ private:
}
bool _periodicChecksDisabledForTests = false;
- Milliseconds _activeFaultDuration = kActiveFaultDuration;
stdx::unordered_map<FaultFacetType, HealthObserverIntensityEnum> _facetToIntensityMapForTest;
};
diff --git a/src/mongo/db/process_health/fault_manager_test_suite.h b/src/mongo/db/process_health/fault_manager_test_suite.h
index 4296add1ee2..807a771ea06 100644
--- a/src/mongo/db/process_health/fault_manager_test_suite.h
+++ b/src/mongo/db/process_health/fault_manager_test_suite.h
@@ -126,6 +126,7 @@ class FaultManagerTest : public unittest::Test {
public:
void setUp() override {
feature_flags::gFeatureFlagHealthMonitoring = true;
+ mongo::gActiveFaultDurationSecs.store(kActiveFaultDurationSecs);
HealthObserverRegistration::resetObserverFactoriesForTest();
createServiceContextIfNeeded();
@@ -236,6 +237,9 @@ public:
static inline const Seconds kWaitTimeout{30};
static inline const Milliseconds kSleepTime{1};
+
+ static inline const int kActiveFaultDurationSecs = 1;
+
void assertSoon(std::function<bool()> predicate, Milliseconds timeout = kWaitTimeout) {
Timer t;
while (t.elapsed() < timeout) {
diff --git a/src/mongo/db/process_health/fault_state_machine_test.cpp b/src/mongo/db/process_health/fault_state_machine_test.cpp
index 6ee69968288..0cede3b5f1a 100644
--- a/src/mongo/db/process_health/fault_state_machine_test.cpp
+++ b/src/mongo/db/process_health/fault_state_machine_test.cpp
@@ -214,10 +214,6 @@ TEST_F(FaultManagerTest, OneFacetIsResolved) {
DEATH_TEST_F(FaultManagerTest, TransitionsToActiveFaultAfterTimeoutFromTransientFault, "Fatal") {
feature_flags::gFeatureFlagHealthMonitoring = true;
auto faultFacetType = FaultFacetType::kMock1;
- auto config = test::getConfigWithDisabledPeriodicChecks();
- auto activeFaultDuration = Milliseconds(100);
- config->setActiveFaultDurationForTests(activeFaultDuration);
- resetManager(std::move(config));
registerMockHealthObserver(faultFacetType, [] { return 1.1; });
auto initialHealthCheckFuture = manager().startPeriodicHealthChecks();
@@ -227,47 +223,80 @@ DEATH_TEST_F(FaultManagerTest, TransitionsToActiveFaultAfterTimeoutFromTransient
manager().acceptTest(HealthCheckStatus(faultFacetType, 1.0, "error"));
ASSERT(manager().getFaultState() == FaultState::kTransientFault);
- advanceTime(activeFaultDuration);
+ advanceTime(Seconds(kActiveFaultDurationSecs));
waitForTransitionIntoState(FaultState::kActiveFault);
}
-DEATH_TEST_F(FaultManagerTest, TransitionsToActiveFaultAfterTimeoutFromStartupCheck, "Fatal") {
+TEST_F(FaultManagerTest,
+ NonCriticalFacetDoesNotTransitionToActiveFaultAfterTimeoutFromTransientFault) {
feature_flags::gFeatureFlagHealthMonitoring = true;
auto faultFacetType = FaultFacetType::kMock1;
auto config = test::getConfigWithDisabledPeriodicChecks();
- auto activeFaultDuration = Milliseconds(100);
- config->setActiveFaultDurationForTests(activeFaultDuration);
+ config->setIntensityForType(faultFacetType, HealthObserverIntensityEnum::kNonCritical);
resetManager(std::move(config));
registerMockHealthObserver(faultFacetType, [] { return 1.1; });
auto initialHealthCheckFuture = manager().startPeriodicHealthChecks();
+ manager().acceptTest(HealthCheckStatus(faultFacetType));
+ ASSERT(manager().getFaultState() == FaultState::kOk);
+
+ manager().acceptTest(HealthCheckStatus(faultFacetType, 1.0, "error"));
+ ASSERT(manager().getFaultState() == FaultState::kTransientFault);
+
+ advanceTime(Seconds(kActiveFaultDurationSecs));
+ // Should be enough time to move to Active fault if we were going to crash.
+ sleepFor(Seconds(1));
+ ASSERT(manager().getFaultState() == FaultState::kTransientFault);
+}
+
+DEATH_TEST_F(FaultManagerTest, TransitionsToActiveFaultAfterTimeoutFromStartupCheck, "Fatal") {
+ feature_flags::gFeatureFlagHealthMonitoring = true;
+ auto faultFacetType = FaultFacetType::kMock1;
+
+ registerMockHealthObserver(faultFacetType, [] { return 1.1; });
+ auto initialHealthCheckFuture = manager().startPeriodicHealthChecks();
manager().acceptTest(HealthCheckStatus(faultFacetType, 1.0, "error"));
ASSERT(manager().getFaultState() == FaultState::kStartupCheck);
- advanceTime(activeFaultDuration);
+ advanceTime(Seconds(kActiveFaultDurationSecs));
waitForTransitionIntoState(FaultState::kActiveFault);
}
-TEST_F(FaultManagerTest, DoesNotTransitionToActiveFaultIfResolved) {
+TEST_F(FaultManagerTest,
+ NonCriticalFacetDoesNotTransitionToActiveFaultAfterTimeoutFromStartupCheck) {
feature_flags::gFeatureFlagHealthMonitoring = true;
auto faultFacetType = FaultFacetType::kMock1;
auto config = test::getConfigWithDisabledPeriodicChecks();
- auto activeFaultDuration = Milliseconds(100);
- config->setActiveFaultDurationForTests(activeFaultDuration);
+ config->setIntensityForType(faultFacetType, HealthObserverIntensityEnum::kNonCritical);
resetManager(std::move(config));
registerMockHealthObserver(faultFacetType, [] { return 1.1; });
auto initialHealthCheckFuture = manager().startPeriodicHealthChecks();
+ manager().acceptTest(HealthCheckStatus(faultFacetType, 1.0, "error"));
+ ASSERT(manager().getFaultState() == FaultState::kStartupCheck);
+
+ advanceTime(Seconds(kActiveFaultDurationSecs) * 10);
+ // Should be enough time to move to Active fault if we were going to crash.
+ sleepFor(Seconds(1));
+ ASSERT(manager().getFaultState() == FaultState::kStartupCheck);
+}
+
+TEST_F(FaultManagerTest, DoesNotTransitionToActiveFaultIfResolved) {
+ feature_flags::gFeatureFlagHealthMonitoring = true;
+ auto faultFacetType = FaultFacetType::kMock1;
+
+ registerMockHealthObserver(faultFacetType, [] { return 1.1; });
+ auto initialHealthCheckFuture = manager().startPeriodicHealthChecks();
manager().acceptTest(HealthCheckStatus(faultFacetType));
ASSERT(manager().getFaultState() == FaultState::kOk);
manager().acceptTest(HealthCheckStatus(faultFacetType, 1.0, "error"));
ASSERT(manager().getFaultState() == FaultState::kTransientFault);
- advanceTime(activeFaultDuration / 2);
+ advanceTime(Seconds(kActiveFaultDurationSecs / 2));
manager().acceptTest(HealthCheckStatus(faultFacetType));
- advanceTime(activeFaultDuration);
+ advanceTime(Seconds(kActiveFaultDurationSecs));
ASSERT(manager().getFaultState() == FaultState::kOk);
}
diff --git a/src/mongo/db/process_health/health_monitoring_server_parameters.idl b/src/mongo/db/process_health/health_monitoring_server_parameters.idl
index 42adb48c2d3..779c9370d90 100644
--- a/src/mongo/db/process_health/health_monitoring_server_parameters.idl
+++ b/src/mongo/db/process_health/health_monitoring_server_parameters.idl
@@ -68,3 +68,12 @@ server_parameters:
name: "HealthMonitoringIntensitiesServerParameter"
data: "synchronized_value<HealthObserverIntensities>"
override_set: true
+ activeFaultDurationSecs:
+ description: "A server parameter for specifying the duration after which we transition to active fault."
+ set_at: [startup, runtime]
+ cpp_vartype: AtomicWord<int>
+ cpp_varname: gActiveFaultDurationSecs
+ default:
+ expr: 120
+ validator:
+ gte: 0