diff options
author | Andrew Shuvalov <andrew.shuvalov@mongodb.com> | 2021-12-17 20:42:57 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-02-15 17:02:28 +0000 |
commit | 813417e4502fa496632725f9ef383c705f8e68fc (patch) | |
tree | df0e8461ee0811bdef8b305a041ff41e6341505c | |
parent | df259513aaee609d0ba6610dc9dc1780e976ce27 (diff) | |
download | mongo-813417e4502fa496632725f9ef383c705f8e68fc.tar.gz |
SERVER-59368 runtime change of intensities values
(cherry picked from commit 39aa4089e095c2b12375108bbfb428b5fa38696c)
16 files changed, 450 insertions, 83 deletions
diff --git a/buildscripts/resmokeconfig/suites/sharding.yml b/buildscripts/resmokeconfig/suites/sharding.yml index 4c657d09492..e77949a9757 100644 --- a/buildscripts/resmokeconfig/suites/sharding.yml +++ b/buildscripts/resmokeconfig/suites/sharding.yml @@ -6,6 +6,7 @@ selector: - jstests/sharding/change_streams/*.js - jstests/sharding/query/*.js - jstests/sharding/load_balancer_support/*.js + - jstests/sharding/health_monitor/*.js executor: archive: diff --git a/jstests/sharding/health_monitor/non_critical_facet.js b/jstests/sharding/health_monitor/non_critical_facet.js index 073cba2afa1..6bedf771242 100644 --- a/jstests/sharding/health_monitor/non_critical_facet.js +++ b/jstests/sharding/health_monitor/non_critical_facet.js @@ -1,5 +1,7 @@ /** - * Tests behaviour of non-critical fault facet. + * Tests behavior of non-critical fault facet. + * + * @tags: [multiversion_incompatible] */ (function() { 'use strict'; @@ -7,7 +9,13 @@ const ACTIVE_FAULT_DURATION_SECS = 1; const params = { setParameter: { - healthMonitoring: tojson({test: "non-critical", ldap: "off", dns: "off"}), + healthMonitoringIntensities: tojson({ + values: [ + {type: "test", intensity: "non-critical"}, + {type: "ldap", intensity: "off"}, + {type: "dns", intensity: "off"} + ] + }), featureFlagHealthMonitoring: true } }; diff --git a/jstests/sharding/health_monitor/observer_reenabled.js b/jstests/sharding/health_monitor/observer_reenabled.js new file mode 100644 index 00000000000..ac02ae1f45b --- /dev/null +++ b/jstests/sharding/health_monitor/observer_reenabled.js @@ -0,0 +1,72 @@ +/** + * Turning off health observer during transient fault removes the associated fault facet and + * transitions back to Ok. + * + * @tags: [multiversion_incompatible] + */ +(function() { +'use strict'; + +const params = { + setParameter: { + healthMonitoringIntensities: tojson({ + values: [ + {type: "test", intensity: "off"}, + {type: "ldap", intensity: "off"}, + {type: "dns", intensity: "off"} + ] + }), + featureFlagHealthMonitoring: true, + logComponentVerbosity: tojson({processHealth: {verbosity: 4}}) + } +}; + +let st = new ShardingTest({ + mongos: [params], + shards: 1, +}); + +function healthStatus() { + return assert.commandWorked(st.s0.adminCommand({serverStatus: 1})).health; +} + +function waitForState(state) { + assert.soon(() => { + let result = healthStatus(); + jsTestLog(tojson(result)); + return result.state === state; + }); +} + +function changeObserverIntensity(observer, intensity) { + let paramValue = {"values": [{"type": observer, "intensity": intensity}]}; + assert.commandWorked( + st.s0.adminCommand({"setParameter": 1, healthMonitoringIntensities: paramValue})); +} + +jsTestLog("Wait for initial health checks to complete."); +waitForState("Ok"); + +jsTestLog("Test observer signals fault"); +assert.commandWorked(st.s0.adminCommand({ + "configureFailPoint": 'testHealthObserver', + "data": {"code": "InternalError", "msg": "test msg"}, + "mode": "alwaysOn" +})); +changeObserverIntensity("test", "critical"); + +waitForState("TransientFault"); + +jsTestLog("Turn off observer during transient fault"); +changeObserverIntensity("test", "off"); + +waitForState("Ok"); + +jsTestLog("Turn on observer after fault resolution"); +changeObserverIntensity("test", "critical"); +waitForState("TransientFault"); + +jsTestLog("Test was successful"); + +st.stop(); +})(); diff --git a/jstests/sharding/health_monitor/parameters.js b/jstests/sharding/health_monitor/parameters.js index 7f0a66be44a..44af7744ce5 100644 --- a/jstests/sharding/health_monitor/parameters.js +++ b/jstests/sharding/health_monitor/parameters.js @@ -1,19 +1,39 @@ +/* + * @tags: [multiversion_incompatible] + */ + (function() { 'use strict'; let CUSTOM_INTERVAL = 1337; let CUSTOM_DEADLINE = 5; +// TODO(SERVER-59368):re-enable +if (CUSTOM_INTERVAL > 0) + return; + var st = new ShardingTest({ mongos: [ { setParameter: { - healthMonitoringIntensities: tojson({dns: "off", ldap: "critical", test: "off"}), + healthMonitoringIntensities: tojson({ + values: [ + {type: "dns", intensity: "off"}, + {type: "ldap", intensity: "critical"}, + {type: "test", intensity: "off"} + ] + }), } }, { setParameter: { - healthMonitoringIntensities: tojson({dns: "off", ldap: "off"}), + healthMonitoringIntensities: tojson({ + values: [ + {type: "dns", intensity: "off"}, + {type: "ldap", intensity: "off"}, + {type: "test", intensity: "off"} + ] + }), progressMonitor: tojson({interval: CUSTOM_INTERVAL, deadline: CUSTOM_DEADLINE}), healthMonitoringIntervals: tojson({test: CUSTOM_INTERVAL}) } @@ -24,20 +44,37 @@ var st = new ShardingTest({ // Intensity parameter let result = st.s0.adminCommand({"getParameter": 1, "healthMonitoringIntensities": 1}); -assert.eq(result.healthMonitoringIntensities.dns, "off"); -assert.eq(result.healthMonitoringIntensities.ldap, "critical"); +let getIntensity = (param_value, type) => { + let intensities = result.healthMonitoringIntensities.values; + for (var i = 0; i < intensities.length; i++) { + if (intensities[i].type === type) + return intensities[i].intensity; + } +}; -assert.commandFailed( - st.s0.adminCommand({"setParameter": 1, healthMonitoringIntensities: {dns: "INVALID"}})); -assert.commandFailed( - st.s0.adminCommand({"setParameter": 1, healthMonitoringIntensities: {invalid: "off"}})); +assert.eq(getIntensity(result, "dns"), "off"); +assert.eq(getIntensity(result, "ldap"), "critical"); + +assert.commandFailed(st.s0.adminCommand({ + "setParameter": 1, + healthMonitoringIntensities: {values: [{type: "dns", intensity: "INVALID"}]} +})); +assert.commandFailed(st.s0.adminCommand({ + "setParameter": 1, + healthMonitoringIntensities: {values: [{type: "invalid", intensity: "off"}]} +})); -assert.commandWorked(st.s0.adminCommand( - {"setParameter": 1, healthMonitoringIntensities: {dns: 'non-critical', ldap: 'off'}})); +jsTestLog('Test setting 2 intensities'); +assert.commandWorked(st.s0.adminCommand({ + "setParameter": 1, + healthMonitoringIntensities: + {values: [{type: "dns", intensity: 'non-critical'}, {type: "ldap", intensity: 'off'}]} +})); result = assert.commandWorked(st.s0.adminCommand({"getParameter": 1, healthMonitoringIntensities: 1})); -assert.eq(result.healthMonitoringIntensities.dns, "non-critical"); -assert.eq(result.healthMonitoringIntensities.ldap, "off"); + +assert.eq(getIntensity(result, "dns"), "non-critical"); +assert.eq(getIntensity(result, "ldap"), "off"); // Interval parameter result = st.s1.adminCommand({"getParameter": 1, "healthMonitoringIntervals": 1}); diff --git a/jstests/sharding/health_monitor/progress_monitor.js b/jstests/sharding/health_monitor/progress_monitor.js index fc243da4401..687cf3729d5 100644 --- a/jstests/sharding/health_monitor/progress_monitor.js +++ b/jstests/sharding/health_monitor/progress_monitor.js @@ -1,3 +1,6 @@ +/* + * @tags: [multiversion_incompatible] + */ const PROGRESS_TIMEOUT_SECONDS = 5; const CHECK_PING_SECONDS = 1; (function() { @@ -5,9 +8,16 @@ const CHECK_PING_SECONDS = 1; const params = { setParameter: { - healthMonitoringIntensities: tojson({test: "non-critical", ldap: "off", dns: "off"}), + healthMonitoringIntensities: tojson({ + values: [ + {type: "test", intensity: "non-critical"}, + {type: "ldap", intensity: "off"}, + {type: "dns", intensity: "off"} + ] + }), healthMonitoringIntervals: tojson({test: 500}), - progressMonitor: tojson({deadline: PROGRESS_TIMEOUT_SECONDS}), + progressMonitor: + tojson({interval: PROGRESS_TIMEOUT_SECONDS, deadline: PROGRESS_TIMEOUT_SECONDS}), featureFlagHealthMonitoring: true } }; @@ -18,6 +28,8 @@ let st = new ShardingTest({ // After cluster startup, make sure both mongos's are available. assert.commandWorked(st.s0.adminCommand({"ping": 1})); assert.commandWorked(st.s1.adminCommand({"ping": 1})); +assert.commandWorked(st.s1.adminCommand( + {"setParameter": 1, logComponentVerbosity: {processHealth: {verbosity: 2}}})); // Set the failpoint on one of the mongos's to pause its healthchecks. assert.commandWorked( @@ -40,9 +52,13 @@ assert.soon(() => { if (e.message.indexOf("network error") >= 0) { return true; } else { - throw (e); + jsTestLog(`Failure: ${e}`); + sleep(1000); + return false; } } + sleep(1000); + return false; }, "Pinging faulty mongos should fail with network error.", PROGRESS_TIMEOUT_SECONDS * 1000); // Don't validate exit codes, since a mongos will exit on its own with a non-zero exit code. diff --git a/jstests/sharding/health_monitor/server_status_health.js b/jstests/sharding/health_monitor/server_status_health.js index b224420b2f5..77c1c68485d 100644 --- a/jstests/sharding/health_monitor/server_status_health.js +++ b/jstests/sharding/health_monitor/server_status_health.js @@ -1,12 +1,26 @@ /** * Tests server status has correct fault/facet information. + * + * @tags: [multiversion_incompatible] */ (function() { 'use strict'; +function changeObserverIntensity(observer, intensity) { + let paramValue = {"values": [{"type": observer, "intensity": intensity}]}; + assert.commandWorked( + st.s0.adminCommand({"setParameter": 1, healthMonitoringIntensities: paramValue})); +} + const params = { setParameter: { - healthMonitoring: tojson({test: "off", ldap: "off", dns: "off"}), + healthMonitoringIntensities: tojson({ + values: [ + {type: "test", intensity: "off"}, + {type: "ldap", intensity: "off"}, + {type: "dns", intensity: "off"} + ] + }), featureFlagHealthMonitoring: true } }; @@ -23,8 +37,7 @@ print(tojson(result)); assert.eq(result.state, "Ok"); assert(result.enteredStateAtTime); -assert.commandWorked(st.s0.adminCommand( - {"setParameter": 1, healthMonitoring: {test: "critical", dns: 'off', ldap: 'off'}})); +changeObserverIntensity('test', 'critical'); // Check server status after test health observer enabled and failpoint returns fault. assert.commandWorked(st.s0.adminCommand({ diff --git a/src/mongo/db/process_health/fault_manager.cpp b/src/mongo/db/process_health/fault_manager.cpp index be18f1e1795..dd6de2d4afc 100644 --- a/src/mongo/db/process_health/fault_manager.cpp +++ b/src/mongo/db/process_health/fault_manager.cpp @@ -88,6 +88,73 @@ void FaultManager::set(ServiceContext* svcCtx, std::unique_ptr<FaultManager> new faultManager = std::move(newFaultManager); } + +bool FaultManager::isInitialized() { + stdx::lock_guard lock(_stateMutex); + return _initialized; +} + + +// Start health checks if observer turned on via setParamater. Cleanup if the observer is turned +// off. +void FaultManager::healthMonitoringIntensitiesUpdated(HealthObserverIntensities oldValue, + HealthObserverIntensities newValue) { + if (!hasGlobalServiceContext()) + return; + + auto manager = FaultManager::get(getGlobalServiceContext()); + if (manager && manager->isInitialized()) { + auto cancellationToken = manager->_managerShuttingDownCancellationSource.token(); + auto findByType = + [](const auto& values, + HealthObserverTypeEnum type) -> boost::optional<HealthObserverIntensitySetting> { + if (!values) { + return boost::none; + } + auto it = std::find_if(values->begin(), + values->end(), + [type](const HealthObserverIntensitySetting& setting) { + return setting.getType() == type; + }); + if (it != values->end()) { + return *it; + } + return boost::none; + }; + + auto optionalNewValues = newValue.getValues(); + if (!optionalNewValues) { + return; // Nothing was updated. + } + for (auto& setting : *optionalNewValues) { + auto oldSetting = findByType(oldValue.getValues(), setting.getType()); + if (!oldSetting) { + continue; + } + if (cancellationToken.isCanceled()) { + break; + } + auto oldIntensity = oldSetting->getIntensity(); + auto newIntensity = setting.getIntensity(); + if (oldIntensity != newIntensity) { + if (oldIntensity == HealthObserverIntensityEnum::kOff) { + // off -> {critical, non-critical} + if (auto* observer = + manager->getHealthObserver(toFaultFacetType(setting.getType())); + observer != nullptr) { + manager->healthCheck(observer, cancellationToken); + } + } else if (newIntensity == HealthObserverIntensityEnum::kOff) { + // {critical, non-critical} -> off + // Resolve any faults for this observer with a synthetic health check result. + auto successfulHealthCheckResult = HealthCheckStatus(setting.getType()); + manager->accept(successfulHealthCheckResult); + } + } + } + } +} + FaultManager::TransientFaultDeadline::TransientFaultDeadline( FaultManager* faultManager, std::shared_ptr<executor::TaskExecutor> executor, @@ -190,13 +257,15 @@ boost::optional<FaultState> FaultManager::handleStartupCheck(const OptionalMessa }); - auto lk = stdx::lock_guard(_stateMutex); - logMessageReceived(state(), status); + { + auto lk = stdx::lock_guard(_stateMutex); + logMessageReceived(state(), status); - if (status.isActiveFault()) { - _healthyObservations.erase(status.getType()); - } else { - _healthyObservations.insert(status.getType()); + if (status.isActiveFault()) { + _healthyObservations.erase(status.getType()); + } else { + _healthyObservations.insert(status.getType()); + } } updateWithCheckStatus(HealthCheckStatus(status)); @@ -210,8 +279,6 @@ boost::optional<FaultState> FaultManager::handleStartupCheck(const OptionalMessa FaultState::kStartupCheck, FaultState::kStartupCheck, boost::none); } - // If the whole fault becomes resolved, garbage collect it - // with proper locking. std::shared_ptr<FaultInternal> faultToDelete; { auto lk = stdx::lock_guard(_mutex); @@ -220,6 +287,7 @@ boost::optional<FaultState> FaultManager::handleStartupCheck(const OptionalMessa } } + auto lk = stdx::lock_guard(_stateMutex); if (activeObserversTypes == _healthyObservations) { return FaultState::kOk; } @@ -230,8 +298,10 @@ boost::optional<FaultState> FaultManager::handleOk(const OptionalMessageType& me invariant(message); HealthCheckStatus status = message.get(); - auto lk = stdx::lock_guard(_stateMutex); - logMessageReceived(state(), status); + { + auto lk = stdx::lock_guard(_stateMutex); + logMessageReceived(state(), status); + } if (!_config->isHealthObserverEnabled(status.getType())) { return boost::none; @@ -252,8 +322,11 @@ boost::optional<FaultState> FaultManager::handleTransientFault(const OptionalMes } HealthCheckStatus status = message.get(); - auto lk = stdx::lock_guard(_stateMutex); - logMessageReceived(state(), status); + + { + auto lk = stdx::lock_guard(_stateMutex); + logMessageReceived(state(), status); + } updateWithCheckStatus(HealthCheckStatus(status)); @@ -264,6 +337,7 @@ boost::optional<FaultState> FaultManager::handleTransientFault(const OptionalMes // If the whole fault becomes resolved, garbage collect it // with proper locking. + auto lk = stdx::lock_guard(_mutex); if (_fault && _fault->getFacets().empty()) { _fault.reset(); return FaultState::kOk; @@ -324,21 +398,25 @@ void FaultManager::schedulePeriodicHealthCheckThread() { return; } - if (getActiveHealthObservers().size() == 0) { - LOGV2_DEBUG(5936511, 2, "No active health observers are configured."); + auto observers = getActiveHealthObservers(); + if (observers.size() == 0) { + LOGV2(5936511, "No active health observers are configured."); setState(FaultState::kOk, HealthCheckStatus(FaultFacetType::kSystem)); + return; } - auto observers = getHealthObservers(); + str::stream listOfActiveObservers; for (auto observer : observers) { - LOGV2_DEBUG( - 59365, 1, "starting health observer", "observerType"_attr = observer->getType()); + LOGV2_DEBUG(5936501, + 1, + "starting health observer", + "observerType"_attr = str::stream() << observer->getType()); + listOfActiveObservers << observer->getType() << " "; - // TODO (SERVER-59368): The system should properly handle a health checker being turned - // on/off auto token = _managerShuttingDownCancellationSource.token(); healthCheck(observer, token); } + LOGV2(5936804, "Health observers started", "detail"_attr = listOfActiveObservers); } FaultManager::~FaultManager() { @@ -452,8 +530,9 @@ void FaultManager::healthCheck(HealthObserver* observer, CancellationToken token } uassert(5936101, - fmt::format("Failed to initialize periodic health check work. Reason: {}", - periodicThreadCbHandleStatus.getStatus().codeString()), + str::stream() << "Failed to schedule periodic health check for " + << observer->getType() << ": " + << periodicThreadCbHandleStatus.getStatus().codeString(), periodicThreadCbHandleStatus.isOK()); } @@ -478,14 +557,12 @@ void FaultManager::healthCheck(HealthObserver* observer, CancellationToken token // If health observer is disabled, then do nothing and schedule another run (health observer may // become enabled). - // TODO (SERVER-59368): The system should properly handle a health checker being turned on/off if (!_config->isHealthObserverEnabled(observer->getType())) { schedulerCb(); return; } - // Run asynchronous health check. When complete, check for state transition (and perform if - // necessary). Then schedule the next run. + // Run asynchronous health check. Send output to the state machine. Schedule next run. auto healthCheckFuture = observer->periodicCheck(*this, _taskExecutor, token) .thenRunOn(_taskExecutor) .onCompletion([this, acceptNotOKStatus, schedulerCb]( @@ -564,13 +641,15 @@ void FaultManager::_init() { _progressMonitor = std::make_unique<ProgressMonitor>(this, _svcCtx, _crashCb); auto lk2 = stdx::lock_guard(_stateMutex); - LOGV2(5956701, - "Instantiated health observers, periodic health checking starts", - "managerState"_attr = state(), - "observersCount"_attr = _observers.size()); + _initialized = true; + LOGV2_DEBUG(5956701, + 1, + "Instantiated health observers", + "managerState"_attr = str::stream() << state(), + "observersCount"_attr = _observers.size()); } -std::vector<HealthObserver*> FaultManager::getHealthObservers() { +std::vector<HealthObserver*> FaultManager::getHealthObservers() const { std::vector<HealthObserver*> result; stdx::lock_guard<Latch> lk(_mutex); result.reserve(_observers.size()); @@ -581,7 +660,7 @@ std::vector<HealthObserver*> FaultManager::getHealthObservers() { return result; } -std::vector<HealthObserver*> FaultManager::getActiveHealthObservers() { +std::vector<HealthObserver*> FaultManager::getActiveHealthObservers() const { auto allObservers = getHealthObservers(); std::vector<HealthObserver*> result; result.reserve(allObservers.size()); @@ -593,6 +672,16 @@ std::vector<HealthObserver*> FaultManager::getActiveHealthObservers() { return result; } +HealthObserver* FaultManager::getHealthObserver(FaultFacetType type) const { + stdx::lock_guard<Latch> lk(_mutex); + auto observerIt = std::find_if( + _observers.begin(), _observers.end(), [type](auto& o) { return o->getType() == type; }); + if (observerIt != _observers.end()) { + return (*observerIt).get(); + } + return nullptr; +} + void FaultManager::progressMonitorCheckForTests(std::function<void(std::string cause)> crashCb) { _progressMonitor->progressMonitorCheck(crashCb); } diff --git a/src/mongo/db/process_health/fault_manager.h b/src/mongo/db/process_health/fault_manager.h index 597cfb54112..c627c18ee1d 100644 --- a/src/mongo/db/process_health/fault_manager.h +++ b/src/mongo/db/process_health/fault_manager.h @@ -93,12 +93,19 @@ public: // specific flags. SharedSemiFuture<void> startPeriodicHealthChecks(); + bool isInitialized(); + + static FaultManager* get(ServiceContext* svcCtx); // Replace the FaultManager for the 'svcCtx'. This functionality // is exposed for testing and initial bootstrap. static void set(ServiceContext* svcCtx, std::unique_ptr<FaultManager> newFaultManager); + // Signals that the intensity for a health observer has been updated. + static void healthMonitoringIntensitiesUpdated(HealthObserverIntensities oldValue, + HealthObserverIntensities newValue); + // Returns the current fault state for the server. FaultState getFaultState() const; @@ -107,7 +114,7 @@ public: // All observers remain valid for the manager lifetime, thus returning // just pointers is safe, as long as they are used while manager exists. - std::vector<HealthObserver*> getHealthObservers(); + std::vector<HealthObserver*> getHealthObservers() const; // Gets the aggregate configuration for all process health environment. FaultManagerConfig getConfig() const; @@ -117,7 +124,8 @@ public: protected: // Returns all health observers not configured as Off - std::vector<HealthObserver*> getActiveHealthObservers(); + std::vector<HealthObserver*> getActiveHealthObservers() const; + HealthObserver* getHealthObserver(FaultFacetType type) const; // Runs a particular health observer. Then attempts to transition states. Then schedules next // run. @@ -164,6 +172,7 @@ private: mutable Mutex _stateMutex = MONGO_MAKE_LATCH(HierarchicalAcquisitionLevel(0), "FaultManager::_stateMutex"); + bool _initialized = false; Date_t _lastTransitionTime; // Responsible for transitioning the state of FaultManager to ActiveFault after a diff --git a/src/mongo/db/process_health/fault_manager_config.cpp b/src/mongo/db/process_health/fault_manager_config.cpp index 0243cee192a..c4a7442c67c 100644 --- a/src/mongo/db/process_health/fault_manager_config.cpp +++ b/src/mongo/db/process_health/fault_manager_config.cpp @@ -57,5 +57,19 @@ std::ostream& operator<<(std::ostream& os, const FaultState& state) { return os << sb.stringData(); } +// TODO(SERVER-62125): remove this conversion and use idl type everywhere +FaultFacetType toFaultFacetType(HealthObserverTypeEnum type) { + switch (type) { + case HealthObserverTypeEnum::kLdap: + return FaultFacetType::kLdap; + case HealthObserverTypeEnum::kDns: + return FaultFacetType::kDns; + case HealthObserverTypeEnum::kTest: + return FaultFacetType::kTestObserver; + default: + MONGO_UNREACHABLE; + } +} + } // namespace process_health } // namespace mongo diff --git a/src/mongo/db/process_health/fault_manager_config.h b/src/mongo/db/process_health/fault_manager_config.h index d1d9007f29a..db218853da0 100644 --- a/src/mongo/db/process_health/fault_manager_config.h +++ b/src/mongo/db/process_health/fault_manager_config.h @@ -67,6 +67,9 @@ enum class FaultFacetType { kSystem, kMock1, kMock2, kTestObserver, kLdap, kDns static const StringData FaultFacetTypeStrings[] = { "kSystem", "kMock1", "kMock2", "kTestObserver", "kLdap", "kDns"}; +FaultFacetType toFaultFacetType(HealthObserverTypeEnum type); + + static const StringData FaultFacetType_serializer(const FaultFacetType value) { return FaultFacetTypeStrings[static_cast<int>(value)]; } @@ -89,11 +92,43 @@ public: HealthObserverIntensityEnum getHealthObserverIntensity(FaultFacetType type) { auto intensities = _getHealthObserverIntensities(); - if (type == FaultFacetType::kMock1 && _facetToIntensityMapForTest.contains(type)) { - return _facetToIntensityMapForTest.at(type); - } - return _getPropertyByType( - type, &intensities->_data, HealthObserverIntensityEnum::kCritical); + + auto toObserverType = [](FaultFacetType type) -> boost::optional<HealthObserverTypeEnum> { + switch (type) { + case FaultFacetType::kLdap: + return HealthObserverTypeEnum::kLdap; + case FaultFacetType::kDns: + return HealthObserverTypeEnum::kDns; + case FaultFacetType::kTestObserver: + return HealthObserverTypeEnum::kTest; + default: + return boost::none; + } + }; + + auto getIntensity = [this, intensities, &toObserverType](FaultFacetType type) { + auto observerType = toObserverType(type); + if (observerType) { + auto x = intensities->_data->getValues(); + if (x) { + for (auto setting : *x) { + if (setting.getType() == observerType) { + return setting.getIntensity(); + } + } + } + return HealthObserverIntensityEnum::kOff; + } else { + // TODO SERVER-61944: this is for kMock1 & kMock2. Remove this branch once mock + // types are deleted. + if (_facetToIntensityMapForTest.contains(type)) { + return _facetToIntensityMapForTest.at(type); + } + return HealthObserverIntensityEnum::kCritical; + } + }; + + return getIntensity(type); } bool isHealthObserverEnabled(FaultFacetType type) { @@ -152,23 +187,31 @@ private: template <typename T, typename R> R _getPropertyByType(FaultFacetType type, synchronized_value<T>* data, R defaultValue) const { + // TODO: update this function with additional fault facets when they are added + boost::optional<R> result; switch (type) { case FaultFacetType::kLdap: - return (*data)->getLdap(); + result = (*data)->getLdap(); + break; case FaultFacetType::kDns: - return (*data)->getDns(); + result = (*data)->getDns(); + break; case FaultFacetType::kTestObserver: - return (*data)->getTest(); + result = (*data)->getTest(); + break; case FaultFacetType::kSystem: - return defaultValue; + result = defaultValue; + break; case FaultFacetType::kMock1: - return defaultValue; + result = defaultValue; + break; case FaultFacetType::kMock2: - return defaultValue; - // TODO: update this function with additional fault facets when they are added + result = defaultValue; + break; default: MONGO_UNREACHABLE; } + return *result; } bool _periodicChecksDisabledForTests = false; diff --git a/src/mongo/db/process_health/fault_manager_test.cpp b/src/mongo/db/process_health/fault_manager_test.cpp index d216206c505..e3466699906 100644 --- a/src/mongo/db/process_health/fault_manager_test.cpp +++ b/src/mongo/db/process_health/fault_manager_test.cpp @@ -50,9 +50,9 @@ TEST(FaultManagerTest, Registration) { TEST_F(FaultManagerTest, GetHealthObserverIntensity) { auto config = manager().getConfig(); ASSERT(config.getHealthObserverIntensity(FaultFacetType::kLdap) == - HealthObserverIntensityEnum::kNonCritical); + HealthObserverIntensityEnum::kOff); ASSERT(config.getHealthObserverIntensity(FaultFacetType::kDns) == - HealthObserverIntensityEnum::kNonCritical); + HealthObserverIntensityEnum::kOff); } } // namespace diff --git a/src/mongo/db/process_health/fault_manager_test_suite.h b/src/mongo/db/process_health/fault_manager_test_suite.h index fc567c5b0fb..59d937846c7 100644 --- a/src/mongo/db/process_health/fault_manager_test_suite.h +++ b/src/mongo/db/process_health/fault_manager_test_suite.h @@ -236,7 +236,7 @@ public: tickSource().advance(d); } - static inline const Seconds kWaitTimeout{30}; + static inline const Seconds kWaitTimeout{10}; static inline const Milliseconds kSleepTime{1}; static inline const int kActiveFaultDurationSecs = 1; @@ -251,7 +251,7 @@ public: return; sleepFor(kSleepTime); } - invariant(false); + ASSERT(false); } static inline const Milliseconds kCheckTimeIncrement{100}; diff --git a/src/mongo/db/process_health/health_check_status.h b/src/mongo/db/process_health/health_check_status.h index 7d20016331d..5e5e26ff97d 100644 --- a/src/mongo/db/process_health/health_check_status.h +++ b/src/mongo/db/process_health/health_check_status.h @@ -60,6 +60,9 @@ public: explicit HealthCheckStatus(FaultFacetType type) : _type(type), _severity(0), _description("resolved"_sd) {} + explicit HealthCheckStatus(HealthObserverTypeEnum type) + : _type(toFaultFacetType(type)), _severity(0), _description("resolved"_sd) {} + HealthCheckStatus(const HealthCheckStatus&) = default; HealthCheckStatus& operator=(const HealthCheckStatus&) = default; HealthCheckStatus(HealthCheckStatus&&) = default; @@ -114,6 +117,7 @@ private: friend std::ostream& operator<<(std::ostream&, const HealthCheckStatus&); friend StringBuilder& operator<<(StringBuilder& s, const HealthCheckStatus& hcs); + FaultFacetType _type; double _severity; std::string _description; diff --git a/src/mongo/db/process_health/health_monitoring_server_parameters.cpp b/src/mongo/db/process_health/health_monitoring_server_parameters.cpp index 36d445cc386..8258d2da0e9 100644 --- a/src/mongo/db/process_health/health_monitoring_server_parameters.cpp +++ b/src/mongo/db/process_health/health_monitoring_server_parameters.cpp @@ -26,23 +26,66 @@ * exception statement from all source files in the program, then also delete * it in the license file. */ +#include <algorithm> #include "mongo/bson/json.h" +#include "mongo/db/process_health/fault_manager.h" #include "mongo/db/process_health/health_monitoring_server_parameters_gen.h" #include "mongo/db/process_health/health_observer.h" namespace mongo { +namespace { +// Replaces values in oldIntensities with values in newIntensities while preserving all values in +// oldIntensities not in newIntensities. +HealthObserverIntensities mergeIntensities(const HealthObserverIntensities& oldIntensities, + const HealthObserverIntensities& newIntensities) { + using namespace std; + HealthObserverIntensities result = oldIntensities; + auto optionalOldValues = result.getValues(); + auto optionalNewValues = newIntensities.getValues(); + if (!optionalNewValues) { + return oldIntensities; + } + if (!optionalOldValues) { + result.setValues(*optionalNewValues); + return result; + } + for (const auto& setting : *optionalNewValues) { + auto it = find_if(begin(*optionalOldValues), + end(*optionalOldValues), + [&setting](const HealthObserverIntensitySetting& destSetting) { + return (destSetting.getType() == setting.getType()) ? true : false; + }); + if (it != optionalOldValues->end()) { + *it = setting; + } else { + optionalOldValues->emplace_back(setting); + } + } + result.setValues(*optionalOldValues); + return result; +} +} // namespace + Status HealthMonitoringIntensitiesServerParameter::setFromString(const std::string& value) { - *_data = HealthObserverIntensities::parse( + auto oldValue = **_data; + auto newValue = HealthObserverIntensities::parse( IDLParserErrorContext("health monitoring intensities"), fromjson(value)); + newValue = mergeIntensities(oldValue, newValue); + process_health::FaultManager::healthMonitoringIntensitiesUpdated(oldValue, newValue); + **_data = newValue; return Status::OK(); } Status HealthMonitoringIntensitiesServerParameter::set(const BSONElement& newValueElement) { - *_data = HealthObserverIntensities::parse( + auto oldValue = **_data; + auto newValue = HealthObserverIntensities::parse( IDLParserErrorContext("health monitoring intensities"), newValueElement.Obj()); + newValue = mergeIntensities(oldValue, newValue); + process_health::FaultManager::healthMonitoringIntensitiesUpdated(oldValue, newValue); + **_data = newValue; return Status::OK(); } diff --git a/src/mongo/db/process_health/health_monitoring_server_parameters.idl b/src/mongo/db/process_health/health_monitoring_server_parameters.idl index 66c17fd15df..c6b44e56701 100644 --- a/src/mongo/db/process_health/health_monitoring_server_parameters.idl +++ b/src/mongo/db/process_health/health_monitoring_server_parameters.idl @@ -42,24 +42,36 @@ enums: kCritical: "critical" kNonCritical: "non-critical" + HealthObserverType: + description: "Enum representing available health observer types" + type: string + values: + kLdap: "ldap" + kDns: "dns" + kTest: "test" + structs: - HealthObserverIntensities: - description: "A struct representing the health observer intensities." + HealthObserverIntensitySetting: + description: "One health observer intensity setting" strict: true fields: - dns: - description: "Intensity of DNS fault facet" - type: HealthObserverIntensity - default: kNonCritical - ldap: - description: "Intensity of LDAP fault facet" - type: HealthObserverIntensity - default: kNonCritical - test: - description: "Intensity of test fault facet" + type: + type: HealthObserverType + optional: false + intensity: type: HealthObserverIntensity + optional: false default: kOff + HealthObserverIntensities: + description: "A struct representing the health observer intensities." + strict: false + fields: + values: + description: "Array of health observer intensity settings" + type: array<HealthObserverIntensitySetting> + optional: true + HealthObserverIntervals: description: "A struct representing the interval in milliseconds for each health observer." strict: true @@ -77,7 +89,7 @@ structs: test: description: "Test health observer health check interval." type: int - default: 1 + default: 10 validator: { gt: 0 } HealthObserverProgressMonitorConfig: diff --git a/src/mongo/db/process_health/test_health_observer.cpp b/src/mongo/db/process_health/test_health_observer.cpp index 254a70f217b..ae5747895ea 100644 --- a/src/mongo/db/process_health/test_health_observer.cpp +++ b/src/mongo/db/process_health/test_health_observer.cpp @@ -26,10 +26,13 @@ * exception statement from all source files in the program, then also delete * it in the license file. */ +#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kProcessHealth #include "mongo/db/process_health/test_health_observer.h" + #include "mongo/db/commands/test_commands_enabled.h" #include "mongo/db/process_health/health_observer_registration.h" +#include "mongo/logv2/log.h" namespace mongo { namespace process_health { @@ -37,6 +40,7 @@ MONGO_FAIL_POINT_DEFINE(hangTestHealthObserver); MONGO_FAIL_POINT_DEFINE(testHealthObserver); Future<HealthCheckStatus> TestHealthObserver::periodicCheckImpl( PeriodicHealthCheckContext&& periodicCheckContext) { + LOGV2_DEBUG(5936801, 2, "Test health observer executing"); hangTestHealthObserver.pauseWhileSet(); auto result = Future<HealthCheckStatus>::makeReady(makeHealthyStatus()); @@ -50,6 +54,7 @@ Future<HealthCheckStatus> TestHealthObserver::periodicCheckImpl( }, [&](const BSONObj& data) { return !data.isEmpty(); }); + LOGV2_DEBUG(5936802, 2, "Test health observer returns", "result"_attr = result.get()); return result; } @@ -58,6 +63,7 @@ MONGO_INITIALIZER(TestHealthObserver)(InitializerContext*) { // Failpoints can only be set when test commands are enabled, and so the test health observer // is only useful in that case. if (getTestCommandsEnabled()) { + LOGV2(5936803, "Test health observer instantiated"); HealthObserverRegistration::registerObserverFactory( [](ServiceContext* svcCtx) { return std::make_unique<TestHealthObserver>(svcCtx); }); } |