diff options
-rw-r--r-- | src/mongo/db/process_health/fault_manager.cpp | 39 | ||||
-rw-r--r-- | src/mongo/db/process_health/fault_manager_config.h | 8 | ||||
-rw-r--r-- | src/mongo/db/process_health/health_observer.h | 16 | ||||
-rw-r--r-- | src/mongo/db/process_health/health_observer_base.cpp | 7 | ||||
-rw-r--r-- | src/mongo/db/process_health/health_observer_base.h | 17 |
5 files changed, 63 insertions, 24 deletions
diff --git a/src/mongo/db/process_health/fault_manager.cpp b/src/mongo/db/process_health/fault_manager.cpp index a9266e9d3b6..8458c640048 100644 --- a/src/mongo/db/process_health/fault_manager.cpp +++ b/src/mongo/db/process_health/fault_manager.cpp @@ -33,6 +33,8 @@ #include "mongo/db/process_health/fault_manager.h" +#include <algorithm> + #include "mongo/db/process_health/fault_facet_impl.h" #include "mongo/db/process_health/fault_impl.h" #include "mongo/db/process_health/fault_manager_config.h" @@ -271,7 +273,7 @@ boost::optional<FaultState> FaultManager::handleTransientFault(const OptionalMes } boost::optional<FaultState> FaultManager::handleActiveFault(const OptionalMessageType& message) { - LOGV2_FATAL(5936509, "Fault manager received active fault"); + LOGV2_FATAL(5936509, "Halting Process due to ongoing fault", "fault"_attr = *_fault); return boost::none; } @@ -285,7 +287,14 @@ void FaultManager::logMessageReceived(FaultState state, const HealthCheckStatus& } void FaultManager::logCurrentState(FaultState, FaultState newState, const OptionalMessageType&) { - LOGV2(5936503, "Fault manager changed state ", "state"_attr = (str::stream() << newState)); + if (_fault) { + LOGV2(5939703, + "Fault manager changed state ", + "state"_attr = (str::stream() << newState), + "fault"_attr = *_fault); + } else { + LOGV2(5936503, "Fault manager changed state ", "state"_attr = (str::stream() << newState)); + } } void FaultManager::setTransientFaultDeadline(FaultState, FaultState, const OptionalMessageType&) { @@ -410,11 +419,27 @@ FaultFacetsContainerPtr FaultManager::getOrCreateFaultFacetsContainer() { void FaultManager::healthCheck(HealthObserver* observer, CancellationToken token) { auto schedulerCb = [this, observer, token] { - auto periodicThreadCbHandleStatus = this->_taskExecutor->scheduleWorkAt( - _taskExecutor->now() + this->_config->kPeriodicHealthCheckInterval, + auto scheduledTime = _taskExecutor->now() + _config->kPeriodicHealthCheckInterval + + std::min(observer->healthCheckJitter(), + FaultManagerConfig::kPeriodicHealthCheckMaxJitter); + LOGV2_DEBUG(5939701, + 2, + "Schedule next health check", + "observerType"_attr = str::stream() << observer->getType(), + "scheduledTime"_attr = scheduledTime); + + auto periodicThreadCbHandleStatus = _taskExecutor->scheduleWorkAt( + scheduledTime, [this, observer, token](const mongo::executor::TaskExecutor::CallbackArgs& cbData) { if (!cbData.status.isOK()) { - return; + LOGV2_DEBUG(5939702, + 1, + "Fault manager received an error", + "status"_attr = cbData.status); + if (ErrorCodes::isA<ErrorCategory::CancellationError>(cbData.status.code())) { + return; + } + // continue health checking otherwise } healthCheck(observer, token); }); @@ -439,7 +464,7 @@ void FaultManager::healthCheck(HealthObserver* observer, CancellationToken token auto healthCheckStatus = HealthCheckStatus(observer->getType(), 1.0, s.reason()); LOGV2_ERROR( 6007901, "Unexpected failure during health check", "status"_attr = healthCheckStatus); - this->accept(healthCheckStatus); + accept(healthCheckStatus); return healthCheckStatus; }; @@ -468,7 +493,7 @@ void FaultManager::healthCheck(HealthObserver* observer, CancellationToken token return acceptNotOKStatus(status.getStatus()); } - this->accept(status.getValue()); + accept(status.getValue()); return status.getValue(); }); auto futurePtr = diff --git a/src/mongo/db/process_health/fault_manager_config.h b/src/mongo/db/process_health/fault_manager_config.h index 830fb97ae46..ce17c74b7a8 100644 --- a/src/mongo/db/process_health/fault_manager_config.h +++ b/src/mongo/db/process_health/fault_manager_config.h @@ -68,7 +68,13 @@ enum class FaultFacetType { kSystem, kMock1, kMock2, kLdap, kDns }; class FaultManagerConfig { public: - static auto inline constexpr kPeriodicHealthCheckInterval{Milliseconds(50)}; + /* Default value of time between health checks + * TODO SERVER-61947 make this a property of health observers + */ + static auto inline constexpr kPeriodicHealthCheckInterval{Milliseconds(1000)}; + + /* Maximum possible jitter added to the time between health checks */ + static auto inline constexpr kPeriodicHealthCheckMaxJitter{Milliseconds{100}}; HealthObserverIntensityEnum getHealthObserverIntensity(FaultFacetType type) { auto intensities = getHealthObserverIntensities(); diff --git a/src/mongo/db/process_health/health_observer.h b/src/mongo/db/process_health/health_observer.h index 9912a624272..881c8f4b6b1 100644 --- a/src/mongo/db/process_health/health_observer.h +++ b/src/mongo/db/process_health/health_observer.h @@ -57,8 +57,6 @@ struct HealthObserverLivenessStats { /** * Interface to conduct periodic health checks. - * Every instance of health observer is wired internally to update the state of the FaultManager - * when a problem is detected. */ class HealthObserver { public: @@ -73,13 +71,8 @@ public: virtual FaultFacetType getType() const = 0; /** - * Triggers health check. - * It should be safe to invoke this method arbitrary often, the implementation - * should prorate the invocations to avoid DoS. - * The implementation may or may not block for the completion of the check, this remains - * unspecified. - * Note: no methods in this class should return any check results, the proper way to - * get result is to check facets in the FaultManager. + * Triggers health check. The implementation should not block to wait for the completion + * of this check. * * @param factory Interface to get or create the factory of facets container. */ @@ -89,6 +82,11 @@ public: CancellationToken token) = 0; virtual HealthObserverLivenessStats getStats() const = 0; + + /** + * Value used to introduce jitter between health check invocations. + */ + virtual Milliseconds healthCheckJitter() const = 0; }; } // namespace process_health diff --git a/src/mongo/db/process_health/health_observer_base.cpp b/src/mongo/db/process_health/health_observer_base.cpp index a4faea9fc73..25556c80f95 100644 --- a/src/mongo/db/process_health/health_observer_base.cpp +++ b/src/mongo/db/process_health/health_observer_base.cpp @@ -37,7 +37,8 @@ namespace mongo { namespace process_health { -HealthObserverBase::HealthObserverBase(ServiceContext* svcCtx) : _svcCtx(svcCtx) {} +HealthObserverBase::HealthObserverBase(ServiceContext* svcCtx) + : _svcCtx(svcCtx), _rand(PseudoRandom(SecureRandom().nextInt64())) {} SharedSemiFuture<HealthCheckStatus> HealthObserverBase::periodicCheck( FaultFacetsContainerFactory& factory, @@ -116,5 +117,9 @@ HealthObserverLivenessStats HealthObserverBase::getStatsLocked(WithLock) const { return stats; } +Milliseconds HealthObserverBase::healthCheckJitter() const { + return randDuration(FaultManagerConfig::kPeriodicHealthCheckMaxJitter); +} + } // namespace process_health } // namespace mongo diff --git a/src/mongo/db/process_health/health_observer_base.h b/src/mongo/db/process_health/health_observer_base.h index f415a8e034a..73f4e9346d8 100644 --- a/src/mongo/db/process_health/health_observer_base.h +++ b/src/mongo/db/process_health/health_observer_base.h @@ -57,12 +57,6 @@ public: return _svcCtx; } - /** - * @return Milliseconds the shortest interval it is safe to repeat this check on. - */ - virtual Milliseconds minimalCheckInterval() const { - return Milliseconds(10); - } // Implements the common logic for periodic checks. // Every observer should implement periodicCheckImpl() for specific tests. @@ -75,6 +69,7 @@ public: HealthCheckStatus makeSimpleFailedStatus(double severity, std::vector<Status>&& failures) const; HealthObserverLivenessStats getStats() const override; + Milliseconds healthCheckJitter() const override; // Common params for every health check. struct PeriodicHealthCheckContext { @@ -94,6 +89,14 @@ protected: HealthObserverLivenessStats getStatsLocked(WithLock) const; + template <typename T> + T randDuration(T upperBound) const { + auto upperCount = durationCount<T>(upperBound); + stdx::lock_guard lock(_mutex); + auto resultCount = _rand.nextInt64(upperCount); + return T(resultCount); + } + ServiceContext* const _svcCtx; mutable Mutex _mutex = @@ -107,6 +110,8 @@ protected: Date_t _lastTimeCheckCompleted; int _completedChecksCount = 0; int _completedChecksWithFaultCount = 0; + + mutable PseudoRandom _rand; }; } // namespace process_health |