From c3707e4b508b3e69f3146e068b01dac88c9106c3 Mon Sep 17 00:00:00 2001 From: LaMont Nelson Date: Wed, 8 Dec 2021 01:24:59 +0000 Subject: SERVER-59397 Add jitter when scheduling next health check --- src/mongo/db/process_health/fault_manager.cpp | 39 ++++++++++++++++++---- src/mongo/db/process_health/fault_manager_config.h | 8 ++++- src/mongo/db/process_health/health_observer.h | 16 ++++----- .../db/process_health/health_observer_base.cpp | 7 +++- src/mongo/db/process_health/health_observer_base.h | 17 ++++++---- 5 files changed, 63 insertions(+), 24 deletions(-) (limited to 'src/mongo/db') diff --git a/src/mongo/db/process_health/fault_manager.cpp b/src/mongo/db/process_health/fault_manager.cpp index 2e994845427..90d9ecf57f2 100644 --- a/src/mongo/db/process_health/fault_manager.cpp +++ b/src/mongo/db/process_health/fault_manager.cpp @@ -33,6 +33,8 @@ #include "mongo/db/process_health/fault_manager.h" +#include + #include "mongo/db/process_health/fault_facet_impl.h" #include "mongo/db/process_health/fault_impl.h" #include "mongo/db/process_health/fault_manager_config.h" @@ -276,7 +278,7 @@ boost::optional FaultManager::handleTransientFault(const OptionalMes } boost::optional FaultManager::handleActiveFault(const OptionalMessageType& message) { - LOGV2_FATAL(5936509, "Fault manager received active fault"); + LOGV2_FATAL(5936509, "Halting Process due to ongoing fault", "fault"_attr = *_fault); return boost::none; } @@ -290,7 +292,14 @@ void FaultManager::logMessageReceived(FaultState state, const HealthCheckStatus& } void FaultManager::logCurrentState(FaultState, FaultState newState, const OptionalMessageType&) { - LOGV2(5936503, "Fault manager changed state ", "state"_attr = (str::stream() << newState)); + if (_fault) { + LOGV2(5939703, + "Fault manager changed state ", + "state"_attr = (str::stream() << newState), + "fault"_attr = *_fault); + } else { + LOGV2(5936503, "Fault manager changed state ", "state"_attr = (str::stream() << newState)); + } } void FaultManager::setTransientFaultDeadline(FaultState, FaultState, const OptionalMessageType&) { @@ -412,11 +421,27 @@ FaultFacetsContainerPtr FaultManager::getOrCreateFaultFacetsContainer() { void FaultManager::healthCheck(HealthObserver* observer, std::shared_ptr> token) { auto schedulerCb = [this, observer, token] { - auto periodicThreadCbHandleStatus = this->_taskExecutor->scheduleWorkAt( - _taskExecutor->now() + this->_config->kPeriodicHealthCheckInterval, + auto scheduledTime = _taskExecutor->now() + _config->kPeriodicHealthCheckInterval + + std::min(observer->healthCheckJitter(), + FaultManagerConfig::kPeriodicHealthCheckMaxJitter); + LOGV2_DEBUG(5939701, + 2, + "Schedule next health check", + "observerType"_attr = str::stream() << observer->getType(), + "scheduledTime"_attr = scheduledTime); + + auto periodicThreadCbHandleStatus = _taskExecutor->scheduleWorkAt( + scheduledTime, [this, observer, token](const mongo::executor::TaskExecutor::CallbackArgs& cbData) { if (!cbData.status.isOK()) { - return; + LOGV2_DEBUG(5939702, + 1, + "Fault manager received an error", + "status"_attr = cbData.status); + if (ErrorCodes::isA(cbData.status.code())) { + return; + } + // continue health checking otherwise } healthCheck(observer, token); }); @@ -441,7 +466,7 @@ void FaultManager::healthCheck(HealthObserver* observer, std::shared_ptrgetType(), 1.0, s.reason()); LOGV2_ERROR( 6007901, "Unexpected failure during health check", "status"_attr = healthCheckStatus); - this->accept(healthCheckStatus); + accept(healthCheckStatus); return healthCheckStatus; }; @@ -470,7 +495,7 @@ void FaultManager::healthCheck(HealthObserver* observer, std::shared_ptraccept(status.getValue()); + accept(status.getValue()); return status.getValue(); }); auto futurePtr = diff --git a/src/mongo/db/process_health/fault_manager_config.h b/src/mongo/db/process_health/fault_manager_config.h index 830fb97ae46..ce17c74b7a8 100644 --- a/src/mongo/db/process_health/fault_manager_config.h +++ b/src/mongo/db/process_health/fault_manager_config.h @@ -68,7 +68,13 @@ enum class FaultFacetType { kSystem, kMock1, kMock2, kLdap, kDns }; class FaultManagerConfig { public: - static auto inline constexpr kPeriodicHealthCheckInterval{Milliseconds(50)}; + /* Default value of time between health checks + * TODO SERVER-61947 make this a property of health observers + */ + static auto inline constexpr kPeriodicHealthCheckInterval{Milliseconds(1000)}; + + /* Maximum possible jitter added to the time between health checks */ + static auto inline constexpr kPeriodicHealthCheckMaxJitter{Milliseconds{100}}; HealthObserverIntensityEnum getHealthObserverIntensity(FaultFacetType type) { auto intensities = getHealthObserverIntensities(); diff --git a/src/mongo/db/process_health/health_observer.h b/src/mongo/db/process_health/health_observer.h index 2c4a04b5399..482dcbcfa92 100644 --- a/src/mongo/db/process_health/health_observer.h +++ b/src/mongo/db/process_health/health_observer.h @@ -57,8 +57,6 @@ struct HealthObserverLivenessStats { /** * Interface to conduct periodic health checks. - * Every instance of health observer is wired internally to update the state of the FaultManager - * when a problem is detected. */ class HealthObserver { public: @@ -73,13 +71,8 @@ public: virtual FaultFacetType getType() const = 0; /** - * Triggers health check. - * It should be safe to invoke this method arbitrary often, the implementation - * should prorate the invocations to avoid DoS. - * The implementation may or may not block for the completion of the check, this remains - * unspecified. - * Note: no methods in this class should return any check results, the proper way to - * get result is to check facets in the FaultManager. + * Triggers health check. The implementation should not block to wait for the completion + * of this check. * * @param factory Interface to get or create the factory of facets container. */ @@ -89,6 +82,11 @@ public: std::shared_ptr> cancellationToken) = 0; virtual HealthObserverLivenessStats getStats() const = 0; + + /** + * Value used to introduce jitter between health check invocations. + */ + virtual Milliseconds healthCheckJitter() const = 0; }; } // namespace process_health diff --git a/src/mongo/db/process_health/health_observer_base.cpp b/src/mongo/db/process_health/health_observer_base.cpp index 9ba3bdf9e9a..32d732d16a7 100644 --- a/src/mongo/db/process_health/health_observer_base.cpp +++ b/src/mongo/db/process_health/health_observer_base.cpp @@ -37,7 +37,8 @@ namespace mongo { namespace process_health { -HealthObserverBase::HealthObserverBase(ServiceContext* svcCtx) : _svcCtx(svcCtx) {} +HealthObserverBase::HealthObserverBase(ServiceContext* svcCtx) + : _svcCtx(svcCtx), _rand(PseudoRandom(SecureRandom().nextInt64())) {} SharedSemiFuture HealthObserverBase::periodicCheck( FaultFacetsContainerFactory& factory, @@ -116,5 +117,9 @@ HealthObserverLivenessStats HealthObserverBase::getStatsLocked(WithLock) const { return stats; } +Milliseconds HealthObserverBase::healthCheckJitter() const { + return randDuration(FaultManagerConfig::kPeriodicHealthCheckMaxJitter); +} + } // namespace process_health } // namespace mongo diff --git a/src/mongo/db/process_health/health_observer_base.h b/src/mongo/db/process_health/health_observer_base.h index 0e8fd424b18..d348edc4902 100644 --- a/src/mongo/db/process_health/health_observer_base.h +++ b/src/mongo/db/process_health/health_observer_base.h @@ -57,12 +57,6 @@ public: return _svcCtx; } - /** - * @return Milliseconds the shortest interval it is safe to repeat this check on. - */ - virtual Milliseconds minimalCheckInterval() const { - return Milliseconds(10); - } // Implements the common logic for periodic checks. // Every observer should implement periodicCheckImpl() for specific tests. @@ -75,6 +69,7 @@ public: HealthCheckStatus makeSimpleFailedStatus(double severity, std::vector&& failures) const; HealthObserverLivenessStats getStats() const override; + Milliseconds healthCheckJitter() const override; // Common params for every health check. struct PeriodicHealthCheckContext { @@ -94,6 +89,14 @@ protected: HealthObserverLivenessStats getStatsLocked(WithLock) const; + template + T randDuration(T upperBound) const { + auto upperCount = durationCount(upperBound); + stdx::lock_guard lock(_mutex); + auto resultCount = _rand.nextInt64(upperCount); + return T(resultCount); + } + ServiceContext* const _svcCtx; mutable Mutex _mutex = @@ -107,6 +110,8 @@ protected: Date_t _lastTimeCheckCompleted; int _completedChecksCount = 0; int _completedChecksWithFaultCount = 0; + + mutable PseudoRandom _rand; }; } // namespace process_health -- cgit v1.2.1