summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/mongo/db/process_health/fault_manager.cpp39
-rw-r--r--src/mongo/db/process_health/fault_manager_config.h8
-rw-r--r--src/mongo/db/process_health/health_observer.h16
-rw-r--r--src/mongo/db/process_health/health_observer_base.cpp7
-rw-r--r--src/mongo/db/process_health/health_observer_base.h17
5 files changed, 63 insertions, 24 deletions
diff --git a/src/mongo/db/process_health/fault_manager.cpp b/src/mongo/db/process_health/fault_manager.cpp
index a9266e9d3b6..8458c640048 100644
--- a/src/mongo/db/process_health/fault_manager.cpp
+++ b/src/mongo/db/process_health/fault_manager.cpp
@@ -33,6 +33,8 @@
#include "mongo/db/process_health/fault_manager.h"
+#include <algorithm>
+
#include "mongo/db/process_health/fault_facet_impl.h"
#include "mongo/db/process_health/fault_impl.h"
#include "mongo/db/process_health/fault_manager_config.h"
@@ -271,7 +273,7 @@ boost::optional<FaultState> FaultManager::handleTransientFault(const OptionalMes
}
boost::optional<FaultState> FaultManager::handleActiveFault(const OptionalMessageType& message) {
- LOGV2_FATAL(5936509, "Fault manager received active fault");
+ LOGV2_FATAL(5936509, "Halting Process due to ongoing fault", "fault"_attr = *_fault);
return boost::none;
}
@@ -285,7 +287,14 @@ void FaultManager::logMessageReceived(FaultState state, const HealthCheckStatus&
}
void FaultManager::logCurrentState(FaultState, FaultState newState, const OptionalMessageType&) {
- LOGV2(5936503, "Fault manager changed state ", "state"_attr = (str::stream() << newState));
+ if (_fault) {
+ LOGV2(5939703,
+ "Fault manager changed state ",
+ "state"_attr = (str::stream() << newState),
+ "fault"_attr = *_fault);
+ } else {
+ LOGV2(5936503, "Fault manager changed state ", "state"_attr = (str::stream() << newState));
+ }
}
void FaultManager::setTransientFaultDeadline(FaultState, FaultState, const OptionalMessageType&) {
@@ -410,11 +419,27 @@ FaultFacetsContainerPtr FaultManager::getOrCreateFaultFacetsContainer() {
void FaultManager::healthCheck(HealthObserver* observer, CancellationToken token) {
auto schedulerCb = [this, observer, token] {
- auto periodicThreadCbHandleStatus = this->_taskExecutor->scheduleWorkAt(
- _taskExecutor->now() + this->_config->kPeriodicHealthCheckInterval,
+ auto scheduledTime = _taskExecutor->now() + _config->kPeriodicHealthCheckInterval +
+ std::min(observer->healthCheckJitter(),
+ FaultManagerConfig::kPeriodicHealthCheckMaxJitter);
+ LOGV2_DEBUG(5939701,
+ 2,
+ "Schedule next health check",
+ "observerType"_attr = str::stream() << observer->getType(),
+ "scheduledTime"_attr = scheduledTime);
+
+ auto periodicThreadCbHandleStatus = _taskExecutor->scheduleWorkAt(
+ scheduledTime,
[this, observer, token](const mongo::executor::TaskExecutor::CallbackArgs& cbData) {
if (!cbData.status.isOK()) {
- return;
+ LOGV2_DEBUG(5939702,
+ 1,
+ "Fault manager received an error",
+ "status"_attr = cbData.status);
+ if (ErrorCodes::isA<ErrorCategory::CancellationError>(cbData.status.code())) {
+ return;
+ }
+ // continue health checking otherwise
}
healthCheck(observer, token);
});
@@ -439,7 +464,7 @@ void FaultManager::healthCheck(HealthObserver* observer, CancellationToken token
auto healthCheckStatus = HealthCheckStatus(observer->getType(), 1.0, s.reason());
LOGV2_ERROR(
6007901, "Unexpected failure during health check", "status"_attr = healthCheckStatus);
- this->accept(healthCheckStatus);
+ accept(healthCheckStatus);
return healthCheckStatus;
};
@@ -468,7 +493,7 @@ void FaultManager::healthCheck(HealthObserver* observer, CancellationToken token
return acceptNotOKStatus(status.getStatus());
}
- this->accept(status.getValue());
+ accept(status.getValue());
return status.getValue();
});
auto futurePtr =
diff --git a/src/mongo/db/process_health/fault_manager_config.h b/src/mongo/db/process_health/fault_manager_config.h
index 830fb97ae46..ce17c74b7a8 100644
--- a/src/mongo/db/process_health/fault_manager_config.h
+++ b/src/mongo/db/process_health/fault_manager_config.h
@@ -68,7 +68,13 @@ enum class FaultFacetType { kSystem, kMock1, kMock2, kLdap, kDns };
class FaultManagerConfig {
public:
- static auto inline constexpr kPeriodicHealthCheckInterval{Milliseconds(50)};
+ /* Default value of time between health checks
+ * TODO SERVER-61947 make this a property of health observers
+ */
+ static auto inline constexpr kPeriodicHealthCheckInterval{Milliseconds(1000)};
+
+ /* Maximum possible jitter added to the time between health checks */
+ static auto inline constexpr kPeriodicHealthCheckMaxJitter{Milliseconds{100}};
HealthObserverIntensityEnum getHealthObserverIntensity(FaultFacetType type) {
auto intensities = getHealthObserverIntensities();
diff --git a/src/mongo/db/process_health/health_observer.h b/src/mongo/db/process_health/health_observer.h
index 9912a624272..881c8f4b6b1 100644
--- a/src/mongo/db/process_health/health_observer.h
+++ b/src/mongo/db/process_health/health_observer.h
@@ -57,8 +57,6 @@ struct HealthObserverLivenessStats {
/**
* Interface to conduct periodic health checks.
- * Every instance of health observer is wired internally to update the state of the FaultManager
- * when a problem is detected.
*/
class HealthObserver {
public:
@@ -73,13 +71,8 @@ public:
virtual FaultFacetType getType() const = 0;
/**
- * Triggers health check.
- * It should be safe to invoke this method arbitrary often, the implementation
- * should prorate the invocations to avoid DoS.
- * The implementation may or may not block for the completion of the check, this remains
- * unspecified.
- * Note: no methods in this class should return any check results, the proper way to
- * get result is to check facets in the FaultManager.
+ * Triggers health check. The implementation should not block to wait for the completion
+ * of this check.
*
* @param factory Interface to get or create the factory of facets container.
*/
@@ -89,6 +82,11 @@ public:
CancellationToken token) = 0;
virtual HealthObserverLivenessStats getStats() const = 0;
+
+ /**
+ * Value used to introduce jitter between health check invocations.
+ */
+ virtual Milliseconds healthCheckJitter() const = 0;
};
} // namespace process_health
diff --git a/src/mongo/db/process_health/health_observer_base.cpp b/src/mongo/db/process_health/health_observer_base.cpp
index a4faea9fc73..25556c80f95 100644
--- a/src/mongo/db/process_health/health_observer_base.cpp
+++ b/src/mongo/db/process_health/health_observer_base.cpp
@@ -37,7 +37,8 @@
namespace mongo {
namespace process_health {
-HealthObserverBase::HealthObserverBase(ServiceContext* svcCtx) : _svcCtx(svcCtx) {}
+HealthObserverBase::HealthObserverBase(ServiceContext* svcCtx)
+ : _svcCtx(svcCtx), _rand(PseudoRandom(SecureRandom().nextInt64())) {}
SharedSemiFuture<HealthCheckStatus> HealthObserverBase::periodicCheck(
FaultFacetsContainerFactory& factory,
@@ -116,5 +117,9 @@ HealthObserverLivenessStats HealthObserverBase::getStatsLocked(WithLock) const {
return stats;
}
+Milliseconds HealthObserverBase::healthCheckJitter() const {
+ return randDuration(FaultManagerConfig::kPeriodicHealthCheckMaxJitter);
+}
+
} // namespace process_health
} // namespace mongo
diff --git a/src/mongo/db/process_health/health_observer_base.h b/src/mongo/db/process_health/health_observer_base.h
index f415a8e034a..73f4e9346d8 100644
--- a/src/mongo/db/process_health/health_observer_base.h
+++ b/src/mongo/db/process_health/health_observer_base.h
@@ -57,12 +57,6 @@ public:
return _svcCtx;
}
- /**
- * @return Milliseconds the shortest interval it is safe to repeat this check on.
- */
- virtual Milliseconds minimalCheckInterval() const {
- return Milliseconds(10);
- }
// Implements the common logic for periodic checks.
// Every observer should implement periodicCheckImpl() for specific tests.
@@ -75,6 +69,7 @@ public:
HealthCheckStatus makeSimpleFailedStatus(double severity, std::vector<Status>&& failures) const;
HealthObserverLivenessStats getStats() const override;
+ Milliseconds healthCheckJitter() const override;
// Common params for every health check.
struct PeriodicHealthCheckContext {
@@ -94,6 +89,14 @@ protected:
HealthObserverLivenessStats getStatsLocked(WithLock) const;
+ template <typename T>
+ T randDuration(T upperBound) const {
+ auto upperCount = durationCount<T>(upperBound);
+ stdx::lock_guard lock(_mutex);
+ auto resultCount = _rand.nextInt64(upperCount);
+ return T(resultCount);
+ }
+
ServiceContext* const _svcCtx;
mutable Mutex _mutex =
@@ -107,6 +110,8 @@ protected:
Date_t _lastTimeCheckCompleted;
int _completedChecksCount = 0;
int _completedChecksWithFaultCount = 0;
+
+ mutable PseudoRandom _rand;
};
} // namespace process_health