From 5579b1de0cacdccfbacac795b56b123ed20c9670 Mon Sep 17 00:00:00 2001 From: Kshitij Gupta Date: Tue, 14 Dec 2021 09:50:49 +0000 Subject: SERVER-61930: Individual health observers should return an error if a timeout period elapses when doing a single health check. (cherry picked from commit fa202b3594be0389f1fcd7e93e8f6d11ccb6e933) --- src/mongo/db/process_health/deadline_future.h | 109 +++++++++++++++++++++ .../db/process_health/fault_manager_test_suite.h | 13 ++- src/mongo/db/process_health/health_observer.h | 5 + .../db/process_health/health_observer_base.cpp | 12 ++- src/mongo/db/process_health/health_observer_base.h | 3 +- src/mongo/db/process_health/health_observer_mock.h | 11 ++- .../db/process_health/health_observer_test.cpp | 35 ++++++- src/mongo/db/process_health/test_health_observer.h | 8 +- 8 files changed, 182 insertions(+), 14 deletions(-) create mode 100644 src/mongo/db/process_health/deadline_future.h diff --git a/src/mongo/db/process_health/deadline_future.h b/src/mongo/db/process_health/deadline_future.h new file mode 100644 index 00000000000..ee5768b03c0 --- /dev/null +++ b/src/mongo/db/process_health/deadline_future.h @@ -0,0 +1,109 @@ +/** + * Copyright (C) 2021-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ +#pragma once + +#include "mongo/db/process_health/health_check_status.h" +#include "mongo/executor/task_executor.h" +#include "mongo/util/future.h" + +namespace mongo { + +namespace process_health { +/** + * A deadline future wraps an inputFuture and returns an outputFuture where: + * - the outputFuture is set to the result of the inputFuture if the inputFuture finishes within + * timeout. + * - otherwise the outputFuture returns an error. + */ +template +class DeadlineFuture { +public: + DeadlineFuture(std::shared_ptr executor, + Future inputFuture, + Milliseconds timeout) + : _executor(executor) { + _outputFuturePromise = std::make_unique>(); + + auto swCbHandle = _executor->scheduleWorkAt( + _executor->now() + timeout, [this](const executor::TaskExecutor::CallbackArgs& cbData) { + auto lk = stdx::lock_guard(_mutex); + + if (!cbData.status.isOK()) { + return; + } + + if (!get().isReady()) { + _outputFuturePromise->setError( + Status(ErrorCodes::ExceededTimeLimit, "Deadline future timed out")); + } + }); + + { + auto lk = stdx::lock_guard(_mutex); + if (!swCbHandle.isOK() && !get().isReady()) { + _outputFuturePromise->setError(swCbHandle.getStatus()); + return; + } + } + + _timeoutCbHandle = swCbHandle.getValue(); + + _inputFuture = std::move(inputFuture).onCompletion([this](StatusWith status) { + { + auto lk = stdx::lock_guard(_mutex); + _executor->cancel(_timeoutCbHandle.get()); + if (!get().isReady()) { + _outputFuturePromise->setFrom(status); + } + return status; + } + }); + } + + ~DeadlineFuture() { + { + auto lk = stdx::lock_guard(_mutex); + _executor->cancel(_timeoutCbHandle.get()); + } + } + + SharedSemiFuture get() const { + return _outputFuturePromise->getFuture(); + } + +private: + mutable Mutex _mutex = + MONGO_MAKE_LATCH(HierarchicalAcquisitionLevel(4), "DeadlineFuture::_mutex"); + const std::shared_ptr _executor; + Future _inputFuture; + boost::optional _timeoutCbHandle; + std::unique_ptr> _outputFuturePromise; +}; +} // namespace process_health +} // namespace mongo diff --git a/src/mongo/db/process_health/fault_manager_test_suite.h b/src/mongo/db/process_health/fault_manager_test_suite.h index 807a771ea06..dbba93d2474 100644 --- a/src/mongo/db/process_health/fault_manager_test_suite.h +++ b/src/mongo/db/process_health/fault_manager_test_suite.h @@ -182,13 +182,20 @@ public: } void registerMockHealthObserver(FaultFacetType mockType, - std::function getSeverityCallback) { + std::function getSeverityCallback, + Milliseconds timeout) { HealthObserverRegistration::registerObserverFactory( - [mockType, getSeverityCallback](ServiceContext* svcCtx) { - return std::make_unique(mockType, svcCtx, getSeverityCallback); + [mockType, getSeverityCallback, timeout](ServiceContext* svcCtx) { + return std::make_unique( + mockType, svcCtx, getSeverityCallback, timeout); }); } + void registerMockHealthObserver(FaultFacetType mockType, + std::function getSeverityCallback) { + registerMockHealthObserver(mockType, getSeverityCallback, Milliseconds(Seconds(30))); + } + template void registerHealthObserver() { HealthObserverRegistration::registerObserverFactory( diff --git a/src/mongo/db/process_health/health_observer.h b/src/mongo/db/process_health/health_observer.h index 482dcbcfa92..f6178889faa 100644 --- a/src/mongo/db/process_health/health_observer.h +++ b/src/mongo/db/process_health/health_observer.h @@ -87,6 +87,11 @@ public: * Value used to introduce jitter between health check invocations. */ virtual Milliseconds healthCheckJitter() const = 0; + + /** + * Timeout value enforced on an individual health check. + */ + virtual Milliseconds getObserverTimeout() const = 0; }; } // namespace process_health diff --git a/src/mongo/db/process_health/health_observer_base.cpp b/src/mongo/db/process_health/health_observer_base.cpp index 32d732d16a7..c7be7035b16 100644 --- a/src/mongo/db/process_health/health_observer_base.cpp +++ b/src/mongo/db/process_health/health_observer_base.cpp @@ -31,6 +31,7 @@ #include "mongo/db/process_health/health_observer_base.h" +#include "mongo/db/process_health/deadline_future.h" #include "mongo/db/service_context.h" #include "mongo/logv2/log.h" @@ -48,17 +49,17 @@ SharedSemiFuture HealthObserverBase::periodicCheck( { auto lk = stdx::lock_guard(_mutex); if (_currentlyRunningHealthCheck) { - return _periodicCheckPromise->getFuture(); + return _deadlineFuture->get(); } LOGV2_DEBUG(6007902, 2, "Start periodic health check", "observerType"_attr = getType()); const auto now = _svcCtx->getPreciseClockSource()->now(); _lastTimeTheCheckWasRun = now; _currentlyRunningHealthCheck = true; - _periodicCheckPromise = std::make_unique>(); } - _periodicCheckPromise->setFrom( + _deadlineFuture = std::make_unique>( + taskExecutor, periodicCheckImpl({cancellationToken, taskExecutor}) .onCompletion([this](StatusWith status) { if (!status.isOK()) { @@ -77,9 +78,10 @@ SharedSemiFuture HealthObserverBase::periodicCheck( _currentlyRunningHealthCheck = false; _lastTimeCheckCompleted = now; return status; - })); + }), + getObserverTimeout()); - return _periodicCheckPromise->getFuture(); + return _deadlineFuture->get(); } HealthCheckStatus HealthObserverBase::makeHealthyStatus() const { diff --git a/src/mongo/db/process_health/health_observer_base.h b/src/mongo/db/process_health/health_observer_base.h index d348edc4902..82ca9643754 100644 --- a/src/mongo/db/process_health/health_observer_base.h +++ b/src/mongo/db/process_health/health_observer_base.h @@ -30,6 +30,7 @@ #include "mongo/db/process_health/health_observer.h" +#include "mongo/db/process_health/deadline_future.h" #include "mongo/db/service_context.h" namespace mongo { @@ -104,7 +105,7 @@ protected: // Indicates if there any check running to prevent running checks concurrently. bool _currentlyRunningHealthCheck = false; - std::unique_ptr> _periodicCheckPromise; + std::unique_ptr> _deadlineFuture; // Enforces the safety interval. Date_t _lastTimeTheCheckWasRun; Date_t _lastTimeCheckCompleted; diff --git a/src/mongo/db/process_health/health_observer_mock.h b/src/mongo/db/process_health/health_observer_mock.h index 230145bedf2..1929d0fde79 100644 --- a/src/mongo/db/process_health/health_observer_mock.h +++ b/src/mongo/db/process_health/health_observer_mock.h @@ -44,10 +44,12 @@ class HealthObserverMock : public HealthObserverBase { public: HealthObserverMock(FaultFacetType mockType, ServiceContext* svcCtx, - std::function getSeverityCallback) + std::function getSeverityCallback, + Milliseconds observerTimeout) : HealthObserverBase(svcCtx), _mockType(mockType), - _getSeverityCallback(getSeverityCallback) {} + _getSeverityCallback(getSeverityCallback), + _observerTimeout(observerTimeout) {} virtual ~HealthObserverMock() = default; @@ -56,6 +58,10 @@ protected: return _mockType; } + Milliseconds getObserverTimeout() const override { + return _observerTimeout; + } + Future periodicCheckImpl( PeriodicHealthCheckContext&& periodicCheckContext) override { @@ -86,6 +92,7 @@ protected: private: const FaultFacetType _mockType; std::function _getSeverityCallback; + const Milliseconds _observerTimeout; }; } // namespace process_health diff --git a/src/mongo/db/process_health/health_observer_test.cpp b/src/mongo/db/process_health/health_observer_test.cpp index 959310ba788..afe75804e38 100644 --- a/src/mongo/db/process_health/health_observer_test.cpp +++ b/src/mongo/db/process_health/health_observer_test.cpp @@ -107,7 +107,7 @@ TEST_F(FaultManagerTest, Stats) { TEST_F(FaultManagerTest, ProgressMonitorCheck) { AtomicWord shouldBlock{true}; - registerMockHealthObserver(FaultFacetType::kMock1, [this, &shouldBlock] { + registerMockHealthObserver(FaultFacetType::kMock1, [&shouldBlock] { while (shouldBlock.load()) { sleepFor(Milliseconds(1)); } @@ -171,6 +171,39 @@ TEST_F(FaultManagerTest, PeriodicHealthCheckOnErrorMakesBadHealthStatus) { }); } +TEST_F(FaultManagerTest, + DeadlineFutureCausesTransientFaultWhenObserverBlocksAndGetsResolvedWhenObserverUnblocked) { + resetManager(std::make_unique()); + feature_flags::gFeatureFlagHealthMonitoring = true; + const auto saveActiveFaultDuration = gActiveFaultDurationSecs.load(); + gActiveFaultDurationSecs.store(5); + + AtomicWord shouldBlock{true}; + registerMockHealthObserver(FaultFacetType::kMock1, + [&shouldBlock] { + while (shouldBlock.load()) { + sleepFor(Milliseconds(1)); + } + return 0.0; + }, + Milliseconds(100)); + + ASSERT_TRUE(manager().getFaultState() == FaultState::kStartupCheck); + + auto initialHealthCheckFuture = manager().startPeriodicHealthChecks(); + + assertSoon([this] { + return manager().currentFault() && manager().getFaultState() == FaultState::kStartupCheck; + }); + + shouldBlock.store(false); + + assertSoon([this] { return manager().getFaultState() == FaultState::kOk; }); + + resetManager(); // Before fields above go out of scope. + gActiveFaultDurationSecs.store(saveActiveFaultDuration); +} + } // namespace } // namespace process_health } // namespace mongo diff --git a/src/mongo/db/process_health/test_health_observer.h b/src/mongo/db/process_health/test_health_observer.h index 42670ee79a9..85de8a65998 100644 --- a/src/mongo/db/process_health/test_health_observer.h +++ b/src/mongo/db/process_health/test_health_observer.h @@ -34,6 +34,9 @@ namespace mongo { namespace process_health { class TestHealthObserver : public HealthObserverBase { public: + TestHealthObserver(ServiceContext* svcCtx) : HealthObserverBase(svcCtx){}; + +protected: FaultFacetType getType() const override { return FaultFacetType::kTestObserver; } @@ -42,9 +45,10 @@ public: return Milliseconds(0); } - TestHealthObserver(ServiceContext* svcCtx) : HealthObserverBase(svcCtx){}; + Milliseconds getObserverTimeout() const override { + return Milliseconds(Seconds(30)); + } -protected: Future periodicCheckImpl( PeriodicHealthCheckContext&& periodicCheckContext) override; }; -- cgit v1.2.1