summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKshitij Gupta <kshitij.gupta@mongodb.com>2021-12-14 09:50:49 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-12-29 00:13:30 +0000
commit5579b1de0cacdccfbacac795b56b123ed20c9670 (patch)
treea290d22e8876b2196a420aed3d694ddeeb9cbcba
parent2772180b9312c0bd61c14f90fb644bf82e746e5d (diff)
downloadmongo-5579b1de0cacdccfbacac795b56b123ed20c9670.tar.gz
SERVER-61930: Individual health observers should return an error if a
timeout period elapses when doing a single health check. (cherry picked from commit fa202b3594be0389f1fcd7e93e8f6d11ccb6e933)
-rw-r--r--src/mongo/db/process_health/deadline_future.h109
-rw-r--r--src/mongo/db/process_health/fault_manager_test_suite.h13
-rw-r--r--src/mongo/db/process_health/health_observer.h5
-rw-r--r--src/mongo/db/process_health/health_observer_base.cpp12
-rw-r--r--src/mongo/db/process_health/health_observer_base.h3
-rw-r--r--src/mongo/db/process_health/health_observer_mock.h11
-rw-r--r--src/mongo/db/process_health/health_observer_test.cpp35
-rw-r--r--src/mongo/db/process_health/test_health_observer.h8
8 files changed, 182 insertions, 14 deletions
diff --git a/src/mongo/db/process_health/deadline_future.h b/src/mongo/db/process_health/deadline_future.h
new file mode 100644
index 00000000000..ee5768b03c0
--- /dev/null
+++ b/src/mongo/db/process_health/deadline_future.h
@@ -0,0 +1,109 @@
+/**
+ * Copyright (C) 2021-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+#pragma once
+
+#include "mongo/db/process_health/health_check_status.h"
+#include "mongo/executor/task_executor.h"
+#include "mongo/util/future.h"
+
+namespace mongo {
+
+namespace process_health {
+/**
+ * A deadline future wraps an inputFuture and returns an outputFuture where:
+ * - the outputFuture is set to the result of the inputFuture if the inputFuture finishes within
+ * timeout.
+ * - otherwise the outputFuture returns an error.
+ */
+template <typename ResultStatus>
+class DeadlineFuture {
+public:
+ DeadlineFuture(std::shared_ptr<executor::TaskExecutor> executor,
+ Future<ResultStatus> inputFuture,
+ Milliseconds timeout)
+ : _executor(executor) {
+ _outputFuturePromise = std::make_unique<SharedPromise<ResultStatus>>();
+
+ auto swCbHandle = _executor->scheduleWorkAt(
+ _executor->now() + timeout, [this](const executor::TaskExecutor::CallbackArgs& cbData) {
+ auto lk = stdx::lock_guard(_mutex);
+
+ if (!cbData.status.isOK()) {
+ return;
+ }
+
+ if (!get().isReady()) {
+ _outputFuturePromise->setError(
+ Status(ErrorCodes::ExceededTimeLimit, "Deadline future timed out"));
+ }
+ });
+
+ {
+ auto lk = stdx::lock_guard(_mutex);
+ if (!swCbHandle.isOK() && !get().isReady()) {
+ _outputFuturePromise->setError(swCbHandle.getStatus());
+ return;
+ }
+ }
+
+ _timeoutCbHandle = swCbHandle.getValue();
+
+ _inputFuture = std::move(inputFuture).onCompletion([this](StatusWith<ResultStatus> status) {
+ {
+ auto lk = stdx::lock_guard(_mutex);
+ _executor->cancel(_timeoutCbHandle.get());
+ if (!get().isReady()) {
+ _outputFuturePromise->setFrom(status);
+ }
+ return status;
+ }
+ });
+ }
+
+ ~DeadlineFuture() {
+ {
+ auto lk = stdx::lock_guard(_mutex);
+ _executor->cancel(_timeoutCbHandle.get());
+ }
+ }
+
+ SharedSemiFuture<ResultStatus> get() const {
+ return _outputFuturePromise->getFuture();
+ }
+
+private:
+ mutable Mutex _mutex =
+ MONGO_MAKE_LATCH(HierarchicalAcquisitionLevel(4), "DeadlineFuture::_mutex");
+ const std::shared_ptr<executor::TaskExecutor> _executor;
+ Future<ResultStatus> _inputFuture;
+ boost::optional<executor::TaskExecutor::CallbackHandle> _timeoutCbHandle;
+ std::unique_ptr<SharedPromise<ResultStatus>> _outputFuturePromise;
+};
+} // namespace process_health
+} // namespace mongo
diff --git a/src/mongo/db/process_health/fault_manager_test_suite.h b/src/mongo/db/process_health/fault_manager_test_suite.h
index 807a771ea06..dbba93d2474 100644
--- a/src/mongo/db/process_health/fault_manager_test_suite.h
+++ b/src/mongo/db/process_health/fault_manager_test_suite.h
@@ -182,13 +182,20 @@ public:
}
void registerMockHealthObserver(FaultFacetType mockType,
- std::function<double()> getSeverityCallback) {
+ std::function<double()> getSeverityCallback,
+ Milliseconds timeout) {
HealthObserverRegistration::registerObserverFactory(
- [mockType, getSeverityCallback](ServiceContext* svcCtx) {
- return std::make_unique<HealthObserverMock>(mockType, svcCtx, getSeverityCallback);
+ [mockType, getSeverityCallback, timeout](ServiceContext* svcCtx) {
+ return std::make_unique<HealthObserverMock>(
+ mockType, svcCtx, getSeverityCallback, timeout);
});
}
+ void registerMockHealthObserver(FaultFacetType mockType,
+ std::function<double()> getSeverityCallback) {
+ registerMockHealthObserver(mockType, getSeverityCallback, Milliseconds(Seconds(30)));
+ }
+
template <typename Observer>
void registerHealthObserver() {
HealthObserverRegistration::registerObserverFactory(
diff --git a/src/mongo/db/process_health/health_observer.h b/src/mongo/db/process_health/health_observer.h
index 482dcbcfa92..f6178889faa 100644
--- a/src/mongo/db/process_health/health_observer.h
+++ b/src/mongo/db/process_health/health_observer.h
@@ -87,6 +87,11 @@ public:
* Value used to introduce jitter between health check invocations.
*/
virtual Milliseconds healthCheckJitter() const = 0;
+
+ /**
+ * Timeout value enforced on an individual health check.
+ */
+ virtual Milliseconds getObserverTimeout() const = 0;
};
} // namespace process_health
diff --git a/src/mongo/db/process_health/health_observer_base.cpp b/src/mongo/db/process_health/health_observer_base.cpp
index 32d732d16a7..c7be7035b16 100644
--- a/src/mongo/db/process_health/health_observer_base.cpp
+++ b/src/mongo/db/process_health/health_observer_base.cpp
@@ -31,6 +31,7 @@
#include "mongo/db/process_health/health_observer_base.h"
+#include "mongo/db/process_health/deadline_future.h"
#include "mongo/db/service_context.h"
#include "mongo/logv2/log.h"
@@ -48,17 +49,17 @@ SharedSemiFuture<HealthCheckStatus> HealthObserverBase::periodicCheck(
{
auto lk = stdx::lock_guard(_mutex);
if (_currentlyRunningHealthCheck) {
- return _periodicCheckPromise->getFuture();
+ return _deadlineFuture->get();
}
LOGV2_DEBUG(6007902, 2, "Start periodic health check", "observerType"_attr = getType());
const auto now = _svcCtx->getPreciseClockSource()->now();
_lastTimeTheCheckWasRun = now;
_currentlyRunningHealthCheck = true;
- _periodicCheckPromise = std::make_unique<SharedPromise<HealthCheckStatus>>();
}
- _periodicCheckPromise->setFrom(
+ _deadlineFuture = std::make_unique<DeadlineFuture<HealthCheckStatus>>(
+ taskExecutor,
periodicCheckImpl({cancellationToken, taskExecutor})
.onCompletion([this](StatusWith<HealthCheckStatus> status) {
if (!status.isOK()) {
@@ -77,9 +78,10 @@ SharedSemiFuture<HealthCheckStatus> HealthObserverBase::periodicCheck(
_currentlyRunningHealthCheck = false;
_lastTimeCheckCompleted = now;
return status;
- }));
+ }),
+ getObserverTimeout());
- return _periodicCheckPromise->getFuture();
+ return _deadlineFuture->get();
}
HealthCheckStatus HealthObserverBase::makeHealthyStatus() const {
diff --git a/src/mongo/db/process_health/health_observer_base.h b/src/mongo/db/process_health/health_observer_base.h
index d348edc4902..82ca9643754 100644
--- a/src/mongo/db/process_health/health_observer_base.h
+++ b/src/mongo/db/process_health/health_observer_base.h
@@ -30,6 +30,7 @@
#include "mongo/db/process_health/health_observer.h"
+#include "mongo/db/process_health/deadline_future.h"
#include "mongo/db/service_context.h"
namespace mongo {
@@ -104,7 +105,7 @@ protected:
// Indicates if there any check running to prevent running checks concurrently.
bool _currentlyRunningHealthCheck = false;
- std::unique_ptr<SharedPromise<HealthCheckStatus>> _periodicCheckPromise;
+ std::unique_ptr<DeadlineFuture<HealthCheckStatus>> _deadlineFuture;
// Enforces the safety interval.
Date_t _lastTimeTheCheckWasRun;
Date_t _lastTimeCheckCompleted;
diff --git a/src/mongo/db/process_health/health_observer_mock.h b/src/mongo/db/process_health/health_observer_mock.h
index 230145bedf2..1929d0fde79 100644
--- a/src/mongo/db/process_health/health_observer_mock.h
+++ b/src/mongo/db/process_health/health_observer_mock.h
@@ -44,10 +44,12 @@ class HealthObserverMock : public HealthObserverBase {
public:
HealthObserverMock(FaultFacetType mockType,
ServiceContext* svcCtx,
- std::function<double()> getSeverityCallback)
+ std::function<double()> getSeverityCallback,
+ Milliseconds observerTimeout)
: HealthObserverBase(svcCtx),
_mockType(mockType),
- _getSeverityCallback(getSeverityCallback) {}
+ _getSeverityCallback(getSeverityCallback),
+ _observerTimeout(observerTimeout) {}
virtual ~HealthObserverMock() = default;
@@ -56,6 +58,10 @@ protected:
return _mockType;
}
+ Milliseconds getObserverTimeout() const override {
+ return _observerTimeout;
+ }
+
Future<HealthCheckStatus> periodicCheckImpl(
PeriodicHealthCheckContext&& periodicCheckContext) override {
@@ -86,6 +92,7 @@ protected:
private:
const FaultFacetType _mockType;
std::function<double()> _getSeverityCallback;
+ const Milliseconds _observerTimeout;
};
} // namespace process_health
diff --git a/src/mongo/db/process_health/health_observer_test.cpp b/src/mongo/db/process_health/health_observer_test.cpp
index 959310ba788..afe75804e38 100644
--- a/src/mongo/db/process_health/health_observer_test.cpp
+++ b/src/mongo/db/process_health/health_observer_test.cpp
@@ -107,7 +107,7 @@ TEST_F(FaultManagerTest, Stats) {
TEST_F(FaultManagerTest, ProgressMonitorCheck) {
AtomicWord<bool> shouldBlock{true};
- registerMockHealthObserver(FaultFacetType::kMock1, [this, &shouldBlock] {
+ registerMockHealthObserver(FaultFacetType::kMock1, [&shouldBlock] {
while (shouldBlock.load()) {
sleepFor(Milliseconds(1));
}
@@ -171,6 +171,39 @@ TEST_F(FaultManagerTest, PeriodicHealthCheckOnErrorMakesBadHealthStatus) {
});
}
+TEST_F(FaultManagerTest,
+ DeadlineFutureCausesTransientFaultWhenObserverBlocksAndGetsResolvedWhenObserverUnblocked) {
+ resetManager(std::make_unique<FaultManagerConfig>());
+ feature_flags::gFeatureFlagHealthMonitoring = true;
+ const auto saveActiveFaultDuration = gActiveFaultDurationSecs.load();
+ gActiveFaultDurationSecs.store(5);
+
+ AtomicWord<bool> shouldBlock{true};
+ registerMockHealthObserver(FaultFacetType::kMock1,
+ [&shouldBlock] {
+ while (shouldBlock.load()) {
+ sleepFor(Milliseconds(1));
+ }
+ return 0.0;
+ },
+ Milliseconds(100));
+
+ ASSERT_TRUE(manager().getFaultState() == FaultState::kStartupCheck);
+
+ auto initialHealthCheckFuture = manager().startPeriodicHealthChecks();
+
+ assertSoon([this] {
+ return manager().currentFault() && manager().getFaultState() == FaultState::kStartupCheck;
+ });
+
+ shouldBlock.store(false);
+
+ assertSoon([this] { return manager().getFaultState() == FaultState::kOk; });
+
+ resetManager(); // Before fields above go out of scope.
+ gActiveFaultDurationSecs.store(saveActiveFaultDuration);
+}
+
} // namespace
} // namespace process_health
} // namespace mongo
diff --git a/src/mongo/db/process_health/test_health_observer.h b/src/mongo/db/process_health/test_health_observer.h
index 42670ee79a9..85de8a65998 100644
--- a/src/mongo/db/process_health/test_health_observer.h
+++ b/src/mongo/db/process_health/test_health_observer.h
@@ -34,6 +34,9 @@ namespace mongo {
namespace process_health {
class TestHealthObserver : public HealthObserverBase {
public:
+ TestHealthObserver(ServiceContext* svcCtx) : HealthObserverBase(svcCtx){};
+
+protected:
FaultFacetType getType() const override {
return FaultFacetType::kTestObserver;
}
@@ -42,9 +45,10 @@ public:
return Milliseconds(0);
}
- TestHealthObserver(ServiceContext* svcCtx) : HealthObserverBase(svcCtx){};
+ Milliseconds getObserverTimeout() const override {
+ return Milliseconds(Seconds(30));
+ }
-protected:
Future<HealthCheckStatus> periodicCheckImpl(
PeriodicHealthCheckContext&& periodicCheckContext) override;
};