summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKshitij Gupta <kshitij.gupta@mongodb.com>2022-02-01 21:50:12 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-03-14 14:38:57 +0000
commit1497a7d78706a502b98eeb7fa121bf8ec311029c (patch)
tree99ebc8ac808ded07340b7ea808f796626201738b
parenta3c66f9924423c48e13198ab96caf8e1d347851c (diff)
downloadmongo-1497a7d78706a502b98eeb7fa121bf8ec311029c.tar.gz
SERVER-63013: When health observer returns status failure it will never
run again. (cherry picked from commit 458895d6629e5bc700c2d6f8d84c8f91c58eab94)
-rw-r--r--jstests/sharding/health_monitor/status_failure.js70
-rw-r--r--src/mongo/db/process_health/config_server_health_observer.cpp4
-rw-r--r--src/mongo/db/process_health/health_observer_base.cpp24
-rw-r--r--src/mongo/db/process_health/health_observer_base.h5
-rw-r--r--src/mongo/db/process_health/health_observer_mock.h2
-rw-r--r--src/mongo/db/process_health/test_health_observer.cpp7
-rw-r--r--src/mongo/db/process_health/test_health_observer.h2
7 files changed, 93 insertions, 21 deletions
diff --git a/jstests/sharding/health_monitor/status_failure.js b/jstests/sharding/health_monitor/status_failure.js
new file mode 100644
index 00000000000..9d65180b2f7
--- /dev/null
+++ b/jstests/sharding/health_monitor/status_failure.js
@@ -0,0 +1,70 @@
+/**
+ * Tests behavior of fault manager when health observer returns status failure.
+ *
+ * @tags: [multiversion_incompatible]
+ */
+(function() {
+'use strict';
+
+const kWaitForCompletedChecksCount = 12;
+const kWaitForPassedChecksCount = 8;
+
+const params = {
+ setParameter: {
+ healthMonitoringIntensities: tojson({
+ values: [
+ {type: "test", intensity: "critical"},
+ {type: "ldap", intensity: "off"},
+ {type: "dns", intensity: "off"}
+ ]
+ }),
+ featureFlagHealthMonitoring: true
+ }
+};
+
+let st = new ShardingTest({
+ mongos: [params],
+ shards: 1,
+});
+
+let result = assert.commandWorked(st.s0.adminCommand({serverStatus: 1})).health;
+assert.eq(result.state, "Ok");
+
+const checkServerStats = function() {
+ while (true) {
+ // Failpoint returns status failure.
+ assert.commandWorked(st.s0.adminCommand(
+ {"configureFailPoint": 'statusFailureTestHealthObserver', "mode": "alwaysOn"}));
+
+ assert.soon(() => {
+ result = assert.commandWorked(st.s0.adminCommand({serverStatus: 1})).health;
+ return result.state == "TransientFault" &&
+ result.faultInformation.facets.testObserver.description.includes(
+ "Status failure in test health observer");
+ });
+
+ assert.commandWorked(st.s0.adminCommand(
+ {"configureFailPoint": 'statusFailureTestHealthObserver', "mode": "off"}));
+
+ assert.soon(() => {
+ result =
+ assert.commandWorked(st.s0.adminCommand({serverStatus: 1, health: {details: true}}))
+ .health;
+ return result.state == "Ok";
+ });
+
+ print(`Server status: ${tojson(result)}`);
+ // Wait for: at least kWaitForPassedChecksCount checks completed.
+ if (result.testObserver.totalChecks >= kWaitForCompletedChecksCount &&
+ result.testObserver.totalChecks - result.testObserver.totalChecksWithFailure >=
+ kWaitForPassedChecksCount) {
+ break;
+ }
+ sleep(1000);
+ }
+};
+
+checkServerStats();
+
+st.stop();
+})();
diff --git a/src/mongo/db/process_health/config_server_health_observer.cpp b/src/mongo/db/process_health/config_server_health_observer.cpp
index e981e9a268b..37f1a8644d4 100644
--- a/src/mongo/db/process_health/config_server_health_observer.cpp
+++ b/src/mongo/db/process_health/config_server_health_observer.cpp
@@ -85,7 +85,7 @@ public:
* previous one is filled, thus synchronization can be relaxed.
*/
Future<HealthCheckStatus> periodicCheckImpl(
- PeriodicHealthCheckContext&& periodicCheckContext) override;
+ PeriodicHealthCheckContext&& periodicCheckContext) noexcept override;
private:
// Collects the results of one check.
@@ -144,7 +144,7 @@ ConfigServerHealthObserver::ConfigServerHealthObserver(ServiceContext* svcCtx)
: HealthObserverBase(svcCtx) {}
Future<HealthCheckStatus> ConfigServerHealthObserver::periodicCheckImpl(
- PeriodicHealthCheckContext&& periodicCheckContext) {
+ PeriodicHealthCheckContext&& periodicCheckContext) noexcept {
// The chain is not capturing 'this' for the case the network call outlives the observer.
return _checkImpl(std::move(periodicCheckContext))
.then([type = getType()](CheckResult result) mutable -> Future<HealthCheckStatus> {
diff --git a/src/mongo/db/process_health/health_observer_base.cpp b/src/mongo/db/process_health/health_observer_base.cpp
index ca032799018..fe3509835df 100644
--- a/src/mongo/db/process_health/health_observer_base.cpp
+++ b/src/mongo/db/process_health/health_observer_base.cpp
@@ -42,16 +42,14 @@ HealthObserverBase::HealthObserverBase(ServiceContext* svcCtx)
: _svcCtx(svcCtx), _rand(PseudoRandom(SecureRandom().nextInt64())) {}
SharedSemiFuture<HealthCheckStatus> HealthObserverBase::periodicCheck(
- std::shared_ptr<executor::TaskExecutor> taskExecutor, CancellationToken token) {
+ std::shared_ptr<executor::TaskExecutor> taskExecutor, CancellationToken token) noexcept {
// If we have reached here, the intensity of this health observer must not be off
{
- auto lk = stdx::lock_guard(_mutex);
- if (_currentlyRunningHealthCheck) {
- return _deadlineFuture->get();
- }
LOGV2_DEBUG(6007902, 2, "Start periodic health check", "observerType"_attr = getType());
const auto now = _svcCtx->getPreciseClockSource()->now();
+
+ auto lk = stdx::lock_guard(_mutex);
_lastTimeTheCheckWasRun = now;
_currentlyRunningHealthCheck = true;
}
@@ -60,21 +58,19 @@ SharedSemiFuture<HealthCheckStatus> HealthObserverBase::periodicCheck(
taskExecutor,
periodicCheckImpl({token, taskExecutor})
.onCompletion([this](StatusWith<HealthCheckStatus> status) {
- if (!status.isOK()) {
- return status;
- }
-
- auto healthStatus = status.getValue();
-
const auto now = _svcCtx->getPreciseClockSource()->now();
+
auto lk = stdx::lock_guard(_mutex);
++_completedChecksCount;
invariant(_currentlyRunningHealthCheck);
- if (!HealthCheckStatus::isResolved(healthStatus.getSeverity())) {
- ++_completedChecksWithFaultCount;
- }
_currentlyRunningHealthCheck = false;
_lastTimeCheckCompleted = now;
+
+ if (!status.isOK() ||
+ !HealthCheckStatus::isResolved(status.getValue().getSeverity())) {
+ ++_completedChecksWithFaultCount;
+ }
+
return status;
}),
getObserverTimeout());
diff --git a/src/mongo/db/process_health/health_observer_base.h b/src/mongo/db/process_health/health_observer_base.h
index e2d53ad68c4..18f24eb8540 100644
--- a/src/mongo/db/process_health/health_observer_base.h
+++ b/src/mongo/db/process_health/health_observer_base.h
@@ -62,7 +62,8 @@ public:
// Implements the common logic for periodic checks.
// Every observer should implement periodicCheckImpl() for specific tests.
SharedSemiFuture<HealthCheckStatus> periodicCheck(
- std::shared_ptr<executor::TaskExecutor> taskExecutor, CancellationToken token) override;
+ std::shared_ptr<executor::TaskExecutor> taskExecutor,
+ CancellationToken token) noexcept override;
HealthCheckStatus makeHealthyStatus() const;
static HealthCheckStatus makeHealthyStatusWithType(FaultFacetType type);
@@ -90,7 +91,7 @@ protected:
* @return The result of a complete health check
*/
virtual Future<HealthCheckStatus> periodicCheckImpl(
- PeriodicHealthCheckContext&& periodicCheckContext) = 0;
+ PeriodicHealthCheckContext&& periodicCheckContext) noexcept = 0;
HealthObserverLivenessStats getStatsLocked(WithLock) const;
diff --git a/src/mongo/db/process_health/health_observer_mock.h b/src/mongo/db/process_health/health_observer_mock.h
index 644cfa3c247..80051dc5eac 100644
--- a/src/mongo/db/process_health/health_observer_mock.h
+++ b/src/mongo/db/process_health/health_observer_mock.h
@@ -67,7 +67,7 @@ protected:
}
Future<HealthCheckStatus> periodicCheckImpl(
- PeriodicHealthCheckContext&& periodicCheckContext) override {
+ PeriodicHealthCheckContext&& periodicCheckContext) noexcept override {
auto completionPf = makePromiseFuture<HealthCheckStatus>();
diff --git a/src/mongo/db/process_health/test_health_observer.cpp b/src/mongo/db/process_health/test_health_observer.cpp
index 41b2087372c..ece51a11165 100644
--- a/src/mongo/db/process_health/test_health_observer.cpp
+++ b/src/mongo/db/process_health/test_health_observer.cpp
@@ -39,11 +39,16 @@ namespace process_health {
MONGO_FAIL_POINT_DEFINE(hangTestHealthObserver);
MONGO_FAIL_POINT_DEFINE(testHealthObserver);
MONGO_FAIL_POINT_DEFINE(badConfigTestHealthObserver);
+MONGO_FAIL_POINT_DEFINE(statusFailureTestHealthObserver);
Future<HealthCheckStatus> TestHealthObserver::periodicCheckImpl(
- PeriodicHealthCheckContext&& periodicCheckContext) {
+ PeriodicHealthCheckContext&& periodicCheckContext) noexcept {
LOGV2_DEBUG(5936801, 2, "Test health observer executing");
hangTestHealthObserver.pauseWhileSet();
+ if (statusFailureTestHealthObserver.shouldFail()) {
+ return Status(ErrorCodes::BadValue, "Status failure in test health observer");
+ }
+
auto result = Future<HealthCheckStatus>::makeReady(makeHealthyStatus());
testHealthObserver.executeIf(
diff --git a/src/mongo/db/process_health/test_health_observer.h b/src/mongo/db/process_health/test_health_observer.h
index 0c23df7fb42..428d57f8e9d 100644
--- a/src/mongo/db/process_health/test_health_observer.h
+++ b/src/mongo/db/process_health/test_health_observer.h
@@ -50,7 +50,7 @@ protected:
}
Future<HealthCheckStatus> periodicCheckImpl(
- PeriodicHealthCheckContext&& periodicCheckContext) override;
+ PeriodicHealthCheckContext&& periodicCheckContext) noexcept override;
bool isConfigured() const override;
};