summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Shuvalov <andrew.shuvalov@mongodb.com>2022-01-10 21:41:36 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-01-11 14:57:42 +0000
commite3c2d10228bcb64fbf0bc26e9c8468e47fa3677e (patch)
treec3dcd5486b3e535379185406b84614d0a4d659e8
parente46858249a8449e9589bbb926ad367d092d71acc (diff)
downloadmongo-e3c2d10228bcb64fbf0bc26e9c8468e47fa3677e.tar.gz
SERVER-59375 SERVER-62373 additional serverStatus sections for health checks
(cherry picked from commit 386986a651b852c3c98b426ea60a023d99e4a5a4)
-rw-r--r--jstests/sharding/health_monitor/non_critical_facet.js3
-rw-r--r--jstests/sharding/health_monitor/server_status_health.js18
-rw-r--r--src/mongo/db/process_health/fault_manager.cpp47
-rw-r--r--src/mongo/db/process_health/fault_manager.h8
-rw-r--r--src/mongo/db/process_health/fault_manager_config.h2
-rw-r--r--src/mongo/db/process_health/health_monitoring_server_status_section.cpp19
6 files changed, 81 insertions, 16 deletions
diff --git a/jstests/sharding/health_monitor/non_critical_facet.js b/jstests/sharding/health_monitor/non_critical_facet.js
index 6bedf771242..704f924c83b 100644
--- a/jstests/sharding/health_monitor/non_critical_facet.js
+++ b/jstests/sharding/health_monitor/non_critical_facet.js
@@ -49,8 +49,7 @@ sleep(ACTIVE_FAULT_DURATION_SECS * 2000);
// Still in transient fault.
result = assert.commandWorked(st.s0.adminCommand({serverStatus: 1})).health;
assert.eq(result.state, "TransientFault");
-assert(
- result.faultInformation.facets.kTestObserver.description.includes("InternalError: test msg"));
+assert(result.faultInformation.facets.testObserver.description.includes("InternalError: test msg"));
st.stop();
})();
diff --git a/jstests/sharding/health_monitor/server_status_health.js b/jstests/sharding/health_monitor/server_status_health.js
index 6a146d87eba..09e9b74390b 100644
--- a/jstests/sharding/health_monitor/server_status_health.js
+++ b/jstests/sharding/health_monitor/server_status_health.js
@@ -56,15 +56,16 @@ print("---RESULT 2---");
print(tojson(result));
assert(result.enteredStateAtTime);
assert(result.faultInformation);
+assert(result.testObserver.intensity);
const faultInformation = result.faultInformation;
assert.eq(faultInformation.severity, 1);
assert(faultInformation.duration);
assert(faultInformation.facets);
assert.eq(faultInformation.numFacets, 1);
-assert(faultInformation.facets.kTestObserver);
+assert(faultInformation.facets.testObserver);
-const kTestObserverFacet = faultInformation.facets.kTestObserver;
+const kTestObserverFacet = faultInformation.facets.testObserver;
assert.eq(kTestObserverFacet.severity, faultInformation.severity);
assert.eq(kTestObserverFacet.duration, faultInformation.duration);
assert(kTestObserverFacet.description.includes("InternalError: test msg"));
@@ -78,11 +79,22 @@ assert.soon(() => {
return result.state == "Ok";
});
-result = assert.commandWorked(st.s0.adminCommand({serverStatus: 1})).health;
print("---RESULT 3---");
+result = assert.commandWorked(st.s0.adminCommand({serverStatus: 1})).health;
print(tojson(result));
assert.eq(result.state, "Ok");
assert(result.enteredStateAtTime);
+print("---RESULT 4 with details---");
+result =
+ assert.commandWorked(st.s0.adminCommand({serverStatus: 1, health: {details: true}})).health;
+print(tojson(result));
+const testObserver = result.testObserver;
+assert(testObserver.totalChecks);
+assert(testObserver.totalChecks >= 1);
+assert(testObserver.totalChecksWithFailure >= 1);
+assert(testObserver.timeSinceLastCheckStartedMs >= 1);
+assert(testObserver.timeSinceLastCheckCompletedMs >= 1);
+
st.stop();
})();
diff --git a/src/mongo/db/process_health/fault_manager.cpp b/src/mongo/db/process_health/fault_manager.cpp
index a3f2cd556e7..8b1284d0886 100644
--- a/src/mongo/db/process_health/fault_manager.cpp
+++ b/src/mongo/db/process_health/fault_manager.cpp
@@ -675,6 +675,53 @@ HealthObserver* FaultManager::getHealthObserver(FaultFacetType type) const {
return nullptr;
}
+void FaultManager::appendDescription(BSONObjBuilder* result, bool appendDetails) const {
+ static constexpr auto kDurationThreshold = Hours{24};
+ const auto now = _svcCtx->getFastClockSource()->now();
+ StringBuilder faultStateStr;
+ faultStateStr << getFaultState();
+
+ result->append("state", faultStateStr.str());
+ result->appendDate("enteredStateAtTime", getLastTransitionTime());
+
+ auto fault = currentFault();
+ if (fault) {
+ BSONObjBuilder sub_result;
+ fault->appendDescription(&sub_result);
+ result->append("faultInformation", sub_result.obj());
+ }
+
+ auto allObservers = getHealthObservers();
+ for (auto observer : allObservers) {
+ if (!appendDetails && !_config->isHealthObserverEnabled(observer->getType())) {
+ continue;
+ }
+ BSONObjBuilder sub_result;
+ sub_result.append("intensity",
+ HealthObserverIntensity_serializer(
+ _config->getHealthObserverIntensity(observer->getType())));
+
+ HealthObserverLivenessStats stats = observer->getStats();
+ sub_result.append("totalChecks", stats.completedChecksCount);
+ if (appendDetails) {
+ sub_result.append("totalChecksWithFailure", stats.completedChecksWithFaultCount);
+ if (now - stats.lastTimeCheckStarted < kDurationThreshold) {
+ sub_result.append("timeSinceLastCheckStartedMs",
+ durationCount<Milliseconds>(now - stats.lastTimeCheckStarted));
+ sub_result.append("timeSinceLastCheckCompletedMs",
+ durationCount<Milliseconds>(now - stats.lastTimeCheckCompleted));
+ }
+ }
+ // Report how long the current check is running, if it's longer than 10% of deadline.
+ if (stats.currentlyRunningHealthCheck &&
+ now - stats.lastTimeCheckStarted > getConfig().getPeriodicLivenessDeadline() / 10) {
+ sub_result.append("runningCheckForMs",
+ durationCount<Milliseconds>(now - stats.lastTimeCheckStarted));
+ }
+ result->append(FaultFacetType_serializer(observer->getType()), sub_result.obj());
+ }
+}
+
void FaultManager::progressMonitorCheckForTests(std::function<void(std::string cause)> crashCb) {
_progressMonitor->progressMonitorCheck(crashCb);
}
diff --git a/src/mongo/db/process_health/fault_manager.h b/src/mongo/db/process_health/fault_manager.h
index a9f8969b66b..7cd13fa842c 100644
--- a/src/mongo/db/process_health/fault_manager.h
+++ b/src/mongo/db/process_health/fault_manager.h
@@ -120,6 +120,14 @@ public:
// Gets the timestamp of the last transition
Date_t getLastTransitionTime() const;
+ /**
+ * Generate the `serverStatus` section for the fault manager.
+ * @param appendDetails is true when the section is generated with:
+ * health: {details: true}
+ * thus it is ok to add any verbose information here.
+ */
+ void appendDescription(BSONObjBuilder* builder, bool appendDetails) const;
+
protected:
// Returns all health observers not configured as Off
std::vector<HealthObserver*> getActiveHealthObservers() const;
diff --git a/src/mongo/db/process_health/fault_manager_config.h b/src/mongo/db/process_health/fault_manager_config.h
index f6d47bdbf24..4905061fa21 100644
--- a/src/mongo/db/process_health/fault_manager_config.h
+++ b/src/mongo/db/process_health/fault_manager_config.h
@@ -63,7 +63,7 @@ std::ostream& operator<<(std::ostream& os, const FaultState& state);
*/
enum class FaultFacetType { kSystem, kMock1, kMock2, kTestObserver, kLdap, kDns };
static const StringData FaultFacetTypeStrings[] = {
- "kSystem", "kMock1", "kMock2", "kTestObserver", "kLdap", "kDns"};
+ "systemObserver", "mock1", "mock2", "testObserver", "LDAP", "DNS"};
FaultFacetType toFaultFacetType(HealthObserverTypeEnum type);
diff --git a/src/mongo/db/process_health/health_monitoring_server_status_section.cpp b/src/mongo/db/process_health/health_monitoring_server_status_section.cpp
index 86002a7bdf3..e527aa2ebd8 100644
--- a/src/mongo/db/process_health/health_monitoring_server_status_section.cpp
+++ b/src/mongo/db/process_health/health_monitoring_server_status_section.cpp
@@ -46,20 +46,19 @@ public:
BSONObj generateSection(OperationContext* opCtx,
const BSONElement& configElement) const override {
auto* fault_manager = process_health::FaultManager::get(getGlobalServiceContext());
- BSONObjBuilder result;
- StringBuilder os;
- os << fault_manager->getFaultState();
+ if (!fault_manager) {
+ return BSONObj();
+ }
- result.append("state", os.str());
- result.appendDate("enteredStateAtTime", fault_manager->getLastTransitionTime());
+ BSONObjBuilder result;
- auto fault = fault_manager->currentFault();
- if (fault) {
- BSONObjBuilder sub_result;
- fault->appendDescription(&sub_result);
- result.append("faultInformation", sub_result.obj());
+ bool appendDetails = false;
+ if (configElement.type() == BSONType::Object && configElement.Obj().hasElement("details")) {
+ appendDetails = configElement.Obj()["details"].trueValue();
}
+ fault_manager->appendDescription(&result, appendDetails);
+
return result.obj();
}