diff options
author | Andrew Shuvalov <andrew.shuvalov@mongodb.com> | 2022-01-10 21:41:36 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-01-11 14:57:42 +0000 |
commit | e3c2d10228bcb64fbf0bc26e9c8468e47fa3677e (patch) | |
tree | c3dcd5486b3e535379185406b84614d0a4d659e8 | |
parent | e46858249a8449e9589bbb926ad367d092d71acc (diff) | |
download | mongo-e3c2d10228bcb64fbf0bc26e9c8468e47fa3677e.tar.gz |
SERVER-59375 SERVER-62373 additional serverStatus sections for health checks
(cherry picked from commit 386986a651b852c3c98b426ea60a023d99e4a5a4)
6 files changed, 81 insertions, 16 deletions
diff --git a/jstests/sharding/health_monitor/non_critical_facet.js b/jstests/sharding/health_monitor/non_critical_facet.js index 6bedf771242..704f924c83b 100644 --- a/jstests/sharding/health_monitor/non_critical_facet.js +++ b/jstests/sharding/health_monitor/non_critical_facet.js @@ -49,8 +49,7 @@ sleep(ACTIVE_FAULT_DURATION_SECS * 2000); // Still in transient fault. result = assert.commandWorked(st.s0.adminCommand({serverStatus: 1})).health; assert.eq(result.state, "TransientFault"); -assert( - result.faultInformation.facets.kTestObserver.description.includes("InternalError: test msg")); +assert(result.faultInformation.facets.testObserver.description.includes("InternalError: test msg")); st.stop(); })(); diff --git a/jstests/sharding/health_monitor/server_status_health.js b/jstests/sharding/health_monitor/server_status_health.js index 6a146d87eba..09e9b74390b 100644 --- a/jstests/sharding/health_monitor/server_status_health.js +++ b/jstests/sharding/health_monitor/server_status_health.js @@ -56,15 +56,16 @@ print("---RESULT 2---"); print(tojson(result)); assert(result.enteredStateAtTime); assert(result.faultInformation); +assert(result.testObserver.intensity); const faultInformation = result.faultInformation; assert.eq(faultInformation.severity, 1); assert(faultInformation.duration); assert(faultInformation.facets); assert.eq(faultInformation.numFacets, 1); -assert(faultInformation.facets.kTestObserver); +assert(faultInformation.facets.testObserver); -const kTestObserverFacet = faultInformation.facets.kTestObserver; +const kTestObserverFacet = faultInformation.facets.testObserver; assert.eq(kTestObserverFacet.severity, faultInformation.severity); assert.eq(kTestObserverFacet.duration, faultInformation.duration); assert(kTestObserverFacet.description.includes("InternalError: test msg")); @@ -78,11 +79,22 @@ assert.soon(() => { return result.state == "Ok"; }); -result = assert.commandWorked(st.s0.adminCommand({serverStatus: 1})).health; print("---RESULT 3---"); +result = assert.commandWorked(st.s0.adminCommand({serverStatus: 1})).health; print(tojson(result)); assert.eq(result.state, "Ok"); assert(result.enteredStateAtTime); +print("---RESULT 4 with details---"); +result = + assert.commandWorked(st.s0.adminCommand({serverStatus: 1, health: {details: true}})).health; +print(tojson(result)); +const testObserver = result.testObserver; +assert(testObserver.totalChecks); +assert(testObserver.totalChecks >= 1); +assert(testObserver.totalChecksWithFailure >= 1); +assert(testObserver.timeSinceLastCheckStartedMs >= 1); +assert(testObserver.timeSinceLastCheckCompletedMs >= 1); + st.stop(); })(); diff --git a/src/mongo/db/process_health/fault_manager.cpp b/src/mongo/db/process_health/fault_manager.cpp index a3f2cd556e7..8b1284d0886 100644 --- a/src/mongo/db/process_health/fault_manager.cpp +++ b/src/mongo/db/process_health/fault_manager.cpp @@ -675,6 +675,53 @@ HealthObserver* FaultManager::getHealthObserver(FaultFacetType type) const { return nullptr; } +void FaultManager::appendDescription(BSONObjBuilder* result, bool appendDetails) const { + static constexpr auto kDurationThreshold = Hours{24}; + const auto now = _svcCtx->getFastClockSource()->now(); + StringBuilder faultStateStr; + faultStateStr << getFaultState(); + + result->append("state", faultStateStr.str()); + result->appendDate("enteredStateAtTime", getLastTransitionTime()); + + auto fault = currentFault(); + if (fault) { + BSONObjBuilder sub_result; + fault->appendDescription(&sub_result); + result->append("faultInformation", sub_result.obj()); + } + + auto allObservers = getHealthObservers(); + for (auto observer : allObservers) { + if (!appendDetails && !_config->isHealthObserverEnabled(observer->getType())) { + continue; + } + BSONObjBuilder sub_result; + sub_result.append("intensity", + HealthObserverIntensity_serializer( + _config->getHealthObserverIntensity(observer->getType()))); + + HealthObserverLivenessStats stats = observer->getStats(); + sub_result.append("totalChecks", stats.completedChecksCount); + if (appendDetails) { + sub_result.append("totalChecksWithFailure", stats.completedChecksWithFaultCount); + if (now - stats.lastTimeCheckStarted < kDurationThreshold) { + sub_result.append("timeSinceLastCheckStartedMs", + durationCount<Milliseconds>(now - stats.lastTimeCheckStarted)); + sub_result.append("timeSinceLastCheckCompletedMs", + durationCount<Milliseconds>(now - stats.lastTimeCheckCompleted)); + } + } + // Report how long the current check is running, if it's longer than 10% of deadline. + if (stats.currentlyRunningHealthCheck && + now - stats.lastTimeCheckStarted > getConfig().getPeriodicLivenessDeadline() / 10) { + sub_result.append("runningCheckForMs", + durationCount<Milliseconds>(now - stats.lastTimeCheckStarted)); + } + result->append(FaultFacetType_serializer(observer->getType()), sub_result.obj()); + } +} + void FaultManager::progressMonitorCheckForTests(std::function<void(std::string cause)> crashCb) { _progressMonitor->progressMonitorCheck(crashCb); } diff --git a/src/mongo/db/process_health/fault_manager.h b/src/mongo/db/process_health/fault_manager.h index a9f8969b66b..7cd13fa842c 100644 --- a/src/mongo/db/process_health/fault_manager.h +++ b/src/mongo/db/process_health/fault_manager.h @@ -120,6 +120,14 @@ public: // Gets the timestamp of the last transition Date_t getLastTransitionTime() const; + /** + * Generate the `serverStatus` section for the fault manager. + * @param appendDetails is true when the section is generated with: + * health: {details: true} + * thus it is ok to add any verbose information here. + */ + void appendDescription(BSONObjBuilder* builder, bool appendDetails) const; + protected: // Returns all health observers not configured as Off std::vector<HealthObserver*> getActiveHealthObservers() const; diff --git a/src/mongo/db/process_health/fault_manager_config.h b/src/mongo/db/process_health/fault_manager_config.h index f6d47bdbf24..4905061fa21 100644 --- a/src/mongo/db/process_health/fault_manager_config.h +++ b/src/mongo/db/process_health/fault_manager_config.h @@ -63,7 +63,7 @@ std::ostream& operator<<(std::ostream& os, const FaultState& state); */ enum class FaultFacetType { kSystem, kMock1, kMock2, kTestObserver, kLdap, kDns }; static const StringData FaultFacetTypeStrings[] = { - "kSystem", "kMock1", "kMock2", "kTestObserver", "kLdap", "kDns"}; + "systemObserver", "mock1", "mock2", "testObserver", "LDAP", "DNS"}; FaultFacetType toFaultFacetType(HealthObserverTypeEnum type); diff --git a/src/mongo/db/process_health/health_monitoring_server_status_section.cpp b/src/mongo/db/process_health/health_monitoring_server_status_section.cpp index 86002a7bdf3..e527aa2ebd8 100644 --- a/src/mongo/db/process_health/health_monitoring_server_status_section.cpp +++ b/src/mongo/db/process_health/health_monitoring_server_status_section.cpp @@ -46,20 +46,19 @@ public: BSONObj generateSection(OperationContext* opCtx, const BSONElement& configElement) const override { auto* fault_manager = process_health::FaultManager::get(getGlobalServiceContext()); - BSONObjBuilder result; - StringBuilder os; - os << fault_manager->getFaultState(); + if (!fault_manager) { + return BSONObj(); + } - result.append("state", os.str()); - result.appendDate("enteredStateAtTime", fault_manager->getLastTransitionTime()); + BSONObjBuilder result; - auto fault = fault_manager->currentFault(); - if (fault) { - BSONObjBuilder sub_result; - fault->appendDescription(&sub_result); - result.append("faultInformation", sub_result.obj()); + bool appendDetails = false; + if (configElement.type() == BSONType::Object && configElement.Obj().hasElement("details")) { + appendDetails = configElement.Obj()["details"].trueValue(); } + fault_manager->appendDescription(&result, appendDetails); + return result.obj(); } |