From 4739fb232284a655f0ae89c9a43490ed2219cb5e Mon Sep 17 00:00:00 2001 From: Amirsaman Memaripour Date: Wed, 14 Oct 2020 19:35:32 +0000 Subject: SERVER-51601 Make OperationCPUTimer initializer more resilient --- src/mongo/db/operation_cpu_timer.cpp | 43 +++++++++++++++++++------ src/mongo/db/operation_cpu_timer_test.cpp | 52 +++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 10 deletions(-) (limited to 'src') diff --git a/src/mongo/db/operation_cpu_timer.cpp b/src/mongo/db/operation_cpu_timer.cpp index 7627c8af11c..34cb63eb3d4 100644 --- a/src/mongo/db/operation_cpu_timer.cpp +++ b/src/mongo/db/operation_cpu_timer.cpp @@ -27,6 +27,8 @@ * it in the license file. */ +#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kDefault + #include #include @@ -39,6 +41,7 @@ #include "mongo/base/error_codes.h" #include "mongo/db/client.h" #include "mongo/db/operation_context.h" +#include "mongo/logv2/log.h" #include "mongo/stdx/thread.h" #include "mongo/util/assert_util.h" #include "mongo/util/errno_util.h" @@ -46,11 +49,22 @@ namespace mongo { +using namespace fmt::literals; + #if defined(__linux__) namespace { -using namespace fmt::literals; +// Reads the thread timer, and throws with `InternalError` if that fails. +Nanoseconds getThreadCPUTime() { + struct timespec t; + if (auto ret = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t); ret != 0) { + int ec = errno; + internalAssert(Status(ErrorCodes::InternalError, + "Unable to get time: {}"_format(errnoWithDescription(ec)))); + } + return Seconds(t.tv_sec) + Nanoseconds(t.tv_nsec); +} MONGO_FAIL_POINT_DEFINE(hangCPUTimerAfterOnThreadAttach); MONGO_FAIL_POINT_DEFINE(hangCPUTimerAfterOnThreadDetach); @@ -132,14 +146,13 @@ void PosixTimer::onThreadDetach() { hangCPUTimerAfterOnThreadDetach.pauseWhileSet(); } -Nanoseconds PosixTimer::_getThreadTime() const { - struct timespec t; - if (auto ret = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t); ret != 0) { - int ec = errno; - internalAssert(Status(ErrorCodes::InternalError, - "Unable to get time: {}"_format(errnoWithDescription(ec)))); - } - return Seconds(t.tv_sec) + Nanoseconds(t.tv_nsec); +Nanoseconds PosixTimer::_getThreadTime() const try { + return getThreadCPUTime(); +} catch (const ExceptionFor& ex) { + // Abort the process as the timer cannot account for the elapsed time. This path is only + // reachable if the platform supports CPU time measurement at startup, but returns an error + // for a subsequent attempt to get thread-specific CPU consumption. + LOGV2_FATAL(4744601, "Failed to read the CPU time for the current thread", "error"_attr = ex); } static auto getCPUTimer = OperationContext::declareDecoration(); @@ -154,7 +167,17 @@ OperationCPUTimer* OperationCPUTimer::get(OperationContext* opCtx) { // of SMP systems. static bool isTimeSupported = [] { clockid_t cid; - return clock_getcpuclockid(0, &cid) == 0; + if (clock_getcpuclockid(0, &cid) != 0) + return false; + + try { + getThreadCPUTime(); + } catch (const ExceptionFor&) { + // Unable to collect the CPU time for the current thread. + return false; + } + + return true; }(); if (!isTimeSupported) diff --git a/src/mongo/db/operation_cpu_timer_test.cpp b/src/mongo/db/operation_cpu_timer_test.cpp index e498ce168b6..8d579e9937c 100644 --- a/src/mongo/db/operation_cpu_timer_test.cpp +++ b/src/mongo/db/operation_cpu_timer_test.cpp @@ -27,9 +27,14 @@ * it in the license file. */ +#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kTest + #include "mongo/db/operation_cpu_timer.h" #include "mongo/db/service_context_test_fixture.h" +#include "mongo/logv2/log.h" #include "mongo/platform/atomic_word.h" +#include "mongo/stdx/condition_variable.h" +#include "mongo/stdx/mutex.h" #include "mongo/stdx/thread.h" #include "mongo/unittest/barrier.h" #include "mongo/unittest/death_test.h" @@ -159,6 +164,53 @@ DEATH_TEST_F(OperationCPUTimerTest, GetElapsedForPausedTimer, "Not attached to c timer->getElapsed(); } +TEST_F(OperationCPUTimerTest, TimerPausesOnBlockingSleep) { + // This test checks if the time measured by `OperationCPUTimer` does not include the period of + // time the operation was blocked (e.g., waiting on a condition variable). The idea is to have + // the operation block for `kSomeDelay`, ensure the elapsed time observed by the timer is always + // less than `kSomeDelay`, and repeat the test `kRepeats` times. To account for the sporadic + // wake ups, the test does not fail unless the number of failures exceeds `kMaxFailures`. This + // is just a best-effort, and the number of failures is not guaranteed to not exceed the upper + // bound (i.e., `kMaxFailures`). + const auto kSomeDelay = Milliseconds(1); + const auto kRepeats = 1000; + const auto kMaxFailureRate = 0.1; + const auto kMaxFailures = kMaxFailureRate * kRepeats; + + auto timer = getTimer(); + + auto checkTimer = [&] { + auto elapsed = timer->getElapsed(); + if (elapsed < kSomeDelay) + return true; + LOGV2_WARNING(5160101, + "Elapsed operation time exceeded the upper bound", + "elapsed"_attr = elapsed, + "delay"_attr = kSomeDelay); + return false; + }; + + auto failures = 0; + for (auto i = 0; i < kRepeats; i++) { + timer->start(); + sleepFor(kSomeDelay); + if (!checkTimer()) + failures++; + timer->stop(); + + stdx::condition_variable cv; + auto mutex = MONGO_MAKE_LATCH("TimerPausesOnBlockingSleep"); + timer->start(); + stdx::unique_lock lk(mutex); + cv.wait_for(lk, kSomeDelay.toSystemDuration(), [] { return false; }); + if (!checkTimer()) + failures++; + timer->stop(); + } + + ASSERT_LTE(failures, kMaxFailures); +} + #else TEST_F(OperationCPUTimerTest, TimerNotSetIfNotSupported) { -- cgit v1.2.1