summaryrefslogtreecommitdiff
path: root/src/mongo/watchdog
diff options
context:
space:
mode:
authorMark Benvenuto <mark.benvenuto@mongodb.com>2019-05-13 13:52:25 -0400
committerMark Benvenuto <mark.benvenuto@mongodb.com>2019-05-13 13:57:28 -0400
commit221457261a6c6bf3d4860e39dd40828db176b939 (patch)
tree3e0d5b2ccbfdc9d6c953dd3125fedc592f62e1ac /src/mongo/watchdog
parent9a7ec37bdb20b042c682e466b33f9ed90a5b4cc2 (diff)
downloadmongo-221457261a6c6bf3d4860e39dd40828db176b939.tar.gz
Revert "SERVER-41023 Move Storage Node Watchdog to community"
This reverts commit 7dd12f7cd34f1d2ab650afc17acb52d8eaea8cf1.
Diffstat (limited to 'src/mongo/watchdog')
-rw-r--r--src/mongo/watchdog/SConscript43
-rw-r--r--src/mongo/watchdog/watchdog.cpp537
-rw-r--r--src/mongo/watchdog/watchdog.h385
-rw-r--r--src/mongo/watchdog/watchdog_mongod.cpp208
-rw-r--r--src/mongo/watchdog/watchdog_mongod.h52
-rw-r--r--src/mongo/watchdog/watchdog_mongod.idl43
-rw-r--r--src/mongo/watchdog/watchdog_test.cpp480
7 files changed, 0 insertions, 1748 deletions
diff --git a/src/mongo/watchdog/SConscript b/src/mongo/watchdog/SConscript
deleted file mode 100644
index bec66a51470..00000000000
--- a/src/mongo/watchdog/SConscript
+++ /dev/null
@@ -1,43 +0,0 @@
-# -*- mode: python -*-
-
-Import('env')
-
-env.Library(
- target='watchdog',
- source=[
- 'watchdog.cpp',
- ],
- LIBDEPS=[
- '$BUILD_DIR/mongo/base',
- '$BUILD_DIR/mongo/db/service_context',
- '$BUILD_DIR/mongo/db/storage/storage_options',
- ]
-)
-
-env.Library(
- target='watchdog_mongod',
- source=[
- 'watchdog_mongod.cpp',
- env.Idlc('watchdog_mongod.idl')[0],
- ],
- LIBDEPS=[
- '$BUILD_DIR/mongo/db/commands/server_status',
- 'watchdog',
- ],
- LIBDEPS_PRIVATE=[
- '$BUILD_DIR/mongo/db/server_options_core',
- '$BUILD_DIR/mongo/idl/server_parameter',
- ],
-)
-
-env.CppUnitTest(
- target='watchdog_test',
- source=[
- 'watchdog_test.cpp',
- ],
- LIBDEPS=[
- '$BUILD_DIR/mongo/db/service_context_test_fixture',
- '$BUILD_DIR/mongo/util/clock_source_mock',
- 'watchdog',
- ],
-)
diff --git a/src/mongo/watchdog/watchdog.cpp b/src/mongo/watchdog/watchdog.cpp
deleted file mode 100644
index ea1d53b2219..00000000000
--- a/src/mongo/watchdog/watchdog.cpp
+++ /dev/null
@@ -1,537 +0,0 @@
-/**
- * Copyright (C) 2019-present MongoDB, Inc.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the Server Side Public License, version 1,
- * as published by MongoDB, Inc.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * Server Side Public License for more details.
- *
- * You should have received a copy of the Server Side Public License
- * along with this program. If not, see
- * <http://www.mongodb.com/licensing/server-side-public-license>.
- *
- * As a special exception, the copyright holders give permission to link the
- * code of portions of this program with the OpenSSL library under certain
- * conditions as described in each individual source file and distribute
- * linked combinations including the program with the OpenSSL library. You
- * must comply with the Server Side Public License in all respects for
- * all of the code used other than as permitted herein. If you modify file(s)
- * with this exception, you may extend this exception to your version of the
- * file(s), but you are not obligated to do so. If you do not wish to do so,
- * delete this exception statement from your version. If you delete this
- * exception statement from all source files in the program, then also delete
- * it in the license file.
- */
-
-#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kControl
-
-#include "mongo/platform/basic.h"
-
-#include "mongo/watchdog/watchdog.h"
-
-#include <boost/filesystem.hpp>
-
-#ifndef _WIN32
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-#endif
-
-#include "mongo/base/static_assert.h"
-#include "mongo/db/client.h"
-#include "mongo/db/operation_context.h"
-#include "mongo/platform/process_id.h"
-#include "mongo/util/concurrency/idle_thread_block.h"
-#include "mongo/util/hex.h"
-#include "mongo/util/log.h"
-#include "mongo/util/timer.h"
-
-
-namespace mongo {
-
-WatchdogPeriodicThread::WatchdogPeriodicThread(Milliseconds period, StringData threadName)
- : _period(period), _enabled(true), _threadName(threadName.toString()) {}
-
-void WatchdogPeriodicThread::start() {
- {
- stdx::lock_guard<stdx::mutex> lock(_mutex);
-
- invariant(_state == State::kNotStarted);
- _state = State::kStarted;
-
- // Start the thread.
- _thread = stdx::thread([this] { this->doLoop(); });
- }
-}
-
-void WatchdogPeriodicThread::shutdown() {
-
- stdx::thread thread;
-
- {
- stdx::lock_guard<stdx::mutex> lock(_mutex);
-
- bool started = (_state == State::kStarted);
-
- invariant(_state == State::kNotStarted || _state == State::kStarted);
-
- if (!started) {
- _state = State::kDone;
- return;
- }
-
- _state = State::kShutdownRequested;
-
- std::swap(thread, _thread);
-
- // Wake up the thread if sleeping so that it will check if we are done.
- _condvar.notify_one();
- }
-
- thread.join();
-
- _state = State::kDone;
-}
-
-void WatchdogPeriodicThread::setPeriod(Milliseconds period) {
- stdx::lock_guard<stdx::mutex> lock(_mutex);
-
- bool wasEnabled = _enabled;
-
- if (period < Milliseconds::zero()) {
- _enabled = false;
-
- // Leave the thread running but very slowly. If we set this value too high, it would
- // overflow Duration.
- _period = Hours(1);
- } else {
- _period = period;
- _enabled = true;
- }
-
- if (!wasEnabled && _enabled) {
- resetState();
- }
-
- _condvar.notify_one();
-}
-
-void WatchdogPeriodicThread::doLoop() {
- Client::initThread(_threadName);
- Client* client = &cc();
-
- auto preciseClockSource = client->getServiceContext()->getPreciseClockSource();
-
- {
- stdx::lock_guard<stdx::mutex> lock(_mutex);
-
- // Ensure state is starting from a clean slate.
- resetState();
- }
-
- while (true) {
- // Wait for the next run or signal to shutdown.
-
- auto opCtx = client->makeOperationContext();
-
- Date_t startTime = preciseClockSource->now();
-
- {
- stdx::unique_lock<stdx::mutex> lock(_mutex);
- MONGO_IDLE_THREAD_BLOCK;
-
-
- // Check if the period is different?
- // We are signalled on period changes at which point we may be done waiting or need to
- // wait longer.
- while (startTime + _period > preciseClockSource->now() &&
- _state != State::kShutdownRequested) {
- auto s = opCtx->waitForConditionOrInterruptNoAssertUntil(
- _condvar, lock, startTime + _period);
-
- if (!s.isOK()) {
- // The only bad status is when we are in shutdown
- if (!opCtx->getServiceContext()->getKillAllOperations()) {
- error() << "Watchdog was interrupted, shuting down:, reason: "
- << s.getStatus();
- }
-
- return;
- }
- }
-
- // Are we done running?
- if (_state == State::kShutdownRequested) {
- return;
- }
-
- // Check if the watchdog checks have been disabled
- if (!_enabled) {
- continue;
- }
- }
-
- run(opCtx.get());
- }
-}
-
-
-WatchdogCheckThread::WatchdogCheckThread(std::vector<std::unique_ptr<WatchdogCheck>> checks,
- Milliseconds period)
- : WatchdogPeriodicThread(period, "watchdogCheck"), _checks(std::move(checks)) {}
-
-std::int64_t WatchdogCheckThread::getGeneration() {
- return _checkGeneration.load();
-}
-
-void WatchdogCheckThread::resetState() {}
-
-void WatchdogCheckThread::run(OperationContext* opCtx) {
- for (auto& check : _checks) {
- Timer timer(opCtx->getServiceContext()->getTickSource());
-
- check->run(opCtx);
- Microseconds micros = timer.elapsed();
-
- LOG(1) << "Watchdog test '" << check->getDescriptionForLogging() << "' took "
- << duration_cast<Milliseconds>(micros);
-
- // We completed a check, bump the generation counter.
- _checkGeneration.fetchAndAdd(1);
- }
-}
-
-
-WatchdogMonitorThread::WatchdogMonitorThread(WatchdogCheckThread* checkThread,
- WatchdogDeathCallback callback,
- Milliseconds interval)
- : WatchdogPeriodicThread(interval, "watchdogMonitor"),
- _callback(callback),
- _checkThread(checkThread) {}
-
-std::int64_t WatchdogMonitorThread::getGeneration() {
- return _monitorGeneration.load();
-}
-
-void WatchdogMonitorThread::resetState() {
- // Reset the generation so that if the monitor thread is run before the check thread
- // after being enabled, it does not.
- _lastSeenGeneration = -1;
-}
-
-void WatchdogMonitorThread::run(OperationContext* opCtx) {
- auto currentGeneration = _checkThread->getGeneration();
-
- if (currentGeneration != _lastSeenGeneration) {
- _lastSeenGeneration = currentGeneration;
- } else {
- _callback();
- }
-}
-
-
-WatchdogMonitor::WatchdogMonitor(std::vector<std::unique_ptr<WatchdogCheck>> checks,
- Milliseconds checkPeriod,
- Milliseconds monitorPeriod,
- WatchdogDeathCallback callback)
- : _checkPeriod(checkPeriod),
- _watchdogCheckThread(std::move(checks), checkPeriod),
- _watchdogMonitorThread(&_watchdogCheckThread, callback, monitorPeriod) {
- invariant(checkPeriod < monitorPeriod);
-}
-
-void WatchdogMonitor::start() {
- log() << "Starting Watchdog Monitor";
-
- // Start the threads.
- _watchdogCheckThread.start();
-
- _watchdogMonitorThread.start();
-
- {
- stdx::lock_guard<stdx::mutex> lock(_mutex);
-
- invariant(_state == State::kNotStarted);
- _state = State::kStarted;
- }
-}
-
-void WatchdogMonitor::setPeriod(Milliseconds duration) {
- {
- stdx::lock_guard<stdx::mutex> lock(_mutex);
-
- if (duration > Milliseconds(0)) {
- dassert(duration >= Milliseconds(1));
-
- // Make sure that we monitor runs more frequently then checks
- // 2 feels like an arbitrary good minimum.
- invariant(duration >= 2 * _checkPeriod);
-
- _watchdogCheckThread.setPeriod(_checkPeriod);
- _watchdogMonitorThread.setPeriod(duration);
-
- log() << "WatchdogMonitor period changed to " << duration_cast<Seconds>(duration);
- } else {
- _watchdogMonitorThread.setPeriod(duration);
- _watchdogCheckThread.setPeriod(duration);
-
- log() << "WatchdogMonitor disabled";
- }
- }
-}
-
-void WatchdogMonitor::shutdown() {
- {
- stdx::lock_guard<stdx::mutex> lock(_mutex);
-
- bool started = (_state == State::kStarted);
-
- invariant(_state == State::kNotStarted || _state == State::kStarted);
-
- if (!started) {
- _state = State::kDone;
- return;
- }
-
- _state = State::kShutdownRequested;
- }
-
- _watchdogMonitorThread.shutdown();
-
- _watchdogCheckThread.shutdown();
-
- _state = State::kDone;
-}
-
-std::int64_t WatchdogMonitor::getCheckGeneration() {
- return _watchdogCheckThread.getGeneration();
-}
-
-std::int64_t WatchdogMonitor::getMonitorGeneration() {
- return _watchdogMonitorThread.getGeneration();
-}
-
-#ifdef _WIN32
-/**
- * Check a directory is ok
- * 1. Open up a direct_io to a new file
- * 2. Write to the file
- * 3. Seek to the beginning
- * 4. Read from the file
- * 5. Close file
- */
-void checkFile(OperationContext* opCtx, const boost::filesystem::path& file) {
- Date_t now = opCtx->getServiceContext()->getPreciseClockSource()->now();
- std::string nowStr = now.toString();
-
- HANDLE hFile = CreateFileW(file.generic_wstring().c_str(),
- GENERIC_READ | GENERIC_WRITE,
- 0, // No Sharing
- NULL,
- CREATE_ALWAYS,
- FILE_ATTRIBUTE_NORMAL,
- NULL);
- if (hFile == INVALID_HANDLE_VALUE) {
- std::uint32_t gle = ::GetLastError();
- severe() << "CreateFile failed for '" << file.generic_string()
- << "' with error: " << errnoWithDescription(gle);
- fassertNoTrace(4074, gle == 0);
- }
-
- DWORD bytesWrittenTotal;
- if (!WriteFile(hFile, nowStr.c_str(), nowStr.size(), &bytesWrittenTotal, NULL)) {
- std::uint32_t gle = ::GetLastError();
- severe() << "WriteFile failed for '" << file.generic_string()
- << "' with error: " << errnoWithDescription(gle);
- fassertNoTrace(4075, gle == 0);
- }
-
- if (bytesWrittenTotal != nowStr.size()) {
- warning() << "partial write for '" << file.generic_string() << "' expected "
- << nowStr.size() << " bytes but wrote " << bytesWrittenTotal << " bytes";
- } else {
-
- if (!FlushFileBuffers(hFile)) {
- std::uint32_t gle = ::GetLastError();
- severe() << "FlushFileBuffers failed for '" << file.generic_string()
- << "' with error: " << errnoWithDescription(gle);
- fassertNoTrace(4076, gle == 0);
- }
-
- DWORD newOffset = SetFilePointer(hFile, 0, 0, FILE_BEGIN);
- if (newOffset != 0) {
- std::uint32_t gle = ::GetLastError();
- severe() << "SetFilePointer failed for '" << file.generic_string()
- << "' with error: " << errnoWithDescription(gle);
- fassertNoTrace(4077, gle == 0);
- }
-
- DWORD bytesRead;
- auto readBuffer = stdx::make_unique<char[]>(nowStr.size());
- if (!ReadFile(hFile, readBuffer.get(), nowStr.size(), &bytesRead, NULL)) {
- std::uint32_t gle = ::GetLastError();
- severe() << "ReadFile failed for '" << file.generic_string()
- << "' with error: " << errnoWithDescription(gle);
- fassertNoTrace(4078, gle == 0);
- }
-
- if (bytesRead != bytesWrittenTotal) {
- severe() << "Read wrong number of bytes for '" << file.generic_string() << "' expected "
- << bytesWrittenTotal << " bytes but read " << bytesRead << " bytes";
- fassertNoTrace(50724, false);
- }
-
- if (memcmp(nowStr.c_str(), readBuffer.get(), nowStr.size()) != 0) {
- severe() << "Read wrong string from file '" << file.generic_string() << nowStr.size()
- << " bytes (in hex) '" << toHexLower(nowStr.c_str(), nowStr.size())
- << "' but read bytes '" << toHexLower(readBuffer.get(), bytesRead) << "'";
- fassertNoTrace(50717, false);
- }
- }
-
- if (!CloseHandle(hFile)) {
- std::uint32_t gle = ::GetLastError();
- severe() << "CloseHandle failed for '" << file.generic_string()
- << "' with error: " << errnoWithDescription(gle);
- fassertNoTrace(4079, gle == 0);
- }
-}
-
-void watchdogTerminate() {
- ::TerminateProcess(::GetCurrentProcess(), ExitCode::EXIT_WATCHDOG);
-}
-
-#else
-
-/**
- * Check a directory is ok
- * 1. Open up a direct_io to a new file
- * 2. Write to the file
- * 3. Read from the file
- * 4. Close file
- */
-void checkFile(OperationContext* opCtx, const boost::filesystem::path& file) {
- Date_t now = opCtx->getServiceContext()->getPreciseClockSource()->now();
- std::string nowStr = now.toString();
-
- int fd = open(file.generic_string().c_str(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
- if (fd == -1) {
- auto err = errno;
- severe() << "open failed for '" << file.generic_string()
- << "' with error: " << errnoWithDescription(err);
- fassertNoTrace(4080, err == 0);
- }
-
- size_t bytesWrittenTotal = 0;
- while (bytesWrittenTotal < nowStr.size()) {
- ssize_t bytesWrittenInWrite =
- write(fd, nowStr.c_str() + bytesWrittenTotal, nowStr.size() - bytesWrittenTotal);
- if (bytesWrittenInWrite == -1) {
- auto err = errno;
- if (err == EINTR) {
- continue;
- }
-
- severe() << "write failed for '" << file.generic_string()
- << "' with error: " << errnoWithDescription(err);
- fassertNoTrace(4081, err == 0);
- }
-
- // Warn if the write was incomplete
- if (bytesWrittenTotal == 0 && static_cast<size_t>(bytesWrittenInWrite) != nowStr.size()) {
- warning() << "parital write for '" << file.generic_string() << "' expected "
- << nowStr.size() << " bytes but wrote " << bytesWrittenInWrite << " bytes";
- }
-
- bytesWrittenTotal += bytesWrittenInWrite;
- }
-
- if (fsync(fd)) {
- auto err = errno;
- severe() << "fsync failed for '" << file.generic_string()
- << "' with error: " << errnoWithDescription(err);
- fassertNoTrace(4082, err == 0);
- }
-
- auto readBuffer = stdx::make_unique<char[]>(nowStr.size());
- size_t bytesReadTotal = 0;
- while (bytesReadTotal < nowStr.size()) {
- ssize_t bytesReadInRead = pread(
- fd, readBuffer.get() + bytesReadTotal, nowStr.size() - bytesReadTotal, bytesReadTotal);
- if (bytesReadInRead == -1) {
- auto err = errno;
- if (err == EINTR) {
- continue;
- }
-
- severe() << "read failed for '" << file.generic_string()
- << "' with error: " << errnoWithDescription(err);
- fassertNoTrace(4083, err == 0);
- } else if (bytesReadInRead == 0) {
- severe() << "read failed for '" << file.generic_string()
- << "' with unexpected end of file";
- fassertNoTrace(50719, false);
- }
-
- // Warn if the read was incomplete
- if (bytesReadTotal == 0 && static_cast<size_t>(bytesReadInRead) != nowStr.size()) {
- warning() << "partial read for '" << file.generic_string() << "' expected "
- << nowStr.size() << " bytes but read " << bytesReadInRead << " bytes";
- }
-
- bytesReadTotal += bytesReadInRead;
- }
-
- if (memcmp(nowStr.c_str(), readBuffer.get(), nowStr.size()) != 0) {
- severe() << "Read wrong string from file '" << file.generic_string() << "' expected "
- << nowStr.size() << " bytes (in hex) '"
- << toHexLower(nowStr.c_str(), nowStr.size()) << "' but read bytes '"
- << toHexLower(readBuffer.get(), bytesReadTotal) << "'";
- fassertNoTrace(50718, false);
- }
-
- if (close(fd)) {
- auto err = errno;
- severe() << "close failed for '" << file.generic_string()
- << "' with error: " << errnoWithDescription(err);
- fassertNoTrace(4084, err == 0);
- }
-}
-
-void watchdogTerminate() {
- // This calls the exit_group syscall on Linux
- ::_exit(ExitCode::EXIT_WATCHDOG);
-}
-#endif
-
-constexpr StringData DirectoryCheck::kProbeFileName;
-constexpr StringData DirectoryCheck::kProbeFileNameExt;
-
-void DirectoryCheck::run(OperationContext* opCtx) {
- // Ensure we have unique file names if multiple processes share the same logging directory
- boost::filesystem::path file = _directory;
- file /= kProbeFileName.toString();
- file += ProcessId::getCurrent().toString();
- file += kProbeFileNameExt.toString();
-
- checkFile(opCtx, file);
-
- // Try to delete the file so it is not leaked on restart, but ignore errors
- boost::system::error_code ec;
- boost::filesystem::remove(file, ec);
- if (ec) {
- warning() << "Failed to delete file '" << file.generic_string()
- << "', error: " << ec.message();
- }
-}
-
-std::string DirectoryCheck::getDescriptionForLogging() {
- return str::stream() << "checked directory '" << _directory.generic_string() << "'";
-}
-
-} // namespace mongo
diff --git a/src/mongo/watchdog/watchdog.h b/src/mongo/watchdog/watchdog.h
deleted file mode 100644
index fe0060e3534..00000000000
--- a/src/mongo/watchdog/watchdog.h
+++ /dev/null
@@ -1,385 +0,0 @@
-/**
- * Copyright (C) 2019-present MongoDB, Inc.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the Server Side Public License, version 1,
- * as published by MongoDB, Inc.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * Server Side Public License for more details.
- *
- * You should have received a copy of the Server Side Public License
- * along with this program. If not, see
- * <http://www.mongodb.com/licensing/server-side-public-license>.
- *
- * As a special exception, the copyright holders give permission to link the
- * code of portions of this program with the OpenSSL library under certain
- * conditions as described in each individual source file and distribute
- * linked combinations including the program with the OpenSSL library. You
- * must comply with the Server Side Public License in all respects for
- * all of the code used other than as permitted herein. If you modify file(s)
- * with this exception, you may extend this exception to your version of the
- * file(s), but you are not obligated to do so. If you do not wish to do so,
- * delete this exception statement from your version. If you delete this
- * exception statement from all source files in the program, then also delete
- * it in the license file.
- */
-
-#pragma once
-
-#include <boost/filesystem/path.hpp>
-#include <string>
-#include <vector>
-
-#include "mongo/platform/atomic_word.h"
-#include "mongo/stdx/condition_variable.h"
-#include "mongo/stdx/functional.h"
-#include "mongo/stdx/mutex.h"
-#include "mongo/stdx/thread.h"
-#include "mongo/util/duration.h"
-
-namespace mongo {
-
-class OperationContext;
-
-/**
- * WatchdogDeathCallback is used by the watchdog component to terminate the process. It is expected
- * to bypass MongoDB's normal shutdown process. It should not make any syscalls other then to
- * exit/terminate the process.
- *
- * It is pluggable for testing purposes.
- */
-using WatchdogDeathCallback = stdx::function<void(void)>;
-
-/**
- * The OS specific implementation of WatchdogDeathCallback that kills the process.
- */
-void watchdogTerminate();
-
-/**
- * WatchdogCheck represents a health check that the watchdog will run periodically to ensure the
- * machine, and process are healthy.
- *
- * It is pluggable for testing purposes.
- */
-class WatchdogCheck {
-public:
- virtual ~WatchdogCheck() = default;
-
- /**
- * Runs a health check against the local machine.
- *
- * Note: It should throw exceptions on unexpected errors. Exceptions will result in a call to
- * WatchdogDeathCallback.
- */
- virtual void run(OperationContext* opCtx) = 0;
-
- /**
- * Returns a description for the watchdog check to log to the log file.
- */
- virtual std::string getDescriptionForLogging() = 0;
-};
-
-/**
- * Do a health check for a given directory. This health check is done by reading, and writing to a
- * file with direct I/O.
- */
-class DirectoryCheck : public WatchdogCheck {
-public:
- static constexpr StringData kProbeFileName = "watchdog_probe_"_sd;
- static constexpr StringData kProbeFileNameExt = ".txt"_sd;
-
-public:
- DirectoryCheck(const boost::filesystem::path& directory) : _directory(directory) {}
-
- void run(OperationContext* opCtx) final;
-
- std::string getDescriptionForLogging() final;
-
-private:
- boost::filesystem::path _directory;
-};
-
-/**
- * Runs a callback on a periodic basis. The specified time period is the time delay between
- * invocations.
- *
- * Example:
- * - callback
- * - sleep(period)
- * - callback
- */
-class WatchdogPeriodicThread {
-public:
- WatchdogPeriodicThread(Milliseconds period, StringData threadName);
- virtual ~WatchdogPeriodicThread() = default;
-
- /**
- * Starts the periodic thread.
- */
- void start();
-
- /**
- * Updates the period the thread runs its task.
- *
- * Period changes take affect immediately.
- */
- void setPeriod(Milliseconds period);
-
- /**
- * Shutdown the periodic thread. After it is shutdown, it cannot be started.
- */
- void shutdown();
-
-protected:
- /**
- * Do one iteration of work.
- */
- virtual void run(OperationContext* opCtx) = 0;
-
- /**
- * Provides an opportunity for derived classes to initialize state.
- *
- * This method is called at two different times:
- * 1. First time a thread is started.
- * 2. When a thread goes from disabled to enabled. Specifically, a user calls setPeriod(-1)
- * followed by setPeriod(> 0).
- *
- */
- virtual void resetState() = 0;
-
-private:
- /**
- * Main thread loop
- */
- void doLoop();
-
-private:
- /**
- * Private enum to track state.
- *
- * +----------------------------------------------------------------+
- * | v
- * +-------------+ +----------+ +--------------------+ +-------+
- * | kNotStarted | --> | kStarted | --> | kShutdownRequested | --> | kDone |
- * +-------------+ +----------+ +--------------------+ +-------+
- */
- enum class State {
- /**
- * Initial state. Either start() or shutdown() can be called next.
- */
- kNotStarted,
-
- /**
- * start() has been called. shutdown() should be called next.
- */
- kStarted,
-
- /**
- * shutdown() has been called, and the thread is in progress of shutting down.
- */
- kShutdownRequested,
-
- /**
- * PeriodicThread has been shutdown.
- */
- kDone,
- };
-
- // State of PeriodicThread
- State _state{State::kNotStarted};
-
- // Thread period
- Milliseconds _period;
-
- // if true, then call run() otherwise just let the thread idle,
- bool _enabled;
-
- // Name of thread for logging purposes
- std::string _threadName;
-
- // The thread
- stdx::thread _thread;
-
- // Lock to protect _state and control _thread
- stdx::mutex _mutex;
- stdx::condition_variable _condvar;
-};
-
-/**
- * Periodic background thread to run watchdog checks.
- */
-class WatchdogCheckThread : public WatchdogPeriodicThread {
-public:
- WatchdogCheckThread(std::vector<std::unique_ptr<WatchdogCheck>> checks, Milliseconds period);
-
- /**
- * Returns the current generation number of the checks.
- *
- * Incremented after each check is run.
- */
- std::int64_t getGeneration();
-
-private:
- void run(OperationContext* opCtx) final;
- void resetState() final;
-
-private:
- // Vector of checks to run
- std::vector<std::unique_ptr<WatchdogCheck>> _checks;
-
- // A counter that is incremented for each watchdog check completed, and monitored to ensure it
- // does not remain at the same value for too long.
- AtomicWord<long long> _checkGeneration{0};
-};
-
-/**
- * Periodic background thread to ensure watchdog checks run periodically.
- */
-class WatchdogMonitorThread : public WatchdogPeriodicThread {
-public:
- WatchdogMonitorThread(WatchdogCheckThread* checkThread,
- WatchdogDeathCallback callback,
- Milliseconds period);
-
- /**
- * Returns the current generation number of the monitor.
- *
- * Incremented after each round of monitoring is run.
- */
- std::int64_t getGeneration();
-
-private:
- void run(OperationContext* opCtx) final;
- void resetState() final;
-
-private:
- // Callback function to call when watchdog gets stuck
- const WatchdogDeathCallback _callback;
-
- // Watchdog check thread to query
- WatchdogCheckThread* _checkThread;
-
- // A counter that is incremented for each watchdog monitor run is completed.
- AtomicWord<long long> _monitorGeneration{0};
-
- // The last seen _checkGeneration value
- std::int64_t _lastSeenGeneration{-1};
-};
-
-
-/**
- * WatchdogMonitor
- *
- * The Watchdog is a pair of dedicated threads that try to figure out if a process is hung
- * and terminate if it is. The worst case scenario in a distributed system is a process that appears
- * to work but does not actually work.
- *
- * The watchdog is not designed to detect all the different ways the process is hung. It's goal is
- * to detect if the storage system is stuck, and to terminate the process if it is stuck.
- *
- * Threads:
- * WatchdogCheck - runs file system checks
- * WatchdogMonitor - verifies that WatchdogCheck continue to make timely progress. If WatchdogCheck
- * fails to make process, WatchdogMonitor calls a callback. The callback is not
- * expected to do any I/O and minimize the system calls it makes.
- */
-class WatchdogMonitor {
-public:
- /**
- * Create the watchdog with specified period.
- *
- * checkPeriod - how often to run the checks
- * monitorPeriod - how often to run the monitor, must be >= checkPeriod
- */
- WatchdogMonitor(std::vector<std::unique_ptr<WatchdogCheck>> checks,
- Milliseconds checkPeriod,
- Milliseconds monitorPeriod,
- WatchdogDeathCallback callback);
-
- /**
- * Starts the watchdog threads.
- */
- void start();
-
- /**
- * Updates the watchdog monitor period. The goal is to detect a failure in the time of the
- * period.
- *
- * Does nothing if watchdog is not started. If watchdog was started, it changes the monitor
- * period, but not the check period.
- *
- * Accepts Milliseconds for testing purposes while the setParameter only works with seconds.
- */
- void setPeriod(Milliseconds duration);
-
- /**
- * Shutdown the watchdog.
- */
- void shutdown();
-
- /**
- * Returns the current generation number of the checks.
- *
- * Incremented after each round of checks is run.
- */
- std::int64_t getCheckGeneration();
-
- /**
- * Returns the current generation number of the checks.
- *
- * Incremented after each round of checks is run.
- */
- std::int64_t getMonitorGeneration();
-
-private:
- /**
- * Private enum to track state.
- *
- * +----------------------------------------------------------------+
- * | v
- * +-------------+ +----------+ +--------------------+ +-------+
- * | kNotStarted | --> | kStarted | --> | kShutdownRequested | --> | kDone |
- * +-------------+ +----------+ +--------------------+ +-------+
- */
- enum class State {
- /**
- * Initial state. Either start() or shutdown() can be called next.
- */
- kNotStarted,
-
- /**
- * start() has been called. shutdown() should be called next.
- */
- kStarted,
-
- /**
- * shutdown() has been called, and the background threads are in progress of shutting down.
- */
- kShutdownRequested,
-
- /**
- * Watchdog has been shutdown.
- */
- kDone,
- };
-
- // Lock to protect _state and control _thread
- stdx::mutex _mutex;
-
- // State of watchdog
- State _state{State::kNotStarted};
-
- // Fixed period for running the checks.
- Milliseconds _checkPeriod;
-
- // WatchdogCheck Thread - runs checks
- WatchdogCheckThread _watchdogCheckThread;
-
- // WatchdogMonitor Thread - watches _watchdogCheckThread
- WatchdogMonitorThread _watchdogMonitorThread;
-};
-
-} // namespace mongo
diff --git a/src/mongo/watchdog/watchdog_mongod.cpp b/src/mongo/watchdog/watchdog_mongod.cpp
deleted file mode 100644
index 5afab3dd926..00000000000
--- a/src/mongo/watchdog/watchdog_mongod.cpp
+++ /dev/null
@@ -1,208 +0,0 @@
-/**
- * Copyright (C) 2019-present MongoDB, Inc.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the Server Side Public License, version 1,
- * as published by MongoDB, Inc.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * Server Side Public License for more details.
- *
- * You should have received a copy of the Server Side Public License
- * along with this program. If not, see
- * <http://www.mongodb.com/licensing/server-side-public-license>.
- *
- * As a special exception, the copyright holders give permission to link the
- * code of portions of this program with the OpenSSL library under certain
- * conditions as described in each individual source file and distribute
- * linked combinations including the program with the OpenSSL library. You
- * must comply with the Server Side Public License in all respects for
- * all of the code used other than as permitted herein. If you modify file(s)
- * with this exception, you may extend this exception to your version of the
- * file(s), but you are not obligated to do so. If you do not wish to do so,
- * delete this exception statement from your version. If you delete this
- * exception statement from all source files in the program, then also delete
- * it in the license file.
- */
-
-#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kDefault
-
-#include "mongo/platform/basic.h"
-
-#include "mongo/watchdog/watchdog_mongod.h"
-
-#include <boost/filesystem.hpp>
-
-#include "mongo/base/init.h"
-#include "mongo/config.h"
-#include "mongo/db/client.h"
-#include "mongo/db/commands/server_status.h"
-#include "mongo/db/server_options.h"
-#include "mongo/db/service_context.h"
-#include "mongo/db/storage/storage_options.h"
-#include "mongo/stdx/memory.h"
-#include "mongo/unittest/unittest.h"
-#include "mongo/util/clock_source.h"
-#include "mongo/util/clock_source_mock.h"
-#include "mongo/util/log.h"
-#include "mongo/util/tick_source_mock.h"
-#include "mongo/watchdog/watchdog.h"
-#include "mongo/watchdog/watchdog_mongod_gen.h"
-
-namespace mongo {
-
-// Run the watchdog checks at a fixed interval regardless of user choice for monitoring period.
-constexpr Seconds watchdogCheckPeriod = Seconds{10};
-
-namespace {
-
-const auto getWatchdogMonitor =
- ServiceContext::declareDecoration<std::unique_ptr<WatchdogMonitor>>();
-
-// A boolean variable to track whether the watchdog was enabled at startup.
-// Defaults to true because set parameters are handled before we start the watchdog if needed.
-bool watchdogEnabled{true};
-
-WatchdogMonitor* getGlobalWatchdogMonitor() {
- if (!hasGlobalServiceContext()) {
- return nullptr;
- }
-
- return getWatchdogMonitor(getGlobalServiceContext()).get();
-}
-
-std::vector<std::string> watchdogPaths;
-
-} // namespace
-
-void registerWatchdogPath(StringData path) {
- if (!path.empty()) {
- watchdogPaths.push_back(path.toString());
- }
-}
-
-Status validateWatchdogPeriodSeconds(const int& value) {
- if (value < 60 && value != -1) {
-
- return {ErrorCodes::BadValue, "watchdogPeriodSeconds must be greater than or equal to 60s"};
- }
-
- // If the watchdog was not enabled at startup, disallow changes the period.
- if (!watchdogEnabled) {
- return {ErrorCodes::BadValue,
- "watchdogPeriodSeconds cannot be changed at runtime if it was not set at startup"};
- }
-
- return Status::OK();
-}
-
-Status onUpdateWatchdogPeriodSeconds(const int& value) {
- auto monitor = getGlobalWatchdogMonitor();
- if (monitor) {
- monitor->setPeriod(Seconds(value));
- }
-
- return Status::OK();
-}
-
-/**
- * Server status section for the Watchdog.
- *
- * Sample format:
- *
- * watchdog: {
- * generation: int,
- * }
- */
-class WatchdogServerStatusSection : public ServerStatusSection {
-public:
- WatchdogServerStatusSection() : ServerStatusSection("watchdog") {}
- bool includeByDefault() const {
- // Only include this by default if the watchdog is on
- return watchdogEnabled;
- }
-
- BSONObj generateSection(OperationContext* opCtx, const BSONElement& configElement) const {
- BSONObjBuilder result;
-
- WatchdogMonitor* watchdog = getWatchdogMonitor(opCtx->getServiceContext()).get();
-
- result.append("checkGeneration", watchdog->getCheckGeneration());
- result.append("monitorGeneration", watchdog->getMonitorGeneration());
- result.append("monitorPeriod", gWatchdogPeriodSeconds.load());
-
- return result.obj();
- }
-} watchdogServerStatusSection;
-
-void startWatchdog() {
- // Check three paths if set
- // 1. storage directory - optional for inmemory?
- // 2. log path - optional
- // 3. audit path - optional
-
- Seconds period{gWatchdogPeriodSeconds.load()};
- if (period < Seconds::zero()) {
- // Skip starting the watchdog if the user has not asked for it.
- watchdogEnabled = false;
- return;
- }
-
- watchdogEnabled = true;
-
- std::vector<std::unique_ptr<WatchdogCheck>> checks;
-
- auto dataCheck =
- stdx::make_unique<DirectoryCheck>(boost::filesystem::path(storageGlobalParams.dbpath));
-
- checks.push_back(std::move(dataCheck));
-
- // Add a check for the journal if it is not disabled
- if (storageGlobalParams.dur) {
- auto journalDirectory = boost::filesystem::path(storageGlobalParams.dbpath);
- journalDirectory /= "journal";
-
- if (boost::filesystem::exists(journalDirectory)) {
- auto journalCheck = stdx::make_unique<DirectoryCheck>(journalDirectory);
-
- checks.push_back(std::move(journalCheck));
- } else {
- warning()
- << "Watchdog is skipping check for journal directory since it does not exist: '"
- << journalDirectory.generic_string() << "'";
- }
- }
-
- // If the user specified a log path, also monitor that directory.
- // This may be redudant with the dbpath check but there is not easy way to confirm they are
- // duplicate.
- if (!serverGlobalParams.logpath.empty()) {
- boost::filesystem::path logFile(serverGlobalParams.logpath);
- auto logPath = logFile.parent_path();
-
- auto logCheck = stdx::make_unique<DirectoryCheck>(logPath);
- checks.push_back(std::move(logCheck));
- }
-
- // If the user specified an audit path, also monitor that directory.
- // This may be redudant with the dbpath check but there is not easy way to confirm they are
- // duplicate.
- for (auto&& path : watchdogPaths) {
- auto auditCheck = stdx::make_unique<DirectoryCheck>(path);
- checks.push_back(std::move(auditCheck));
- }
-
- auto monitor = stdx::make_unique<WatchdogMonitor>(
- std::move(checks), watchdogCheckPeriod, period, watchdogTerminate);
-
- // Install the new WatchdogMonitor
- auto& staticMonitor = getWatchdogMonitor(getGlobalServiceContext());
-
- staticMonitor = std::move(monitor);
-
- staticMonitor->start();
-}
-
-} // namespace mongo
diff --git a/src/mongo/watchdog/watchdog_mongod.h b/src/mongo/watchdog/watchdog_mongod.h
deleted file mode 100644
index 3e1583a5e6a..00000000000
--- a/src/mongo/watchdog/watchdog_mongod.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/**
- * Copyright (C) 2019-present MongoDB, Inc.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the Server Side Public License, version 1,
- * as published by MongoDB, Inc.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * Server Side Public License for more details.
- *
- * You should have received a copy of the Server Side Public License
- * along with this program. If not, see
- * <http://www.mongodb.com/licensing/server-side-public-license>.
- *
- * As a special exception, the copyright holders give permission to link the
- * code of portions of this program with the OpenSSL library under certain
- * conditions as described in each individual source file and distribute
- * linked combinations including the program with the OpenSSL library. You
- * must comply with the Server Side Public License in all respects for
- * all of the code used other than as permitted herein. If you modify file(s)
- * with this exception, you may extend this exception to your version of the
- * file(s), but you are not obligated to do so. If you do not wish to do so,
- * delete this exception statement from your version. If you delete this
- * exception statement from all source files in the program, then also delete
- * it in the license file.
- */
-
-#pragma once
-
-#include "mongo/base/status.h"
-
-namespace mongo {
-
-/**
-* Start the watchdog.
-*/
-void startWatchdog();
-
-/**
- * Callbacks used by the 'watchdogPeriodSeconds' set parameter.
- */
-Status validateWatchdogPeriodSeconds(const int& value);
-Status onUpdateWatchdogPeriodSeconds(const int& value);
-
-/**
- * Allow components a way to tell the watchdog what to watch.
- */
-void registerWatchdogPath(StringData path);
-
-} // namespace mongo
diff --git a/src/mongo/watchdog/watchdog_mongod.idl b/src/mongo/watchdog/watchdog_mongod.idl
deleted file mode 100644
index 61e6fd605b0..00000000000
--- a/src/mongo/watchdog/watchdog_mongod.idl
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (C) 2019-present MongoDB, Inc.
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the Server Side Public License, version 1,
-# as published by MongoDB, Inc.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# Server Side Public License for more details.
-#
-# You should have received a copy of the Server Side Public License
-# along with this program. If not, see
-# <http://www.mongodb.com/licensing/server-side-public-license>.
-#
-# As a special exception, the copyright holders give permission to link the
-# code of portions of this program with the OpenSSL library under certain
-# conditions as described in each individual source file and distribute
-# linked combinations including the program with the OpenSSL library. You
-# must comply with the Server Side Public License in all respects for
-# all of the code used other than as permitted herein. If you modify file(s)
-# with this exception, you may extend this exception to your version of the
-# file(s), but you are not obligated to do so. If you do not wish to do so,
-# delete this exception statement from your version. If you delete this
-# exception statement from all source files in the program, then also delete
-# it in the license file.
-#
-
-global:
- cpp_namespace: "mongo"
- cpp_includes:
- - "mongo/watchdog/watchdog_mongod.h"
-
-server_parameters:
- "watchdogPeriodSeconds":
- description: 'Watchdog Period (seconds)'
- set_at: [ startup, runtime ]
- cpp_vartype: 'AtomicWord<int>'
- cpp_varname: gWatchdogPeriodSeconds
- default: -1
- validator:
- callback: validateWatchdogPeriodSeconds
- on_update: onUpdateWatchdogPeriodSeconds
diff --git a/src/mongo/watchdog/watchdog_test.cpp b/src/mongo/watchdog/watchdog_test.cpp
deleted file mode 100644
index fd3b34602c9..00000000000
--- a/src/mongo/watchdog/watchdog_test.cpp
+++ /dev/null
@@ -1,480 +0,0 @@
-/**
- * Copyright (C) 2019-present MongoDB, Inc.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the Server Side Public License, version 1,
- * as published by MongoDB, Inc.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * Server Side Public License for more details.
- *
- * You should have received a copy of the Server Side Public License
- * along with this program. If not, see
- * <http://www.mongodb.com/licensing/server-side-public-license>.
- *
- * As a special exception, the copyright holders give permission to link the
- * code of portions of this program with the OpenSSL library under certain
- * conditions as described in each individual source file and distribute
- * linked combinations including the program with the OpenSSL library. You
- * must comply with the Server Side Public License in all respects for
- * all of the code used other than as permitted herein. If you modify file(s)
- * with this exception, you may extend this exception to your version of the
- * file(s), but you are not obligated to do so. If you do not wish to do so,
- * delete this exception statement from your version. If you delete this
- * exception statement from all source files in the program, then also delete
- * it in the license file.
- */
-
-#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kDefault
-
-#include "mongo/platform/basic.h"
-
-#include "mongo/watchdog/watchdog.h"
-
-#include "mongo/db/client.h"
-#include "mongo/db/service_context.h"
-#include "mongo/db/service_context_test_fixture.h"
-#include "mongo/stdx/memory.h"
-#include "mongo/unittest/death_test.h"
-#include "mongo/unittest/temp_dir.h"
-#include "mongo/unittest/unittest.h"
-#include "mongo/util/clock_source.h"
-#include "mongo/util/clock_source_mock.h"
-#include "mongo/util/log.h"
-#include "mongo/util/tick_source_mock.h"
-
-namespace mongo {
-
-class TestPeriodicThread : public WatchdogPeriodicThread {
-public:
- TestPeriodicThread(Milliseconds period) : WatchdogPeriodicThread(period, "testPeriodic") {}
-
- void run(OperationContext* opCtx) final {
- {
- stdx::lock_guard<stdx::mutex> lock(_mutex);
- ++_counter;
- }
-
- if (_counter == _wait) {
- _condvar.notify_all();
- }
- }
-
- void setSignalOnCount(int c) {
- _wait = c;
- }
-
- void waitForCount() {
- invariant(_wait != 0);
-
- stdx::unique_lock<stdx::mutex> lock(_mutex);
- while (_counter < _wait) {
- _condvar.wait(lock);
- }
- }
-
- void resetState() final {}
-
- std::uint32_t getCounter() {
- {
- stdx::lock_guard<stdx::mutex> lock(_mutex);
- return _counter;
- }
- }
-
-private:
- std::uint32_t _counter{0};
-
- stdx::mutex _mutex;
- stdx::condition_variable _condvar;
- std::uint32_t _wait{0};
-};
-
-class PeriodicThreadTest : public ServiceContextTest {};
-
-// Tests:
-// 1. Make sure it runs at least N times
-// 2. Make sure it responds to stop after being paused
-// 3. Make sure it can be resumed
-// 4. Make sure the period can be changed like from 1 minute -> 1 milli
-
-// Positive: Make sure periodic thread runs at least N times and stops correctly
-TEST_F(PeriodicThreadTest, Basic) {
-
- TestPeriodicThread testThread(Milliseconds(5));
-
- testThread.setSignalOnCount(5);
-
- testThread.start();
-
- testThread.waitForCount();
-
- testThread.shutdown();
-
- // Check the counter after it is shutdown and make sure it does not change.
- std::uint32_t lastCounter = testThread.getCounter();
-
- // This is racey but it should only produce false negatives
- sleepmillis(100);
-
- ASSERT_EQ(lastCounter, testThread.getCounter());
-}
-
-// Positive: Make sure it stops after being paused
-TEST_F(PeriodicThreadTest, PauseAndStop) {
-
- TestPeriodicThread testThread(Milliseconds(5));
- testThread.setSignalOnCount(5);
-
- testThread.start();
-
- testThread.waitForCount();
-
- // Stop the thread by setting a -1 duration
- testThread.setPeriod(Milliseconds(-1));
-
- // Check the counter after it is shutdown and make sure it does not change.
- std::uint32_t pauseCounter = testThread.getCounter();
-
- // This is racey but it should only produce false negatives
- sleepmillis(100);
-
- // We could have had one more run of the loop as we paused - allow for that case
- // but no other runs of the thread.
- ASSERT_GTE(pauseCounter + 1, testThread.getCounter());
-
- testThread.shutdown();
-
- // Check the counter after it is shutdown and make sure it does not change.
- std::uint32_t stopCounter = testThread.getCounter();
-
- // This is racey but it should only produce false negatives
- sleepmillis(100);
-
- ASSERT_EQ(stopCounter, testThread.getCounter());
-}
-
-// Positive: Make sure it can be paused and resumed
-TEST_F(PeriodicThreadTest, PauseAndResume) {
-
- TestPeriodicThread testThread(Milliseconds(5));
- testThread.setSignalOnCount(5);
-
- testThread.start();
-
- testThread.waitForCount();
-
- // Stop the thread by setting a -1 duration
- testThread.setPeriod(Milliseconds(-1));
-
- // Check the counter after it is shutdown and make sure it does not change.
- std::uint32_t pauseCounter = testThread.getCounter();
-
- // This is racey but it should only produce false negatives
- sleepmillis(100);
-
- // We could have had one more run of the loop as we paused - allow for that case
- // but no other runs of the thread.
- ASSERT_GTE(pauseCounter + 1, testThread.getCounter());
-
- // Make sure we can resume the thread again
- std::uint32_t baseCounter = testThread.getCounter();
- testThread.setSignalOnCount(baseCounter + 5);
-
- testThread.setPeriod(Milliseconds(7));
-
- testThread.waitForCount();
-
- testThread.shutdown();
-}
-
-/**
- * Simple class to ensure we run checks.
- */
-class TestCounterCheck : public WatchdogCheck {
-public:
- void run(OperationContext* opCtx) final {
- {
- stdx::lock_guard<stdx::mutex> lock(_mutex);
- ++_counter;
- }
-
- if (_counter == _wait) {
- _condvar.notify_all();
- }
- }
-
- std::string getDescriptionForLogging() final {
- return "test";
- }
-
- void setSignalOnCount(int c) {
- _wait = c;
- }
-
- void waitForCount() {
- invariant(_wait != 0);
-
- stdx::unique_lock<stdx::mutex> lock(_mutex);
- while (_counter < _wait) {
- _condvar.wait(lock);
- }
- }
-
- std::uint32_t getCounter() {
- {
- stdx::lock_guard<stdx::mutex> lock(_mutex);
- return _counter;
- }
- }
-
-private:
- std::uint32_t _counter{0};
-
- stdx::mutex _mutex;
- stdx::condition_variable _condvar;
- std::uint32_t _wait{0};
-};
-
-class WatchdogCheckThreadTest : public ServiceContextTest {};
-
-// Positive: Make sure check thread runs at least N times and stops correctly
-TEST_F(WatchdogCheckThreadTest, Basic) {
- auto counterCheck = stdx::make_unique<TestCounterCheck>();
- auto counterCheckPtr = counterCheck.get();
-
- std::vector<std::unique_ptr<WatchdogCheck>> checks;
- checks.push_back(std::move(counterCheck));
-
- WatchdogCheckThread testThread(std::move(checks), Milliseconds(5));
-
- counterCheckPtr->setSignalOnCount(5);
-
- testThread.start();
-
- counterCheckPtr->waitForCount();
-
- testThread.shutdown();
-
- // Check the counter after it is shutdown and make sure it does not change.
- std::uint32_t lastCounter = counterCheckPtr->getCounter();
-
- // This is racey but it should only produce false negatives
- sleepmillis(100);
-
- ASSERT_EQ(lastCounter, counterCheckPtr->getCounter());
-}
-
-/**
- * A class that models the behavior of Windows' manual reset Event object.
- */
-class ManualResetEvent {
-public:
- void set() {
- stdx::lock_guard<stdx::mutex> lock(_mutex);
-
- _set = true;
- _condvar.notify_one();
- }
-
- void wait() {
- stdx::unique_lock<stdx::mutex> lock(_mutex);
-
- _condvar.wait(lock, [this]() { return _set; });
- }
-
-private:
- bool _set{false};
-
- stdx::mutex _mutex;
- stdx::condition_variable _condvar;
-};
-
-
-class WatchdogMonitorThreadTest : public ServiceContextTest {};
-
-// Positive: Make sure monitor thread signals death if the check thread never starts
-TEST_F(WatchdogMonitorThreadTest, Basic) {
- ManualResetEvent deathEvent;
- WatchdogDeathCallback deathCallback = [&deathEvent]() {
- log() << "Death signalled";
- deathEvent.set();
- };
-
- auto counterCheck = stdx::make_unique<TestCounterCheck>();
-
- std::vector<std::unique_ptr<WatchdogCheck>> checks;
- checks.push_back(std::move(counterCheck));
-
- WatchdogCheckThread checkThread(std::move(checks), Milliseconds(5));
-
- WatchdogMonitorThread monitorThread(&checkThread, deathCallback, Milliseconds(5));
-
- monitorThread.start();
-
- deathEvent.wait();
-
- monitorThread.shutdown();
-}
-
-/**
- * Sleep after doing a few checks to replicate a hung check.
- */
-class SleepyCheck : public WatchdogCheck {
-public:
- void run(OperationContext* opCtx) final {
- ++_counter;
-
- if (_counter >= 6) {
- sleepFor(Seconds(5));
- }
- }
-
- std::string getDescriptionForLogging() final {
- return "test";
- }
-
-private:
- std::uint32_t _counter{0};
-};
-
-// Positive: Make sure monitor thread signals death if the thread does not make progress
-TEST_F(WatchdogMonitorThreadTest, SleepyHungCheck) {
- ManualResetEvent deathEvent;
- WatchdogDeathCallback deathCallback = [&deathEvent]() {
- log() << "Death signalled";
- deathEvent.set();
- };
-
- auto sleepyCheck = stdx::make_unique<SleepyCheck>();
-
- std::vector<std::unique_ptr<WatchdogCheck>> checks;
- checks.push_back(std::move(sleepyCheck));
-
- WatchdogCheckThread checkThread(std::move(checks), Milliseconds(1));
-
- WatchdogMonitorThread monitorThread(&checkThread, deathCallback, Milliseconds(100));
-
- checkThread.start();
-
- monitorThread.start();
-
- deathEvent.wait();
-
- // Make sure we actually did some checks
- ASSERT_GTE(checkThread.getGeneration(), 2);
-
- monitorThread.shutdown();
-
- checkThread.shutdown();
-}
-
-class WatchdogMonitorTest : public ServiceContextTest {};
-
-// Positive: Make sure watchdog monitor signals death if a check is unresponsive
-TEST_F(WatchdogMonitorTest, SleepyHungCheck) {
- ManualResetEvent deathEvent;
- WatchdogDeathCallback deathCallback = [&deathEvent]() {
- log() << "Death signalled";
- deathEvent.set();
- };
-
- auto sleepyCheck = stdx::make_unique<SleepyCheck>();
-
- std::vector<std::unique_ptr<WatchdogCheck>> checks;
- checks.push_back(std::move(sleepyCheck));
-
- WatchdogMonitor monitor(std::move(checks), Milliseconds(1), Milliseconds(5), deathCallback);
-
- monitor.start();
-
- deathEvent.wait();
-
- monitor.shutdown();
-}
-
-// Positive: Make sure watchdog monitor terminates the process if a check is unresponsive
-DEATH_TEST(WatchdogMonitorTest, Death, "") {
- auto sleepyCheck = stdx::make_unique<SleepyCheck>();
-
- std::vector<std::unique_ptr<WatchdogCheck>> checks;
- checks.push_back(std::move(sleepyCheck));
-
- WatchdogMonitor monitor(
- std::move(checks), Milliseconds(1), Milliseconds(100), watchdogTerminate);
-
- monitor.start();
-
- sleepmillis(1000);
-}
-
-// Positive: Make sure the monitor can be paused and resumed, and it does not trigger death
-TEST_F(WatchdogMonitorTest, PauseAndResume) {
-
- WatchdogDeathCallback deathCallback = []() {
- log() << "Death signalled, it should not have been";
- invariant(false);
- };
-
- auto counterCheck = stdx::make_unique<TestCounterCheck>();
- auto counterCheckPtr = counterCheck.get();
-
- std::vector<std::unique_ptr<WatchdogCheck>> checks;
- checks.push_back(std::move(counterCheck));
-
- WatchdogMonitor monitor(std::move(checks), Milliseconds(1), Milliseconds(1001), deathCallback);
-
- counterCheckPtr->setSignalOnCount(5);
-
- monitor.start();
-
- counterCheckPtr->waitForCount();
-
- // Pause the monitor
- monitor.setPeriod(Milliseconds(-1));
-
- // Check the counter after it is shutdown and make sure it does not change.
- std::uint32_t pauseCounter = counterCheckPtr->getCounter();
-
- // This is racey but it should only produce false negatives
- sleepmillis(100);
-
- // We could have had one more run of the loop as we paused - allow for that case
- // but no other runs of the thread.
- ASSERT_GTE(pauseCounter + 1, counterCheckPtr->getCounter());
-
- // Resume the monitor
- std::uint32_t baseCounter = counterCheckPtr->getCounter();
- counterCheckPtr->setSignalOnCount(baseCounter + 5);
-
- // Restart the monitor with a different interval.
- monitor.setPeriod(Milliseconds(1007));
-
- counterCheckPtr->waitForCount();
-
- monitor.shutdown();
-
- // Check the counter after it is shutdown and make sure it does not change.
- std::uint32_t lastCounter = counterCheckPtr->getCounter();
-
- // This is racey but it should only produce false negatives
- sleepmillis(100);
-
- ASSERT_EQ(lastCounter, counterCheckPtr->getCounter());
-}
-
-class DirectoryCheckTest : public ServiceContextTest {};
-
-// Positive: Do a sanity check that directory check passes
-TEST_F(DirectoryCheckTest, Basic) {
- unittest::TempDir tempdir("watchdog_testpath");
-
- DirectoryCheck check(tempdir.path());
-
- auto opCtx = makeOperationContext();
- check.run(opCtx.get());
-}
-
-} // namespace mongo