summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMark Benvenuto <mark.benvenuto@mongodb.com>2019-05-13 21:16:52 -0400
committerMark Benvenuto <mark.benvenuto@mongodb.com>2019-05-13 21:16:52 -0400
commit39f8c6af5ef1f637bdb2120b1cfb8b507368a7f8 (patch)
tree9f2587793cb215b9a36eb519e0b8d02bd05da317
parent9ab10de2762ba48532d9bc6717a434672eee8475 (diff)
downloadmongo-39f8c6af5ef1f637bdb2120b1cfb8b507368a7f8.tar.gz
SERVER-41023 Move Storage Node Watchdog to community
-rw-r--r--buildscripts/resmokeconfig/suites/watchdog.yml2
-rw-r--r--etc/evergreen.yml6
-rw-r--r--jstests/watchdog/charybdefs_setup.sh27
-rw-r--r--jstests/watchdog/lib/charybdefs_lib.js127
-rw-r--r--jstests/watchdog/lib/wd_test_common.js54
-rw-r--r--jstests/watchdog/wd_auditpath_hang.js21
-rw-r--r--jstests/watchdog/wd_dbpath_hang.js14
-rw-r--r--jstests/watchdog/wd_journal_hang.js33
-rw-r--r--jstests/watchdog/wd_logpath_hang.js14
-rw-r--r--jstests/watchdog/wd_setparam.js60
-rw-r--r--src/mongo/SConscript2
-rw-r--r--src/mongo/db/db.cpp6
-rw-r--r--src/mongo/watchdog/SConscript51
-rw-r--r--src/mongo/watchdog/watchdog.cpp537
-rw-r--r--src/mongo/watchdog/watchdog.h385
-rw-r--r--src/mongo/watchdog/watchdog_mongod.cpp201
-rw-r--r--src/mongo/watchdog/watchdog_mongod.h47
-rw-r--r--src/mongo/watchdog/watchdog_mongod.idl43
-rw-r--r--src/mongo/watchdog/watchdog_register.cpp50
-rw-r--r--src/mongo/watchdog/watchdog_register.h48
-rw-r--r--src/mongo/watchdog/watchdog_test.cpp480
21 files changed, 2204 insertions, 4 deletions
diff --git a/buildscripts/resmokeconfig/suites/watchdog.yml b/buildscripts/resmokeconfig/suites/watchdog.yml
index 711c84b738d..e4c6da0e043 100644
--- a/buildscripts/resmokeconfig/suites/watchdog.yml
+++ b/buildscripts/resmokeconfig/suites/watchdog.yml
@@ -2,7 +2,7 @@ test_kind: js_test
selector:
roots:
- - src/mongo/db/modules/*/jstests/watchdog/*.js
+ - jstests/watchdog/*.js
executor:
config:
diff --git a/etc/evergreen.yml b/etc/evergreen.yml
index 9fbd47df60c..b65d8dfb22b 100644
--- a/etc/evergreen.yml
+++ b/etc/evergreen.yml
@@ -2073,7 +2073,7 @@ functions:
set -o errexit
set -o verbose
- bash src/mongo/db/modules/enterprise/jstests/watchdog/charybdefs_setup.sh
+ bash jstests/watchdog/charybdefs_setup.sh
"cleanup environment":
command: shell.exec
@@ -8138,6 +8138,7 @@ buildvariants:
- name: stitch_support_lib_build_and_test
- name: stitch_support_lib_build_and_archive
- name: tool
+ - name: watchdog_wiredtiger
- name: package
distros:
- ubuntu1604-packer
@@ -8316,6 +8317,7 @@ buildvariants:
- name: ssl_gen
- name: sslSpecial_gen
- name: tool
+ - name: watchdog_wiredtiger
- name: package
distros:
- ubuntu1604-packer
@@ -14079,6 +14081,8 @@ buildvariants:
- name: tool
- name: update_fuzzer_gen
- name: update_fuzzer_replication_gen
+ - name: watchdog_inmemory
+ - name: watchdog_wiredtiger
- name: write_concern_majority_passthrough
- name: secondary_reads_passthrough_gen
diff --git a/jstests/watchdog/charybdefs_setup.sh b/jstests/watchdog/charybdefs_setup.sh
new file mode 100644
index 00000000000..31cd1f0be05
--- /dev/null
+++ b/jstests/watchdog/charybdefs_setup.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Script to setup charybdefs
+set -euo pipefail
+IFS=$'\n\t'
+
+if [ "$#" -ne 0 ]; then
+ echo "This script does not take any arguments"
+ exit 1
+fi
+
+echo Start - charybdefs_setup.sh
+
+cd /data
+
+rm -rf /data/charybdefs
+rm -rf /data/thrift
+
+# Use the mongo branch and fork from here
+git clone -b mongo_42 https://github.com/markbenvenuto/charybdefs.git
+
+# Run the build script in the mongo branch
+cd charybdefs/mongo
+
+# Build and setup thrift and charybdefs
+PATH=/opt/mongodbtoolchain/v3/bin:$PATH bash ./build.sh
+
+echo Done - charybdefs_setup.sh
diff --git a/jstests/watchdog/lib/charybdefs_lib.js b/jstests/watchdog/lib/charybdefs_lib.js
new file mode 100644
index 00000000000..f80246426d8
--- /dev/null
+++ b/jstests/watchdog/lib/charybdefs_lib.js
@@ -0,0 +1,127 @@
+// Exit code that the watchdog uses on exit
+const EXIT_WATCHDOG = 61;
+
+/**
+ * Control the Charybdefs file system for Fault Injectiong testing
+ *
+ * @param {string} test_name unique name for test directories
+ */
+function CharybdefsControl(test_name) {
+ 'use strict';
+
+ const python = "/opt/mongodbtoolchain/v3/bin/python3";
+ let control_py = "/data/charybdefs/mongo/control.py";
+
+ // Use the minimum watchdog period
+ const wd_period_sec = 60;
+
+ // Since the watchdog can take up to (2 x period) to detect failures, stall the write for that
+ // amount of time plus a small buffer of time to account for thread scheduling, etc.
+ const fs_delay_sec = wd_period_sec * 2 + 5;
+
+ const mount_point = MongoRunner.toRealPath(test_name + '_mnt');
+ const backing_path = MongoRunner.toRealPath(test_name + '_backing');
+
+ this._runControl = function(cmd, ...args) {
+ let cmd_args = [python, control_py, cmd];
+ cmd_args = cmd_args.concat(args);
+ let ret = run.apply(null, cmd_args);
+ assert.eq(ret, 0);
+ };
+
+ /**
+ * Get the path of the mounted Charybdefs file system.
+ *
+ * @return {string} mount point
+ */
+ this.getMountPath = function() {
+ return mount_point;
+ };
+
+ /**
+ * Get the Watchdog Period.
+ *
+ * @return {number} number of sections
+ */
+ this.getWatchdogPeriodSeconds = function() {
+ return wd_period_sec;
+ };
+
+ /**
+ * Start the Charybdefs filesystem.
+ */
+ this.start = function() {
+ this.cleanup();
+
+ this._runControl("start",
+ "--fuse_mount=" + mount_point,
+ "--backing_path=" + backing_path,
+ "--log_file=foo_fs.log");
+ print("Charybdefs sucessfully started.");
+ };
+
+ // Get the current check generation
+ function _getGeneration(admin) {
+ const result = admin.runCommand({"serverStatus": 1});
+
+ assert.commandWorked(result);
+
+ return result.watchdog.checkGeneration;
+ }
+
+ /**
+ * Wait for the watchdog to run some checks first.
+ *
+ * @param {object} MongoDB connection to admin database
+ */
+ this.waitForWatchdogToStart = function(admin) {
+ print("Waiting for MongoDB watchdog to checks run twice.");
+ assert.soon(function() {
+ return _getGeneration(admin) > 2;
+ }, "Watchdog did not start running", 5 * wd_period_sec * 1000);
+ };
+
+ /**
+ * Inject delay on write, and wait to MongoDB to get hung.
+ *
+ * @param {string} file_name - file name to inject fault on
+ */
+ this.addWriteDelayFaultAndWait = function(file_name) {
+ // Convert seconds to microseconds for charybdefs
+ const delay_us = fs_delay_sec * 1000000;
+ this.addFault("write_buf", file_name, delay_us);
+
+ // Wait for watchdog to stop
+ print("Waiting for MongoDB to hang.");
+ sleep(fs_delay_sec * 1000);
+
+ };
+
+ /**
+ * Add a fault to inject.
+ *
+ * @param {string} method - name of fuse method to inject fault for
+ * @param {string} file_name - file name to inject fault on
+ * @param {number} delay_us - optional delay in microseconds to wait
+ */
+ this.addFault = function(method, file_name, delay_us) {
+
+ this._runControl("set_fault",
+ "--methods=" + method,
+ "--errno=5",
+ "--probability=100000",
+ "--regexp=.*" + file_name,
+ "--delay_us=" + delay_us);
+ };
+
+ /**
+ * Shutdown and clean up the Charybdefs filesystem.
+ */
+ this.cleanup = function() {
+ this._runControl("stop_all", "--fuse_mount=" + mount_point);
+
+ // Delete any remaining files
+ resetDbpath(mount_point);
+ resetDbpath(backing_path);
+ };
+}
diff --git a/jstests/watchdog/lib/wd_test_common.js b/jstests/watchdog/lib/wd_test_common.js
new file mode 100644
index 00000000000..46e625d6c9e
--- /dev/null
+++ b/jstests/watchdog/lib/wd_test_common.js
@@ -0,0 +1,54 @@
+// Storage Node Watchdog common test code
+//
+load("jstests/watchdog/lib/charybdefs_lib.js");
+
+function testMongoDHang(control, mongod_options) {
+ 'use strict';
+
+ // Now start MongoD with it enabled at startup
+ //
+ if (mongod_options.hasOwnProperty("dbPath")) {
+ resetDbpath(mongod_options.dbPath);
+ }
+
+ var options = {
+ setParameter: "watchdogPeriodSeconds=" + control.getWatchdogPeriodSeconds(),
+ verbose: 1,
+ };
+
+ options = Object.extend(mongod_options, options);
+
+ const conn = MongoRunner.runMongod(options);
+ assert.neq(null, conn, 'mongod was unable to start up');
+
+ // Wait for watchdog to get running
+ const admin = conn.getDB("admin");
+
+ // Wait for the watchdog to run some checks first
+ control.waitForWatchdogToStart(admin);
+
+ // Hang the file system
+ control.addWriteDelayFaultAndWait("watchdog_probe.*");
+
+ // Check MongoD is dead by sending SIGTERM
+ // This will trigger our "nice" shutdown, but since mongod is stuck in the kernel doing I/O,
+ // the process will not terminate until charybdefs is done sleeping.
+ print("Stopping MongoDB now, it will terminate once charybdefs is done sleeping.");
+ MongoRunner.stopMongod(conn, undefined, {allowedExitCode: EXIT_WATCHDOG});
+}
+
+function testFuseAndMongoD(control, mongod_options) {
+ 'use strict';
+
+ // Cleanup previous runs
+ control.cleanup();
+
+ try {
+ // Start the file system
+ control.start();
+
+ testMongoDHang(control, mongod_options);
+ } finally {
+ control.cleanup();
+ }
+}
diff --git a/jstests/watchdog/wd_auditpath_hang.js b/jstests/watchdog/wd_auditpath_hang.js
new file mode 100644
index 00000000000..bd961d55a47
--- /dev/null
+++ b/jstests/watchdog/wd_auditpath_hang.js
@@ -0,0 +1,21 @@
+// Storage Node Watchdog - validate watchdog monitors --auditpath
+//
+load("jstests/watchdog/lib/wd_test_common.js");
+
+(function() {
+ 'use strict';
+
+ if (assert.commandWorked(db.runCommand({buildInfo: 1})).modules.includes("enterprise")) {
+ let control = new CharybdefsControl("auditpath_hang");
+
+ const auditPath = control.getMountPath();
+
+ testFuseAndMongoD(control, {
+
+ auditDestination: 'file',
+ auditFormat: 'JSON',
+ auditPath: auditPath + "/auditLog.json"
+ });
+ }
+
+})();
diff --git a/jstests/watchdog/wd_dbpath_hang.js b/jstests/watchdog/wd_dbpath_hang.js
new file mode 100644
index 00000000000..39147fe2229
--- /dev/null
+++ b/jstests/watchdog/wd_dbpath_hang.js
@@ -0,0 +1,14 @@
+// Storage Node Watchdog - validate --dbpath
+//
+load("jstests/watchdog/lib/wd_test_common.js");
+
+(function() {
+ 'use strict';
+
+ let control = new CharybdefsControl("dbpath_hang");
+
+ const dbPath = control.getMountPath() + "/db";
+
+ testFuseAndMongoD(control, {dbpath: dbPath});
+
+})();
diff --git a/jstests/watchdog/wd_journal_hang.js b/jstests/watchdog/wd_journal_hang.js
new file mode 100644
index 00000000000..c07b4298170
--- /dev/null
+++ b/jstests/watchdog/wd_journal_hang.js
@@ -0,0 +1,33 @@
+// Storage Node Watchdog - validate watchdog monitors --dbpath /journal
+// @tags: [requires_wiredtiger,requires_journaling]
+//
+load("jstests/watchdog/lib/wd_test_common.js");
+
+(function() {
+ 'use strict';
+
+ function trimTrailingSlash(dir) {
+ if (dir.endsWith('/')) {
+ return dir.substring(0, dir.length - 1);
+ }
+
+ return dir;
+ }
+
+ let control = new CharybdefsControl("journalpath_hang");
+
+ const journalFusePath = control.getMountPath();
+
+ const dbPath = MongoRunner.toRealDir("$dataDir/mongod-journal");
+
+ const journalLinkPath = dbPath + "/journal";
+
+ resetDbpath(dbPath);
+
+ // Create a symlink from the non-fuse journal directory to the fuse mount.
+ const ret = run("ln", "-s", trimTrailingSlash(journalFusePath), journalLinkPath);
+ assert.eq(ret, 0);
+
+ // Set noCleanData so that the dbPath is not cleaned because we want to use the journal symlink.
+ testFuseAndMongoD(control, {dbpath: dbPath, noCleanData: true});
+})();
diff --git a/jstests/watchdog/wd_logpath_hang.js b/jstests/watchdog/wd_logpath_hang.js
new file mode 100644
index 00000000000..9a3ec13c845
--- /dev/null
+++ b/jstests/watchdog/wd_logpath_hang.js
@@ -0,0 +1,14 @@
+// Storage Node Watchdog - validate watchdog monitors --logpath
+//
+load("jstests/watchdog/lib/wd_test_common.js");
+
+(function() {
+ 'use strict';
+
+ let control = new CharybdefsControl("logpath_hang");
+
+ const logpath = control.getMountPath();
+
+ testFuseAndMongoD(control, {logpath: logpath + "/foo.log"});
+
+})();
diff --git a/jstests/watchdog/wd_setparam.js b/jstests/watchdog/wd_setparam.js
new file mode 100644
index 00000000000..0857e11b1ff
--- /dev/null
+++ b/jstests/watchdog/wd_setparam.js
@@ -0,0 +1,60 @@
+// Storage Node Watchdog test cases
+// - Validate set parameter functions correctly.
+(function() {
+ 'use strict';
+ const admin = db.getSiblingDB("admin");
+
+ // Check the defaults are correct
+ //
+ function getparam(adminDb, field) {
+ let q = {getParameter: 1};
+ q[field] = 1;
+
+ const ret = adminDb.runCommand(q);
+ return ret[field];
+ }
+
+ // Verify the defaults are as we documented them
+ assert.eq(getparam(admin, "watchdogPeriodSeconds"), -1);
+
+ function setparam(adminDb, obj) {
+ const ret = adminDb.runCommand(Object.extend({setParameter: 1}, obj));
+ return ret;
+ }
+
+ // Negative tests
+ // Negative: set it too low.
+ assert.commandFailed(setparam(admin, {"watchdogPeriodSeconds": 1}));
+ // Negative: set it the min value but fail since it was not enabled.
+ assert.commandFailed(setparam(admin, {"watchdogPeriodSeconds": 60}));
+ // Negative: set it the min value + 1 but fail since it was not enabled.
+ assert.commandFailed(setparam(admin, {"watchdogPeriodSeconds": 61}));
+
+ // Now test MongoD with it enabled at startup
+ //
+ const conn = MongoRunner.runMongod({setParameter: "watchdogPeriodSeconds=60"});
+ assert.neq(null, conn, 'mongod was unable to start up');
+
+ const admin2 = conn.getDB("admin");
+
+ // Validate defaults
+ assert.eq(getparam(admin2, "watchdogPeriodSeconds"), 60);
+
+ // Negative: set it too low.
+ assert.commandFailed(setparam(admin2, {"watchdogPeriodSeconds": 1}));
+ // Positive: set it the min value
+ assert.commandWorked(setparam(admin2, {"watchdogPeriodSeconds": 60}));
+ // Positive: set it the min value + 1
+ assert.commandWorked(setparam(admin2, {"watchdogPeriodSeconds": 61}));
+
+ // Positive: disable it
+ assert.commandWorked(setparam(admin2, {"watchdogPeriodSeconds": -1}));
+
+ assert.eq(getparam(admin2, "watchdogPeriodSeconds"), -1);
+
+ // Positive: enable it again
+ assert.commandWorked(setparam(admin2, {"watchdogPeriodSeconds": 60}));
+
+ MongoRunner.stopMongod(conn);
+
+})();
diff --git a/src/mongo/SConscript b/src/mongo/SConscript
index 871429b589d..bc9a38c38cf 100644
--- a/src/mongo/SConscript
+++ b/src/mongo/SConscript
@@ -42,6 +42,7 @@ env.SConscript(
'transport',
'unittest',
'util',
+ 'watchdog',
],
exports=[
'env',
@@ -424,6 +425,7 @@ mongod = env.Program(
'util/options_parser/options_parser_init',
'util/periodic_runner_factory',
'util/version_impl',
+ 'watchdog/watchdog_mongod',
],
INSTALL_ALIAS=[
'core',
diff --git a/src/mongo/db/db.cpp b/src/mongo/db/db.cpp
index 31477531f64..4a7f4c39fe6 100644
--- a/src/mongo/db/db.cpp
+++ b/src/mongo/db/db.cpp
@@ -127,6 +127,7 @@
#include "mongo/db/stats/counters.h"
#include "mongo/db/storage/backup_cursor_hooks.h"
#include "mongo/db/storage/encryption_hooks.h"
+#include "mongo/db/storage/flow_control.h"
#include "mongo/db/storage/flow_control_parameters_gen.h"
#include "mongo/db/storage/storage_engine.h"
#include "mongo/db/storage/storage_engine_init.h"
@@ -178,8 +179,7 @@
#include "mongo/util/text.h"
#include "mongo/util/time_support.h"
#include "mongo/util/version.h"
-
-#include "mongo/db/storage/flow_control.h"
+#include "mongo/watchdog/watchdog_mongod.h"
#ifdef MONGO_CONFIG_SSL
#include "mongo/util/net/ssl_options.h"
@@ -403,6 +403,8 @@ ExitCode _initAndListen(int listenPort) {
initializeSNMP();
+ startWatchdog();
+
if (!storageGlobalParams.readOnly) {
boost::filesystem::remove_all(storageGlobalParams.dbpath + "/_tmp/");
}
diff --git a/src/mongo/watchdog/SConscript b/src/mongo/watchdog/SConscript
new file mode 100644
index 00000000000..a1cedf2a327
--- /dev/null
+++ b/src/mongo/watchdog/SConscript
@@ -0,0 +1,51 @@
+# -*- mode: python -*-
+
+Import('env')
+
+env.Library(
+ target='watchdog',
+ source=[
+ 'watchdog.cpp',
+ ],
+ LIBDEPS=[
+ '$BUILD_DIR/mongo/base',
+ '$BUILD_DIR/mongo/db/service_context',
+ '$BUILD_DIR/mongo/db/storage/storage_options',
+ ]
+)
+
+env.Library(
+ target='watchdog_register',
+ source=[
+ 'watchdog_register.cpp',
+ ],
+)
+
+env.Library(
+ target='watchdog_mongod',
+ source=[
+ 'watchdog_mongod.cpp',
+ env.Idlc('watchdog_mongod.idl')[0],
+ ],
+ LIBDEPS=[
+ '$BUILD_DIR/mongo/db/commands/server_status',
+ 'watchdog',
+ ],
+ LIBDEPS_PRIVATE=[
+ '$BUILD_DIR/mongo/db/server_options_core',
+ '$BUILD_DIR/mongo/idl/server_parameter',
+ 'watchdog_register',
+ ],
+)
+
+env.CppUnitTest(
+ target='watchdog_test',
+ source=[
+ 'watchdog_test.cpp',
+ ],
+ LIBDEPS=[
+ '$BUILD_DIR/mongo/db/service_context_test_fixture',
+ '$BUILD_DIR/mongo/util/clock_source_mock',
+ 'watchdog',
+ ],
+)
diff --git a/src/mongo/watchdog/watchdog.cpp b/src/mongo/watchdog/watchdog.cpp
new file mode 100644
index 00000000000..ea1d53b2219
--- /dev/null
+++ b/src/mongo/watchdog/watchdog.cpp
@@ -0,0 +1,537 @@
+/**
+ * Copyright (C) 2019-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kControl
+
+#include "mongo/platform/basic.h"
+
+#include "mongo/watchdog/watchdog.h"
+
+#include <boost/filesystem.hpp>
+
+#ifndef _WIN32
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#endif
+
+#include "mongo/base/static_assert.h"
+#include "mongo/db/client.h"
+#include "mongo/db/operation_context.h"
+#include "mongo/platform/process_id.h"
+#include "mongo/util/concurrency/idle_thread_block.h"
+#include "mongo/util/hex.h"
+#include "mongo/util/log.h"
+#include "mongo/util/timer.h"
+
+
+namespace mongo {
+
+WatchdogPeriodicThread::WatchdogPeriodicThread(Milliseconds period, StringData threadName)
+ : _period(period), _enabled(true), _threadName(threadName.toString()) {}
+
+void WatchdogPeriodicThread::start() {
+ {
+ stdx::lock_guard<stdx::mutex> lock(_mutex);
+
+ invariant(_state == State::kNotStarted);
+ _state = State::kStarted;
+
+ // Start the thread.
+ _thread = stdx::thread([this] { this->doLoop(); });
+ }
+}
+
+void WatchdogPeriodicThread::shutdown() {
+
+ stdx::thread thread;
+
+ {
+ stdx::lock_guard<stdx::mutex> lock(_mutex);
+
+ bool started = (_state == State::kStarted);
+
+ invariant(_state == State::kNotStarted || _state == State::kStarted);
+
+ if (!started) {
+ _state = State::kDone;
+ return;
+ }
+
+ _state = State::kShutdownRequested;
+
+ std::swap(thread, _thread);
+
+ // Wake up the thread if sleeping so that it will check if we are done.
+ _condvar.notify_one();
+ }
+
+ thread.join();
+
+ _state = State::kDone;
+}
+
+void WatchdogPeriodicThread::setPeriod(Milliseconds period) {
+ stdx::lock_guard<stdx::mutex> lock(_mutex);
+
+ bool wasEnabled = _enabled;
+
+ if (period < Milliseconds::zero()) {
+ _enabled = false;
+
+ // Leave the thread running but very slowly. If we set this value too high, it would
+ // overflow Duration.
+ _period = Hours(1);
+ } else {
+ _period = period;
+ _enabled = true;
+ }
+
+ if (!wasEnabled && _enabled) {
+ resetState();
+ }
+
+ _condvar.notify_one();
+}
+
+void WatchdogPeriodicThread::doLoop() {
+ Client::initThread(_threadName);
+ Client* client = &cc();
+
+ auto preciseClockSource = client->getServiceContext()->getPreciseClockSource();
+
+ {
+ stdx::lock_guard<stdx::mutex> lock(_mutex);
+
+ // Ensure state is starting from a clean slate.
+ resetState();
+ }
+
+ while (true) {
+ // Wait for the next run or signal to shutdown.
+
+ auto opCtx = client->makeOperationContext();
+
+ Date_t startTime = preciseClockSource->now();
+
+ {
+ stdx::unique_lock<stdx::mutex> lock(_mutex);
+ MONGO_IDLE_THREAD_BLOCK;
+
+
+ // Check if the period is different?
+ // We are signalled on period changes at which point we may be done waiting or need to
+ // wait longer.
+ while (startTime + _period > preciseClockSource->now() &&
+ _state != State::kShutdownRequested) {
+ auto s = opCtx->waitForConditionOrInterruptNoAssertUntil(
+ _condvar, lock, startTime + _period);
+
+ if (!s.isOK()) {
+ // The only bad status is when we are in shutdown
+ if (!opCtx->getServiceContext()->getKillAllOperations()) {
+ error() << "Watchdog was interrupted, shuting down:, reason: "
+ << s.getStatus();
+ }
+
+ return;
+ }
+ }
+
+ // Are we done running?
+ if (_state == State::kShutdownRequested) {
+ return;
+ }
+
+ // Check if the watchdog checks have been disabled
+ if (!_enabled) {
+ continue;
+ }
+ }
+
+ run(opCtx.get());
+ }
+}
+
+
+WatchdogCheckThread::WatchdogCheckThread(std::vector<std::unique_ptr<WatchdogCheck>> checks,
+ Milliseconds period)
+ : WatchdogPeriodicThread(period, "watchdogCheck"), _checks(std::move(checks)) {}
+
+std::int64_t WatchdogCheckThread::getGeneration() {
+ return _checkGeneration.load();
+}
+
+void WatchdogCheckThread::resetState() {}
+
+void WatchdogCheckThread::run(OperationContext* opCtx) {
+ for (auto& check : _checks) {
+ Timer timer(opCtx->getServiceContext()->getTickSource());
+
+ check->run(opCtx);
+ Microseconds micros = timer.elapsed();
+
+ LOG(1) << "Watchdog test '" << check->getDescriptionForLogging() << "' took "
+ << duration_cast<Milliseconds>(micros);
+
+ // We completed a check, bump the generation counter.
+ _checkGeneration.fetchAndAdd(1);
+ }
+}
+
+
+WatchdogMonitorThread::WatchdogMonitorThread(WatchdogCheckThread* checkThread,
+ WatchdogDeathCallback callback,
+ Milliseconds interval)
+ : WatchdogPeriodicThread(interval, "watchdogMonitor"),
+ _callback(callback),
+ _checkThread(checkThread) {}
+
+std::int64_t WatchdogMonitorThread::getGeneration() {
+ return _monitorGeneration.load();
+}
+
+void WatchdogMonitorThread::resetState() {
+ // Reset the generation so that if the monitor thread is run before the check thread
+ // after being enabled, it does not.
+ _lastSeenGeneration = -1;
+}
+
+void WatchdogMonitorThread::run(OperationContext* opCtx) {
+ auto currentGeneration = _checkThread->getGeneration();
+
+ if (currentGeneration != _lastSeenGeneration) {
+ _lastSeenGeneration = currentGeneration;
+ } else {
+ _callback();
+ }
+}
+
+
+WatchdogMonitor::WatchdogMonitor(std::vector<std::unique_ptr<WatchdogCheck>> checks,
+ Milliseconds checkPeriod,
+ Milliseconds monitorPeriod,
+ WatchdogDeathCallback callback)
+ : _checkPeriod(checkPeriod),
+ _watchdogCheckThread(std::move(checks), checkPeriod),
+ _watchdogMonitorThread(&_watchdogCheckThread, callback, monitorPeriod) {
+ invariant(checkPeriod < monitorPeriod);
+}
+
+void WatchdogMonitor::start() {
+ log() << "Starting Watchdog Monitor";
+
+ // Start the threads.
+ _watchdogCheckThread.start();
+
+ _watchdogMonitorThread.start();
+
+ {
+ stdx::lock_guard<stdx::mutex> lock(_mutex);
+
+ invariant(_state == State::kNotStarted);
+ _state = State::kStarted;
+ }
+}
+
+void WatchdogMonitor::setPeriod(Milliseconds duration) {
+ {
+ stdx::lock_guard<stdx::mutex> lock(_mutex);
+
+ if (duration > Milliseconds(0)) {
+ dassert(duration >= Milliseconds(1));
+
+ // Make sure that we monitor runs more frequently then checks
+ // 2 feels like an arbitrary good minimum.
+ invariant(duration >= 2 * _checkPeriod);
+
+ _watchdogCheckThread.setPeriod(_checkPeriod);
+ _watchdogMonitorThread.setPeriod(duration);
+
+ log() << "WatchdogMonitor period changed to " << duration_cast<Seconds>(duration);
+ } else {
+ _watchdogMonitorThread.setPeriod(duration);
+ _watchdogCheckThread.setPeriod(duration);
+
+ log() << "WatchdogMonitor disabled";
+ }
+ }
+}
+
+void WatchdogMonitor::shutdown() {
+ {
+ stdx::lock_guard<stdx::mutex> lock(_mutex);
+
+ bool started = (_state == State::kStarted);
+
+ invariant(_state == State::kNotStarted || _state == State::kStarted);
+
+ if (!started) {
+ _state = State::kDone;
+ return;
+ }
+
+ _state = State::kShutdownRequested;
+ }
+
+ _watchdogMonitorThread.shutdown();
+
+ _watchdogCheckThread.shutdown();
+
+ _state = State::kDone;
+}
+
+std::int64_t WatchdogMonitor::getCheckGeneration() {
+ return _watchdogCheckThread.getGeneration();
+}
+
+std::int64_t WatchdogMonitor::getMonitorGeneration() {
+ return _watchdogMonitorThread.getGeneration();
+}
+
+#ifdef _WIN32
+/**
+ * Check a directory is ok
+ * 1. Open up a direct_io to a new file
+ * 2. Write to the file
+ * 3. Seek to the beginning
+ * 4. Read from the file
+ * 5. Close file
+ */
+void checkFile(OperationContext* opCtx, const boost::filesystem::path& file) {
+ Date_t now = opCtx->getServiceContext()->getPreciseClockSource()->now();
+ std::string nowStr = now.toString();
+
+ HANDLE hFile = CreateFileW(file.generic_wstring().c_str(),
+ GENERIC_READ | GENERIC_WRITE,
+ 0, // No Sharing
+ NULL,
+ CREATE_ALWAYS,
+ FILE_ATTRIBUTE_NORMAL,
+ NULL);
+ if (hFile == INVALID_HANDLE_VALUE) {
+ std::uint32_t gle = ::GetLastError();
+ severe() << "CreateFile failed for '" << file.generic_string()
+ << "' with error: " << errnoWithDescription(gle);
+ fassertNoTrace(4074, gle == 0);
+ }
+
+ DWORD bytesWrittenTotal;
+ if (!WriteFile(hFile, nowStr.c_str(), nowStr.size(), &bytesWrittenTotal, NULL)) {
+ std::uint32_t gle = ::GetLastError();
+ severe() << "WriteFile failed for '" << file.generic_string()
+ << "' with error: " << errnoWithDescription(gle);
+ fassertNoTrace(4075, gle == 0);
+ }
+
+ if (bytesWrittenTotal != nowStr.size()) {
+ warning() << "partial write for '" << file.generic_string() << "' expected "
+ << nowStr.size() << " bytes but wrote " << bytesWrittenTotal << " bytes";
+ } else {
+
+ if (!FlushFileBuffers(hFile)) {
+ std::uint32_t gle = ::GetLastError();
+ severe() << "FlushFileBuffers failed for '" << file.generic_string()
+ << "' with error: " << errnoWithDescription(gle);
+ fassertNoTrace(4076, gle == 0);
+ }
+
+ DWORD newOffset = SetFilePointer(hFile, 0, 0, FILE_BEGIN);
+ if (newOffset != 0) {
+ std::uint32_t gle = ::GetLastError();
+ severe() << "SetFilePointer failed for '" << file.generic_string()
+ << "' with error: " << errnoWithDescription(gle);
+ fassertNoTrace(4077, gle == 0);
+ }
+
+ DWORD bytesRead;
+ auto readBuffer = stdx::make_unique<char[]>(nowStr.size());
+ if (!ReadFile(hFile, readBuffer.get(), nowStr.size(), &bytesRead, NULL)) {
+ std::uint32_t gle = ::GetLastError();
+ severe() << "ReadFile failed for '" << file.generic_string()
+ << "' with error: " << errnoWithDescription(gle);
+ fassertNoTrace(4078, gle == 0);
+ }
+
+ if (bytesRead != bytesWrittenTotal) {
+ severe() << "Read wrong number of bytes for '" << file.generic_string() << "' expected "
+ << bytesWrittenTotal << " bytes but read " << bytesRead << " bytes";
+ fassertNoTrace(50724, false);
+ }
+
+ if (memcmp(nowStr.c_str(), readBuffer.get(), nowStr.size()) != 0) {
+ severe() << "Read wrong string from file '" << file.generic_string() << nowStr.size()
+ << " bytes (in hex) '" << toHexLower(nowStr.c_str(), nowStr.size())
+ << "' but read bytes '" << toHexLower(readBuffer.get(), bytesRead) << "'";
+ fassertNoTrace(50717, false);
+ }
+ }
+
+ if (!CloseHandle(hFile)) {
+ std::uint32_t gle = ::GetLastError();
+ severe() << "CloseHandle failed for '" << file.generic_string()
+ << "' with error: " << errnoWithDescription(gle);
+ fassertNoTrace(4079, gle == 0);
+ }
+}
+
+void watchdogTerminate() {
+ ::TerminateProcess(::GetCurrentProcess(), ExitCode::EXIT_WATCHDOG);
+}
+
+#else
+
+/**
+ * Check a directory is ok
+ * 1. Open up a direct_io to a new file
+ * 2. Write to the file
+ * 3. Read from the file
+ * 4. Close file
+ */
+void checkFile(OperationContext* opCtx, const boost::filesystem::path& file) {
+ Date_t now = opCtx->getServiceContext()->getPreciseClockSource()->now();
+ std::string nowStr = now.toString();
+
+ int fd = open(file.generic_string().c_str(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+ if (fd == -1) {
+ auto err = errno;
+ severe() << "open failed for '" << file.generic_string()
+ << "' with error: " << errnoWithDescription(err);
+ fassertNoTrace(4080, err == 0);
+ }
+
+ size_t bytesWrittenTotal = 0;
+ while (bytesWrittenTotal < nowStr.size()) {
+ ssize_t bytesWrittenInWrite =
+ write(fd, nowStr.c_str() + bytesWrittenTotal, nowStr.size() - bytesWrittenTotal);
+ if (bytesWrittenInWrite == -1) {
+ auto err = errno;
+ if (err == EINTR) {
+ continue;
+ }
+
+ severe() << "write failed for '" << file.generic_string()
+ << "' with error: " << errnoWithDescription(err);
+ fassertNoTrace(4081, err == 0);
+ }
+
+ // Warn if the write was incomplete
+ if (bytesWrittenTotal == 0 && static_cast<size_t>(bytesWrittenInWrite) != nowStr.size()) {
+ warning() << "parital write for '" << file.generic_string() << "' expected "
+ << nowStr.size() << " bytes but wrote " << bytesWrittenInWrite << " bytes";
+ }
+
+ bytesWrittenTotal += bytesWrittenInWrite;
+ }
+
+ if (fsync(fd)) {
+ auto err = errno;
+ severe() << "fsync failed for '" << file.generic_string()
+ << "' with error: " << errnoWithDescription(err);
+ fassertNoTrace(4082, err == 0);
+ }
+
+ auto readBuffer = stdx::make_unique<char[]>(nowStr.size());
+ size_t bytesReadTotal = 0;
+ while (bytesReadTotal < nowStr.size()) {
+ ssize_t bytesReadInRead = pread(
+ fd, readBuffer.get() + bytesReadTotal, nowStr.size() - bytesReadTotal, bytesReadTotal);
+ if (bytesReadInRead == -1) {
+ auto err = errno;
+ if (err == EINTR) {
+ continue;
+ }
+
+ severe() << "read failed for '" << file.generic_string()
+ << "' with error: " << errnoWithDescription(err);
+ fassertNoTrace(4083, err == 0);
+ } else if (bytesReadInRead == 0) {
+ severe() << "read failed for '" << file.generic_string()
+ << "' with unexpected end of file";
+ fassertNoTrace(50719, false);
+ }
+
+ // Warn if the read was incomplete
+ if (bytesReadTotal == 0 && static_cast<size_t>(bytesReadInRead) != nowStr.size()) {
+ warning() << "partial read for '" << file.generic_string() << "' expected "
+ << nowStr.size() << " bytes but read " << bytesReadInRead << " bytes";
+ }
+
+ bytesReadTotal += bytesReadInRead;
+ }
+
+ if (memcmp(nowStr.c_str(), readBuffer.get(), nowStr.size()) != 0) {
+ severe() << "Read wrong string from file '" << file.generic_string() << "' expected "
+ << nowStr.size() << " bytes (in hex) '"
+ << toHexLower(nowStr.c_str(), nowStr.size()) << "' but read bytes '"
+ << toHexLower(readBuffer.get(), bytesReadTotal) << "'";
+ fassertNoTrace(50718, false);
+ }
+
+ if (close(fd)) {
+ auto err = errno;
+ severe() << "close failed for '" << file.generic_string()
+ << "' with error: " << errnoWithDescription(err);
+ fassertNoTrace(4084, err == 0);
+ }
+}
+
+void watchdogTerminate() {
+ // This calls the exit_group syscall on Linux
+ ::_exit(ExitCode::EXIT_WATCHDOG);
+}
+#endif
+
+constexpr StringData DirectoryCheck::kProbeFileName;
+constexpr StringData DirectoryCheck::kProbeFileNameExt;
+
+void DirectoryCheck::run(OperationContext* opCtx) {
+ // Ensure we have unique file names if multiple processes share the same logging directory
+ boost::filesystem::path file = _directory;
+ file /= kProbeFileName.toString();
+ file += ProcessId::getCurrent().toString();
+ file += kProbeFileNameExt.toString();
+
+ checkFile(opCtx, file);
+
+ // Try to delete the file so it is not leaked on restart, but ignore errors
+ boost::system::error_code ec;
+ boost::filesystem::remove(file, ec);
+ if (ec) {
+ warning() << "Failed to delete file '" << file.generic_string()
+ << "', error: " << ec.message();
+ }
+}
+
+std::string DirectoryCheck::getDescriptionForLogging() {
+ return str::stream() << "checked directory '" << _directory.generic_string() << "'";
+}
+
+} // namespace mongo
diff --git a/src/mongo/watchdog/watchdog.h b/src/mongo/watchdog/watchdog.h
new file mode 100644
index 00000000000..fe0060e3534
--- /dev/null
+++ b/src/mongo/watchdog/watchdog.h
@@ -0,0 +1,385 @@
+/**
+ * Copyright (C) 2019-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include <boost/filesystem/path.hpp>
+#include <string>
+#include <vector>
+
+#include "mongo/platform/atomic_word.h"
+#include "mongo/stdx/condition_variable.h"
+#include "mongo/stdx/functional.h"
+#include "mongo/stdx/mutex.h"
+#include "mongo/stdx/thread.h"
+#include "mongo/util/duration.h"
+
+namespace mongo {
+
+class OperationContext;
+
+/**
+ * WatchdogDeathCallback is used by the watchdog component to terminate the process. It is expected
+ * to bypass MongoDB's normal shutdown process. It should not make any syscalls other then to
+ * exit/terminate the process.
+ *
+ * It is pluggable for testing purposes.
+ */
+using WatchdogDeathCallback = stdx::function<void(void)>;
+
+/**
+ * The OS specific implementation of WatchdogDeathCallback that kills the process.
+ */
+void watchdogTerminate();
+
+/**
+ * WatchdogCheck represents a health check that the watchdog will run periodically to ensure the
+ * machine, and process are healthy.
+ *
+ * It is pluggable for testing purposes.
+ */
+class WatchdogCheck {
+public:
+ virtual ~WatchdogCheck() = default;
+
+ /**
+ * Runs a health check against the local machine.
+ *
+ * Note: It should throw exceptions on unexpected errors. Exceptions will result in a call to
+ * WatchdogDeathCallback.
+ */
+ virtual void run(OperationContext* opCtx) = 0;
+
+ /**
+ * Returns a description for the watchdog check to log to the log file.
+ */
+ virtual std::string getDescriptionForLogging() = 0;
+};
+
+/**
+ * Do a health check for a given directory. This health check is done by reading, and writing to a
+ * file with direct I/O.
+ */
+class DirectoryCheck : public WatchdogCheck {
+public:
+ static constexpr StringData kProbeFileName = "watchdog_probe_"_sd;
+ static constexpr StringData kProbeFileNameExt = ".txt"_sd;
+
+public:
+ DirectoryCheck(const boost::filesystem::path& directory) : _directory(directory) {}
+
+ void run(OperationContext* opCtx) final;
+
+ std::string getDescriptionForLogging() final;
+
+private:
+ boost::filesystem::path _directory;
+};
+
+/**
+ * Runs a callback on a periodic basis. The specified time period is the time delay between
+ * invocations.
+ *
+ * Example:
+ * - callback
+ * - sleep(period)
+ * - callback
+ */
+class WatchdogPeriodicThread {
+public:
+ WatchdogPeriodicThread(Milliseconds period, StringData threadName);
+ virtual ~WatchdogPeriodicThread() = default;
+
+ /**
+ * Starts the periodic thread.
+ */
+ void start();
+
+ /**
+ * Updates the period the thread runs its task.
+ *
+ * Period changes take affect immediately.
+ */
+ void setPeriod(Milliseconds period);
+
+ /**
+ * Shutdown the periodic thread. After it is shutdown, it cannot be started.
+ */
+ void shutdown();
+
+protected:
+ /**
+ * Do one iteration of work.
+ */
+ virtual void run(OperationContext* opCtx) = 0;
+
+ /**
+ * Provides an opportunity for derived classes to initialize state.
+ *
+ * This method is called at two different times:
+ * 1. First time a thread is started.
+ * 2. When a thread goes from disabled to enabled. Specifically, a user calls setPeriod(-1)
+ * followed by setPeriod(> 0).
+ *
+ */
+ virtual void resetState() = 0;
+
+private:
+ /**
+ * Main thread loop
+ */
+ void doLoop();
+
+private:
+ /**
+ * Private enum to track state.
+ *
+ * +----------------------------------------------------------------+
+ * | v
+ * +-------------+ +----------+ +--------------------+ +-------+
+ * | kNotStarted | --> | kStarted | --> | kShutdownRequested | --> | kDone |
+ * +-------------+ +----------+ +--------------------+ +-------+
+ */
+ enum class State {
+ /**
+ * Initial state. Either start() or shutdown() can be called next.
+ */
+ kNotStarted,
+
+ /**
+ * start() has been called. shutdown() should be called next.
+ */
+ kStarted,
+
+ /**
+ * shutdown() has been called, and the thread is in progress of shutting down.
+ */
+ kShutdownRequested,
+
+ /**
+ * PeriodicThread has been shutdown.
+ */
+ kDone,
+ };
+
+ // State of PeriodicThread
+ State _state{State::kNotStarted};
+
+ // Thread period
+ Milliseconds _period;
+
+ // if true, then call run() otherwise just let the thread idle,
+ bool _enabled;
+
+ // Name of thread for logging purposes
+ std::string _threadName;
+
+ // The thread
+ stdx::thread _thread;
+
+ // Lock to protect _state and control _thread
+ stdx::mutex _mutex;
+ stdx::condition_variable _condvar;
+};
+
+/**
+ * Periodic background thread to run watchdog checks.
+ */
+class WatchdogCheckThread : public WatchdogPeriodicThread {
+public:
+ WatchdogCheckThread(std::vector<std::unique_ptr<WatchdogCheck>> checks, Milliseconds period);
+
+ /**
+ * Returns the current generation number of the checks.
+ *
+ * Incremented after each check is run.
+ */
+ std::int64_t getGeneration();
+
+private:
+ void run(OperationContext* opCtx) final;
+ void resetState() final;
+
+private:
+ // Vector of checks to run
+ std::vector<std::unique_ptr<WatchdogCheck>> _checks;
+
+ // A counter that is incremented for each watchdog check completed, and monitored to ensure it
+ // does not remain at the same value for too long.
+ AtomicWord<long long> _checkGeneration{0};
+};
+
+/**
+ * Periodic background thread to ensure watchdog checks run periodically.
+ */
+class WatchdogMonitorThread : public WatchdogPeriodicThread {
+public:
+ WatchdogMonitorThread(WatchdogCheckThread* checkThread,
+ WatchdogDeathCallback callback,
+ Milliseconds period);
+
+ /**
+ * Returns the current generation number of the monitor.
+ *
+ * Incremented after each round of monitoring is run.
+ */
+ std::int64_t getGeneration();
+
+private:
+ void run(OperationContext* opCtx) final;
+ void resetState() final;
+
+private:
+ // Callback function to call when watchdog gets stuck
+ const WatchdogDeathCallback _callback;
+
+ // Watchdog check thread to query
+ WatchdogCheckThread* _checkThread;
+
+ // A counter that is incremented for each watchdog monitor run is completed.
+ AtomicWord<long long> _monitorGeneration{0};
+
+ // The last seen _checkGeneration value
+ std::int64_t _lastSeenGeneration{-1};
+};
+
+
+/**
+ * WatchdogMonitor
+ *
+ * The Watchdog is a pair of dedicated threads that try to figure out if a process is hung
+ * and terminate if it is. The worst case scenario in a distributed system is a process that appears
+ * to work but does not actually work.
+ *
+ * The watchdog is not designed to detect all the different ways the process is hung. It's goal is
+ * to detect if the storage system is stuck, and to terminate the process if it is stuck.
+ *
+ * Threads:
+ * WatchdogCheck - runs file system checks
+ * WatchdogMonitor - verifies that WatchdogCheck continue to make timely progress. If WatchdogCheck
+ * fails to make process, WatchdogMonitor calls a callback. The callback is not
+ * expected to do any I/O and minimize the system calls it makes.
+ */
+class WatchdogMonitor {
+public:
+ /**
+ * Create the watchdog with specified period.
+ *
+ * checkPeriod - how often to run the checks
+ * monitorPeriod - how often to run the monitor, must be >= checkPeriod
+ */
+ WatchdogMonitor(std::vector<std::unique_ptr<WatchdogCheck>> checks,
+ Milliseconds checkPeriod,
+ Milliseconds monitorPeriod,
+ WatchdogDeathCallback callback);
+
+ /**
+ * Starts the watchdog threads.
+ */
+ void start();
+
+ /**
+ * Updates the watchdog monitor period. The goal is to detect a failure in the time of the
+ * period.
+ *
+ * Does nothing if watchdog is not started. If watchdog was started, it changes the monitor
+ * period, but not the check period.
+ *
+ * Accepts Milliseconds for testing purposes while the setParameter only works with seconds.
+ */
+ void setPeriod(Milliseconds duration);
+
+ /**
+ * Shutdown the watchdog.
+ */
+ void shutdown();
+
+ /**
+ * Returns the current generation number of the checks.
+ *
+ * Incremented after each round of checks is run.
+ */
+ std::int64_t getCheckGeneration();
+
+ /**
+ * Returns the current generation number of the checks.
+ *
+ * Incremented after each round of checks is run.
+ */
+ std::int64_t getMonitorGeneration();
+
+private:
+ /**
+ * Private enum to track state.
+ *
+ * +----------------------------------------------------------------+
+ * | v
+ * +-------------+ +----------+ +--------------------+ +-------+
+ * | kNotStarted | --> | kStarted | --> | kShutdownRequested | --> | kDone |
+ * +-------------+ +----------+ +--------------------+ +-------+
+ */
+ enum class State {
+ /**
+ * Initial state. Either start() or shutdown() can be called next.
+ */
+ kNotStarted,
+
+ /**
+ * start() has been called. shutdown() should be called next.
+ */
+ kStarted,
+
+ /**
+ * shutdown() has been called, and the background threads are in progress of shutting down.
+ */
+ kShutdownRequested,
+
+ /**
+ * Watchdog has been shutdown.
+ */
+ kDone,
+ };
+
+ // Lock to protect _state and control _thread
+ stdx::mutex _mutex;
+
+ // State of watchdog
+ State _state{State::kNotStarted};
+
+ // Fixed period for running the checks.
+ Milliseconds _checkPeriod;
+
+ // WatchdogCheck Thread - runs checks
+ WatchdogCheckThread _watchdogCheckThread;
+
+ // WatchdogMonitor Thread - watches _watchdogCheckThread
+ WatchdogMonitorThread _watchdogMonitorThread;
+};
+
+} // namespace mongo
diff --git a/src/mongo/watchdog/watchdog_mongod.cpp b/src/mongo/watchdog/watchdog_mongod.cpp
new file mode 100644
index 00000000000..c223e9fc24d
--- /dev/null
+++ b/src/mongo/watchdog/watchdog_mongod.cpp
@@ -0,0 +1,201 @@
+/**
+ * Copyright (C) 2019-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kDefault
+
+#include "mongo/platform/basic.h"
+
+#include "mongo/watchdog/watchdog_mongod.h"
+
+#include <boost/filesystem.hpp>
+
+#include "mongo/base/init.h"
+#include "mongo/config.h"
+#include "mongo/db/client.h"
+#include "mongo/db/commands/server_status.h"
+#include "mongo/db/server_options.h"
+#include "mongo/db/service_context.h"
+#include "mongo/db/storage/storage_options.h"
+#include "mongo/stdx/memory.h"
+#include "mongo/unittest/unittest.h"
+#include "mongo/util/clock_source.h"
+#include "mongo/util/clock_source_mock.h"
+#include "mongo/util/log.h"
+#include "mongo/util/tick_source_mock.h"
+#include "mongo/watchdog/watchdog.h"
+#include "mongo/watchdog/watchdog_mongod_gen.h"
+#include "mongo/watchdog/watchdog_register.h"
+
+namespace mongo {
+
+// Run the watchdog checks at a fixed interval regardless of user choice for monitoring period.
+constexpr Seconds watchdogCheckPeriod = Seconds{10};
+
+namespace {
+
+const auto getWatchdogMonitor =
+ ServiceContext::declareDecoration<std::unique_ptr<WatchdogMonitor>>();
+
+// A boolean variable to track whether the watchdog was enabled at startup.
+// Defaults to true because set parameters are handled before we start the watchdog if needed.
+bool watchdogEnabled{true};
+
+WatchdogMonitor* getGlobalWatchdogMonitor() {
+ if (!hasGlobalServiceContext()) {
+ return nullptr;
+ }
+
+ return getWatchdogMonitor(getGlobalServiceContext()).get();
+}
+
+} // namespace
+
+Status validateWatchdogPeriodSeconds(const int& value) {
+ if (value < 60 && value != -1) {
+
+ return {ErrorCodes::BadValue, "watchdogPeriodSeconds must be greater than or equal to 60s"};
+ }
+
+ // If the watchdog was not enabled at startup, disallow changes the period.
+ if (!watchdogEnabled) {
+ return {ErrorCodes::BadValue,
+ "watchdogPeriodSeconds cannot be changed at runtime if it was not set at startup"};
+ }
+
+ return Status::OK();
+}
+
+Status onUpdateWatchdogPeriodSeconds(const int& value) {
+ auto monitor = getGlobalWatchdogMonitor();
+ if (monitor) {
+ monitor->setPeriod(Seconds(value));
+ }
+
+ return Status::OK();
+}
+
+/**
+ * Server status section for the Watchdog.
+ *
+ * Sample format:
+ *
+ * watchdog: {
+ * generation: int,
+ * }
+ */
+class WatchdogServerStatusSection : public ServerStatusSection {
+public:
+ WatchdogServerStatusSection() : ServerStatusSection("watchdog") {}
+ bool includeByDefault() const {
+ // Only include this by default if the watchdog is on
+ return watchdogEnabled;
+ }
+
+ BSONObj generateSection(OperationContext* opCtx, const BSONElement& configElement) const {
+ BSONObjBuilder result;
+
+ WatchdogMonitor* watchdog = getWatchdogMonitor(opCtx->getServiceContext()).get();
+
+ result.append("checkGeneration", watchdog->getCheckGeneration());
+ result.append("monitorGeneration", watchdog->getMonitorGeneration());
+ result.append("monitorPeriod", gWatchdogPeriodSeconds.load());
+
+ return result.obj();
+ }
+} watchdogServerStatusSection;
+
+void startWatchdog() {
+ // Check three paths if set
+ // 1. storage directory - optional for inmemory?
+ // 2. log path - optional
+ // 3. audit path - optional
+
+ Seconds period{gWatchdogPeriodSeconds.load()};
+ if (period < Seconds::zero()) {
+ // Skip starting the watchdog if the user has not asked for it.
+ watchdogEnabled = false;
+ return;
+ }
+
+ watchdogEnabled = true;
+
+ std::vector<std::unique_ptr<WatchdogCheck>> checks;
+
+ auto dataCheck =
+ stdx::make_unique<DirectoryCheck>(boost::filesystem::path(storageGlobalParams.dbpath));
+
+ checks.push_back(std::move(dataCheck));
+
+ // Add a check for the journal if it is not disabled
+ if (storageGlobalParams.dur) {
+ auto journalDirectory = boost::filesystem::path(storageGlobalParams.dbpath);
+ journalDirectory /= "journal";
+
+ if (boost::filesystem::exists(journalDirectory)) {
+ auto journalCheck = stdx::make_unique<DirectoryCheck>(journalDirectory);
+
+ checks.push_back(std::move(journalCheck));
+ } else {
+ warning()
+ << "Watchdog is skipping check for journal directory since it does not exist: '"
+ << journalDirectory.generic_string() << "'";
+ }
+ }
+
+ // If the user specified a log path, also monitor that directory.
+ // This may be redudant with the dbpath check but there is not easy way to confirm they are
+ // duplicate.
+ if (!serverGlobalParams.logpath.empty()) {
+ boost::filesystem::path logFile(serverGlobalParams.logpath);
+ auto logPath = logFile.parent_path();
+
+ auto logCheck = stdx::make_unique<DirectoryCheck>(logPath);
+ checks.push_back(std::move(logCheck));
+ }
+
+ // If the user specified an audit path, also monitor that directory.
+ // This may be redudant with the dbpath check but there is not easy way to confirm they are
+ // duplicate.
+ for (auto&& path : getWatchdogPaths()) {
+ auto auditCheck = stdx::make_unique<DirectoryCheck>(path);
+ checks.push_back(std::move(auditCheck));
+ }
+
+ auto monitor = stdx::make_unique<WatchdogMonitor>(
+ std::move(checks), watchdogCheckPeriod, period, watchdogTerminate);
+
+ // Install the new WatchdogMonitor
+ auto& staticMonitor = getWatchdogMonitor(getGlobalServiceContext());
+
+ staticMonitor = std::move(monitor);
+
+ staticMonitor->start();
+}
+
+} // namespace mongo
diff --git a/src/mongo/watchdog/watchdog_mongod.h b/src/mongo/watchdog/watchdog_mongod.h
new file mode 100644
index 00000000000..186e21e4a47
--- /dev/null
+++ b/src/mongo/watchdog/watchdog_mongod.h
@@ -0,0 +1,47 @@
+/**
+ * Copyright (C) 2019-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/base/status.h"
+
+namespace mongo {
+
+/**
+* Start the watchdog.
+*/
+void startWatchdog();
+
+/**
+ * Callbacks used by the 'watchdogPeriodSeconds' set parameter.
+ */
+Status validateWatchdogPeriodSeconds(const int& value);
+Status onUpdateWatchdogPeriodSeconds(const int& value);
+
+} // namespace mongo
diff --git a/src/mongo/watchdog/watchdog_mongod.idl b/src/mongo/watchdog/watchdog_mongod.idl
new file mode 100644
index 00000000000..61e6fd605b0
--- /dev/null
+++ b/src/mongo/watchdog/watchdog_mongod.idl
@@ -0,0 +1,43 @@
+# Copyright (C) 2019-present MongoDB, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the Server Side Public License, version 1,
+# as published by MongoDB, Inc.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# Server Side Public License for more details.
+#
+# You should have received a copy of the Server Side Public License
+# along with this program. If not, see
+# <http://www.mongodb.com/licensing/server-side-public-license>.
+#
+# As a special exception, the copyright holders give permission to link the
+# code of portions of this program with the OpenSSL library under certain
+# conditions as described in each individual source file and distribute
+# linked combinations including the program with the OpenSSL library. You
+# must comply with the Server Side Public License in all respects for
+# all of the code used other than as permitted herein. If you modify file(s)
+# with this exception, you may extend this exception to your version of the
+# file(s), but you are not obligated to do so. If you do not wish to do so,
+# delete this exception statement from your version. If you delete this
+# exception statement from all source files in the program, then also delete
+# it in the license file.
+#
+
+global:
+ cpp_namespace: "mongo"
+ cpp_includes:
+ - "mongo/watchdog/watchdog_mongod.h"
+
+server_parameters:
+ "watchdogPeriodSeconds":
+ description: 'Watchdog Period (seconds)'
+ set_at: [ startup, runtime ]
+ cpp_vartype: 'AtomicWord<int>'
+ cpp_varname: gWatchdogPeriodSeconds
+ default: -1
+ validator:
+ callback: validateWatchdogPeriodSeconds
+ on_update: onUpdateWatchdogPeriodSeconds
diff --git a/src/mongo/watchdog/watchdog_register.cpp b/src/mongo/watchdog/watchdog_register.cpp
new file mode 100644
index 00000000000..d1976a4c74c
--- /dev/null
+++ b/src/mongo/watchdog/watchdog_register.cpp
@@ -0,0 +1,50 @@
+/**
+ * Copyright (C) 2019-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/watchdog/watchdog_register.h"
+
+namespace mongo {
+
+namespace {
+
+std::vector<std::string> watchdogPaths;
+
+} // namespace
+
+void registerWatchdogPath(StringData path) {
+ if (!path.empty()) {
+ watchdogPaths.push_back(path.toString());
+ }
+}
+
+std::vector<std::string>& getWatchdogPaths() {
+ return watchdogPaths;
+}
+
+} // namespace mongo
diff --git a/src/mongo/watchdog/watchdog_register.h b/src/mongo/watchdog/watchdog_register.h
new file mode 100644
index 00000000000..822d5d24c6e
--- /dev/null
+++ b/src/mongo/watchdog/watchdog_register.h
@@ -0,0 +1,48 @@
+/**
+ * Copyright (C) 2019-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include <vector>
+
+#include "mongo/base/string_data.h"
+
+namespace mongo {
+
+/**
+ * Allow components a way to tell the watchdog what to watch.
+ */
+void registerWatchdogPath(StringData path);
+
+/**
+ * Get list of registered watchdog paths.
+ */
+std::vector<std::string>& getWatchdogPaths();
+
+} // namespace mongo
diff --git a/src/mongo/watchdog/watchdog_test.cpp b/src/mongo/watchdog/watchdog_test.cpp
new file mode 100644
index 00000000000..fd3b34602c9
--- /dev/null
+++ b/src/mongo/watchdog/watchdog_test.cpp
@@ -0,0 +1,480 @@
+/**
+ * Copyright (C) 2019-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kDefault
+
+#include "mongo/platform/basic.h"
+
+#include "mongo/watchdog/watchdog.h"
+
+#include "mongo/db/client.h"
+#include "mongo/db/service_context.h"
+#include "mongo/db/service_context_test_fixture.h"
+#include "mongo/stdx/memory.h"
+#include "mongo/unittest/death_test.h"
+#include "mongo/unittest/temp_dir.h"
+#include "mongo/unittest/unittest.h"
+#include "mongo/util/clock_source.h"
+#include "mongo/util/clock_source_mock.h"
+#include "mongo/util/log.h"
+#include "mongo/util/tick_source_mock.h"
+
+namespace mongo {
+
+class TestPeriodicThread : public WatchdogPeriodicThread {
+public:
+ TestPeriodicThread(Milliseconds period) : WatchdogPeriodicThread(period, "testPeriodic") {}
+
+ void run(OperationContext* opCtx) final {
+ {
+ stdx::lock_guard<stdx::mutex> lock(_mutex);
+ ++_counter;
+ }
+
+ if (_counter == _wait) {
+ _condvar.notify_all();
+ }
+ }
+
+ void setSignalOnCount(int c) {
+ _wait = c;
+ }
+
+ void waitForCount() {
+ invariant(_wait != 0);
+
+ stdx::unique_lock<stdx::mutex> lock(_mutex);
+ while (_counter < _wait) {
+ _condvar.wait(lock);
+ }
+ }
+
+ void resetState() final {}
+
+ std::uint32_t getCounter() {
+ {
+ stdx::lock_guard<stdx::mutex> lock(_mutex);
+ return _counter;
+ }
+ }
+
+private:
+ std::uint32_t _counter{0};
+
+ stdx::mutex _mutex;
+ stdx::condition_variable _condvar;
+ std::uint32_t _wait{0};
+};
+
+class PeriodicThreadTest : public ServiceContextTest {};
+
+// Tests:
+// 1. Make sure it runs at least N times
+// 2. Make sure it responds to stop after being paused
+// 3. Make sure it can be resumed
+// 4. Make sure the period can be changed like from 1 minute -> 1 milli
+
+// Positive: Make sure periodic thread runs at least N times and stops correctly
+TEST_F(PeriodicThreadTest, Basic) {
+
+ TestPeriodicThread testThread(Milliseconds(5));
+
+ testThread.setSignalOnCount(5);
+
+ testThread.start();
+
+ testThread.waitForCount();
+
+ testThread.shutdown();
+
+ // Check the counter after it is shutdown and make sure it does not change.
+ std::uint32_t lastCounter = testThread.getCounter();
+
+ // This is racey but it should only produce false negatives
+ sleepmillis(100);
+
+ ASSERT_EQ(lastCounter, testThread.getCounter());
+}
+
+// Positive: Make sure it stops after being paused
+TEST_F(PeriodicThreadTest, PauseAndStop) {
+
+ TestPeriodicThread testThread(Milliseconds(5));
+ testThread.setSignalOnCount(5);
+
+ testThread.start();
+
+ testThread.waitForCount();
+
+ // Stop the thread by setting a -1 duration
+ testThread.setPeriod(Milliseconds(-1));
+
+ // Check the counter after it is shutdown and make sure it does not change.
+ std::uint32_t pauseCounter = testThread.getCounter();
+
+ // This is racey but it should only produce false negatives
+ sleepmillis(100);
+
+ // We could have had one more run of the loop as we paused - allow for that case
+ // but no other runs of the thread.
+ ASSERT_GTE(pauseCounter + 1, testThread.getCounter());
+
+ testThread.shutdown();
+
+ // Check the counter after it is shutdown and make sure it does not change.
+ std::uint32_t stopCounter = testThread.getCounter();
+
+ // This is racey but it should only produce false negatives
+ sleepmillis(100);
+
+ ASSERT_EQ(stopCounter, testThread.getCounter());
+}
+
+// Positive: Make sure it can be paused and resumed
+TEST_F(PeriodicThreadTest, PauseAndResume) {
+
+ TestPeriodicThread testThread(Milliseconds(5));
+ testThread.setSignalOnCount(5);
+
+ testThread.start();
+
+ testThread.waitForCount();
+
+ // Stop the thread by setting a -1 duration
+ testThread.setPeriod(Milliseconds(-1));
+
+ // Check the counter after it is shutdown and make sure it does not change.
+ std::uint32_t pauseCounter = testThread.getCounter();
+
+ // This is racey but it should only produce false negatives
+ sleepmillis(100);
+
+ // We could have had one more run of the loop as we paused - allow for that case
+ // but no other runs of the thread.
+ ASSERT_GTE(pauseCounter + 1, testThread.getCounter());
+
+ // Make sure we can resume the thread again
+ std::uint32_t baseCounter = testThread.getCounter();
+ testThread.setSignalOnCount(baseCounter + 5);
+
+ testThread.setPeriod(Milliseconds(7));
+
+ testThread.waitForCount();
+
+ testThread.shutdown();
+}
+
+/**
+ * Simple class to ensure we run checks.
+ */
+class TestCounterCheck : public WatchdogCheck {
+public:
+ void run(OperationContext* opCtx) final {
+ {
+ stdx::lock_guard<stdx::mutex> lock(_mutex);
+ ++_counter;
+ }
+
+ if (_counter == _wait) {
+ _condvar.notify_all();
+ }
+ }
+
+ std::string getDescriptionForLogging() final {
+ return "test";
+ }
+
+ void setSignalOnCount(int c) {
+ _wait = c;
+ }
+
+ void waitForCount() {
+ invariant(_wait != 0);
+
+ stdx::unique_lock<stdx::mutex> lock(_mutex);
+ while (_counter < _wait) {
+ _condvar.wait(lock);
+ }
+ }
+
+ std::uint32_t getCounter() {
+ {
+ stdx::lock_guard<stdx::mutex> lock(_mutex);
+ return _counter;
+ }
+ }
+
+private:
+ std::uint32_t _counter{0};
+
+ stdx::mutex _mutex;
+ stdx::condition_variable _condvar;
+ std::uint32_t _wait{0};
+};
+
+class WatchdogCheckThreadTest : public ServiceContextTest {};
+
+// Positive: Make sure check thread runs at least N times and stops correctly
+TEST_F(WatchdogCheckThreadTest, Basic) {
+ auto counterCheck = stdx::make_unique<TestCounterCheck>();
+ auto counterCheckPtr = counterCheck.get();
+
+ std::vector<std::unique_ptr<WatchdogCheck>> checks;
+ checks.push_back(std::move(counterCheck));
+
+ WatchdogCheckThread testThread(std::move(checks), Milliseconds(5));
+
+ counterCheckPtr->setSignalOnCount(5);
+
+ testThread.start();
+
+ counterCheckPtr->waitForCount();
+
+ testThread.shutdown();
+
+ // Check the counter after it is shutdown and make sure it does not change.
+ std::uint32_t lastCounter = counterCheckPtr->getCounter();
+
+ // This is racey but it should only produce false negatives
+ sleepmillis(100);
+
+ ASSERT_EQ(lastCounter, counterCheckPtr->getCounter());
+}
+
+/**
+ * A class that models the behavior of Windows' manual reset Event object.
+ */
+class ManualResetEvent {
+public:
+ void set() {
+ stdx::lock_guard<stdx::mutex> lock(_mutex);
+
+ _set = true;
+ _condvar.notify_one();
+ }
+
+ void wait() {
+ stdx::unique_lock<stdx::mutex> lock(_mutex);
+
+ _condvar.wait(lock, [this]() { return _set; });
+ }
+
+private:
+ bool _set{false};
+
+ stdx::mutex _mutex;
+ stdx::condition_variable _condvar;
+};
+
+
+class WatchdogMonitorThreadTest : public ServiceContextTest {};
+
+// Positive: Make sure monitor thread signals death if the check thread never starts
+TEST_F(WatchdogMonitorThreadTest, Basic) {
+ ManualResetEvent deathEvent;
+ WatchdogDeathCallback deathCallback = [&deathEvent]() {
+ log() << "Death signalled";
+ deathEvent.set();
+ };
+
+ auto counterCheck = stdx::make_unique<TestCounterCheck>();
+
+ std::vector<std::unique_ptr<WatchdogCheck>> checks;
+ checks.push_back(std::move(counterCheck));
+
+ WatchdogCheckThread checkThread(std::move(checks), Milliseconds(5));
+
+ WatchdogMonitorThread monitorThread(&checkThread, deathCallback, Milliseconds(5));
+
+ monitorThread.start();
+
+ deathEvent.wait();
+
+ monitorThread.shutdown();
+}
+
+/**
+ * Sleep after doing a few checks to replicate a hung check.
+ */
+class SleepyCheck : public WatchdogCheck {
+public:
+ void run(OperationContext* opCtx) final {
+ ++_counter;
+
+ if (_counter >= 6) {
+ sleepFor(Seconds(5));
+ }
+ }
+
+ std::string getDescriptionForLogging() final {
+ return "test";
+ }
+
+private:
+ std::uint32_t _counter{0};
+};
+
+// Positive: Make sure monitor thread signals death if the thread does not make progress
+TEST_F(WatchdogMonitorThreadTest, SleepyHungCheck) {
+ ManualResetEvent deathEvent;
+ WatchdogDeathCallback deathCallback = [&deathEvent]() {
+ log() << "Death signalled";
+ deathEvent.set();
+ };
+
+ auto sleepyCheck = stdx::make_unique<SleepyCheck>();
+
+ std::vector<std::unique_ptr<WatchdogCheck>> checks;
+ checks.push_back(std::move(sleepyCheck));
+
+ WatchdogCheckThread checkThread(std::move(checks), Milliseconds(1));
+
+ WatchdogMonitorThread monitorThread(&checkThread, deathCallback, Milliseconds(100));
+
+ checkThread.start();
+
+ monitorThread.start();
+
+ deathEvent.wait();
+
+ // Make sure we actually did some checks
+ ASSERT_GTE(checkThread.getGeneration(), 2);
+
+ monitorThread.shutdown();
+
+ checkThread.shutdown();
+}
+
+class WatchdogMonitorTest : public ServiceContextTest {};
+
+// Positive: Make sure watchdog monitor signals death if a check is unresponsive
+TEST_F(WatchdogMonitorTest, SleepyHungCheck) {
+ ManualResetEvent deathEvent;
+ WatchdogDeathCallback deathCallback = [&deathEvent]() {
+ log() << "Death signalled";
+ deathEvent.set();
+ };
+
+ auto sleepyCheck = stdx::make_unique<SleepyCheck>();
+
+ std::vector<std::unique_ptr<WatchdogCheck>> checks;
+ checks.push_back(std::move(sleepyCheck));
+
+ WatchdogMonitor monitor(std::move(checks), Milliseconds(1), Milliseconds(5), deathCallback);
+
+ monitor.start();
+
+ deathEvent.wait();
+
+ monitor.shutdown();
+}
+
+// Positive: Make sure watchdog monitor terminates the process if a check is unresponsive
+DEATH_TEST(WatchdogMonitorTest, Death, "") {
+ auto sleepyCheck = stdx::make_unique<SleepyCheck>();
+
+ std::vector<std::unique_ptr<WatchdogCheck>> checks;
+ checks.push_back(std::move(sleepyCheck));
+
+ WatchdogMonitor monitor(
+ std::move(checks), Milliseconds(1), Milliseconds(100), watchdogTerminate);
+
+ monitor.start();
+
+ sleepmillis(1000);
+}
+
+// Positive: Make sure the monitor can be paused and resumed, and it does not trigger death
+TEST_F(WatchdogMonitorTest, PauseAndResume) {
+
+ WatchdogDeathCallback deathCallback = []() {
+ log() << "Death signalled, it should not have been";
+ invariant(false);
+ };
+
+ auto counterCheck = stdx::make_unique<TestCounterCheck>();
+ auto counterCheckPtr = counterCheck.get();
+
+ std::vector<std::unique_ptr<WatchdogCheck>> checks;
+ checks.push_back(std::move(counterCheck));
+
+ WatchdogMonitor monitor(std::move(checks), Milliseconds(1), Milliseconds(1001), deathCallback);
+
+ counterCheckPtr->setSignalOnCount(5);
+
+ monitor.start();
+
+ counterCheckPtr->waitForCount();
+
+ // Pause the monitor
+ monitor.setPeriod(Milliseconds(-1));
+
+ // Check the counter after it is shutdown and make sure it does not change.
+ std::uint32_t pauseCounter = counterCheckPtr->getCounter();
+
+ // This is racey but it should only produce false negatives
+ sleepmillis(100);
+
+ // We could have had one more run of the loop as we paused - allow for that case
+ // but no other runs of the thread.
+ ASSERT_GTE(pauseCounter + 1, counterCheckPtr->getCounter());
+
+ // Resume the monitor
+ std::uint32_t baseCounter = counterCheckPtr->getCounter();
+ counterCheckPtr->setSignalOnCount(baseCounter + 5);
+
+ // Restart the monitor with a different interval.
+ monitor.setPeriod(Milliseconds(1007));
+
+ counterCheckPtr->waitForCount();
+
+ monitor.shutdown();
+
+ // Check the counter after it is shutdown and make sure it does not change.
+ std::uint32_t lastCounter = counterCheckPtr->getCounter();
+
+ // This is racey but it should only produce false negatives
+ sleepmillis(100);
+
+ ASSERT_EQ(lastCounter, counterCheckPtr->getCounter());
+}
+
+class DirectoryCheckTest : public ServiceContextTest {};
+
+// Positive: Do a sanity check that directory check passes
+TEST_F(DirectoryCheckTest, Basic) {
+ unittest::TempDir tempdir("watchdog_testpath");
+
+ DirectoryCheck check(tempdir.path());
+
+ auto opCtx = makeOperationContext();
+ check.run(opCtx.get());
+}
+
+} // namespace mongo