We have a background thread running (albiet it doesn't do what we want yet)51403

author: Brian DeLeonardis <brian.deleonardis@mongodb.com> 2020-11-12 00:17:02 +0000
committer: Brian DeLeonardis <brian.deleonardis@mongodb.com> 2020-11-12 00:17:02 +0000
commit: 653817f5d5739753693285dec52497beda2bb5a6 (patch)
tree: 54d328c4d2b29a0d5301409b0362923f1d21191f
parent: 04415c7f5e6c824d0683da7b9a647fd785d03efb (diff)
download: mongo-653817f5d5739753693285dec52497beda2bb5a6.tar.gz
4 files changed, 268 insertions, 14 deletions
diff --git a/buildscripts/resmokeconfig/suites/concurrency_metrics.yml b/buildscripts/resmokeconfig/suites/concurrency_metrics.yml
index da6143d3216..64d0ac3f970 100644
--- a/buildscripts/resmokeconfig/suites/concurrency_metrics.yml
+++ b/buildscripts/resmokeconfig/suites/concurrency_metrics.yml
@@ -31,13 +31,12 @@ executor:
     shell_options:
       readMode: commands
   hooks:
-    - AggregateResourceConsumptionMetricsInBackground
+    - class: AggregateResourceConsumptionMetricsInBackground
   fixture:
     class: ReplicaSetFixture
     mongod_options:
-      oplogSize: 1024
       set_parameters:
         enableTestCommands: 1
         measureOperationResourceConsumption: true
         aggregateOperationResourceConsumptionMetrics: true
-      num_nodes: 3
-\ No newline at end of file
+    num_nodes: 2
+\ No newline at end of file
diff --git a/buildscripts/resmokelib/testing/executor.py b/buildscripts/resmokelib/testing/executor.py
index 4fd76ea7093..c7e0beb24ce 100644
--- a/buildscripts/resmokelib/testing/executor.py
+++ b/buildscripts/resmokelib/testing/executor.py
@@ -43,6 +43,7 @@ class TestSuiteExecutor(object):  # pylint: disable=too-many-instance-attributes
         else:
             self.fixture_config = fixture
 
+        print("hooks:", hooks)
         self.hooks_config = utils.default_if_none(hooks, [])
         self.test_config = utils.default_if_none(config, {})
 
@@ -53,7 +54,8 @@ class TestSuiteExecutor(object):  # pylint: disable=too-many-instance-attributes
 
         self._suite = suite
         self.num_tests = len(suite.tests) * suite.options.num_repeat_tests
-        self.test_queue_logger = logging.loggers.new_testqueue_logger(suite.test_kind)
+        self.test_queue_logger = logging.loggers.new_testqueue_logger(
+            suite.test_kind)
 
         # Must be done after getting buildlogger configuration.
         self._jobs = self._create_jobs(self.num_tests)
@@ -117,12 +119,14 @@ class TestSuiteExecutor(object):  # pylint: disable=too-many-instance-attributes
                 # still running if an Evergreen task were to time out from a hang/deadlock being
                 # triggered.
                 teardown_flag = threading.Event() if num_repeat_suites == 1 else None
-                (report, interrupted) = self._run_tests(test_queue, setup_flag, teardown_flag)
+                (report, interrupted) = self._run_tests(
+                    test_queue, setup_flag, teardown_flag)
 
                 self._suite.record_test_end(report)
 
                 if setup_flag and setup_flag.is_set():
-                    self.logger.error("Setup of one of the job fixtures failed")
+                    self.logger.error(
+                        "Setup of one of the job fixtures failed")
                     return_code = 2
                     return
                 # Remove the setup flag once the first suite ran.
@@ -137,7 +141,8 @@ class TestSuiteExecutor(object):  # pylint: disable=too-many-instance-attributes
 
                 sb = []  # String builder.
                 self._suite.summarize_latest(sb)
-                self.logger.info("Summary of latest execution: %s", "\n    ".join(sb))
+                self.logger.info(
+                    "Summary of latest execution: %s", "\n    ".join(sb))
 
                 if not report.wasSuccessful():
                     return_code = 1
@@ -201,7 +206,8 @@ class TestSuiteExecutor(object):  # pylint: disable=too-many-instance-attributes
         wait_secs = 2.0
         self.logger.debug("Waiting for threads to complete")
 
-        timer = threading.Timer(wait_secs, self._log_timeout_warning, args=[wait_secs])
+        timer = threading.Timer(
+            wait_secs, self._log_timeout_warning, args=[wait_secs])
         timer.daemon = True
         timer.start()
         try:
@@ -243,7 +249,8 @@ class TestSuiteExecutor(object):  # pylint: disable=too-many-instance-attributes
             fixture_config = self.fixture_config.copy()
             fixture_class = fixture_config.pop("class")
 
-        fixture_logger = logging.loggers.new_fixture_logger(fixture_class, job_num)
+        fixture_logger = logging.loggers.new_fixture_logger(
+            fixture_class, job_num)
 
         return fixtures.make_fixture(fixture_class, fixture_logger, job_num, **fixture_config)
 
@@ -257,7 +264,8 @@ class TestSuiteExecutor(object):  # pylint: disable=too-many-instance-attributes
             hook_class = hook_config.pop("class")
 
             hook_logger = logging.loggers.new_hook_logger(hook_class, job_num)
-            hook = _hooks.make_hook(hook_class, hook_logger, fixture, **hook_config)
+            hook = _hooks.make_hook(
+                hook_class, hook_logger, fixture, **hook_config)
             hooks.append(hook)
 
         return hooks
@@ -269,7 +277,8 @@ class TestSuiteExecutor(object):  # pylint: disable=too-many-instance-attributes
         :param job_num: instance number of job being created.
         :return: Job instance.
         """
-        job_logger = logging.loggers.new_job_logger(self._suite.test_kind, job_num)
+        job_logger = logging.loggers.new_job_logger(
+            self._suite.test_kind, job_num)
 
         fixture = self._make_fixture(job_num)
         hooks = self._make_hooks(fixture, job_num)
diff --git a/buildscripts/resmokelib/testing/hooks/aggregate_metrics_background.py b/buildscripts/resmokelib/testing/hooks/aggregate_metrics_background.py
index 73de9ee8d61..6816695f729 100644
--- a/buildscripts/resmokelib/testing/hooks/aggregate_metrics_background.py
+++ b/buildscripts/resmokelib/testing/hooks/aggregate_metrics_background.py
@@ -1,11 +1,75 @@
-"""Test hook for running the $operationMetrics stage in the background. 
+"""Test hook for running the $operationMetrics stage in the background.
 
 This hook runs every five seconds.
 """
 
+import os.path
+
+from buildscripts.resmokelib import errors
 from buildscripts.resmokelib.testing.hooks import jsfile
-from buildscripts.resmokelib.testing.hooks.background_job import _BackgroundJob
+from buildscripts.resmokelib.testing.hooks.background_job import _BackgroundJob, _ContinuousDynamicJSTestCase
 
 
-class EnsureOperationMetricsAreAggregatedInBackground(jsfile.JSHook):
+class AggregateResourceConsumptionMetricsInBackground(jsfile.JSHook):
     """A hook to run $operationMetrics stage in the background"""
+
+    def __init__(self, hook_logger, fixture, shell_options=None):
+        """Initialize AggregateResourceConsumptionMetricsInBackground."""
+        description = "Run background $operationMetrics on all mongods while a test is running"
+        js_filename = os.path.join(
+            "jstests", "hooks", "run_aggregate_metrics_background.js")
+        jsfile.JSHook.__init__(self, hook_logger, fixture,
+                               js_filename, description, shell_options=shell_options)
+        self._background_job = None
+
+    def before_suite(self, test_report):
+        """Start the background thread."""
+        self._background_job = _BackgroundJob(
+            "AggregateResourceConsumptionMetricsInBackground")
+        self.logger.info("Starting the background aggregate metrics thread.")
+        self._background_job.start()
+
+    def after_suite(self, test_report):
+        """Signal the background aggregate metrics thread to exit, and wait until it does."""
+        if self._background_job is None:
+            return
+
+        self.logger.info("Stopping the background aggregate metrics thread.")
+        self._background_job.stop()
+
+    def before_test(self, test, test_report):
+        """Instruct the background aggregate metrics thread to run while 'test' is also running."""
+        if self._background_job is None:
+            return
+
+        hook_test_case = _ContinuousDynamicJSTestCase.create_before_test(
+            self.logger, test, self, self._js_filename, self._shell_options)
+        hook_test_case.configure(self.fixture)
+
+        self.logger.info(
+            "Resuming the background aggregate metrics thread.")
+        self._background_job.resume(hook_test_case, test_report)
+
+    def after_test(self, test, test_report):  # noqa: D205,D400
+        """Instruct the background aggregate metrics thread to stop running now that 'test' has
+        finished running.
+        """
+        if self._background_job is None:
+            return
+
+        self.logger.info(
+            "Pausing the background aggregate metrics thread.")
+        self._background_job.pause()
+
+        if self._background_job.exc_info is not None:
+            if isinstance(self._background_job.exc_info[1], errors.TestFailure):
+                # If the mongo shell process running the JavaScript file exited with a non-zero
+                # return code, then we raise an errors.ServerFailure exception to cause resmoke.py's
+                # test execution to stop.
+                raise errors.ServerFailure(
+                    self._background_job.exc_info[1].args[0])
+            else:
+                self.logger.error(
+                    "Encountered an error inside the background aggregate metrics thread.",
+                    exc_info=self._background_job.exc_info)
+                raise self._background_job.exc_info[1]
diff --git a/jstests/hooks/run_aggregate_metrics_background.js b/jstests/hooks/run_aggregate_metrics_background.js
new file mode 100644
index 00000000000..646d0c5f06f
--- /dev/null
+++ b/jstests/hooks/run_aggregate_metrics_background.js
@@ -0,0 +1,182 @@
+/** TODO: fix all these comments
+ * Runs the validate command with {background:true} against all nodes (replica set members and
+ * standalone nodes, not sharded clusters) concurrently with running tests.
+ */
+
+'use strict';
+
+(function() {
+
+print("yipppeee ki ya were in a background thread!!!");
+
+load('jstests/libs/discover_topology.js');  // For Topology and DiscoverTopology.
+load('jstests/libs/parallelTester.js');     // For Thread.
+
+if (typeof db === 'undefined') {
+    throw new Error(
+        "Expected mongo shell to be connected a server, but global 'db' object isn't defined");
+}
+
+// Disable implicit sessions so FSM workloads that kill random sessions won't interrupt the
+// operations in this test that aren't resilient to interruptions.
+TestData.disableImplicitSessions = true;
+
+const conn = db.getMongo();
+const topology = DiscoverTopology.findConnectedNodes(conn);
+
+/**
+ * Returns true if the error code is transient and does not indicate data corruption.
+ */
+const isIgnorableError = function ignorableError(codeName) {
+    if (codeName == "NamespaceNotFound" || codeName == "Interrupted" ||
+        codeName == "CommandNotSupportedOnView" || codeName == "InterruptedAtShutdown" ||
+        codeName == "InvalidViewDefinition") {
+        return true;
+    }
+    return false;
+};
+
+/**
+ * Runs validate commands with {background:true} against 'host' for all collections it possesses.
+ *
+ * Returns the cumulative command failure results, if there are any, in an object
+ * { ok: 0, error: [{cmd-res}, {cmd-res}, ... ]}
+ * Or simply OK if all cmds were successful.
+ * {ok: 1}
+ *
+ * This function should not throw if everything is working properly.
+ */
+const validateCollectionsBackgroundThread = function validateCollectionsBackground(
+    host, isIgnorableErrorFunc) {
+    // Calls 'func' with the print() function overridden to be a no-op.
+    const quietly = (func) => {
+        const printOriginal = print;
+        try {
+            print = Function.prototype;
+            func();
+        } finally {
+            print = printOriginal;
+        }
+    };
+
+    // Suppress the log messages generated establishing new mongo connections. The
+    // run_validate_collections_background.js hook is executed frequently by resmoke.py and
+    // could lead to generating an overwhelming amount of log messages.
+    let conn;
+    quietly(() => {
+        conn = new Mongo(host);
+    });
+    assert.neq(null,
+               conn,
+               "Failed to connect to host '" + host + "' for background collection validation");
+
+    // Filter out arbiters.
+    if (conn.adminCommand({isMaster: 1}).arbiterOnly) {
+        print("Skipping background validation against test node: " + host +
+              " because it is an arbiter and has no data.");
+        return {ok: 1};
+    }
+
+    print("Running background validation on all collections on test node: " + host);
+
+    // Save a map of namespace to validate cmd results for any cmds that fail so that we can return
+    // the results afterwards.
+    let failedValidateResults = [];
+
+    // Validate all collections in every database.
+
+    const dbNames =
+        assert
+            .commandWorked(conn.adminCommand(
+                {"listDatabases": 1, "nameOnly": true, "$readPreference": {"mode": "nearest"}}))
+            .databases.map(function(z) {
+                return z.name;
+            });
+
+    conn.adminCommand({configureFailPoint: "crashOnMultikeyValidateFailure", mode: "alwaysOn"});
+    for (let dbName of dbNames) {
+        let db = conn.getDB(dbName);
+
+        const listCollRes = assert.commandWorked(db.runCommand({
+            "listCollections": 1,
+            "nameOnly": true,
+            "filter": {$or: [{type: 'collection'}, {type: {$exists: false}}]},
+            "$readPreference": {"mode": "nearest"},
+        }));
+        const collectionNames = new DBCommandCursor(db, listCollRes).map(function(z) {
+            return z.name;
+        });
+
+        for (let collectionName of collectionNames) {
+            let res = conn.getDB(dbName).getCollection(collectionName).runCommand({
+                "validate": collectionName,
+                background: true,
+                "$readPreference": {"mode": "nearest"}
+            });
+
+            if ((!res.ok && !isIgnorableErrorFunc(res.codeName)) || (res.valid === false)) {
+                failedValidateResults.push({"ns": dbName + "." + collectionName, "res": res});
+            }
+        }
+    }
+    conn.adminCommand({configureFailPoint: "crashOnMultikeyValidateFailure", mode: "off"});
+
+    // If any commands failed, format and return an error.
+    if (failedValidateResults.length) {
+        let errorsArray = [];
+        for (let nsAndRes of failedValidateResults) {
+            errorsArray.push({"namespace": nsAndRes.ns, "res": nsAndRes.res});
+        }
+
+        const heading = "Validate command(s) with {background:true} failed against mongod";
+        print(heading + " '" + conn.host + "': \n" + tojson(errorsArray));
+
+        return {ok: 0, error: "Validate failure (search for the following heading): " + heading};
+    }
+
+    return {ok: 1};
+};
+
+if (topology.type === Topology.kStandalone) {
+    let res = validateCollectionsBackgroundThread(topology.mongod);
+    assert.commandWorked(
+        res,
+        () => 'background collection validation against the standalone failed: ' + tojson(res));
+} else if (topology.type === Topology.kReplicaSet) {
+    const threads = [];
+    try {
+        for (let replicaMember of topology.nodes) {
+            const thread =
+                new Thread(validateCollectionsBackgroundThread, replicaMember, isIgnorableError);
+            threads.push(thread);
+            thread.start();
+        }
+    } finally {
+        // Wait for each thread to finish and gather any errors.
+        let gatheredErrors = [];
+        const returnData = threads.map(thread => {
+            try {
+                thread.join();
+
+                // Calling returnData can cause an error thrown in the thread to be thrown again, so
+                // we do this in a try-catch block.
+                let res = thread.returnData();
+
+                if (!res.ok) {
+                    gatheredErrors.push(res);
+                }
+            } catch (e) {
+                gatheredErrors.push(e);
+            }
+        });
+
+        if (gatheredErrors.length) {
+            throw new Error(
+                "Background collection validation was not successful against all replica set " +
+                "members: \n" + tojson(gatheredErrors));
+        }
+    }
+} else {
+    throw new Error('Unsupported topology configuration: ' + tojson(topology));
+}
+})();
author	Brian DeLeonardis <brian.deleonardis@mongodb.com>	2020-11-12 00:17:02 +0000
committer	Brian DeLeonardis <brian.deleonardis@mongodb.com>	2020-11-12 00:17:02 +0000
commit	653817f5d5739753693285dec52497beda2bb5a6 (patch)
tree	54d328c4d2b29a0d5301409b0362923f1d21191f
parent	04415c7f5e6c824d0683da7b9a647fd785d03efb (diff)
download	mongo-653817f5d5739753693285dec52497beda2bb5a6.tar.gz