summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrian DeLeonardis <brian.deleonardis@mongodb.com>2020-11-12 00:17:02 +0000
committerBrian DeLeonardis <brian.deleonardis@mongodb.com>2020-11-12 00:17:02 +0000
commit653817f5d5739753693285dec52497beda2bb5a6 (patch)
tree54d328c4d2b29a0d5301409b0362923f1d21191f
parent04415c7f5e6c824d0683da7b9a647fd785d03efb (diff)
downloadmongo-51403.tar.gz
We have a background thread running (albiet it doesn't do what we want yet)51403
-rw-r--r--buildscripts/resmokeconfig/suites/concurrency_metrics.yml5
-rw-r--r--buildscripts/resmokelib/testing/executor.py25
-rw-r--r--buildscripts/resmokelib/testing/hooks/aggregate_metrics_background.py70
-rw-r--r--jstests/hooks/run_aggregate_metrics_background.js182
4 files changed, 268 insertions, 14 deletions
diff --git a/buildscripts/resmokeconfig/suites/concurrency_metrics.yml b/buildscripts/resmokeconfig/suites/concurrency_metrics.yml
index da6143d3216..64d0ac3f970 100644
--- a/buildscripts/resmokeconfig/suites/concurrency_metrics.yml
+++ b/buildscripts/resmokeconfig/suites/concurrency_metrics.yml
@@ -31,13 +31,12 @@ executor:
shell_options:
readMode: commands
hooks:
- - AggregateResourceConsumptionMetricsInBackground
+ - class: AggregateResourceConsumptionMetricsInBackground
fixture:
class: ReplicaSetFixture
mongod_options:
- oplogSize: 1024
set_parameters:
enableTestCommands: 1
measureOperationResourceConsumption: true
aggregateOperationResourceConsumptionMetrics: true
- num_nodes: 3 \ No newline at end of file
+ num_nodes: 2 \ No newline at end of file
diff --git a/buildscripts/resmokelib/testing/executor.py b/buildscripts/resmokelib/testing/executor.py
index 4fd76ea7093..c7e0beb24ce 100644
--- a/buildscripts/resmokelib/testing/executor.py
+++ b/buildscripts/resmokelib/testing/executor.py
@@ -43,6 +43,7 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes
else:
self.fixture_config = fixture
+ print("hooks:", hooks)
self.hooks_config = utils.default_if_none(hooks, [])
self.test_config = utils.default_if_none(config, {})
@@ -53,7 +54,8 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes
self._suite = suite
self.num_tests = len(suite.tests) * suite.options.num_repeat_tests
- self.test_queue_logger = logging.loggers.new_testqueue_logger(suite.test_kind)
+ self.test_queue_logger = logging.loggers.new_testqueue_logger(
+ suite.test_kind)
# Must be done after getting buildlogger configuration.
self._jobs = self._create_jobs(self.num_tests)
@@ -117,12 +119,14 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes
# still running if an Evergreen task were to time out from a hang/deadlock being
# triggered.
teardown_flag = threading.Event() if num_repeat_suites == 1 else None
- (report, interrupted) = self._run_tests(test_queue, setup_flag, teardown_flag)
+ (report, interrupted) = self._run_tests(
+ test_queue, setup_flag, teardown_flag)
self._suite.record_test_end(report)
if setup_flag and setup_flag.is_set():
- self.logger.error("Setup of one of the job fixtures failed")
+ self.logger.error(
+ "Setup of one of the job fixtures failed")
return_code = 2
return
# Remove the setup flag once the first suite ran.
@@ -137,7 +141,8 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes
sb = [] # String builder.
self._suite.summarize_latest(sb)
- self.logger.info("Summary of latest execution: %s", "\n ".join(sb))
+ self.logger.info(
+ "Summary of latest execution: %s", "\n ".join(sb))
if not report.wasSuccessful():
return_code = 1
@@ -201,7 +206,8 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes
wait_secs = 2.0
self.logger.debug("Waiting for threads to complete")
- timer = threading.Timer(wait_secs, self._log_timeout_warning, args=[wait_secs])
+ timer = threading.Timer(
+ wait_secs, self._log_timeout_warning, args=[wait_secs])
timer.daemon = True
timer.start()
try:
@@ -243,7 +249,8 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes
fixture_config = self.fixture_config.copy()
fixture_class = fixture_config.pop("class")
- fixture_logger = logging.loggers.new_fixture_logger(fixture_class, job_num)
+ fixture_logger = logging.loggers.new_fixture_logger(
+ fixture_class, job_num)
return fixtures.make_fixture(fixture_class, fixture_logger, job_num, **fixture_config)
@@ -257,7 +264,8 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes
hook_class = hook_config.pop("class")
hook_logger = logging.loggers.new_hook_logger(hook_class, job_num)
- hook = _hooks.make_hook(hook_class, hook_logger, fixture, **hook_config)
+ hook = _hooks.make_hook(
+ hook_class, hook_logger, fixture, **hook_config)
hooks.append(hook)
return hooks
@@ -269,7 +277,8 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes
:param job_num: instance number of job being created.
:return: Job instance.
"""
- job_logger = logging.loggers.new_job_logger(self._suite.test_kind, job_num)
+ job_logger = logging.loggers.new_job_logger(
+ self._suite.test_kind, job_num)
fixture = self._make_fixture(job_num)
hooks = self._make_hooks(fixture, job_num)
diff --git a/buildscripts/resmokelib/testing/hooks/aggregate_metrics_background.py b/buildscripts/resmokelib/testing/hooks/aggregate_metrics_background.py
index 73de9ee8d61..6816695f729 100644
--- a/buildscripts/resmokelib/testing/hooks/aggregate_metrics_background.py
+++ b/buildscripts/resmokelib/testing/hooks/aggregate_metrics_background.py
@@ -1,11 +1,75 @@
-"""Test hook for running the $operationMetrics stage in the background.
+"""Test hook for running the $operationMetrics stage in the background.
This hook runs every five seconds.
"""
+import os.path
+
+from buildscripts.resmokelib import errors
from buildscripts.resmokelib.testing.hooks import jsfile
-from buildscripts.resmokelib.testing.hooks.background_job import _BackgroundJob
+from buildscripts.resmokelib.testing.hooks.background_job import _BackgroundJob, _ContinuousDynamicJSTestCase
-class EnsureOperationMetricsAreAggregatedInBackground(jsfile.JSHook):
+class AggregateResourceConsumptionMetricsInBackground(jsfile.JSHook):
"""A hook to run $operationMetrics stage in the background"""
+
+ def __init__(self, hook_logger, fixture, shell_options=None):
+ """Initialize AggregateResourceConsumptionMetricsInBackground."""
+ description = "Run background $operationMetrics on all mongods while a test is running"
+ js_filename = os.path.join(
+ "jstests", "hooks", "run_aggregate_metrics_background.js")
+ jsfile.JSHook.__init__(self, hook_logger, fixture,
+ js_filename, description, shell_options=shell_options)
+ self._background_job = None
+
+ def before_suite(self, test_report):
+ """Start the background thread."""
+ self._background_job = _BackgroundJob(
+ "AggregateResourceConsumptionMetricsInBackground")
+ self.logger.info("Starting the background aggregate metrics thread.")
+ self._background_job.start()
+
+ def after_suite(self, test_report):
+ """Signal the background aggregate metrics thread to exit, and wait until it does."""
+ if self._background_job is None:
+ return
+
+ self.logger.info("Stopping the background aggregate metrics thread.")
+ self._background_job.stop()
+
+ def before_test(self, test, test_report):
+ """Instruct the background aggregate metrics thread to run while 'test' is also running."""
+ if self._background_job is None:
+ return
+
+ hook_test_case = _ContinuousDynamicJSTestCase.create_before_test(
+ self.logger, test, self, self._js_filename, self._shell_options)
+ hook_test_case.configure(self.fixture)
+
+ self.logger.info(
+ "Resuming the background aggregate metrics thread.")
+ self._background_job.resume(hook_test_case, test_report)
+
+ def after_test(self, test, test_report): # noqa: D205,D400
+ """Instruct the background aggregate metrics thread to stop running now that 'test' has
+ finished running.
+ """
+ if self._background_job is None:
+ return
+
+ self.logger.info(
+ "Pausing the background aggregate metrics thread.")
+ self._background_job.pause()
+
+ if self._background_job.exc_info is not None:
+ if isinstance(self._background_job.exc_info[1], errors.TestFailure):
+ # If the mongo shell process running the JavaScript file exited with a non-zero
+ # return code, then we raise an errors.ServerFailure exception to cause resmoke.py's
+ # test execution to stop.
+ raise errors.ServerFailure(
+ self._background_job.exc_info[1].args[0])
+ else:
+ self.logger.error(
+ "Encountered an error inside the background aggregate metrics thread.",
+ exc_info=self._background_job.exc_info)
+ raise self._background_job.exc_info[1]
diff --git a/jstests/hooks/run_aggregate_metrics_background.js b/jstests/hooks/run_aggregate_metrics_background.js
new file mode 100644
index 00000000000..646d0c5f06f
--- /dev/null
+++ b/jstests/hooks/run_aggregate_metrics_background.js
@@ -0,0 +1,182 @@
+/** TODO: fix all these comments
+ * Runs the validate command with {background:true} against all nodes (replica set members and
+ * standalone nodes, not sharded clusters) concurrently with running tests.
+ */
+
+'use strict';
+
+(function() {
+
+print("yipppeee ki ya were in a background thread!!!");
+
+load('jstests/libs/discover_topology.js'); // For Topology and DiscoverTopology.
+load('jstests/libs/parallelTester.js'); // For Thread.
+
+if (typeof db === 'undefined') {
+ throw new Error(
+ "Expected mongo shell to be connected a server, but global 'db' object isn't defined");
+}
+
+// Disable implicit sessions so FSM workloads that kill random sessions won't interrupt the
+// operations in this test that aren't resilient to interruptions.
+TestData.disableImplicitSessions = true;
+
+const conn = db.getMongo();
+const topology = DiscoverTopology.findConnectedNodes(conn);
+
+/**
+ * Returns true if the error code is transient and does not indicate data corruption.
+ */
+const isIgnorableError = function ignorableError(codeName) {
+ if (codeName == "NamespaceNotFound" || codeName == "Interrupted" ||
+ codeName == "CommandNotSupportedOnView" || codeName == "InterruptedAtShutdown" ||
+ codeName == "InvalidViewDefinition") {
+ return true;
+ }
+ return false;
+};
+
+/**
+ * Runs validate commands with {background:true} against 'host' for all collections it possesses.
+ *
+ * Returns the cumulative command failure results, if there are any, in an object
+ * { ok: 0, error: [{cmd-res}, {cmd-res}, ... ]}
+ * Or simply OK if all cmds were successful.
+ * {ok: 1}
+ *
+ * This function should not throw if everything is working properly.
+ */
+const validateCollectionsBackgroundThread = function validateCollectionsBackground(
+ host, isIgnorableErrorFunc) {
+ // Calls 'func' with the print() function overridden to be a no-op.
+ const quietly = (func) => {
+ const printOriginal = print;
+ try {
+ print = Function.prototype;
+ func();
+ } finally {
+ print = printOriginal;
+ }
+ };
+
+ // Suppress the log messages generated establishing new mongo connections. The
+ // run_validate_collections_background.js hook is executed frequently by resmoke.py and
+ // could lead to generating an overwhelming amount of log messages.
+ let conn;
+ quietly(() => {
+ conn = new Mongo(host);
+ });
+ assert.neq(null,
+ conn,
+ "Failed to connect to host '" + host + "' for background collection validation");
+
+ // Filter out arbiters.
+ if (conn.adminCommand({isMaster: 1}).arbiterOnly) {
+ print("Skipping background validation against test node: " + host +
+ " because it is an arbiter and has no data.");
+ return {ok: 1};
+ }
+
+ print("Running background validation on all collections on test node: " + host);
+
+ // Save a map of namespace to validate cmd results for any cmds that fail so that we can return
+ // the results afterwards.
+ let failedValidateResults = [];
+
+ // Validate all collections in every database.
+
+ const dbNames =
+ assert
+ .commandWorked(conn.adminCommand(
+ {"listDatabases": 1, "nameOnly": true, "$readPreference": {"mode": "nearest"}}))
+ .databases.map(function(z) {
+ return z.name;
+ });
+
+ conn.adminCommand({configureFailPoint: "crashOnMultikeyValidateFailure", mode: "alwaysOn"});
+ for (let dbName of dbNames) {
+ let db = conn.getDB(dbName);
+
+ const listCollRes = assert.commandWorked(db.runCommand({
+ "listCollections": 1,
+ "nameOnly": true,
+ "filter": {$or: [{type: 'collection'}, {type: {$exists: false}}]},
+ "$readPreference": {"mode": "nearest"},
+ }));
+ const collectionNames = new DBCommandCursor(db, listCollRes).map(function(z) {
+ return z.name;
+ });
+
+ for (let collectionName of collectionNames) {
+ let res = conn.getDB(dbName).getCollection(collectionName).runCommand({
+ "validate": collectionName,
+ background: true,
+ "$readPreference": {"mode": "nearest"}
+ });
+
+ if ((!res.ok && !isIgnorableErrorFunc(res.codeName)) || (res.valid === false)) {
+ failedValidateResults.push({"ns": dbName + "." + collectionName, "res": res});
+ }
+ }
+ }
+ conn.adminCommand({configureFailPoint: "crashOnMultikeyValidateFailure", mode: "off"});
+
+ // If any commands failed, format and return an error.
+ if (failedValidateResults.length) {
+ let errorsArray = [];
+ for (let nsAndRes of failedValidateResults) {
+ errorsArray.push({"namespace": nsAndRes.ns, "res": nsAndRes.res});
+ }
+
+ const heading = "Validate command(s) with {background:true} failed against mongod";
+ print(heading + " '" + conn.host + "': \n" + tojson(errorsArray));
+
+ return {ok: 0, error: "Validate failure (search for the following heading): " + heading};
+ }
+
+ return {ok: 1};
+};
+
+if (topology.type === Topology.kStandalone) {
+ let res = validateCollectionsBackgroundThread(topology.mongod);
+ assert.commandWorked(
+ res,
+ () => 'background collection validation against the standalone failed: ' + tojson(res));
+} else if (topology.type === Topology.kReplicaSet) {
+ const threads = [];
+ try {
+ for (let replicaMember of topology.nodes) {
+ const thread =
+ new Thread(validateCollectionsBackgroundThread, replicaMember, isIgnorableError);
+ threads.push(thread);
+ thread.start();
+ }
+ } finally {
+ // Wait for each thread to finish and gather any errors.
+ let gatheredErrors = [];
+ const returnData = threads.map(thread => {
+ try {
+ thread.join();
+
+ // Calling returnData can cause an error thrown in the thread to be thrown again, so
+ // we do this in a try-catch block.
+ let res = thread.returnData();
+
+ if (!res.ok) {
+ gatheredErrors.push(res);
+ }
+ } catch (e) {
+ gatheredErrors.push(e);
+ }
+ });
+
+ if (gatheredErrors.length) {
+ throw new Error(
+ "Background collection validation was not successful against all replica set " +
+ "members: \n" + tojson(gatheredErrors));
+ }
+ }
+} else {
+ throw new Error('Unsupported topology configuration: ' + tojson(topology));
+}
+})();