diff options
author | Brian DeLeonardis <brian.deleonardis@mongodb.com> | 2020-11-12 00:17:02 +0000 |
---|---|---|
committer | Brian DeLeonardis <brian.deleonardis@mongodb.com> | 2020-11-12 00:17:02 +0000 |
commit | 653817f5d5739753693285dec52497beda2bb5a6 (patch) | |
tree | 54d328c4d2b29a0d5301409b0362923f1d21191f | |
parent | 04415c7f5e6c824d0683da7b9a647fd785d03efb (diff) | |
download | mongo-653817f5d5739753693285dec52497beda2bb5a6.tar.gz |
We have a background thread running (albiet it doesn't do what we want yet)51403
4 files changed, 268 insertions, 14 deletions
diff --git a/buildscripts/resmokeconfig/suites/concurrency_metrics.yml b/buildscripts/resmokeconfig/suites/concurrency_metrics.yml index da6143d3216..64d0ac3f970 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_metrics.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_metrics.yml @@ -31,13 +31,12 @@ executor: shell_options: readMode: commands hooks: - - AggregateResourceConsumptionMetricsInBackground + - class: AggregateResourceConsumptionMetricsInBackground fixture: class: ReplicaSetFixture mongod_options: - oplogSize: 1024 set_parameters: enableTestCommands: 1 measureOperationResourceConsumption: true aggregateOperationResourceConsumptionMetrics: true - num_nodes: 3
\ No newline at end of file + num_nodes: 2
\ No newline at end of file diff --git a/buildscripts/resmokelib/testing/executor.py b/buildscripts/resmokelib/testing/executor.py index 4fd76ea7093..c7e0beb24ce 100644 --- a/buildscripts/resmokelib/testing/executor.py +++ b/buildscripts/resmokelib/testing/executor.py @@ -43,6 +43,7 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes else: self.fixture_config = fixture + print("hooks:", hooks) self.hooks_config = utils.default_if_none(hooks, []) self.test_config = utils.default_if_none(config, {}) @@ -53,7 +54,8 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes self._suite = suite self.num_tests = len(suite.tests) * suite.options.num_repeat_tests - self.test_queue_logger = logging.loggers.new_testqueue_logger(suite.test_kind) + self.test_queue_logger = logging.loggers.new_testqueue_logger( + suite.test_kind) # Must be done after getting buildlogger configuration. self._jobs = self._create_jobs(self.num_tests) @@ -117,12 +119,14 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes # still running if an Evergreen task were to time out from a hang/deadlock being # triggered. teardown_flag = threading.Event() if num_repeat_suites == 1 else None - (report, interrupted) = self._run_tests(test_queue, setup_flag, teardown_flag) + (report, interrupted) = self._run_tests( + test_queue, setup_flag, teardown_flag) self._suite.record_test_end(report) if setup_flag and setup_flag.is_set(): - self.logger.error("Setup of one of the job fixtures failed") + self.logger.error( + "Setup of one of the job fixtures failed") return_code = 2 return # Remove the setup flag once the first suite ran. @@ -137,7 +141,8 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes sb = [] # String builder. self._suite.summarize_latest(sb) - self.logger.info("Summary of latest execution: %s", "\n ".join(sb)) + self.logger.info( + "Summary of latest execution: %s", "\n ".join(sb)) if not report.wasSuccessful(): return_code = 1 @@ -201,7 +206,8 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes wait_secs = 2.0 self.logger.debug("Waiting for threads to complete") - timer = threading.Timer(wait_secs, self._log_timeout_warning, args=[wait_secs]) + timer = threading.Timer( + wait_secs, self._log_timeout_warning, args=[wait_secs]) timer.daemon = True timer.start() try: @@ -243,7 +249,8 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes fixture_config = self.fixture_config.copy() fixture_class = fixture_config.pop("class") - fixture_logger = logging.loggers.new_fixture_logger(fixture_class, job_num) + fixture_logger = logging.loggers.new_fixture_logger( + fixture_class, job_num) return fixtures.make_fixture(fixture_class, fixture_logger, job_num, **fixture_config) @@ -257,7 +264,8 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes hook_class = hook_config.pop("class") hook_logger = logging.loggers.new_hook_logger(hook_class, job_num) - hook = _hooks.make_hook(hook_class, hook_logger, fixture, **hook_config) + hook = _hooks.make_hook( + hook_class, hook_logger, fixture, **hook_config) hooks.append(hook) return hooks @@ -269,7 +277,8 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes :param job_num: instance number of job being created. :return: Job instance. """ - job_logger = logging.loggers.new_job_logger(self._suite.test_kind, job_num) + job_logger = logging.loggers.new_job_logger( + self._suite.test_kind, job_num) fixture = self._make_fixture(job_num) hooks = self._make_hooks(fixture, job_num) diff --git a/buildscripts/resmokelib/testing/hooks/aggregate_metrics_background.py b/buildscripts/resmokelib/testing/hooks/aggregate_metrics_background.py index 73de9ee8d61..6816695f729 100644 --- a/buildscripts/resmokelib/testing/hooks/aggregate_metrics_background.py +++ b/buildscripts/resmokelib/testing/hooks/aggregate_metrics_background.py @@ -1,11 +1,75 @@ -"""Test hook for running the $operationMetrics stage in the background. +"""Test hook for running the $operationMetrics stage in the background. This hook runs every five seconds. """ +import os.path + +from buildscripts.resmokelib import errors from buildscripts.resmokelib.testing.hooks import jsfile -from buildscripts.resmokelib.testing.hooks.background_job import _BackgroundJob +from buildscripts.resmokelib.testing.hooks.background_job import _BackgroundJob, _ContinuousDynamicJSTestCase -class EnsureOperationMetricsAreAggregatedInBackground(jsfile.JSHook): +class AggregateResourceConsumptionMetricsInBackground(jsfile.JSHook): """A hook to run $operationMetrics stage in the background""" + + def __init__(self, hook_logger, fixture, shell_options=None): + """Initialize AggregateResourceConsumptionMetricsInBackground.""" + description = "Run background $operationMetrics on all mongods while a test is running" + js_filename = os.path.join( + "jstests", "hooks", "run_aggregate_metrics_background.js") + jsfile.JSHook.__init__(self, hook_logger, fixture, + js_filename, description, shell_options=shell_options) + self._background_job = None + + def before_suite(self, test_report): + """Start the background thread.""" + self._background_job = _BackgroundJob( + "AggregateResourceConsumptionMetricsInBackground") + self.logger.info("Starting the background aggregate metrics thread.") + self._background_job.start() + + def after_suite(self, test_report): + """Signal the background aggregate metrics thread to exit, and wait until it does.""" + if self._background_job is None: + return + + self.logger.info("Stopping the background aggregate metrics thread.") + self._background_job.stop() + + def before_test(self, test, test_report): + """Instruct the background aggregate metrics thread to run while 'test' is also running.""" + if self._background_job is None: + return + + hook_test_case = _ContinuousDynamicJSTestCase.create_before_test( + self.logger, test, self, self._js_filename, self._shell_options) + hook_test_case.configure(self.fixture) + + self.logger.info( + "Resuming the background aggregate metrics thread.") + self._background_job.resume(hook_test_case, test_report) + + def after_test(self, test, test_report): # noqa: D205,D400 + """Instruct the background aggregate metrics thread to stop running now that 'test' has + finished running. + """ + if self._background_job is None: + return + + self.logger.info( + "Pausing the background aggregate metrics thread.") + self._background_job.pause() + + if self._background_job.exc_info is not None: + if isinstance(self._background_job.exc_info[1], errors.TestFailure): + # If the mongo shell process running the JavaScript file exited with a non-zero + # return code, then we raise an errors.ServerFailure exception to cause resmoke.py's + # test execution to stop. + raise errors.ServerFailure( + self._background_job.exc_info[1].args[0]) + else: + self.logger.error( + "Encountered an error inside the background aggregate metrics thread.", + exc_info=self._background_job.exc_info) + raise self._background_job.exc_info[1] diff --git a/jstests/hooks/run_aggregate_metrics_background.js b/jstests/hooks/run_aggregate_metrics_background.js new file mode 100644 index 00000000000..646d0c5f06f --- /dev/null +++ b/jstests/hooks/run_aggregate_metrics_background.js @@ -0,0 +1,182 @@ +/** TODO: fix all these comments + * Runs the validate command with {background:true} against all nodes (replica set members and + * standalone nodes, not sharded clusters) concurrently with running tests. + */ + +'use strict'; + +(function() { + +print("yipppeee ki ya were in a background thread!!!"); + +load('jstests/libs/discover_topology.js'); // For Topology and DiscoverTopology. +load('jstests/libs/parallelTester.js'); // For Thread. + +if (typeof db === 'undefined') { + throw new Error( + "Expected mongo shell to be connected a server, but global 'db' object isn't defined"); +} + +// Disable implicit sessions so FSM workloads that kill random sessions won't interrupt the +// operations in this test that aren't resilient to interruptions. +TestData.disableImplicitSessions = true; + +const conn = db.getMongo(); +const topology = DiscoverTopology.findConnectedNodes(conn); + +/** + * Returns true if the error code is transient and does not indicate data corruption. + */ +const isIgnorableError = function ignorableError(codeName) { + if (codeName == "NamespaceNotFound" || codeName == "Interrupted" || + codeName == "CommandNotSupportedOnView" || codeName == "InterruptedAtShutdown" || + codeName == "InvalidViewDefinition") { + return true; + } + return false; +}; + +/** + * Runs validate commands with {background:true} against 'host' for all collections it possesses. + * + * Returns the cumulative command failure results, if there are any, in an object + * { ok: 0, error: [{cmd-res}, {cmd-res}, ... ]} + * Or simply OK if all cmds were successful. + * {ok: 1} + * + * This function should not throw if everything is working properly. + */ +const validateCollectionsBackgroundThread = function validateCollectionsBackground( + host, isIgnorableErrorFunc) { + // Calls 'func' with the print() function overridden to be a no-op. + const quietly = (func) => { + const printOriginal = print; + try { + print = Function.prototype; + func(); + } finally { + print = printOriginal; + } + }; + + // Suppress the log messages generated establishing new mongo connections. The + // run_validate_collections_background.js hook is executed frequently by resmoke.py and + // could lead to generating an overwhelming amount of log messages. + let conn; + quietly(() => { + conn = new Mongo(host); + }); + assert.neq(null, + conn, + "Failed to connect to host '" + host + "' for background collection validation"); + + // Filter out arbiters. + if (conn.adminCommand({isMaster: 1}).arbiterOnly) { + print("Skipping background validation against test node: " + host + + " because it is an arbiter and has no data."); + return {ok: 1}; + } + + print("Running background validation on all collections on test node: " + host); + + // Save a map of namespace to validate cmd results for any cmds that fail so that we can return + // the results afterwards. + let failedValidateResults = []; + + // Validate all collections in every database. + + const dbNames = + assert + .commandWorked(conn.adminCommand( + {"listDatabases": 1, "nameOnly": true, "$readPreference": {"mode": "nearest"}})) + .databases.map(function(z) { + return z.name; + }); + + conn.adminCommand({configureFailPoint: "crashOnMultikeyValidateFailure", mode: "alwaysOn"}); + for (let dbName of dbNames) { + let db = conn.getDB(dbName); + + const listCollRes = assert.commandWorked(db.runCommand({ + "listCollections": 1, + "nameOnly": true, + "filter": {$or: [{type: 'collection'}, {type: {$exists: false}}]}, + "$readPreference": {"mode": "nearest"}, + })); + const collectionNames = new DBCommandCursor(db, listCollRes).map(function(z) { + return z.name; + }); + + for (let collectionName of collectionNames) { + let res = conn.getDB(dbName).getCollection(collectionName).runCommand({ + "validate": collectionName, + background: true, + "$readPreference": {"mode": "nearest"} + }); + + if ((!res.ok && !isIgnorableErrorFunc(res.codeName)) || (res.valid === false)) { + failedValidateResults.push({"ns": dbName + "." + collectionName, "res": res}); + } + } + } + conn.adminCommand({configureFailPoint: "crashOnMultikeyValidateFailure", mode: "off"}); + + // If any commands failed, format and return an error. + if (failedValidateResults.length) { + let errorsArray = []; + for (let nsAndRes of failedValidateResults) { + errorsArray.push({"namespace": nsAndRes.ns, "res": nsAndRes.res}); + } + + const heading = "Validate command(s) with {background:true} failed against mongod"; + print(heading + " '" + conn.host + "': \n" + tojson(errorsArray)); + + return {ok: 0, error: "Validate failure (search for the following heading): " + heading}; + } + + return {ok: 1}; +}; + +if (topology.type === Topology.kStandalone) { + let res = validateCollectionsBackgroundThread(topology.mongod); + assert.commandWorked( + res, + () => 'background collection validation against the standalone failed: ' + tojson(res)); +} else if (topology.type === Topology.kReplicaSet) { + const threads = []; + try { + for (let replicaMember of topology.nodes) { + const thread = + new Thread(validateCollectionsBackgroundThread, replicaMember, isIgnorableError); + threads.push(thread); + thread.start(); + } + } finally { + // Wait for each thread to finish and gather any errors. + let gatheredErrors = []; + const returnData = threads.map(thread => { + try { + thread.join(); + + // Calling returnData can cause an error thrown in the thread to be thrown again, so + // we do this in a try-catch block. + let res = thread.returnData(); + + if (!res.ok) { + gatheredErrors.push(res); + } + } catch (e) { + gatheredErrors.push(e); + } + }); + + if (gatheredErrors.length) { + throw new Error( + "Background collection validation was not successful against all replica set " + + "members: \n" + tojson(gatheredErrors)); + } + } +} else { + throw new Error('Unsupported topology configuration: ' + tojson(topology)); +} +})(); |