diff options
author | Vishnu Kaushik <vishnu.kaushik@mongodb.com> | 2021-11-03 14:33:46 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-11-03 15:52:49 +0000 |
commit | 0d73b2dc01f302a73f5805cce0fab5c3a86aeb41 (patch) | |
tree | 8da409061221e28ef23d3ab2bb38a44deb8ce8bb | |
parent | 75242734fdbec90c6a55a57165295b62c173b179 (diff) | |
download | mongo-0d73b2dc01f302a73f5805cce0fab5c3a86aeb41.tar.gz |
SERVER-61080 Avoid failing replicator tasks for test failures unrelated to the replicator
5 files changed, 107 insertions, 44 deletions
diff --git a/buildscripts/resmokelib/testing/executor.py b/buildscripts/resmokelib/testing/executor.py index 4fd76ea7093..65f04dd18a2 100644 --- a/buildscripts/resmokelib/testing/executor.py +++ b/buildscripts/resmokelib/testing/executor.py @@ -88,7 +88,7 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes n_jobs_to_start = self._num_jobs_to_start(self._suite, num_tests) return [self._make_job(job_num) for job_num in range(n_jobs_to_start)] - def run(self): + def run(self): # pylint: disable=too-many-branches """Execute the test suite. Any exceptions that occur during setting up or tearing down a @@ -104,6 +104,7 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes # a test suite run earlier can be reused during this current test suite. network.PortAllocator.reset() teardown_flag = None + hook_failure_flag = None try: num_repeat_suites = self._suite.options.num_repeat_suites while num_repeat_suites > 0: @@ -117,30 +118,39 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes # still running if an Evergreen task were to time out from a hang/deadlock being # triggered. teardown_flag = threading.Event() if num_repeat_suites == 1 else None - (report, interrupted) = self._run_tests(test_queue, setup_flag, teardown_flag) + # We use the 'hook_failure_flag' to distinguish hook failures from other failures, + # so that we can return a separate return code when a hook has failed. + hook_failure_flag = threading.Event() + (report, interrupted) = self._run_tests(test_queue, setup_flag, teardown_flag, + hook_failure_flag) self._suite.record_test_end(report) if setup_flag and setup_flag.is_set(): self.logger.error("Setup of one of the job fixtures failed") - return_code = 2 + return_code = max(return_code, 2) return # Remove the setup flag once the first suite ran. setup_flag = None + if hook_failure_flag.is_set(): + # The hook failure return code is highest so it will take precedence when + # reported. + return_code = max(return_code, 3) + # If the user triggered a KeyboardInterrupt, then we should stop. if interrupted: raise errors.UserInterrupt("Received interrupt from user") if teardown_flag and teardown_flag.is_set(): - return_code = 2 + return_code = max(return_code, 2) sb = [] # String builder. self._suite.summarize_latest(sb) self.logger.info("Summary of latest execution: %s", "\n ".join(sb)) if not report.wasSuccessful(): - return_code = 1 + return_code = max(return_code, 1) if self._suite.options.fail_fast: break @@ -159,10 +169,11 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes finally: if not teardown_flag: if not self._teardown_fixtures(): - return_code = 2 + return_code = max(return_code, 2) + self._suite.return_code = return_code - def _run_tests(self, test_queue, setup_flag, teardown_flag): + def _run_tests(self, test_queue, setup_flag, teardown_flag, hook_failure_flag): """Start a thread for each Job instance and block until all of the tests are run. Returns a (combined report, user interrupted) pair, where the @@ -178,7 +189,8 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes for job in self._jobs: thr = threading.Thread( target=job, args=(test_queue, interrupt_flag), kwargs=dict( - setup_flag=setup_flag, teardown_flag=teardown_flag)) + setup_flag=setup_flag, teardown_flag=teardown_flag, + hook_failure_flag=hook_failure_flag)) # Do not wait for tests to finish executing if interrupted by the user. thr.daemon = True thr.start() diff --git a/buildscripts/resmokelib/testing/hooks/cluster_to_cluster_dummy_replicator.py b/buildscripts/resmokelib/testing/hooks/cluster_to_cluster_dummy_replicator.py index 0cc4870c3a7..2eb17d9c6a0 100644 --- a/buildscripts/resmokelib/testing/hooks/cluster_to_cluster_dummy_replicator.py +++ b/buildscripts/resmokelib/testing/hooks/cluster_to_cluster_dummy_replicator.py @@ -1,4 +1,4 @@ -"""Test hook for running the dummy replicator on two clusters.""" +"""Test implementation for running the dummy replicator on two clusters.""" import copy import os.path diff --git a/buildscripts/resmokelib/testing/hooks/cluster_to_cluster_replication.py b/buildscripts/resmokelib/testing/hooks/cluster_to_cluster_replication.py index 2e610d9c162..da35679f3af 100644 --- a/buildscripts/resmokelib/testing/hooks/cluster_to_cluster_replication.py +++ b/buildscripts/resmokelib/testing/hooks/cluster_to_cluster_replication.py @@ -56,9 +56,9 @@ class ClusterToClusterReplication(interface.Hook): # pylint: disable=too-many-i def before_suite(self, test_report): """Before suite.""" if not self._fixture: - raise ValueError("No ClusterToClusterReplication to run migrations on") - # The replicator must be called here to tell it to start replicating data. - self.logger.info("Starting the cluster to cluster replicator.") + raise ValueError("No ClusterToClusterFixture to run migrations on") + + self.logger.info("Setting up cluster to cluster test data.") # Set up the initial replication direction. clusters = self._fixture.get_independent_clusters() @@ -78,35 +78,37 @@ class ClusterToClusterReplication(interface.Hook): # pylint: disable=too-many-i def after_suite(self, test_report, teardown_flag=None): """After suite.""" # If the total number of tests was not an exact multiple of the number run per cycle, then - # run the data consistency check again. + # pause the replicator first. if self._test_num % self._tests_per_cycle != 0: # Pause the dummy replicator first. self.logger.info("Pausing replicator before stopping.") - self._replicator.pause() + self._run_replicator_action(test_report, self._replicator.pause) self.logger.info("Finished pausing the replicator.") - self.logger.info("Stopping the cluster to cluster replicator.") - stop_options = { - "test": self._last_test, "test_report": test_report, - "shell_options": self._shell_options - } - self._replicator.stop(stop_options) - self.logger.info("Stopped the cluster to cluster replicator.") + self.logger.info("Stopping the cluster to cluster replicator.") + stop_options = { + "test": self._last_test, "test_report": test_report, + "shell_options": self._shell_options + } + self._run_replicator_action(test_report, self._replicator.stop, stop_options) + self.logger.info("Stopped the cluster to cluster replicator.") - self._run_data_consistency_check(self._last_test, test_report) - self._run_check_repl_db_hash(self._last_test, test_report) + self._run_data_consistency_check(self._last_test, test_report) + self._run_check_repl_db_hash(self._last_test, test_report) def before_test(self, test, test_report): """Before test.""" if self._test_num == 0: self.logger.info("Starting the replicator.") - self._replicator.start() + self._run_replicator_action(test_report, self._replicator.start) + self.logger.info("Started the replicator.") return if self._test_num % self._tests_per_cycle == 0: # The replicator should be told to start running once again. self.logger.info("Resuming the cluster to cluster replicator.") - self._replicator.resume() + self._run_replicator_action(test_report, self._replicator.resume) + self.logger.info("Resumed the cluster to cluster replicator.") def after_test(self, test, test_report): """After test.""" @@ -117,7 +119,7 @@ class ClusterToClusterReplication(interface.Hook): # pylint: disable=too-many-i # data across the clusters, so that a data consistency check can be performed. if self._test_num % self._tests_per_cycle == 0: self.logger.info("Pausing the cluster to cluster replicator.") - self._replicator.pause() + self._run_replicator_action(test_report, self._replicator.pause) self.logger.info("Paused the cluster to cluster replicator.") self._run_data_consistency_check(test, test_report) @@ -139,3 +141,28 @@ class ClusterToClusterReplication(interface.Hook): # pylint: disable=too-many-i check_db_hash.before_test(test, test_report) check_db_hash.after_test(test, test_report) check_db_hash.after_suite(test_report) + + def _run_replicator_action(self, test_report, action, action_options=None): + replicator_action_case = _ReplicatorActionTestCase(self.logger, self._last_test, self, + action, action_options) + replicator_action_case.run_dynamic_test(test_report) + + +class _ReplicatorActionTestCase(interface.DynamicTestCase): + """_ReplicatorActionTestCase class, to run a replicator action as a test.""" + + def __init__( # pylint: disable=too-many-arguments + self, logger, base_test_name, hook, action, action_options): + """Initialize _ReplicatorActionTestCase.""" + interface.DynamicTestCase.__init__(self, logger, "replicator_action", + "Run a replicator action.", base_test_name, hook) + self._action = action + self._action_options = action_options + + def run_test(self): + try: + self._action(self._action_options) + except: + self.logger.exception("Failed to run replicator action '%s' with options '%s'", + self._action, self._action_options) + raise diff --git a/buildscripts/resmokelib/testing/job.py b/buildscripts/resmokelib/testing/job.py index 3a2ac3dbedf..6b40cbdbb92 100644 --- a/buildscripts/resmokelib/testing/job.py +++ b/buildscripts/resmokelib/testing/job.py @@ -48,7 +48,8 @@ class Job(object): # pylint: disable=too-many-instance-attributes # Drain the queue to unblock the main thread. Job._drain_queue(queue) - def __call__(self, queue, interrupt_flag, setup_flag=None, teardown_flag=None): + def __call__(self, queue, interrupt_flag, setup_flag=None, teardown_flag=None, + hook_failure_flag=None): """Continuously execute tests from 'queue' and records their details in 'report'. If 'setup_flag' is not None, then a test to set up the fixture will be run @@ -81,7 +82,7 @@ class Job(object): # pylint: disable=too-many-instance-attributes if setup_succeeded: try: - self._run(queue, interrupt_flag, teardown_flag) + self._run(queue, interrupt_flag, teardown_flag, hook_failure_flag) except errors.StopExecution as err: # Stop running tests immediately. self.logger.error("Received a StopExecution exception: %s.", err) @@ -117,26 +118,24 @@ class Job(object): # pylint: disable=too-many-instance-attributes """Get current time to aid in the unit testing of the _run method.""" return time.time() - def _run(self, queue, interrupt_flag, teardown_flag=None): + def _run(self, queue, interrupt_flag, teardown_flag=None, hook_failure_flag=None): """Call the before/after suite hooks and continuously execute tests from 'queue'.""" - for hook in self.hooks: - hook.before_suite(self.report) + self._run_hooks_before_suite(hook_failure_flag) while not queue.empty() and not interrupt_flag.is_set(): queue_elem = queue.get_nowait() test_time_start = self._get_time() try: test = queue_elem.testcase - self._execute_test(test) + self._execute_test(test, hook_failure_flag) finally: queue_elem.job_completed(self._get_time() - test_time_start) queue.task_done() self._requeue_test(queue, queue_elem, interrupt_flag) - for hook in self.hooks: - hook.after_suite(self.report, teardown_flag) + self._run_hooks_after_suite(teardown_flag, hook_failure_flag) def _log_requeue_test(self, queue_elem): """Log the requeue of a test.""" @@ -165,11 +164,11 @@ class Job(object): # pylint: disable=too-many-instance-attributes self._log_requeue_test(queue_elem) queue.put(queue_elem) - def _execute_test(self, test): + def _execute_test(self, test, hook_failure_flag): """Call the before/after test hooks and execute 'test'.""" test.configure(self.fixture, config.NUM_CLIENTS_PER_FIXTURE) - self._run_hooks_before_tests(test) + self._run_hooks_before_tests(test, hook_failure_flag) self.report.logging_prefix = create_fixture_table(self.fixture) test(self.report) @@ -199,26 +198,51 @@ class Job(object): # pylint: disable=too-many-instance-attributes # Stop background hooks first since they can interfere with fixture startup and teardown # done as part of archival. - self._run_hooks_after_tests(test, background=True) + self._run_hooks_after_tests(test, hook_failure_flag, background=True) if self.archival: result = TestResult(test=test, hook=None, success=success) self.archival.archive(self.logger, result, self.manager) - self._run_hooks_after_tests(test, background=False) + self._run_hooks_after_tests(test, hook_failure_flag, background=False) - def _run_hook(self, hook, hook_function, test): + def _run_hook(self, hook, hook_function, test, hook_failure_flag): """Provide helper to run hook and archival.""" try: success = False hook_function(test, self.report) success = True finally: + if not success and hook_failure_flag is not None: + hook_failure_flag.set() + if self.archival: result = TestResult(test=test, hook=hook, success=success) self.archival.archive(self.logger, result, self.manager) - def _run_hooks_before_tests(self, test): + def _run_hooks_before_suite(self, hook_failure_flag): + """Run the before_suite method on each of the hooks.""" + hooks_failed = True + try: + for hook in self.hooks: + hook.before_suite(self.report) + hooks_failed = False + finally: + if hooks_failed and hook_failure_flag is not None: + hook_failure_flag.set() + + def _run_hooks_after_suite(self, teardown_flag, hook_failure_flag): + """Run the after_suite method on each of the hooks.""" + hooks_failed = True + try: + for hook in self.hooks: + hook.after_suite(self.report, teardown_flag) + hooks_failed = False + finally: + if hooks_failed and hook_failure_flag is not None: + hook_failure_flag.set() + + def _run_hooks_before_tests(self, test, hook_failure_flag): """Run the before_test method on each of the hooks. Swallows any TestFailure exceptions if set to continue on @@ -226,7 +250,7 @@ class Job(object): # pylint: disable=too-many-instance-attributes """ try: for hook in self.hooks: - self._run_hook(hook, hook.before_test, test) + self._run_hook(hook, hook.before_test, test, hook_failure_flag) except errors.StopExecution: raise @@ -251,7 +275,7 @@ class Job(object): # pylint: disable=too-many-instance-attributes self.report.stopTest(test) raise - def _run_hooks_after_tests(self, test, background=False): + def _run_hooks_after_tests(self, test, hook_failure_flag, background=False): """Run the after_test method on each of the hooks. Swallows any TestFailure exceptions if set to continue on @@ -263,7 +287,7 @@ class Job(object): # pylint: disable=too-many-instance-attributes try: for hook in self.hooks: if hook.IS_BACKGROUND == background: - self._run_hook(hook, hook.after_test, test) + self._run_hook(hook, hook.after_test, test, hook_failure_flag) except errors.StopExecution: raise diff --git a/buildscripts/tests/resmokelib/testing/test_job.py b/buildscripts/tests/resmokelib/testing/test_job.py index e2d130c3fc8..699f19bd133 100644 --- a/buildscripts/tests/resmokelib/testing/test_job.py +++ b/buildscripts/tests/resmokelib/testing/test_job.py @@ -198,7 +198,7 @@ class UnitJob(job.Job): # pylint: disable=too-many-instance-attributes self.total_test_num = 0 self.tests = {} - def _execute_test(self, test): + def _execute_test(self, test, hook_failure_flag=None): self.total_test_num += 1 if test.test_name not in self.tests: self.tests[test.test_name] = 0 |