SERVER-61080 Avoid failing replicator tasks for test failures unrelated to the replicator

author: Vishnu Kaushik <vishnu.kaushik@mongodb.com> 2021-11-03 14:33:46 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2021-11-03 15:52:49 +0000
commit: 0d73b2dc01f302a73f5805cce0fab5c3a86aeb41 (patch)
tree: 8da409061221e28ef23d3ab2bb38a44deb8ce8bb
parent: 75242734fdbec90c6a55a57165295b62c173b179 (diff)
download: mongo-0d73b2dc01f302a73f5805cce0fab5c3a86aeb41.tar.gz
5 files changed, 107 insertions, 44 deletions
diff --git a/buildscripts/resmokelib/testing/executor.py b/buildscripts/resmokelib/testing/executor.py
index 4fd76ea7093..65f04dd18a2 100644
--- a/buildscripts/resmokelib/testing/executor.py
+++ b/buildscripts/resmokelib/testing/executor.py
@@ -88,7 +88,7 @@ class TestSuiteExecutor(object):  # pylint: disable=too-many-instance-attributes
         n_jobs_to_start = self._num_jobs_to_start(self._suite, num_tests)
         return [self._make_job(job_num) for job_num in range(n_jobs_to_start)]
 
-    def run(self):
+    def run(self):  # pylint: disable=too-many-branches
         """Execute the test suite.
 
         Any exceptions that occur during setting up or tearing down a
@@ -104,6 +104,7 @@ class TestSuiteExecutor(object):  # pylint: disable=too-many-instance-attributes
         # a test suite run earlier can be reused during this current test suite.
         network.PortAllocator.reset()
         teardown_flag = None
+        hook_failure_flag = None
         try:
             num_repeat_suites = self._suite.options.num_repeat_suites
             while num_repeat_suites > 0:
@@ -117,30 +118,39 @@ class TestSuiteExecutor(object):  # pylint: disable=too-many-instance-attributes
                 # still running if an Evergreen task were to time out from a hang/deadlock being
                 # triggered.
                 teardown_flag = threading.Event() if num_repeat_suites == 1 else None
-                (report, interrupted) = self._run_tests(test_queue, setup_flag, teardown_flag)
+                # We use the 'hook_failure_flag' to distinguish hook failures from other failures,
+                # so that we can return a separate return code when a hook has failed.
+                hook_failure_flag = threading.Event()
+                (report, interrupted) = self._run_tests(test_queue, setup_flag, teardown_flag,
+                                                        hook_failure_flag)
 
                 self._suite.record_test_end(report)
 
                 if setup_flag and setup_flag.is_set():
                     self.logger.error("Setup of one of the job fixtures failed")
-                    return_code = 2
+                    return_code = max(return_code, 2)
                     return
                 # Remove the setup flag once the first suite ran.
                 setup_flag = None
 
+                if hook_failure_flag.is_set():
+                    # The hook failure return code is highest so it will take precedence when
+                    # reported.
+                    return_code = max(return_code, 3)
+
                 # If the user triggered a KeyboardInterrupt, then we should stop.
                 if interrupted:
                     raise errors.UserInterrupt("Received interrupt from user")
 
                 if teardown_flag and teardown_flag.is_set():
-                    return_code = 2
+                    return_code = max(return_code, 2)
 
                 sb = []  # String builder.
                 self._suite.summarize_latest(sb)
                 self.logger.info("Summary of latest execution: %s", "\n    ".join(sb))
 
                 if not report.wasSuccessful():
-                    return_code = 1
+                    return_code = max(return_code, 1)
                     if self._suite.options.fail_fast:
                         break
 
@@ -159,10 +169,11 @@ class TestSuiteExecutor(object):  # pylint: disable=too-many-instance-attributes
         finally:
             if not teardown_flag:
                 if not self._teardown_fixtures():
-                    return_code = 2
+                    return_code = max(return_code, 2)
+
             self._suite.return_code = return_code
 
-    def _run_tests(self, test_queue, setup_flag, teardown_flag):
+    def _run_tests(self, test_queue, setup_flag, teardown_flag, hook_failure_flag):
         """Start a thread for each Job instance and block until all of the tests are run.
 
         Returns a (combined report, user interrupted) pair, where the
@@ -178,7 +189,8 @@ class TestSuiteExecutor(object):  # pylint: disable=too-many-instance-attributes
             for job in self._jobs:
                 thr = threading.Thread(
                     target=job, args=(test_queue, interrupt_flag), kwargs=dict(
-                        setup_flag=setup_flag, teardown_flag=teardown_flag))
+                        setup_flag=setup_flag, teardown_flag=teardown_flag,
+                        hook_failure_flag=hook_failure_flag))
                 # Do not wait for tests to finish executing if interrupted by the user.
                 thr.daemon = True
                 thr.start()
diff --git a/buildscripts/resmokelib/testing/hooks/cluster_to_cluster_dummy_replicator.py b/buildscripts/resmokelib/testing/hooks/cluster_to_cluster_dummy_replicator.py
index 0cc4870c3a7..2eb17d9c6a0 100644
--- a/buildscripts/resmokelib/testing/hooks/cluster_to_cluster_dummy_replicator.py
+++ b/buildscripts/resmokelib/testing/hooks/cluster_to_cluster_dummy_replicator.py
@@ -1,4 +1,4 @@
-"""Test hook for running the dummy replicator on two clusters."""
+"""Test implementation for running the dummy replicator on two clusters."""
 import copy
 import os.path
 
diff --git a/buildscripts/resmokelib/testing/hooks/cluster_to_cluster_replication.py b/buildscripts/resmokelib/testing/hooks/cluster_to_cluster_replication.py
index 2e610d9c162..da35679f3af 100644
--- a/buildscripts/resmokelib/testing/hooks/cluster_to_cluster_replication.py
+++ b/buildscripts/resmokelib/testing/hooks/cluster_to_cluster_replication.py
@@ -56,9 +56,9 @@ class ClusterToClusterReplication(interface.Hook):  # pylint: disable=too-many-i
     def before_suite(self, test_report):
         """Before suite."""
         if not self._fixture:
-            raise ValueError("No ClusterToClusterReplication to run migrations on")
-        # The replicator must be called here to tell it to start replicating data.
-        self.logger.info("Starting the cluster to cluster replicator.")
+            raise ValueError("No ClusterToClusterFixture to run migrations on")
+
+        self.logger.info("Setting up cluster to cluster test data.")
 
         # Set up the initial replication direction.
         clusters = self._fixture.get_independent_clusters()
@@ -78,35 +78,37 @@ class ClusterToClusterReplication(interface.Hook):  # pylint: disable=too-many-i
     def after_suite(self, test_report, teardown_flag=None):
         """After suite."""
         # If the total number of tests was not an exact multiple of the number run per cycle, then
-        # run the data consistency check again.
+        # pause the replicator first.
         if self._test_num % self._tests_per_cycle != 0:
             # Pause the dummy replicator first.
             self.logger.info("Pausing replicator before stopping.")
-            self._replicator.pause()
+            self._run_replicator_action(test_report, self._replicator.pause)
             self.logger.info("Finished pausing the replicator.")
 
-            self.logger.info("Stopping the cluster to cluster replicator.")
-            stop_options = {
-                "test": self._last_test, "test_report": test_report,
-                "shell_options": self._shell_options
-            }
-            self._replicator.stop(stop_options)
-            self.logger.info("Stopped the cluster to cluster replicator.")
+        self.logger.info("Stopping the cluster to cluster replicator.")
+        stop_options = {
+            "test": self._last_test, "test_report": test_report,
+            "shell_options": self._shell_options
+        }
+        self._run_replicator_action(test_report, self._replicator.stop, stop_options)
+        self.logger.info("Stopped the cluster to cluster replicator.")
 
-            self._run_data_consistency_check(self._last_test, test_report)
-            self._run_check_repl_db_hash(self._last_test, test_report)
+        self._run_data_consistency_check(self._last_test, test_report)
+        self._run_check_repl_db_hash(self._last_test, test_report)
 
     def before_test(self, test, test_report):
         """Before test."""
         if self._test_num == 0:
             self.logger.info("Starting the replicator.")
-            self._replicator.start()
+            self._run_replicator_action(test_report, self._replicator.start)
+            self.logger.info("Started the replicator.")
             return
 
         if self._test_num % self._tests_per_cycle == 0:
             # The replicator should be told to start running once again.
             self.logger.info("Resuming the cluster to cluster replicator.")
-            self._replicator.resume()
+            self._run_replicator_action(test_report, self._replicator.resume)
+            self.logger.info("Resumed the cluster to cluster replicator.")
 
     def after_test(self, test, test_report):
         """After test."""
@@ -117,7 +119,7 @@ class ClusterToClusterReplication(interface.Hook):  # pylint: disable=too-many-i
         # data across the clusters, so that a data consistency check can be performed.
         if self._test_num % self._tests_per_cycle == 0:
             self.logger.info("Pausing the cluster to cluster replicator.")
-            self._replicator.pause()
+            self._run_replicator_action(test_report, self._replicator.pause)
             self.logger.info("Paused the cluster to cluster replicator.")
 
             self._run_data_consistency_check(test, test_report)
@@ -139,3 +141,28 @@ class ClusterToClusterReplication(interface.Hook):  # pylint: disable=too-many-i
         check_db_hash.before_test(test, test_report)
         check_db_hash.after_test(test, test_report)
         check_db_hash.after_suite(test_report)
+
+    def _run_replicator_action(self, test_report, action, action_options=None):
+        replicator_action_case = _ReplicatorActionTestCase(self.logger, self._last_test, self,
+                                                           action, action_options)
+        replicator_action_case.run_dynamic_test(test_report)
+
+
+class _ReplicatorActionTestCase(interface.DynamicTestCase):
+    """_ReplicatorActionTestCase class, to run a replicator action as a test."""
+
+    def __init__(  # pylint: disable=too-many-arguments
+            self, logger, base_test_name, hook, action, action_options):
+        """Initialize _ReplicatorActionTestCase."""
+        interface.DynamicTestCase.__init__(self, logger, "replicator_action",
+                                           "Run a replicator action.", base_test_name, hook)
+        self._action = action
+        self._action_options = action_options
+
+    def run_test(self):
+        try:
+            self._action(self._action_options)
+        except:
+            self.logger.exception("Failed to run replicator action '%s' with options '%s'",
+                                  self._action, self._action_options)
+            raise
diff --git a/buildscripts/resmokelib/testing/job.py b/buildscripts/resmokelib/testing/job.py
index 3a2ac3dbedf..6b40cbdbb92 100644
--- a/buildscripts/resmokelib/testing/job.py
+++ b/buildscripts/resmokelib/testing/job.py
@@ -48,7 +48,8 @@ class Job(object):  # pylint: disable=too-many-instance-attributes
         # Drain the queue to unblock the main thread.
         Job._drain_queue(queue)
 
-    def __call__(self, queue, interrupt_flag, setup_flag=None, teardown_flag=None):
+    def __call__(self, queue, interrupt_flag, setup_flag=None, teardown_flag=None,
+                 hook_failure_flag=None):
         """Continuously execute tests from 'queue' and records their details in 'report'.
 
         If 'setup_flag' is not None, then a test to set up the fixture will be run
@@ -81,7 +82,7 @@ class Job(object):  # pylint: disable=too-many-instance-attributes
 
         if setup_succeeded:
             try:
-                self._run(queue, interrupt_flag, teardown_flag)
+                self._run(queue, interrupt_flag, teardown_flag, hook_failure_flag)
             except errors.StopExecution as err:
                 # Stop running tests immediately.
                 self.logger.error("Received a StopExecution exception: %s.", err)
@@ -117,26 +118,24 @@ class Job(object):  # pylint: disable=too-many-instance-attributes
         """Get current time to aid in the unit testing of the _run method."""
         return time.time()
 
-    def _run(self, queue, interrupt_flag, teardown_flag=None):
+    def _run(self, queue, interrupt_flag, teardown_flag=None, hook_failure_flag=None):
         """Call the before/after suite hooks and continuously execute tests from 'queue'."""
 
-        for hook in self.hooks:
-            hook.before_suite(self.report)
+        self._run_hooks_before_suite(hook_failure_flag)
 
         while not queue.empty() and not interrupt_flag.is_set():
             queue_elem = queue.get_nowait()
             test_time_start = self._get_time()
             try:
                 test = queue_elem.testcase
-                self._execute_test(test)
+                self._execute_test(test, hook_failure_flag)
             finally:
                 queue_elem.job_completed(self._get_time() - test_time_start)
                 queue.task_done()
 
             self._requeue_test(queue, queue_elem, interrupt_flag)
 
-        for hook in self.hooks:
-            hook.after_suite(self.report, teardown_flag)
+        self._run_hooks_after_suite(teardown_flag, hook_failure_flag)
 
     def _log_requeue_test(self, queue_elem):
         """Log the requeue of a test."""
@@ -165,11 +164,11 @@ class Job(object):  # pylint: disable=too-many-instance-attributes
             self._log_requeue_test(queue_elem)
             queue.put(queue_elem)
 
-    def _execute_test(self, test):
+    def _execute_test(self, test, hook_failure_flag):
         """Call the before/after test hooks and execute 'test'."""
 
         test.configure(self.fixture, config.NUM_CLIENTS_PER_FIXTURE)
-        self._run_hooks_before_tests(test)
+        self._run_hooks_before_tests(test, hook_failure_flag)
         self.report.logging_prefix = create_fixture_table(self.fixture)
 
         test(self.report)
@@ -199,26 +198,51 @@ class Job(object):  # pylint: disable=too-many-instance-attributes
 
             # Stop background hooks first since they can interfere with fixture startup and teardown
             # done as part of archival.
-            self._run_hooks_after_tests(test, background=True)
+            self._run_hooks_after_tests(test, hook_failure_flag, background=True)
 
             if self.archival:
                 result = TestResult(test=test, hook=None, success=success)
                 self.archival.archive(self.logger, result, self.manager)
 
-            self._run_hooks_after_tests(test, background=False)
+            self._run_hooks_after_tests(test, hook_failure_flag, background=False)
 
-    def _run_hook(self, hook, hook_function, test):
+    def _run_hook(self, hook, hook_function, test, hook_failure_flag):
         """Provide helper to run hook and archival."""
         try:
             success = False
             hook_function(test, self.report)
             success = True
         finally:
+            if not success and hook_failure_flag is not None:
+                hook_failure_flag.set()
+
             if self.archival:
                 result = TestResult(test=test, hook=hook, success=success)
                 self.archival.archive(self.logger, result, self.manager)
 
-    def _run_hooks_before_tests(self, test):
+    def _run_hooks_before_suite(self, hook_failure_flag):
+        """Run the before_suite method on each of the hooks."""
+        hooks_failed = True
+        try:
+            for hook in self.hooks:
+                hook.before_suite(self.report)
+            hooks_failed = False
+        finally:
+            if hooks_failed and hook_failure_flag is not None:
+                hook_failure_flag.set()
+
+    def _run_hooks_after_suite(self, teardown_flag, hook_failure_flag):
+        """Run the after_suite method on each of the hooks."""
+        hooks_failed = True
+        try:
+            for hook in self.hooks:
+                hook.after_suite(self.report, teardown_flag)
+            hooks_failed = False
+        finally:
+            if hooks_failed and hook_failure_flag is not None:
+                hook_failure_flag.set()
+
+    def _run_hooks_before_tests(self, test, hook_failure_flag):
         """Run the before_test method on each of the hooks.
 
         Swallows any TestFailure exceptions if set to continue on
@@ -226,7 +250,7 @@ class Job(object):  # pylint: disable=too-many-instance-attributes
         """
         try:
             for hook in self.hooks:
-                self._run_hook(hook, hook.before_test, test)
+                self._run_hook(hook, hook.before_test, test, hook_failure_flag)
 
         except errors.StopExecution:
             raise
@@ -251,7 +275,7 @@ class Job(object):  # pylint: disable=too-many-instance-attributes
             self.report.stopTest(test)
             raise
 
-    def _run_hooks_after_tests(self, test, background=False):
+    def _run_hooks_after_tests(self, test, hook_failure_flag, background=False):
         """Run the after_test method on each of the hooks.
 
         Swallows any TestFailure exceptions if set to continue on
@@ -263,7 +287,7 @@ class Job(object):  # pylint: disable=too-many-instance-attributes
         try:
             for hook in self.hooks:
                 if hook.IS_BACKGROUND == background:
-                    self._run_hook(hook, hook.after_test, test)
+                    self._run_hook(hook, hook.after_test, test, hook_failure_flag)
 
         except errors.StopExecution:
             raise
diff --git a/buildscripts/tests/resmokelib/testing/test_job.py b/buildscripts/tests/resmokelib/testing/test_job.py
index e2d130c3fc8..699f19bd133 100644
--- a/buildscripts/tests/resmokelib/testing/test_job.py
+++ b/buildscripts/tests/resmokelib/testing/test_job.py
@@ -198,7 +198,7 @@ class UnitJob(job.Job):  # pylint: disable=too-many-instance-attributes
         self.total_test_num = 0
         self.tests = {}
 
-    def _execute_test(self, test):
+    def _execute_test(self, test, hook_failure_flag=None):
         self.total_test_num += 1
         if test.test_name not in self.tests:
             self.tests[test.test_name] = 0
author	Vishnu Kaushik <vishnu.kaushik@mongodb.com>	2021-11-03 14:33:46 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2021-11-03 15:52:49 +0000
commit	0d73b2dc01f302a73f5805cce0fab5c3a86aeb41 (patch)
tree	8da409061221e28ef23d3ab2bb38a44deb8ce8bb
parent	75242734fdbec90c6a55a57165295b62c173b179 (diff)
download	mongo-0d73b2dc01f302a73f5805cce0fab5c3a86aeb41.tar.gz