summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVishnu Kaushik <vishnu.kaushik@mongodb.com>2021-11-03 14:33:46 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-11-03 15:52:49 +0000
commit0d73b2dc01f302a73f5805cce0fab5c3a86aeb41 (patch)
tree8da409061221e28ef23d3ab2bb38a44deb8ce8bb
parent75242734fdbec90c6a55a57165295b62c173b179 (diff)
downloadmongo-0d73b2dc01f302a73f5805cce0fab5c3a86aeb41.tar.gz
SERVER-61080 Avoid failing replicator tasks for test failures unrelated to the replicator
-rw-r--r--buildscripts/resmokelib/testing/executor.py28
-rw-r--r--buildscripts/resmokelib/testing/hooks/cluster_to_cluster_dummy_replicator.py2
-rw-r--r--buildscripts/resmokelib/testing/hooks/cluster_to_cluster_replication.py61
-rw-r--r--buildscripts/resmokelib/testing/job.py58
-rw-r--r--buildscripts/tests/resmokelib/testing/test_job.py2
5 files changed, 107 insertions, 44 deletions
diff --git a/buildscripts/resmokelib/testing/executor.py b/buildscripts/resmokelib/testing/executor.py
index 4fd76ea7093..65f04dd18a2 100644
--- a/buildscripts/resmokelib/testing/executor.py
+++ b/buildscripts/resmokelib/testing/executor.py
@@ -88,7 +88,7 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes
n_jobs_to_start = self._num_jobs_to_start(self._suite, num_tests)
return [self._make_job(job_num) for job_num in range(n_jobs_to_start)]
- def run(self):
+ def run(self): # pylint: disable=too-many-branches
"""Execute the test suite.
Any exceptions that occur during setting up or tearing down a
@@ -104,6 +104,7 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes
# a test suite run earlier can be reused during this current test suite.
network.PortAllocator.reset()
teardown_flag = None
+ hook_failure_flag = None
try:
num_repeat_suites = self._suite.options.num_repeat_suites
while num_repeat_suites > 0:
@@ -117,30 +118,39 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes
# still running if an Evergreen task were to time out from a hang/deadlock being
# triggered.
teardown_flag = threading.Event() if num_repeat_suites == 1 else None
- (report, interrupted) = self._run_tests(test_queue, setup_flag, teardown_flag)
+ # We use the 'hook_failure_flag' to distinguish hook failures from other failures,
+ # so that we can return a separate return code when a hook has failed.
+ hook_failure_flag = threading.Event()
+ (report, interrupted) = self._run_tests(test_queue, setup_flag, teardown_flag,
+ hook_failure_flag)
self._suite.record_test_end(report)
if setup_flag and setup_flag.is_set():
self.logger.error("Setup of one of the job fixtures failed")
- return_code = 2
+ return_code = max(return_code, 2)
return
# Remove the setup flag once the first suite ran.
setup_flag = None
+ if hook_failure_flag.is_set():
+ # The hook failure return code is highest so it will take precedence when
+ # reported.
+ return_code = max(return_code, 3)
+
# If the user triggered a KeyboardInterrupt, then we should stop.
if interrupted:
raise errors.UserInterrupt("Received interrupt from user")
if teardown_flag and teardown_flag.is_set():
- return_code = 2
+ return_code = max(return_code, 2)
sb = [] # String builder.
self._suite.summarize_latest(sb)
self.logger.info("Summary of latest execution: %s", "\n ".join(sb))
if not report.wasSuccessful():
- return_code = 1
+ return_code = max(return_code, 1)
if self._suite.options.fail_fast:
break
@@ -159,10 +169,11 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes
finally:
if not teardown_flag:
if not self._teardown_fixtures():
- return_code = 2
+ return_code = max(return_code, 2)
+
self._suite.return_code = return_code
- def _run_tests(self, test_queue, setup_flag, teardown_flag):
+ def _run_tests(self, test_queue, setup_flag, teardown_flag, hook_failure_flag):
"""Start a thread for each Job instance and block until all of the tests are run.
Returns a (combined report, user interrupted) pair, where the
@@ -178,7 +189,8 @@ class TestSuiteExecutor(object): # pylint: disable=too-many-instance-attributes
for job in self._jobs:
thr = threading.Thread(
target=job, args=(test_queue, interrupt_flag), kwargs=dict(
- setup_flag=setup_flag, teardown_flag=teardown_flag))
+ setup_flag=setup_flag, teardown_flag=teardown_flag,
+ hook_failure_flag=hook_failure_flag))
# Do not wait for tests to finish executing if interrupted by the user.
thr.daemon = True
thr.start()
diff --git a/buildscripts/resmokelib/testing/hooks/cluster_to_cluster_dummy_replicator.py b/buildscripts/resmokelib/testing/hooks/cluster_to_cluster_dummy_replicator.py
index 0cc4870c3a7..2eb17d9c6a0 100644
--- a/buildscripts/resmokelib/testing/hooks/cluster_to_cluster_dummy_replicator.py
+++ b/buildscripts/resmokelib/testing/hooks/cluster_to_cluster_dummy_replicator.py
@@ -1,4 +1,4 @@
-"""Test hook for running the dummy replicator on two clusters."""
+"""Test implementation for running the dummy replicator on two clusters."""
import copy
import os.path
diff --git a/buildscripts/resmokelib/testing/hooks/cluster_to_cluster_replication.py b/buildscripts/resmokelib/testing/hooks/cluster_to_cluster_replication.py
index 2e610d9c162..da35679f3af 100644
--- a/buildscripts/resmokelib/testing/hooks/cluster_to_cluster_replication.py
+++ b/buildscripts/resmokelib/testing/hooks/cluster_to_cluster_replication.py
@@ -56,9 +56,9 @@ class ClusterToClusterReplication(interface.Hook): # pylint: disable=too-many-i
def before_suite(self, test_report):
"""Before suite."""
if not self._fixture:
- raise ValueError("No ClusterToClusterReplication to run migrations on")
- # The replicator must be called here to tell it to start replicating data.
- self.logger.info("Starting the cluster to cluster replicator.")
+ raise ValueError("No ClusterToClusterFixture to run migrations on")
+
+ self.logger.info("Setting up cluster to cluster test data.")
# Set up the initial replication direction.
clusters = self._fixture.get_independent_clusters()
@@ -78,35 +78,37 @@ class ClusterToClusterReplication(interface.Hook): # pylint: disable=too-many-i
def after_suite(self, test_report, teardown_flag=None):
"""After suite."""
# If the total number of tests was not an exact multiple of the number run per cycle, then
- # run the data consistency check again.
+ # pause the replicator first.
if self._test_num % self._tests_per_cycle != 0:
# Pause the dummy replicator first.
self.logger.info("Pausing replicator before stopping.")
- self._replicator.pause()
+ self._run_replicator_action(test_report, self._replicator.pause)
self.logger.info("Finished pausing the replicator.")
- self.logger.info("Stopping the cluster to cluster replicator.")
- stop_options = {
- "test": self._last_test, "test_report": test_report,
- "shell_options": self._shell_options
- }
- self._replicator.stop(stop_options)
- self.logger.info("Stopped the cluster to cluster replicator.")
+ self.logger.info("Stopping the cluster to cluster replicator.")
+ stop_options = {
+ "test": self._last_test, "test_report": test_report,
+ "shell_options": self._shell_options
+ }
+ self._run_replicator_action(test_report, self._replicator.stop, stop_options)
+ self.logger.info("Stopped the cluster to cluster replicator.")
- self._run_data_consistency_check(self._last_test, test_report)
- self._run_check_repl_db_hash(self._last_test, test_report)
+ self._run_data_consistency_check(self._last_test, test_report)
+ self._run_check_repl_db_hash(self._last_test, test_report)
def before_test(self, test, test_report):
"""Before test."""
if self._test_num == 0:
self.logger.info("Starting the replicator.")
- self._replicator.start()
+ self._run_replicator_action(test_report, self._replicator.start)
+ self.logger.info("Started the replicator.")
return
if self._test_num % self._tests_per_cycle == 0:
# The replicator should be told to start running once again.
self.logger.info("Resuming the cluster to cluster replicator.")
- self._replicator.resume()
+ self._run_replicator_action(test_report, self._replicator.resume)
+ self.logger.info("Resumed the cluster to cluster replicator.")
def after_test(self, test, test_report):
"""After test."""
@@ -117,7 +119,7 @@ class ClusterToClusterReplication(interface.Hook): # pylint: disable=too-many-i
# data across the clusters, so that a data consistency check can be performed.
if self._test_num % self._tests_per_cycle == 0:
self.logger.info("Pausing the cluster to cluster replicator.")
- self._replicator.pause()
+ self._run_replicator_action(test_report, self._replicator.pause)
self.logger.info("Paused the cluster to cluster replicator.")
self._run_data_consistency_check(test, test_report)
@@ -139,3 +141,28 @@ class ClusterToClusterReplication(interface.Hook): # pylint: disable=too-many-i
check_db_hash.before_test(test, test_report)
check_db_hash.after_test(test, test_report)
check_db_hash.after_suite(test_report)
+
+ def _run_replicator_action(self, test_report, action, action_options=None):
+ replicator_action_case = _ReplicatorActionTestCase(self.logger, self._last_test, self,
+ action, action_options)
+ replicator_action_case.run_dynamic_test(test_report)
+
+
+class _ReplicatorActionTestCase(interface.DynamicTestCase):
+ """_ReplicatorActionTestCase class, to run a replicator action as a test."""
+
+ def __init__( # pylint: disable=too-many-arguments
+ self, logger, base_test_name, hook, action, action_options):
+ """Initialize _ReplicatorActionTestCase."""
+ interface.DynamicTestCase.__init__(self, logger, "replicator_action",
+ "Run a replicator action.", base_test_name, hook)
+ self._action = action
+ self._action_options = action_options
+
+ def run_test(self):
+ try:
+ self._action(self._action_options)
+ except:
+ self.logger.exception("Failed to run replicator action '%s' with options '%s'",
+ self._action, self._action_options)
+ raise
diff --git a/buildscripts/resmokelib/testing/job.py b/buildscripts/resmokelib/testing/job.py
index 3a2ac3dbedf..6b40cbdbb92 100644
--- a/buildscripts/resmokelib/testing/job.py
+++ b/buildscripts/resmokelib/testing/job.py
@@ -48,7 +48,8 @@ class Job(object): # pylint: disable=too-many-instance-attributes
# Drain the queue to unblock the main thread.
Job._drain_queue(queue)
- def __call__(self, queue, interrupt_flag, setup_flag=None, teardown_flag=None):
+ def __call__(self, queue, interrupt_flag, setup_flag=None, teardown_flag=None,
+ hook_failure_flag=None):
"""Continuously execute tests from 'queue' and records their details in 'report'.
If 'setup_flag' is not None, then a test to set up the fixture will be run
@@ -81,7 +82,7 @@ class Job(object): # pylint: disable=too-many-instance-attributes
if setup_succeeded:
try:
- self._run(queue, interrupt_flag, teardown_flag)
+ self._run(queue, interrupt_flag, teardown_flag, hook_failure_flag)
except errors.StopExecution as err:
# Stop running tests immediately.
self.logger.error("Received a StopExecution exception: %s.", err)
@@ -117,26 +118,24 @@ class Job(object): # pylint: disable=too-many-instance-attributes
"""Get current time to aid in the unit testing of the _run method."""
return time.time()
- def _run(self, queue, interrupt_flag, teardown_flag=None):
+ def _run(self, queue, interrupt_flag, teardown_flag=None, hook_failure_flag=None):
"""Call the before/after suite hooks and continuously execute tests from 'queue'."""
- for hook in self.hooks:
- hook.before_suite(self.report)
+ self._run_hooks_before_suite(hook_failure_flag)
while not queue.empty() and not interrupt_flag.is_set():
queue_elem = queue.get_nowait()
test_time_start = self._get_time()
try:
test = queue_elem.testcase
- self._execute_test(test)
+ self._execute_test(test, hook_failure_flag)
finally:
queue_elem.job_completed(self._get_time() - test_time_start)
queue.task_done()
self._requeue_test(queue, queue_elem, interrupt_flag)
- for hook in self.hooks:
- hook.after_suite(self.report, teardown_flag)
+ self._run_hooks_after_suite(teardown_flag, hook_failure_flag)
def _log_requeue_test(self, queue_elem):
"""Log the requeue of a test."""
@@ -165,11 +164,11 @@ class Job(object): # pylint: disable=too-many-instance-attributes
self._log_requeue_test(queue_elem)
queue.put(queue_elem)
- def _execute_test(self, test):
+ def _execute_test(self, test, hook_failure_flag):
"""Call the before/after test hooks and execute 'test'."""
test.configure(self.fixture, config.NUM_CLIENTS_PER_FIXTURE)
- self._run_hooks_before_tests(test)
+ self._run_hooks_before_tests(test, hook_failure_flag)
self.report.logging_prefix = create_fixture_table(self.fixture)
test(self.report)
@@ -199,26 +198,51 @@ class Job(object): # pylint: disable=too-many-instance-attributes
# Stop background hooks first since they can interfere with fixture startup and teardown
# done as part of archival.
- self._run_hooks_after_tests(test, background=True)
+ self._run_hooks_after_tests(test, hook_failure_flag, background=True)
if self.archival:
result = TestResult(test=test, hook=None, success=success)
self.archival.archive(self.logger, result, self.manager)
- self._run_hooks_after_tests(test, background=False)
+ self._run_hooks_after_tests(test, hook_failure_flag, background=False)
- def _run_hook(self, hook, hook_function, test):
+ def _run_hook(self, hook, hook_function, test, hook_failure_flag):
"""Provide helper to run hook and archival."""
try:
success = False
hook_function(test, self.report)
success = True
finally:
+ if not success and hook_failure_flag is not None:
+ hook_failure_flag.set()
+
if self.archival:
result = TestResult(test=test, hook=hook, success=success)
self.archival.archive(self.logger, result, self.manager)
- def _run_hooks_before_tests(self, test):
+ def _run_hooks_before_suite(self, hook_failure_flag):
+ """Run the before_suite method on each of the hooks."""
+ hooks_failed = True
+ try:
+ for hook in self.hooks:
+ hook.before_suite(self.report)
+ hooks_failed = False
+ finally:
+ if hooks_failed and hook_failure_flag is not None:
+ hook_failure_flag.set()
+
+ def _run_hooks_after_suite(self, teardown_flag, hook_failure_flag):
+ """Run the after_suite method on each of the hooks."""
+ hooks_failed = True
+ try:
+ for hook in self.hooks:
+ hook.after_suite(self.report, teardown_flag)
+ hooks_failed = False
+ finally:
+ if hooks_failed and hook_failure_flag is not None:
+ hook_failure_flag.set()
+
+ def _run_hooks_before_tests(self, test, hook_failure_flag):
"""Run the before_test method on each of the hooks.
Swallows any TestFailure exceptions if set to continue on
@@ -226,7 +250,7 @@ class Job(object): # pylint: disable=too-many-instance-attributes
"""
try:
for hook in self.hooks:
- self._run_hook(hook, hook.before_test, test)
+ self._run_hook(hook, hook.before_test, test, hook_failure_flag)
except errors.StopExecution:
raise
@@ -251,7 +275,7 @@ class Job(object): # pylint: disable=too-many-instance-attributes
self.report.stopTest(test)
raise
- def _run_hooks_after_tests(self, test, background=False):
+ def _run_hooks_after_tests(self, test, hook_failure_flag, background=False):
"""Run the after_test method on each of the hooks.
Swallows any TestFailure exceptions if set to continue on
@@ -263,7 +287,7 @@ class Job(object): # pylint: disable=too-many-instance-attributes
try:
for hook in self.hooks:
if hook.IS_BACKGROUND == background:
- self._run_hook(hook, hook.after_test, test)
+ self._run_hook(hook, hook.after_test, test, hook_failure_flag)
except errors.StopExecution:
raise
diff --git a/buildscripts/tests/resmokelib/testing/test_job.py b/buildscripts/tests/resmokelib/testing/test_job.py
index e2d130c3fc8..699f19bd133 100644
--- a/buildscripts/tests/resmokelib/testing/test_job.py
+++ b/buildscripts/tests/resmokelib/testing/test_job.py
@@ -198,7 +198,7 @@ class UnitJob(job.Job): # pylint: disable=too-many-instance-attributes
self.total_test_num = 0
self.tests = {}
- def _execute_test(self, test):
+ def _execute_test(self, test, hook_failure_flag=None):
self.total_test_num += 1
if test.test_name not in self.tests:
self.tests[test.test_name] = 0