From 963274b25452305779898d618102e3d1cacb562a Mon Sep 17 00:00:00 2001 From: Max Hirschhorn Date: Sat, 26 May 2018 01:12:41 -0400 Subject: SERVER-34779 Add hook for checking dbhash while a test is running. Enables the CheckReplDBHashInBackground hook in the replica_sets_jscore_passthrough.yml test suite. (cherry picked from commit 25f72cf5f1d0d894680ef855f646e27f234ce6a3) --- .../suites/replica_sets_jscore_passthrough.yml | 24 +++ .../resmokelib/testing/hooks/dbhash_background.py | 175 +++++++++++++++++++++ buildscripts/resmokelib/testing/hooks/interface.py | 2 +- buildscripts/resmokelib/testing/job.py | 7 +- .../resmokelib/testing/testcases/jstest.py | 2 +- 5 files changed, 207 insertions(+), 3 deletions(-) create mode 100644 buildscripts/resmokelib/testing/hooks/dbhash_background.py (limited to 'buildscripts') diff --git a/buildscripts/resmokeconfig/suites/replica_sets_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/replica_sets_jscore_passthrough.yml index 189a1655f63..d7222328cbe 100644 --- a/buildscripts/resmokeconfig/suites/replica_sets_jscore_passthrough.yml +++ b/buildscripts/resmokeconfig/suites/replica_sets_jscore_passthrough.yml @@ -9,11 +9,27 @@ selector: - jstests/core/opcounters_write_cmd.js - jstests/core/read_after_optime.js - jstests/core/capped_update.js + # These tests change the featureCompatibilityVersion which makes it so transactions aren't + # supported. + - jstests/core/txns/only_allow_transactions_on_FCV_40.js + - jstests/core/txns/abort_transactions_on_FCV_downgrade.js + # These tests change the transactionLifetimeLimitSeconds server parameter which conflicts with how + # the CheckReplDBHashInBackground hook doesn't want transactions to be reaped while it is running. + - jstests/core/txns/abort_expired_transaction.js + - jstests/core/txns/abort_transaction_thread_does_not_block_on_locks.js + # The set_param1.js test attempts to compare the response from running the {getParameter: "*"} + # command multiple times, which may observe the change to the "transactionLifetimeLimitSeconds" + # server parameter. + - jstests/core/set_param1.js + # TODO SERVER-35217: Attempting to kill a session while multiple transactions have been started on + # the server may lead to deadlock. + - jstests/core/txns/kill_sessions_kills_transaction.js executor: archive: hooks: - CheckReplDBHash + - CheckReplDBHashInBackground - CheckReplOplogs - ValidateCollections config: @@ -24,6 +40,7 @@ executor: # The CheckReplDBHash hook waits until all operations have replicated to and have been applied # on the secondaries, so we run the ValidateCollections hook after it to ensure we're # validating the entire contents of the collection. + - class: CheckReplDBHashInBackground - class: CheckReplOplogs - class: CheckReplDBHash - class: ValidateCollections @@ -35,4 +52,11 @@ executor: set_parameters: enableTestCommands: 1 numInitialSyncAttempts: 1 + # When running tests that intentionally trigger a DuplicateKeyError, we somehow end up + # performing an atClusterTime read in the CheckReplDBHashInBackground hook based on an + # operationTime that is greater than anything in the oplog. The periodic no-op writer must + # be enabled to ensure that we eventually reach the clusterTime we are waiting for. + # + # TODO SERVER-35156: Remove the following line to disable the periodic no-op writer. + writePeriodicNoops: 1 num_nodes: 2 diff --git a/buildscripts/resmokelib/testing/hooks/dbhash_background.py b/buildscripts/resmokelib/testing/hooks/dbhash_background.py new file mode 100644 index 00000000000..10d49ee8e3f --- /dev/null +++ b/buildscripts/resmokelib/testing/hooks/dbhash_background.py @@ -0,0 +1,175 @@ +"""Test hook for verifying data consistency across a replica set. + +Unlike dbhash.py, this version of the hook runs continously in a background thread while the test is +running. +""" + +from __future__ import absolute_import + +import os.path +import sys +import threading + +from buildscripts.resmokelib import errors +from buildscripts.resmokelib.testing.hooks import jsfile +from buildscripts.resmokelib.testing.testcases import interface as testcase + + +class CheckReplDBHashInBackground(jsfile.JSHook): + """A hook for comparing the dbhashes of all replica set members while a test is running.""" + + def __init__(self, hook_logger, fixture, shell_options=None): + """Initialize CheckReplDBHashInBackground.""" + description = "Check dbhashes of all replica set members while a test is running" + js_filename = os.path.join("jstests", "hooks", "run_check_repl_dbhash_background.js") + jsfile.JSHook.__init__(self, hook_logger, fixture, js_filename, description, + shell_options=shell_options) + + self._background_job = None + + def before_suite(self, test_report): + """Start the background thread.""" + client = self.fixture.mongo_client() + server_status = client.admin.command("serverStatus") + if not server_status["storageEngine"].get("supportsSnapshotReadConcern", False): + self.logger.info("Not enabling the background thread because '%s' storage engine" + " doesn't support snapshot reads.", + server_status["storageEngine"]["name"]) + return + + self._background_job = _BackgroundJob() + self.logger.info("Starting the background thread.") + self._background_job.start() + + def after_suite(self, test_report): + """Signal the background thread to exit, and wait until it does.""" + if self._background_job is None: + return + + self.logger.info("Stopping the background thread.") + self._background_job.stop() + + def before_test(self, test, test_report): + """Instruct the background thread to run the dbhash check while 'test' is also running.""" + if self._background_job is None: + return + + hook_test_case = _ContinuousDynamicJSTestCase.create_before_test( + self.logger.test_case_logger, test, self, self._js_filename, self._shell_options) + hook_test_case.configure(self.fixture) + + self.logger.info("Resuming the background thread.") + self._background_job.resume(hook_test_case, test_report) + + def after_test(self, test, test_report): # noqa: D205,D400 + """Instruct the background thread to stop running the dbhash check now that 'test' has + finished running. + """ + if self._background_job is None: + return + + self.logger.info("Pausing the background thread.") + self._background_job.pause() + + if self._background_job.exc_info is not None: + if isinstance(self._background_job.exc_info[1], errors.TestFailure): + # If the mongo shell process running the JavaScript file exited with a non-zero + # return code, then we raise an errors.ServerFailure exception to cause resmoke.py's + # test execution to stop. + raise errors.ServerFailure(self._background_job.exc_info[1].args[0]) + else: + self.logger.error("Encountered an error inside the background thread.", + exc_info=self._background_job.exc_info) + raise self._background_job.exc_info[1] + + +class _BackgroundJob(threading.Thread): # pylint: disable=too-many-instance-attributes + """A thread for running the dbhash check while a test is running.""" + + def __init__(self): + """Initialize _BackgroundJob.""" + threading.Thread.__init__(self, name="CheckReplDBHashInBackground") + self.daemon = True + + self._lock = threading.Lock() + self._cond = threading.Condition(self._lock) + + self._should_stop = False + self._should_resume = False + self._is_idle = True + + self._hook_test_case = None + self._test_report = None + self.exc_info = None + + def run(self): + while True: + with self._lock: + while not self._should_resume: + self._cond.wait() + + # We have the background thread set 'self._should_resume' back to false to ensure + # that 'self._hook_test_case.run_dynamic_test()' is only called once before the + # resume() method is called again. + self._should_resume = False + + if self._should_stop: + break + + # We are about to execute 'self._hook_test_case' so we mark the background thread as + # no longer being idle. + self._is_idle = False + + try: + self._hook_test_case.run_dynamic_test(self._test_report) + except: # pylint: disable=bare-except + self.exc_info = sys.exc_info() + finally: + with self._lock: + self._is_idle = True + self._cond.notify_all() + + def stop(self): + """Signal the background thread to exit, and wait until it does.""" + with self._lock: + self._should_stop = True + self.resume(hook_test_case=None, test_report=None) + self.join() + + def pause(self): # noqa: D205,D400 + """Signal the background thread that it should stop executing 'self._hook_test_case', and + wait until the current execution has finished. + """ + self._hook_test_case.signal_stop_test() + with self._lock: + while not self._is_idle: + self._cond.wait() + + def resume(self, hook_test_case, test_report): + """Instruct the background thread to start executing 'hook_test_case'.""" + self._hook_test_case = hook_test_case + self._test_report = test_report + self.exc_info = None + with self._lock: + self._should_resume = True + self._cond.notify_all() + + +class _ContinuousDynamicJSTestCase(jsfile.DynamicJSTestCase): + """A dynamic TestCase that runs a JavaScript file repeatedly.""" + + def __init__(self, *args, **kwargs): + """Initialize _ContinuousDynamicJSTestCase.""" + jsfile.DynamicJSTestCase.__init__(self, *args, **kwargs) + self._should_stop = threading.Event() + + def run_test(self): + """Execute the test repeatedly.""" + while not self._should_stop.is_set(): + jsfile.DynamicJSTestCase.run_test(self) + + def signal_stop_test(self): # noqa: D205,D400 + """Indicate to the thread executing the run_test() method that the current execution is the + last one. This method returns without waiting for the current execution to finish. + """ + self._should_stop.set() diff --git a/buildscripts/resmokelib/testing/hooks/interface.py b/buildscripts/resmokelib/testing/hooks/interface.py index 8e938355001..0c2f65077bc 100644 --- a/buildscripts/resmokelib/testing/hooks/interface.py +++ b/buildscripts/resmokelib/testing/hooks/interface.py @@ -78,7 +78,7 @@ class DynamicTestCase(testcase.TestCase): # pylint: disable=abstract-method self.return_code = 1 self.logger.exception("{0} failed".format(self.description)) test_report.addFailure(self, sys.exc_info()) - raise errors.TestFailure(err) + raise errors.TestFailure(err.args[0]) except: self.return_code = 2 test_report.addFailure(self, sys.exc_info()) diff --git a/buildscripts/resmokelib/testing/job.py b/buildscripts/resmokelib/testing/job.py index 41e80bbee1b..f5586d8dfc6 100644 --- a/buildscripts/resmokelib/testing/job.py +++ b/buildscripts/resmokelib/testing/job.py @@ -95,7 +95,12 @@ class Job(object): test(self.report) try: - if self.suite_options.fail_fast and not self.report.wasSuccessful(): + # We are intentionally only checking the individual 'test' status and not calling + # report.wasSuccessful() here. It is possible that a thread running in the background as + # part of a hook has added a failed test case to 'self.report'. Checking the individual + # 'test' status ensures self._run_hooks_after_tests() is called if it is a hook's test + # case that has failed and not 'test' that has failed. + if self.suite_options.fail_fast and self.report.find_test_info(test).status != "pass": self.logger.info("%s failed, so stopping..." % (test.short_description())) raise errors.StopExecution("%s failed" % (test.short_description())) diff --git a/buildscripts/resmokelib/testing/testcases/jstest.py b/buildscripts/resmokelib/testing/testcases/jstest.py index ae12ac34ed6..0d150930a44 100644 --- a/buildscripts/resmokelib/testing/testcases/jstest.py +++ b/buildscripts/resmokelib/testing/testcases/jstest.py @@ -224,7 +224,7 @@ class JSTestCase(interface.ProcessTestCase): self.logger.error( "Encountered an error inside thread %d running jstest %s.", thread_id, self.basename(), exc_info=thread.exc_info) - raise thread.exc_info + raise thread.exc_info[1] def run_test(self): """Execute the test.""" -- cgit v1.2.1