diff options
author | Tausif Rahman <tausif.rahman@mongodb.com> | 2023-05-16 14:16:31 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2023-05-16 15:39:06 +0000 |
commit | 0249b20d05c58ceb47375413e9eb47e028859489 (patch) | |
tree | eae88e13a98cc5357decbd23772de695914c8e39 | |
parent | c2853b882fb5130684a1e848248e26d4fa4961b9 (diff) | |
download | mongo-0249b20d05c58ceb47375413e9eb47e028859489.tar.gz |
SERVER-76739 Archive data missing on evergreen task timeouts
-rw-r--r-- | buildscripts/resmokelib/flags.py | 5 | ||||
-rwxr-xr-x | buildscripts/resmokelib/hang_analyzer/hang_analyzer.py | 6 | ||||
-rw-r--r-- | buildscripts/resmokelib/sighandler.py | 29 | ||||
-rw-r--r-- | buildscripts/resmokelib/testing/hook_test_archival.py | 7 | ||||
-rw-r--r-- | buildscripts/resmokelib/testing/report.py | 6 | ||||
-rw-r--r-- | buildscripts/resmokelib/testing/symbolizer_service.py | 7 |
6 files changed, 51 insertions, 9 deletions
diff --git a/buildscripts/resmokelib/flags.py b/buildscripts/resmokelib/flags.py new file mode 100644 index 00000000000..6aff6961666 --- /dev/null +++ b/buildscripts/resmokelib/flags.py @@ -0,0 +1,5 @@ +"""Global flags used by resmoke.""" + +import threading + +HANG_ANALYZER_CALLED = threading.Event() diff --git a/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py b/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py index d3f38e6374d..28dc4248467 100755 --- a/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py +++ b/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py @@ -68,6 +68,12 @@ class HangAnalyzer(Subcommand): self._configure_processes() self._setup_logging(logger) + def kill_rogue_processes(self): + """Kill any processes that are currently being analyzed.""" + processes = process_list.get_processes(self.process_ids, self.interesting_processes, + self.options.process_match, self.root_logger) + process.teardown_processes(self.root_logger, processes, dump_pids={}) + def execute(self): """ Execute hang analysis. diff --git a/buildscripts/resmokelib/sighandler.py b/buildscripts/resmokelib/sighandler.py index 5df67812d06..609ea8a6e3e 100644 --- a/buildscripts/resmokelib/sighandler.py +++ b/buildscripts/resmokelib/sighandler.py @@ -10,10 +10,10 @@ import traceback import psutil +from buildscripts.resmokelib.flags import HANG_ANALYZER_CALLED from buildscripts.resmokelib import reportfile from buildscripts.resmokelib import testing from buildscripts.resmokelib import config -from buildscripts.resmokelib.hang_analyzer import hang_analyzer from buildscripts.resmokelib import parser _IS_WINDOWS = (sys.platform == "win32") @@ -32,8 +32,8 @@ def register(logger, suites, start_time): log suite summaries. """ + HANG_ANALYZER_CALLED.set() header_msg = "Dumping stacks due to SIGUSR1 signal" - _dump_and_log(header_msg) def _handle_set_event(event_handle): @@ -53,6 +53,7 @@ def register(logger, suites, start_time): except win32event.error as err: logger.error("Exception from win32event.WaitForSingleObject with error: %s" % err) else: + HANG_ANALYZER_CALLED.set() header_msg = "Dumping stacks due to signal from win32event.SetEvent" _dump_and_log(header_msg) @@ -159,4 +160,26 @@ def _analyze_pids(logger, pids): if not os.getenv('ASAN_OPTIONS'): hang_analyzer_args.append('-c') _hang_analyzer = parser.parse_command_line(hang_analyzer_args, logger=logger) - _hang_analyzer.execute() + + # Evergreen has a 15 minute timeout for task timeout commands + # Limit the hang analyzer to 12 minutes so there is time for other tasks. + hang_analyzer_hard_timeout = None + if config.EVERGREEN_TASK_ID: + hang_analyzer_hard_timeout = 60 * 12 + logger.info( + "Limit the resmoke invoked hang analyzer to 12 minutes so there is time for resmoke to finish up." + ) + + hang_analyzer_thread = threading.Thread(target=_hang_analyzer.execute, daemon=True) + hang_analyzer_thread.start() + hang_analyzer_thread.join(hang_analyzer_hard_timeout) + + if hang_analyzer_thread.is_alive(): + logger.warning( + "Resmoke invoked hang analyzer thread did not finish, but will continue running in the background. The thread may be disruputed and may show extraneous output." + ) + logger.warning("Cleaning up resmoke child processes so that resmoke can fail gracefully.") + _hang_analyzer.kill_rogue_processes() + + else: + logger.info("Done running resmoke invoked hang analyzer thread.") diff --git a/buildscripts/resmokelib/testing/hook_test_archival.py b/buildscripts/resmokelib/testing/hook_test_archival.py index 38909056ce3..23ebfda8de3 100644 --- a/buildscripts/resmokelib/testing/hook_test_archival.py +++ b/buildscripts/resmokelib/testing/hook_test_archival.py @@ -6,6 +6,7 @@ import threading from buildscripts.resmokelib import config from buildscripts.resmokelib import errors from buildscripts.resmokelib import utils +from buildscripts.resmokelib.flags import HANG_ANALYZER_CALLED from buildscripts.resmokelib.utils import globstar @@ -105,5 +106,9 @@ class HookTestArchival(object): else: logger.info("Archive succeeded for %s: %s", test_name, message) - if not manager.setup_fixture(logger): + if HANG_ANALYZER_CALLED.is_set(): + logger.info("Hang Analyzer has been called. Fixtures will not be restarted.") + raise errors.StopExecution( + "Hang analyzer has been called. Stopping further execution of tests.") + elif not manager.setup_fixture(logger): raise errors.StopExecution("Error while restarting test fixtures after archiving.") diff --git a/buildscripts/resmokelib/testing/report.py b/buildscripts/resmokelib/testing/report.py index cc98664fabe..5dd2cbe870b 100644 --- a/buildscripts/resmokelib/testing/report.py +++ b/buildscripts/resmokelib/testing/report.py @@ -146,13 +146,9 @@ class TestReport(unittest.TestResult): try: # check if there are stacktrace files, if so, invoke the symbolizer here. - # If there are no stacktrace files for this job, we do not need to invoke the symbolizer at all. - # Take a lock to download the debug symbols if it hasn't already been downloaded. # log symbolized output to test.logger.info() - symbolizer = ResmokeSymbolizer() symbolizer.symbolize_test_logs(test) - # symbolization completed unittest.TestResult.stopTest(self, test) @@ -404,7 +400,7 @@ class TestInfo(object): self.evergreen_status = None self.return_code = None self.url_endpoint = None - self.exception_extractors = None + self.exception_extractors = [] self.error = None diff --git a/buildscripts/resmokelib/testing/symbolizer_service.py b/buildscripts/resmokelib/testing/symbolizer_service.py index 86d190640e0..b4468ef3175 100644 --- a/buildscripts/resmokelib/testing/symbolizer_service.py +++ b/buildscripts/resmokelib/testing/symbolizer_service.py @@ -11,6 +11,7 @@ from threading import Lock from typing import List, Optional, NamedTuple, Set from buildscripts.resmokelib import config as _config +from buildscripts.resmokelib.flags import HANG_ANALYZER_CALLED from buildscripts.resmokelib.testing.testcases.interface import TestCase # This lock prevents different resmoke jobs from symbolizing stacktraces concurrently, @@ -148,6 +149,12 @@ class ResmokeSymbolizer: test.logger.info("Running on MacOS, skipping symbolization") return False + if HANG_ANALYZER_CALLED.is_set(): + test.logger.info( + "Hang analyzer has been called, skipping symbolization to meet timeout constraints." + ) + return False + return True def get_stacktrace_dir(self, test: TestCase) -> Optional[str]: |