summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTausif Rahman <tausif.rahman@mongodb.com>2023-05-16 14:16:31 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2023-05-16 15:39:06 +0000
commit0249b20d05c58ceb47375413e9eb47e028859489 (patch)
treeeae88e13a98cc5357decbd23772de695914c8e39
parentc2853b882fb5130684a1e848248e26d4fa4961b9 (diff)
downloadmongo-0249b20d05c58ceb47375413e9eb47e028859489.tar.gz
SERVER-76739 Archive data missing on evergreen task timeouts
-rw-r--r--buildscripts/resmokelib/flags.py5
-rwxr-xr-xbuildscripts/resmokelib/hang_analyzer/hang_analyzer.py6
-rw-r--r--buildscripts/resmokelib/sighandler.py29
-rw-r--r--buildscripts/resmokelib/testing/hook_test_archival.py7
-rw-r--r--buildscripts/resmokelib/testing/report.py6
-rw-r--r--buildscripts/resmokelib/testing/symbolizer_service.py7
6 files changed, 51 insertions, 9 deletions
diff --git a/buildscripts/resmokelib/flags.py b/buildscripts/resmokelib/flags.py
new file mode 100644
index 00000000000..6aff6961666
--- /dev/null
+++ b/buildscripts/resmokelib/flags.py
@@ -0,0 +1,5 @@
+"""Global flags used by resmoke."""
+
+import threading
+
+HANG_ANALYZER_CALLED = threading.Event()
diff --git a/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py b/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py
index d3f38e6374d..28dc4248467 100755
--- a/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py
+++ b/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py
@@ -68,6 +68,12 @@ class HangAnalyzer(Subcommand):
self._configure_processes()
self._setup_logging(logger)
+ def kill_rogue_processes(self):
+ """Kill any processes that are currently being analyzed."""
+ processes = process_list.get_processes(self.process_ids, self.interesting_processes,
+ self.options.process_match, self.root_logger)
+ process.teardown_processes(self.root_logger, processes, dump_pids={})
+
def execute(self):
"""
Execute hang analysis.
diff --git a/buildscripts/resmokelib/sighandler.py b/buildscripts/resmokelib/sighandler.py
index 5df67812d06..609ea8a6e3e 100644
--- a/buildscripts/resmokelib/sighandler.py
+++ b/buildscripts/resmokelib/sighandler.py
@@ -10,10 +10,10 @@ import traceback
import psutil
+from buildscripts.resmokelib.flags import HANG_ANALYZER_CALLED
from buildscripts.resmokelib import reportfile
from buildscripts.resmokelib import testing
from buildscripts.resmokelib import config
-from buildscripts.resmokelib.hang_analyzer import hang_analyzer
from buildscripts.resmokelib import parser
_IS_WINDOWS = (sys.platform == "win32")
@@ -32,8 +32,8 @@ def register(logger, suites, start_time):
log suite summaries.
"""
+ HANG_ANALYZER_CALLED.set()
header_msg = "Dumping stacks due to SIGUSR1 signal"
-
_dump_and_log(header_msg)
def _handle_set_event(event_handle):
@@ -53,6 +53,7 @@ def register(logger, suites, start_time):
except win32event.error as err:
logger.error("Exception from win32event.WaitForSingleObject with error: %s" % err)
else:
+ HANG_ANALYZER_CALLED.set()
header_msg = "Dumping stacks due to signal from win32event.SetEvent"
_dump_and_log(header_msg)
@@ -159,4 +160,26 @@ def _analyze_pids(logger, pids):
if not os.getenv('ASAN_OPTIONS'):
hang_analyzer_args.append('-c')
_hang_analyzer = parser.parse_command_line(hang_analyzer_args, logger=logger)
- _hang_analyzer.execute()
+
+ # Evergreen has a 15 minute timeout for task timeout commands
+ # Limit the hang analyzer to 12 minutes so there is time for other tasks.
+ hang_analyzer_hard_timeout = None
+ if config.EVERGREEN_TASK_ID:
+ hang_analyzer_hard_timeout = 60 * 12
+ logger.info(
+ "Limit the resmoke invoked hang analyzer to 12 minutes so there is time for resmoke to finish up."
+ )
+
+ hang_analyzer_thread = threading.Thread(target=_hang_analyzer.execute, daemon=True)
+ hang_analyzer_thread.start()
+ hang_analyzer_thread.join(hang_analyzer_hard_timeout)
+
+ if hang_analyzer_thread.is_alive():
+ logger.warning(
+ "Resmoke invoked hang analyzer thread did not finish, but will continue running in the background. The thread may be disruputed and may show extraneous output."
+ )
+ logger.warning("Cleaning up resmoke child processes so that resmoke can fail gracefully.")
+ _hang_analyzer.kill_rogue_processes()
+
+ else:
+ logger.info("Done running resmoke invoked hang analyzer thread.")
diff --git a/buildscripts/resmokelib/testing/hook_test_archival.py b/buildscripts/resmokelib/testing/hook_test_archival.py
index 38909056ce3..23ebfda8de3 100644
--- a/buildscripts/resmokelib/testing/hook_test_archival.py
+++ b/buildscripts/resmokelib/testing/hook_test_archival.py
@@ -6,6 +6,7 @@ import threading
from buildscripts.resmokelib import config
from buildscripts.resmokelib import errors
from buildscripts.resmokelib import utils
+from buildscripts.resmokelib.flags import HANG_ANALYZER_CALLED
from buildscripts.resmokelib.utils import globstar
@@ -105,5 +106,9 @@ class HookTestArchival(object):
else:
logger.info("Archive succeeded for %s: %s", test_name, message)
- if not manager.setup_fixture(logger):
+ if HANG_ANALYZER_CALLED.is_set():
+ logger.info("Hang Analyzer has been called. Fixtures will not be restarted.")
+ raise errors.StopExecution(
+ "Hang analyzer has been called. Stopping further execution of tests.")
+ elif not manager.setup_fixture(logger):
raise errors.StopExecution("Error while restarting test fixtures after archiving.")
diff --git a/buildscripts/resmokelib/testing/report.py b/buildscripts/resmokelib/testing/report.py
index cc98664fabe..5dd2cbe870b 100644
--- a/buildscripts/resmokelib/testing/report.py
+++ b/buildscripts/resmokelib/testing/report.py
@@ -146,13 +146,9 @@ class TestReport(unittest.TestResult):
try:
# check if there are stacktrace files, if so, invoke the symbolizer here.
- # If there are no stacktrace files for this job, we do not need to invoke the symbolizer at all.
- # Take a lock to download the debug symbols if it hasn't already been downloaded.
# log symbolized output to test.logger.info()
-
symbolizer = ResmokeSymbolizer()
symbolizer.symbolize_test_logs(test)
- # symbolization completed
unittest.TestResult.stopTest(self, test)
@@ -404,7 +400,7 @@ class TestInfo(object):
self.evergreen_status = None
self.return_code = None
self.url_endpoint = None
- self.exception_extractors = None
+ self.exception_extractors = []
self.error = None
diff --git a/buildscripts/resmokelib/testing/symbolizer_service.py b/buildscripts/resmokelib/testing/symbolizer_service.py
index 86d190640e0..b4468ef3175 100644
--- a/buildscripts/resmokelib/testing/symbolizer_service.py
+++ b/buildscripts/resmokelib/testing/symbolizer_service.py
@@ -11,6 +11,7 @@ from threading import Lock
from typing import List, Optional, NamedTuple, Set
from buildscripts.resmokelib import config as _config
+from buildscripts.resmokelib.flags import HANG_ANALYZER_CALLED
from buildscripts.resmokelib.testing.testcases.interface import TestCase
# This lock prevents different resmoke jobs from symbolizing stacktraces concurrently,
@@ -148,6 +149,12 @@ class ResmokeSymbolizer:
test.logger.info("Running on MacOS, skipping symbolization")
return False
+ if HANG_ANALYZER_CALLED.is_set():
+ test.logger.info(
+ "Hang analyzer has been called, skipping symbolization to meet timeout constraints."
+ )
+ return False
+
return True
def get_stacktrace_dir(self, test: TestCase) -> Optional[str]: