diff options
author | Carl Raiden Worley <carl.worley@10gen.com> | 2020-05-04 12:02:26 -0400 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-05-04 19:26:55 +0000 |
commit | c1081f22237d6b92aeaeadefd85614ef83f9fc89 (patch) | |
tree | 538070fa9b664892102b000108d049dfd95e92fc /buildscripts | |
parent | 47b3bc5d5927225bb1fd7767613b1cf74157f7c9 (diff) | |
download | mongo-c1081f22237d6b92aeaeadefd85614ef83f9fc89.tar.gz |
SERVER-46684 Repackage the hang-analyzer as a resmoke subcommand
Diffstat (limited to 'buildscripts')
-rwxr-xr-x | buildscripts/hang_analyzer.py | 805 | ||||
-rw-r--r-- | buildscripts/resmokelib/commands/__init__.py | 1 | ||||
-rwxr-xr-x | buildscripts/resmokelib/commands/hang_analyzer.py | 187 | ||||
-rw-r--r-- | buildscripts/resmokelib/hang_analyzer/__init__.py | 6 | ||||
-rw-r--r-- | buildscripts/resmokelib/hang_analyzer/dumper.py | 350 | ||||
-rw-r--r-- | buildscripts/resmokelib/hang_analyzer/extractor.py | 53 | ||||
-rw-r--r-- | buildscripts/resmokelib/hang_analyzer/process.py | 104 | ||||
-rw-r--r-- | buildscripts/resmokelib/hang_analyzer/process_list.py | 158 | ||||
-rw-r--r-- | buildscripts/resmokelib/parser.py | 71 |
9 files changed, 915 insertions, 820 deletions
diff --git a/buildscripts/hang_analyzer.py b/buildscripts/hang_analyzer.py index 8e812e5f23f..30eba2206e4 100755 --- a/buildscripts/hang_analyzer.py +++ b/buildscripts/hang_analyzer.py @@ -1,804 +1,7 @@ #!/usr/bin/env python3 -"""Hang Analyzer module. - -A prototype hang analyzer for Evergreen integration to help investigate test timeouts - -1. Script supports taking dumps, and/or dumping a summary of useful information about a process -2. Script will iterate through a list of interesting processes, - and run the tools from step 1. The list of processes can be provided as an option. -3. Java processes will be dumped using jstack, if available. - -Supports Linux, MacOS X, Solaris, and Windows. -""" - -import io -import csv -import glob -import itertools -import logging -import os -import platform -import re -import signal -import subprocess -import sys -import tempfile -import traceback -import time -from distutils import spawn # pylint: disable=no-name-in-module -from optparse import OptionParser -_IS_WINDOWS = (sys.platform == "win32") - -if _IS_WINDOWS: - import win32event - import win32api - -# Get relative imports to work when the package is not installed on the PYTHONPATH. -if __name__ == "__main__" and __package__ is None: - sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - from buildscripts.resmokelib import core - - -def call(args, logger): - """Call subprocess on args list.""" - logger.info(str(args)) - - # Use a common pipe for stdout & stderr for logging. - process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - logger_pipe = core.pipe.LoggerPipe(logger, logging.INFO, process.stdout) - logger_pipe.wait_until_started() - - ret = process.wait() - logger_pipe.wait_until_finished() - - if ret != 0: - logger.error("Bad exit code %d", ret) - raise Exception("Bad exit code %d from %s" % (ret, " ".join(args))) - - -def callo(args, logger): - """Call subprocess on args string.""" - logger.info("%s", str(args)) - - return subprocess.check_output(args).decode('utf-8', 'replace') - - -def find_program(prog, paths): - """Find the specified program in env PATH, or tries a set of paths.""" - for loc in paths: - full_prog = os.path.join(loc, prog) - if os.path.exists(full_prog): - return full_prog - - return spawn.find_executable(prog) - - -def get_process_logger(debugger_output, pid, process_name): - """Return the process logger from options specified.""" - process_logger = logging.Logger("process", level=logging.DEBUG) - process_logger.mongo_process_filename = None - - if 'stdout' in debugger_output: - s_handler = logging.StreamHandler(sys.stdout) - s_handler.setFormatter(logging.Formatter(fmt="%(message)s")) - process_logger.addHandler(s_handler) - - if 'file' in debugger_output: - filename = "debugger_%s_%d.log" % (os.path.splitext(process_name)[0], pid) - process_logger.mongo_process_filename = filename - f_handler = logging.FileHandler(filename=filename, mode="w") - f_handler.setFormatter(logging.Formatter(fmt="%(message)s")) - process_logger.addHandler(f_handler) - - return process_logger - - -class WindowsDumper(object): - """WindowsDumper class.""" - - @staticmethod - def __find_debugger(logger, debugger): - """Find the installed debugger.""" - # We are looking for c:\Program Files (x86)\Windows Kits\8.1\Debuggers\x64 - cdb = spawn.find_executable(debugger) - if cdb is not None: - return cdb - from win32com.shell import shell, shellcon - - # Cygwin via sshd does not expose the normal environment variables - # Use the shell api to get the variable instead - root_dir = shell.SHGetFolderPath(0, shellcon.CSIDL_PROGRAM_FILESX86, None, 0) - - # Construct the debugger search paths in most-recent order - debugger_paths = [os.path.join(root_dir, "Windows Kits", "10", "Debuggers", "x64")] - for idx in reversed(range(0, 2)): - debugger_paths.append( - os.path.join(root_dir, "Windows Kits", "8." + str(idx), "Debuggers", "x64")) - - for dbg_path in debugger_paths: - logger.info("Checking for debugger in %s", dbg_path) - if os.path.exists(dbg_path): - return os.path.join(dbg_path, debugger) - - return None - - def dump_info( # pylint: disable=too-many-arguments - self, root_logger, logger, pid, process_name, take_dump): - """Dump useful information to the console.""" - debugger = "cdb.exe" - dbg = self.__find_debugger(root_logger, debugger) - - if dbg is None: - root_logger.warning("Debugger %s not found, skipping dumping of %d", debugger, pid) - return - - root_logger.info("Debugger %s, analyzing %s process with PID %d", dbg, process_name, pid) - - dump_command = "" - if take_dump: - # Dump to file, dump_<process name>.<pid>.mdmp - dump_file = "dump_%s.%d.%s" % (os.path.splitext(process_name)[0], pid, - self.get_dump_ext()) - dump_command = ".dump /ma %s" % dump_file - root_logger.info("Dumping core to %s", dump_file) - - cmds = [ - ".symfix", # Fixup symbol path - ".symopt +0x10", # Enable line loading (off by default in CDB, on by default in WinDBG) - ".reload", # Reload symbols - "!peb", # Dump current exe, & environment variables - "lm", # Dump loaded modules - dump_command, - "!uniqstack -pn", # Dump All unique Threads with function arguments - "!cs -l", # Dump all locked critical sections - ".detach", # Detach - "q" # Quit - ] - - call([dbg, '-c', ";".join(cmds), '-p', str(pid)], logger) - - root_logger.info("Done analyzing %s process with PID %d", process_name, pid) - - @staticmethod - def get_dump_ext(): - """Return the dump file extension.""" - return "mdmp" - - -class WindowsProcessList(object): - """WindowsProcessList class.""" - - @staticmethod - def __find_ps(): - """Find tasklist.""" - return os.path.join(os.environ["WINDIR"], "system32", "tasklist.exe") - - def dump_processes(self, logger): - """Get list of [Pid, Process Name].""" - ps = self.__find_ps() - - logger.info("Getting list of processes using %s", ps) - - ret = callo([ps, "/FO", "CSV"], logger) - - buff = io.StringIO(ret) - csv_reader = csv.reader(buff) - - return [[int(row[1]), row[0]] for row in csv_reader if row[1] != "PID"] - - -# LLDB dumper is for MacOS X -class LLDBDumper(object): - """LLDBDumper class.""" - - @staticmethod - def __find_debugger(debugger): - """Find the installed debugger.""" - return find_program(debugger, ['/usr/bin']) - - def dump_info( # pylint: disable=too-many-arguments,too-many-locals - self, root_logger, logger, pid, process_name, take_dump): - """Dump info.""" - debugger = "lldb" - dbg = self.__find_debugger(debugger) - - if dbg is None: - root_logger.warning("Debugger %s not found, skipping dumping of %d", debugger, pid) - return - - root_logger.info("Debugger %s, analyzing %s process with PID %d", dbg, process_name, pid) - - lldb_version = callo([dbg, "--version"], logger) - - logger.info(lldb_version) - - # Do we have the XCode or LLVM version of lldb? - # Old versions of lldb do not work well when taking commands via a file - # XCode (7.2): lldb-340.4.119 - # LLVM - lldb version 3.7.0 ( revision ) - - if 'version' not in lldb_version: - # We have XCode's lldb - lldb_version = lldb_version[lldb_version.index("lldb-"):] - lldb_version = lldb_version.replace('lldb-', '') - lldb_major_version = int(lldb_version[:lldb_version.index('.')]) - if lldb_major_version < 340: - logger.warning("Debugger lldb is too old, please upgrade to XCode 7.2") - return - - dump_command = "" - if take_dump: - # Dump to file, dump_<process name>.<pid>.core - dump_file = "dump_%s.%d.%s" % (process_name, pid, self.get_dump_ext()) - dump_command = "process save-core %s" % dump_file - root_logger.info("Dumping core to %s", dump_file) - - cmds = [ - "attach -p %d" % pid, - "target modules list", - "thread backtrace all", - dump_command, - "settings set interpreter.prompt-on-quit false", - "quit", - ] - - tf = tempfile.NamedTemporaryFile(mode='w', encoding='utf-8') - - for cmd in cmds: - tf.write(cmd + "\n") - - tf.flush() - - # Works on in MacOS 10.9 & later - #call([dbg] + list( itertools.chain.from_iterable([['-o', b] for b in cmds])), logger) - call(['cat', tf.name], logger) - call([dbg, '--source', tf.name], logger) - - root_logger.info("Done analyzing %s process with PID %d", process_name, pid) - - @staticmethod - def get_dump_ext(): - """Return the dump file extension.""" - return "core" - - -class DarwinProcessList(object): - """DarwinProcessList class.""" - - @staticmethod - def __find_ps(): - """Find ps.""" - return find_program('ps', ['/bin']) - - def dump_processes(self, logger): - """Get list of [Pid, Process Name].""" - ps = self.__find_ps() - - logger.info("Getting list of processes using %s", ps) - - ret = callo([ps, "-axco", "pid,comm"], logger) - - buff = io.StringIO(ret) - csv_reader = csv.reader(buff, delimiter=' ', quoting=csv.QUOTE_NONE, skipinitialspace=True) - - return [[int(row[0]), row[1]] for row in csv_reader if row[0] != "PID"] - - -# GDB dumper is for Linux & Solaris -class GDBDumper(object): - """GDBDumper class.""" - - @staticmethod - def __find_debugger(debugger): - """Find the installed debugger.""" - return find_program(debugger, ['/opt/mongodbtoolchain/gdb/bin', '/usr/bin']) - - def dump_info( # pylint: disable=too-many-arguments,too-many-locals - self, root_logger, logger, pid, process_name, take_dump): - """Dump info.""" - debugger = "gdb" - dbg = self.__find_debugger(debugger) - - if dbg is None: - logger.warning("Debugger %s not found, skipping dumping of %d", debugger, pid) - return - - root_logger.info("Debugger %s, analyzing %s process with PID %d", dbg, process_name, pid) - - dump_command = "" - if take_dump: - # Dump to file, dump_<process name>.<pid>.core - dump_file = "dump_%s.%d.%s" % (process_name, pid, self.get_dump_ext()) - dump_command = "gcore %s" % dump_file - root_logger.info("Dumping core to %s", dump_file) - - call([dbg, "--version"], logger) - - script_dir = os.path.dirname(os.path.abspath(__file__)) - root_logger.info("dir %s", script_dir) - gdb_dir = os.path.join(script_dir, "gdb") - mongo_script = os.path.join(gdb_dir, "mongo.py") - mongo_printers_script = os.path.join(gdb_dir, "mongo_printers.py") - mongo_lock_script = os.path.join(gdb_dir, "mongo_lock.py") - - source_mongo = "source %s" % mongo_script - source_mongo_printers = "source %s" % mongo_printers_script - source_mongo_lock = "source %s" % mongo_lock_script - mongodb_dump_locks = "mongodb-dump-locks" - mongodb_show_locks = "mongodb-show-locks" - mongodb_uniqstack = "mongodb-uniqstack mongodb-bt-if-active" - mongodb_waitsfor_graph = "mongodb-waitsfor-graph debugger_waitsfor_%s_%d.gv" % \ - (process_name, pid) - mongodb_javascript_stack = "mongodb-javascript-stack" - mongod_dump_sessions = "mongod-dump-sessions" - - # The following MongoDB python extensions do not run on Solaris. - if sys.platform.startswith("sunos"): - source_mongo_lock = "" - # SERVER-28234 - GDB frame information not available on Solaris for a templatized - # function - mongodb_dump_locks = "" - - # SERVER-28373 - GDB thread-local variables not available on Solaris - mongodb_show_locks = "" - mongodb_waitsfor_graph = "" - mongodb_javascript_stack = "" - mongod_dump_sessions = "" - - if not logger.mongo_process_filename: - raw_stacks_commands = [] - else: - base, ext = os.path.splitext(logger.mongo_process_filename) - raw_stacks_filename = base + '_raw_stacks' + ext - raw_stacks_commands = [ - 'echo \\nWriting raw stacks to %s.\\n' % raw_stacks_filename, - # This sends output to log file rather than stdout until we turn logging off. - 'set logging redirect on', - 'set logging file ' + raw_stacks_filename, - 'set logging on', - 'thread apply all bt', - 'set logging off', - ] - - cmds = [ - "set interactive-mode off", - "set print thread-events off", # Suppress GDB messages of threads starting/finishing. - "file %s" % process_name, # Solaris must load the process to read the symbols. - "attach %d" % pid, - "info sharedlibrary", - "info threads", # Dump a simple list of commands to get the thread name - "set python print-stack full", - ] + raw_stacks_commands + [ - source_mongo, - source_mongo_printers, - source_mongo_lock, - mongodb_uniqstack, - # Lock the scheduler, before running commands, which execute code in the attached process. - "set scheduler-locking on", - dump_command, - mongodb_dump_locks, - mongodb_show_locks, - mongodb_waitsfor_graph, - mongodb_javascript_stack, - mongod_dump_sessions, - "set confirm off", - "quit", - ] - - call([dbg, "--quiet", "--nx"] + list( - itertools.chain.from_iterable([['-ex', b] for b in cmds])), logger) - - root_logger.info("Done analyzing %s process with PID %d", process_name, pid) - - @staticmethod - def get_dump_ext(): - """Return the dump file extension.""" - return "core" - - @staticmethod - def _find_gcore(): - """Find the installed gcore.""" - dbg = "/usr/bin/gcore" - if os.path.exists(dbg): - return dbg - - return None - - -class LinuxProcessList(object): - """LinuxProcessList class.""" - - @staticmethod - def __find_ps(): - """Find ps.""" - return find_program('ps', ['/bin', '/usr/bin']) - - def dump_processes(self, logger): - """Get list of [Pid, Process Name].""" - ps = self.__find_ps() - - logger.info("Getting list of processes using %s", ps) - - call([ps, "--version"], logger) - - ret = callo([ps, "-eo", "pid,args"], logger) - - buff = io.StringIO(ret) - csv_reader = csv.reader(buff, delimiter=' ', quoting=csv.QUOTE_NONE, skipinitialspace=True) - - return [[int(row[0]), os.path.split(row[1])[1]] for row in csv_reader if row[0] != "PID"] - - -class SolarisProcessList(object): - """SolarisProcessList class.""" - - @staticmethod - def __find_ps(): - """Find ps.""" - return find_program('ps', ['/bin', '/usr/bin']) - - def dump_processes(self, logger): - """Get list of [Pid, Process Name].""" - ps = self.__find_ps() - - logger.info("Getting list of processes using %s", ps) - - ret = callo([ps, "-eo", "pid,args"], logger) - - buff = io.StringIO(ret) - csv_reader = csv.reader(buff, delimiter=' ', quoting=csv.QUOTE_NONE, skipinitialspace=True) - - return [[int(row[0]), os.path.split(row[1])[1]] for row in csv_reader if row[0] != "PID"] - - -# jstack is a JDK utility -class JstackDumper(object): - """JstackDumper class.""" - - @staticmethod - def __find_debugger(debugger): - """Find the installed jstack debugger.""" - return find_program(debugger, ['/usr/bin']) - - def dump_info(self, root_logger, logger, pid, process_name): - """Dump java thread stack traces to the console.""" - debugger = "jstack" - jstack = self.__find_debugger(debugger) - - if jstack is None: - logger.warning("Debugger %s not found, skipping dumping of %d", debugger, pid) - return - - root_logger.info("Debugger %s, analyzing %s process with PID %d", jstack, process_name, pid) - - call([jstack, "-l", str(pid)], logger) - - root_logger.info("Done analyzing %s process with PID %d", process_name, pid) - - -# jstack is a JDK utility -class JstackWindowsDumper(object): - """JstackWindowsDumper class.""" - - @staticmethod - def dump_info(root_logger, pid): - """Dump java thread stack traces to the logger.""" - - root_logger.warning("Debugger jstack not supported, skipping dumping of %d", pid) - - -class DebugExtractor(object): - """Extracts `mongo-debugsymbols.tgz`.""" - - @staticmethod - def extract_debug_symbols(root_logger): - """ - Extract debug symbols. Idempotent. - - :param root_logger: logger to use - :return: None - """ - path = os.path.join(os.getcwd(), 'mongo-debugsymbols.tgz') - root_logger.debug('Starting: Extract debug-symbols from %s.', path) - if not os.path.exists(path): - root_logger.info('Debug-symbols archive-file does not exist. ' - 'Hang-Analyzer may not complete successfully, ' - 'or debug-symbols may already be extracted.') - return - try: - DebugExtractor._exxtract_tar(path, root_logger) - root_logger.debug('Finished: Extract debug-symbols from %s.', path) - # We never want this to cause the whole task to fail. - # The rest of hang_analyzer.py will continue to work without the - # symbols it just won't be quite as helpful. - # pylint: disable=broad-except - except Exception as exception: - root_logger.warning('Error when extracting %s: %s', path, exception) - - @staticmethod - def _exxtract_tar(path, root_logger): - import shutil - # The file name is always .tgz but it's "secretly" a zip file on Windows :( - compressed_format = 'zip' if _IS_WINDOWS else 'gztar' - shutil.unpack_archive(path, format=compressed_format) - for (src, dest) in DebugExtractor._extracted_files_to_copy(): - if os.path.exists(dest): - root_logger.debug('Debug symbol %s already exists, not copying from %s.', dest, src) - continue - shutil.copy(src, dest) - root_logger.debug('Copied debug symbol %s.', dest) - - @staticmethod - def _extracted_files_to_copy(): - out = [] - for ext in ['debug', 'dSYM', 'pdb']: - for file in ['mongo', 'mongod', 'mongos']: - haystack = os.path.join('dist-test', 'bin', '{file}.{ext}'.format( - file=file, ext=ext)) - for needle in glob.glob(haystack): - out.append((needle, os.path.join(os.getcwd(), os.path.basename(needle)))) - return out - - -def get_hang_analyzers(): - """Return hang analyzers.""" - - dbg = None - jstack = None - ps = None - if sys.platform.startswith("linux"): - dbg = GDBDumper() - jstack = JstackDumper() - ps = LinuxProcessList() - elif sys.platform.startswith("sunos"): - dbg = GDBDumper() - jstack = JstackDumper() - ps = SolarisProcessList() - elif _IS_WINDOWS or sys.platform == "cygwin": - dbg = WindowsDumper() - jstack = JstackWindowsDumper() - ps = WindowsProcessList() - elif sys.platform == "darwin": - dbg = LLDBDumper() - jstack = JstackDumper() - ps = DarwinProcessList() - - return [ps, dbg, jstack] - - -def check_dump_quota(quota, ext): - """Check if sum of the files with ext is within the specified quota in megabytes.""" - - files = glob.glob("*." + ext) - - size_sum = 0 - for file_name in files: - size_sum += os.path.getsize(file_name) - - return size_sum <= quota - - -def signal_event_object(logger, pid): - """Signal the Windows event object.""" - - # Use unique event_name created. - event_name = "Global\\Mongo_Python_" + str(pid) - - try: - desired_access = win32event.EVENT_MODIFY_STATE - inherit_handle = False - task_timeout_handle = win32event.OpenEvent(desired_access, inherit_handle, event_name) - except win32event.error as err: - logger.info("Exception from win32event.OpenEvent with error: %s", err) - return - - try: - win32event.SetEvent(task_timeout_handle) - except win32event.error as err: - logger.info("Exception from win32event.SetEvent with error: %s", err) - finally: - win32api.CloseHandle(task_timeout_handle) - - logger.info("Waiting for process to report") - time.sleep(5) - - -def signal_process(logger, pid, signalnum): - """Signal process with signal, N/A on Windows.""" - try: - os.kill(pid, signalnum) - - logger.info("Waiting for process to report") - time.sleep(5) - except OSError as err: - logger.error("Hit OS error trying to signal process: %s", err) - - except AttributeError: - logger.error("Cannot send signal to a process on Windows") - - -def pname_match(match_type, pname, interesting_processes): - """Return True if the pname matches in interesting_processes.""" - pname = os.path.splitext(pname)[0] - for ip in interesting_processes: - if match_type == 'exact' and pname == ip or match_type == 'contains' and ip in pname: - return True - return False - - -# Basic procedure -# -# 1. Get a list of interesting processes -# 2. Dump useful information or take dumps -def main(): # pylint: disable=too-many-branches,too-many-locals,too-many-statements - """Execute Main program.""" - root_logger = logging.Logger("hang_analyzer", level=logging.DEBUG) - - handler = logging.StreamHandler(sys.stdout) - handler.setFormatter(logging.Formatter(fmt="%(message)s")) - root_logger.addHandler(handler) - - root_logger.info("Python Version: %s", sys.version) - root_logger.info("OS: %s", platform.platform()) - - try: - if _IS_WINDOWS or sys.platform == "cygwin": - distro = platform.win32_ver() - root_logger.info("Windows Distribution: %s", distro) - else: - distro = platform.linux_distribution() - root_logger.info("Linux Distribution: %s", distro) - - except AttributeError: - root_logger.warning("Cannot determine Linux distro since Python is too old") - - try: - uid = os.getuid() - root_logger.info("Current User: %s", uid) - current_login = os.getlogin() - root_logger.info("Current Login: %s", current_login) - except OSError: - root_logger.warning("Cannot determine Unix Current Login") - except AttributeError: - root_logger.warning("Cannot determine Unix Current Login, not supported on Windows") - - DebugExtractor.extract_debug_symbols(root_logger) - - interesting_processes = ["mongo", "mongod", "mongos", "_test", "dbtest", "python", "java"] - go_processes = [] - process_ids = [] - - parser = OptionParser(description=__doc__) - parser.add_option( - '-m', '--process-match', dest='process_match', choices=['contains', 'exact'], - default='contains', help="Type of match for process names (-p & -g), specify 'contains', or" - " 'exact'. Note that the process name match performs the following" - " conversions: change all process names to lowecase, strip off the file" - " extension, like '.exe' on Windows. Default is 'contains'.") - parser.add_option('-p', '--process-names', dest='process_names', - help='Comma separated list of process names to analyze') - parser.add_option('-g', '--go-process-names', dest='go_process_names', - help='Comma separated list of go process names to analyze') - parser.add_option( - '-d', '--process-ids', dest='process_ids', default=None, - help='Comma separated list of process ids (PID) to analyze, overrides -p &' - ' -g') - parser.add_option('-c', '--dump-core', dest='dump_core', action="store_true", default=False, - help='Dump core file for each analyzed process') - parser.add_option('-s', '--max-core-dumps-size', dest='max_core_dumps_size', default=10000, - help='Maximum total size of core dumps to keep in megabytes') - parser.add_option( - '-o', '--debugger-output', dest='debugger_output', action="append", - choices=['file', 'stdout'], default=None, - help="If 'stdout', then the debugger's output is written to the Python" - " process's stdout. If 'file', then the debugger's output is written" - " to a file named debugger_<process>_<pid>.log for each process it" - " attaches to. This option can be specified multiple times on the" - " command line to have the debugger's output written to multiple" - " locations. By default, the debugger's output is written only to the" - " Python process's stdout.") - - (options, _) = parser.parse_args() - - if options.debugger_output is None: - options.debugger_output = ['stdout'] - - if options.process_ids is not None: - # process_ids is an int list of PIDs - process_ids = [int(pid) for pid in options.process_ids.split(',')] - - if options.process_names is not None: - interesting_processes = options.process_names.split(',') - - if options.go_process_names is not None: - go_processes = options.go_process_names.split(',') - interesting_processes += go_processes - - [ps, dbg, jstack] = get_hang_analyzers() - - if ps is None or (dbg is None and jstack is None): - root_logger.warning("hang_analyzer.py: Unsupported platform: %s", sys.platform) - exit(1) - - all_processes = ps.dump_processes(root_logger) - - # Canonicalize the process names to lowercase to handle cases where the name of the Python - # process is /System/Library/.../Python on OS X and -p python is specified to hang_analyzer.py. - all_processes = [(pid, process_name.lower()) for (pid, process_name) in all_processes] - - # Find all running interesting processes: - # If a list of process_ids is supplied, match on that. - # Otherwise, do a substring match on interesting_processes. - if process_ids: - processes = [(pid, pname) for (pid, pname) in all_processes - if pid in process_ids and pid != os.getpid()] - - running_pids = {pid for (pid, pname) in all_processes} - missing_pids = set(process_ids) - running_pids - if missing_pids: - root_logger.warning("The following requested process ids are not running %s", - list(missing_pids)) - else: - processes = [(pid, pname) for (pid, pname) in all_processes - if pname_match(options.process_match, pname, interesting_processes) - and pid != os.getpid()] - - root_logger.info("Found %d interesting processes %s", len(processes), processes) - - max_dump_size_bytes = int(options.max_core_dumps_size) * 1024 * 1024 - - # Dump python processes by signalling them. The resmoke.py process will generate - # the report.json, when signalled, so we do this before attaching to other processes. - for (pid, process_name) in [(p, pn) for (p, pn) in processes if pn.startswith("python")]: - # On Windows, we set up an event object to wait on a signal. For Cygwin, we register - # a signal handler to wait for the signal since it supports POSIX signals. - if _IS_WINDOWS: - root_logger.info("Calling SetEvent to signal python process %s with PID %d", - process_name, pid) - signal_event_object(root_logger, pid) - else: - root_logger.info("Sending signal SIGUSR1 to python process %s with PID %d", - process_name, pid) - signal_process(root_logger, pid, signal.SIGUSR1) - - trapped_exceptions = [] - - # Dump all processes, except python & java. - for (pid, - process_name) in [(p, pn) for (p, pn) in processes if not re.match("^(java|python)", pn)]: - process_logger = get_process_logger(options.debugger_output, pid, process_name) - try: - dbg.dump_info( - root_logger, process_logger, pid, process_name, options.dump_core - and check_dump_quota(max_dump_size_bytes, dbg.get_dump_ext())) - except Exception as err: # pylint: disable=broad-except - root_logger.info("Error encountered when invoking debugger %s", err) - trapped_exceptions.append(traceback.format_exc()) - - # Dump java processes using jstack. - for (pid, process_name) in [(p, pn) for (p, pn) in processes if pn.startswith("java")]: - process_logger = get_process_logger(options.debugger_output, pid, process_name) - try: - jstack.dump_info(root_logger, pid) - except Exception as err: # pylint: disable=broad-except - root_logger.info("Error encountered when invoking debugger %s", err) - trapped_exceptions.append(traceback.format_exc()) - - # Signal go processes to ensure they print out stack traces, and die on POSIX OSes. - # On Windows, this will simply kill the process since python emulates SIGABRT as - # TerminateProcess. - # Note: The stacktrace output may be captured elsewhere (i.e. resmoke). - for (pid, process_name) in [(p, pn) for (p, pn) in processes if pn in go_processes]: - root_logger.info("Sending signal SIGABRT to go process %s with PID %d", process_name, pid) - signal_process(root_logger, pid, signal.SIGABRT) - - root_logger.info("Done analyzing all processes for hangs") - - for exception in trapped_exceptions: - root_logger.info(exception) - if trapped_exceptions: - sys.exit(1) - +"""Stub file pointing users to the new invocation.""" if __name__ == "__main__": - main() + print("Hello! It seems you've executed 'buildscripts/hang_analyzer.py'. We have recently\n" + "repackaged the hang analyzer as a subcommand of resmoke. It can now be invoked as\n" + "'./buildscripts/resmoke.py hang-analyzer' with all of the same arguments as before.") diff --git a/buildscripts/resmokelib/commands/__init__.py b/buildscripts/resmokelib/commands/__init__.py index 1a3235631a7..641e3625cfd 100644 --- a/buildscripts/resmokelib/commands/__init__.py +++ b/buildscripts/resmokelib/commands/__init__.py @@ -2,3 +2,4 @@ from . import interface from . import run +from . import hang_analyzer diff --git a/buildscripts/resmokelib/commands/hang_analyzer.py b/buildscripts/resmokelib/commands/hang_analyzer.py new file mode 100755 index 00000000000..c4d82f65da4 --- /dev/null +++ b/buildscripts/resmokelib/commands/hang_analyzer.py @@ -0,0 +1,187 @@ +"""Hang Analyzer module. + +A prototype hang analyzer for Evergreen integration to help investigate test timeouts + +1. Script supports taking dumps, and/or dumping a summary of useful information about a process +2. Script will iterate through a list of interesting processes, + and run the tools from step 1. The list of processes can be provided as an option. +3. Java processes will be dumped using jstack, if available. + +Supports Linux, MacOS X, and Windows. +""" +import re +import os +import sys +import glob +import signal +import logging +import platform +import traceback + +from buildscripts.resmokelib.commands import interface + +from buildscripts.resmokelib.hang_analyzer import extractor +from buildscripts.resmokelib.hang_analyzer import dumper +from buildscripts.resmokelib.hang_analyzer import process_list +from buildscripts.resmokelib.hang_analyzer.process import signal_process, signal_python + + +class HangAnalyzer(interface.Subcommand): + """Main class for the hang analyzer subcommand.""" + + def __init__(self, options): + """ + Configure processe lists based on options. + + :param options: Options as parsed by parser.py + """ + self.options = options + self.root_logger = None + self.interesting_processes = [ + "mongo", "mongod", "mongos", "_test", "dbtest", "python", "java" + ] + self.go_processes = [] + self.process_ids = [] + + self._configure_processes() + + def execute(self): # pylint: disable=too-many-branches,too-many-locals,too-many-statements + """ + Execute hang analysis. + + 1. Get a list of interesting processes + 2. Dump useful information or take core dumps + """ + self._setup_logging() + self._log_system_info() + + extractor.extract_debug_symbols(self.root_logger) + dumpers = dumper.get_dumpers() + + processes = process_list.get_processes(self.process_ids, self.interesting_processes, + self.options.process_match, self.root_logger) + + max_dump_size_bytes = int(self.options.max_core_dumps_size) * 1024 * 1024 + + # Dump python processes by signalling them. The resmoke.py process will generate + # the report.json, when signalled, so we do this before attaching to other processes. + for pinfo in [pinfo for pinfo in processes if pinfo.name.startswith("python")]: + signal_python(self.root_logger, pinfo) + + trapped_exceptions = [] + + # Dump all processes, except python & java. + for pinfo in [pinfo for pinfo in processes if not re.match("^(java|python)", pinfo.name)]: + process_logger = self._get_process_logger(pinfo) + try: + dumpers.dbg.dump_info( + self.root_logger, process_logger, pinfo, self.options.dump_core + and _check_dump_quota(max_dump_size_bytes, dumpers.dbg.get_dump_ext())) + except Exception as err: # pylint: disable=broad-except + self.root_logger.info("Error encountered when invoking debugger %s", err) + trapped_exceptions.append(traceback.format_exc()) + + # Dump java processes using jstack. + for pinfo in [pinfo for pinfo in processes if pinfo.name.startswith("java")]: + process_logger = self._get_process_logger(pinfo) + try: + dumpers.jstack.dump_info(self.root_logger, pinfo.pid) + except Exception as err: # pylint: disable=broad-except + self.root_logger.info("Error encountered when invoking debugger %s", err) + trapped_exceptions.append(traceback.format_exc()) + + # Signal go processes to ensure they print out stack traces, and die on POSIX OSes. + # On Windows, this will simply kill the process since python emulates SIGABRT as + # TerminateProcess. + # Note: The stacktrace output may be captured elsewhere (i.e. resmoke). + for pinfo in [pinfo for pinfo in processes if pinfo.name in self.go_processes]: + self.root_logger.info("Sending signal SIGABRT to go process %s with PID %d", pinfo.name, + pinfo.pid) + signal_process(self.root_logger, pinfo.pid, signal.SIGABRT) + + self.root_logger.info("Done analyzing all processes for hangs") + + for exception in trapped_exceptions: + self.root_logger.info(exception) + if trapped_exceptions: + raise RuntimeError( + "Exceptions were thrown while dumping. There may still be some valid dumps.") + + def _configure_processes(self): + if self.options.debugger_output is None: + self.options.debugger_output = ['stdout'] + + if self.options.process_ids is not None: + # self.process_ids is an int list of PIDs + self.process_ids = [int(pid) for pid in self.options.process_ids.split(',')] + + if self.options.process_names is not None: + self.interesting_processes = self.options.process_names.split(',') + + if self.options.go_process_names is not None: + self.go_processes = self.options.go_process_names.split(',') + self.interesting_processes += self.go_processes + + def _setup_logging(self): + self.root_logger = logging.Logger("hang_analyzer", level=logging.DEBUG) + + handler = logging.StreamHandler(sys.stdout) + handler.setFormatter(logging.Formatter(fmt="%(message)s")) + self.root_logger.addHandler(handler) + + self.root_logger.info("Python Version: %s", sys.version) + self.root_logger.info("OS: %s", platform.platform()) + + def _log_system_info(self): + try: + if sys.platform == "win32" or sys.platform == "cygwin": + distro = platform.win32_ver() + self.root_logger.info("Windows Distribution: %s", distro) + else: + distro = platform.linux_distribution() + self.root_logger.info("Linux Distribution: %s", distro) + + except AttributeError: + self.root_logger.warning("Cannot determine Linux distro since Python is too old") + + try: + uid = os.getuid() + self.root_logger.info("Current User: %s", uid) + current_login = os.getlogin() + self.root_logger.info("Current Login: %s", current_login) + except OSError: + self.root_logger.warning("Cannot determine Unix Current Login") + except AttributeError: + self.root_logger.warning( + "Cannot determine Unix Current Login, not supported on Windows") + + def _get_process_logger(self, pinfo): + """Return the process logger from options specified.""" + process_logger = logging.Logger("process", level=logging.DEBUG) + process_logger.mongo_process_filename = None + + if 'stdout' in self.options.debugger_output: + s_handler = logging.StreamHandler(sys.stdout) + s_handler.setFormatter(logging.Formatter(fmt="%(message)s")) + process_logger.addHandler(s_handler) + + if 'file' in self.options.debugger_output: + filename = "debugger_%s_%d.log" % (os.path.splitext(pinfo.name)[0], pinfo.pid) + process_logger.mongo_process_filename = filename + f_handler = logging.FileHandler(filename=filename, mode="w") + f_handler.setFormatter(logging.Formatter(fmt="%(message)s")) + process_logger.addHandler(f_handler) + + return process_logger + + +def _check_dump_quota(quota, ext): + """Check if sum of the files with ext is within the specified quota in megabytes.""" + + files = glob.glob("*." + ext) + + size_sum = 0 + for file_name in files: + size_sum += os.path.getsize(file_name) + + return size_sum <= quota diff --git a/buildscripts/resmokelib/hang_analyzer/__init__.py b/buildscripts/resmokelib/hang_analyzer/__init__.py new file mode 100644 index 00000000000..2c36bf4f130 --- /dev/null +++ b/buildscripts/resmokelib/hang_analyzer/__init__.py @@ -0,0 +1,6 @@ +"""Utilities for the hang analyzer subcommand.""" + +from . import process_list +from . import dumper +from . import process +from . import extractor diff --git a/buildscripts/resmokelib/hang_analyzer/dumper.py b/buildscripts/resmokelib/hang_analyzer/dumper.py new file mode 100644 index 00000000000..0a88ef455cc --- /dev/null +++ b/buildscripts/resmokelib/hang_analyzer/dumper.py @@ -0,0 +1,350 @@ +"""Tools to dump debug info for each OS.""" + +import os +import sys +import tempfile +import itertools +from distutils import spawn # pylint: disable=no-name-in-module +from collections import namedtuple + +from buildscripts.resmokelib.hang_analyzer.process import call, callo, find_program + +Dumpers = namedtuple('Dumpers', ['dbg', 'jstack']) + + +def get_dumpers(): + """Return OS-appropriate dumpers.""" + + dbg = None + jstack = None + if sys.platform.startswith("linux"): + dbg = GDBDumper() + jstack = JstackDumper() + elif sys.platform == "win32" or sys.platform == "cygwin": + dbg = WindowsDumper() + jstack = JstackWindowsDumper() + elif sys.platform == "darwin": + dbg = LLDBDumper() + jstack = JstackDumper() + + return Dumpers(dbg=dbg, jstack=jstack) + + +class Dumper(object): + """Abstract base class for OS-specific dumpers.""" + + def dump_info( # pylint: disable=too-many-arguments,too-many-locals + self, root_logger, logger, pinfo, take_dump): + """ + Perform dump for a process. + + :param root_logger: Top-level logger + :param logger: Logger to output dump info to + :param pinfo: A Pinfo describing the process + :param take_dump: Whether to take a core dump + """ + raise NotImplementedError("dump_info must be implemented in OS-specific subclasses") + + @staticmethod + def get_dump_ext(): + """Return the dump file extension.""" + raise NotImplementedError("get_dump_ext must be implemented in OS-specific subclasses") + + +class WindowsDumper(Dumper): + """WindowsDumper class.""" + + @staticmethod + def __find_debugger(logger, debugger): + """Find the installed debugger.""" + # We are looking for c:\Program Files (x86)\Windows Kits\8.1\Debuggers\x64 + cdb = spawn.find_executable(debugger) + if cdb is not None: + return cdb + from win32com.shell import shell, shellcon + + # Cygwin via sshd does not expose the normal environment variables + # Use the shell api to get the variable instead + root_dir = shell.SHGetFolderPath(0, shellcon.CSIDL_PROGRAM_FILESX86, None, 0) + + # Construct the debugger search paths in most-recent order + debugger_paths = [os.path.join(root_dir, "Windows Kits", "10", "Debuggers", "x64")] + for idx in reversed(range(0, 2)): + debugger_paths.append( + os.path.join(root_dir, "Windows Kits", "8." + str(idx), "Debuggers", "x64")) + + for dbg_path in debugger_paths: + logger.info("Checking for debugger in %s", dbg_path) + if os.path.exists(dbg_path): + return os.path.join(dbg_path, debugger) + + return None + + def dump_info( # pylint: disable=too-many-arguments + self, root_logger, logger, pinfo, take_dump): + """Dump useful information to the console.""" + debugger = "cdb.exe" + dbg = self.__find_debugger(root_logger, debugger) + + if dbg is None: + root_logger.warning("Debugger %s not found, skipping dumping of %d", debugger, + pinfo.pid) + return + + root_logger.info("Debugger %s, analyzing %s process with PID %d", dbg, pinfo.name, + pinfo.pid) + + dump_command = "" + if take_dump: + # Dump to file, dump_<process name>.<pid>.mdmp + dump_file = "dump_%s.%d.%s" % (os.path.splitext(pinfo.name)[0], pinfo.pid, + self.get_dump_ext()) + dump_command = ".dump /ma %s" % dump_file + root_logger.info("Dumping core to %s", dump_file) + + cmds = [ + ".symfix", # Fixup symbol path + ".symopt +0x10", # Enable line loading (off by default in CDB, on by default in WinDBG) + ".reload", # Reload symbols + "!peb", # Dump current exe, & environment variables + "lm", # Dump loaded modules + dump_command, + "!uniqstack -pn", # Dump All unique Threads with function arguments + "!cs -l", # Dump all locked critical sections + ".detach", # Detach + "q" # Quit + ] + + call([dbg, '-c', ";".join(cmds), '-p', str(pinfo.pid)], logger) + + root_logger.info("Done analyzing %s process with PID %d", pinfo.name, pinfo.pid) + + @staticmethod + def get_dump_ext(): + """Return the dump file extension.""" + return "mdmp" + + +# LLDB dumper is for MacOS X +class LLDBDumper(Dumper): + """LLDBDumper class.""" + + @staticmethod + def __find_debugger(debugger): + """Find the installed debugger.""" + return find_program(debugger, ['/usr/bin']) + + def dump_info( # pylint: disable=too-many-arguments,too-many-locals + self, root_logger, logger, pinfo, take_dump): + """Dump info.""" + debugger = "lldb" + dbg = self.__find_debugger(debugger) + + if dbg is None: + root_logger.warning("Debugger %s not found, skipping dumping of %d", debugger, + pinfo.pid) + return + + root_logger.info("Debugger %s, analyzing %s process with PID %d", dbg, pinfo.name, + pinfo.pid) + + lldb_version = callo([dbg, "--version"], logger) + + logger.info(lldb_version) + + # Do we have the XCode or LLVM version of lldb? + # Old versions of lldb do not work well when taking commands via a file + # XCode (7.2): lldb-340.4.119 + # LLVM - lldb version 3.7.0 ( revision ) + + if 'version' not in lldb_version: + # We have XCode's lldb + lldb_version = lldb_version[lldb_version.index("lldb-"):] + lldb_version = lldb_version.replace('lldb-', '') + lldb_major_version = int(lldb_version[:lldb_version.index('.')]) + if lldb_major_version < 340: + logger.warning("Debugger lldb is too old, please upgrade to XCode 7.2") + return + + dump_command = "" + if take_dump: + # Dump to file, dump_<process name>.<pid>.core + dump_file = "dump_%s.%d.%s" % (pinfo.name, pinfo.pid, self.get_dump_ext()) + dump_command = "process save-core %s" % dump_file + root_logger.info("Dumping core to %s", dump_file) + + cmds = [ + "attach -p %d" % pinfo.pid, + "target modules list", + "thread backtrace all", + dump_command, + "settings set interpreter.prompt-on-quit false", + "quit", + ] + + tf = tempfile.NamedTemporaryFile(mode='w', encoding='utf-8') + + for cmd in cmds: + tf.write(cmd + "\n") + + tf.flush() + + # Works on in MacOS 10.9 & later + #call([dbg] + list( itertools.chain.from_iterable([['-o', b] for b in cmds])), logger) + call(['cat', tf.name], logger) + call([dbg, '--source', tf.name], logger) + + root_logger.info("Done analyzing %s process with PID %d", pinfo.name, pinfo.pid) + + @staticmethod + def get_dump_ext(): + """Return the dump file extension.""" + return "core" + + +# GDB dumper is for Linux +class GDBDumper(Dumper): + """GDBDumper class.""" + + @staticmethod + def __find_debugger(debugger): + """Find the installed debugger.""" + return find_program(debugger, ['/opt/mongodbtoolchain/gdb/bin', '/usr/bin']) + + def dump_info( # pylint: disable=too-many-arguments,too-many-locals + self, root_logger, logger, pinfo, take_dump): + """Dump info.""" + debugger = "gdb" + dbg = self.__find_debugger(debugger) + + if dbg is None: + logger.warning("Debugger %s not found, skipping dumping of %d", debugger, pinfo.pid) + return + + root_logger.info("Debugger %s, analyzing %s process with PID %d", dbg, pinfo.name, + pinfo.pid) + + dump_command = "" + if take_dump: + # Dump to file, dump_<process name>.<pid>.core + dump_file = "dump_%s.%d.%s" % (pinfo.name, pinfo.pid, self.get_dump_ext()) + dump_command = "gcore %s" % dump_file + root_logger.info("Dumping core to %s", dump_file) + + call([dbg, "--version"], logger) + + script_dir = "buildscripts" + root_logger.info("dir %s", script_dir) + gdb_dir = os.path.join(script_dir, "gdb") + mongo_script = os.path.join(gdb_dir, "mongo.py") + mongo_printers_script = os.path.join(gdb_dir, "mongo_printers.py") + mongo_lock_script = os.path.join(gdb_dir, "mongo_lock.py") + + source_mongo = "source %s" % mongo_script + source_mongo_printers = "source %s" % mongo_printers_script + source_mongo_lock = "source %s" % mongo_lock_script + mongodb_dump_locks = "mongodb-dump-locks" + mongodb_show_locks = "mongodb-show-locks" + mongodb_uniqstack = "mongodb-uniqstack mongodb-bt-if-active" + mongodb_waitsfor_graph = "mongodb-waitsfor-graph debugger_waitsfor_%s_%d.gv" % \ + (pinfo.name, pinfo.pid) + mongodb_javascript_stack = "mongodb-javascript-stack" + mongod_dump_sessions = "mongod-dump-sessions" + mongodb_dump_mutexes = "mongodb-dump-mutexes" + mongodb_dump_recovery_units = "mongodb-dump-recovery-units" + + if not logger.mongo_process_filename: + raw_stacks_commands = [] + else: + base, ext = os.path.splitext(logger.mongo_process_filename) + raw_stacks_filename = base + '_raw_stacks' + ext + raw_stacks_commands = [ + 'echo \\nWriting raw stacks to %s.\\n' % raw_stacks_filename, + # This sends output to log file rather than stdout until we turn logging off. + 'set logging redirect on', + 'set logging file ' + raw_stacks_filename, + 'set logging on', + 'thread apply all bt', + 'set logging off', + ] + + cmds = [ + "set interactive-mode off", + "set print thread-events off", # Suppress GDB messages of threads starting/finishing. + "attach %d" % pinfo.pid, + "info sharedlibrary", + "info threads", # Dump a simple list of commands to get the thread name + "set python print-stack full", + ] + raw_stacks_commands + [ + source_mongo, + source_mongo_printers, + source_mongo_lock, + mongodb_uniqstack, + # Lock the scheduler, before running commands, which execute code in the attached process. + "set scheduler-locking on", + dump_command, + mongodb_dump_locks, + mongodb_show_locks, + mongodb_waitsfor_graph, + mongodb_javascript_stack, + mongod_dump_sessions, + mongodb_dump_mutexes, + mongodb_dump_recovery_units, + "set confirm off", + "quit", + ] + + call([dbg, "--quiet", "--nx"] + list( + itertools.chain.from_iterable([['-ex', b] for b in cmds])), logger) + + root_logger.info("Done analyzing %s process with PID %d", pinfo.name, pinfo.pid) + + @staticmethod + def get_dump_ext(): + """Return the dump file extension.""" + return "core" + + @staticmethod + def _find_gcore(): + """Find the installed gcore.""" + dbg = "/usr/bin/gcore" + if os.path.exists(dbg): + return dbg + + return None + + +# jstack is a JDK utility +class JstackDumper(object): + """JstackDumper class.""" + + @staticmethod + def __find_debugger(debugger): + """Find the installed jstack debugger.""" + return find_program(debugger, ['/usr/bin']) + + def dump_info(self, root_logger, logger, pid, process_name): + """Dump java thread stack traces to the console.""" + debugger = "jstack" + jstack = self.__find_debugger(debugger) + + if jstack is None: + logger.warning("Debugger %s not found, skipping dumping of %d", debugger, pid) + return + + root_logger.info("Debugger %s, analyzing %s process with PID %d", jstack, process_name, pid) + + call([jstack, "-l", str(pid)], logger) + + root_logger.info("Done analyzing %s process with PID %d", process_name, pid) + + +# jstack is a JDK utility +class JstackWindowsDumper(object): + """JstackWindowsDumper class.""" + + @staticmethod + def dump_info(root_logger, pid): + """Dump java thread stack traces to the logger.""" + + root_logger.warning("Debugger jstack not supported, skipping dumping of %d", pid) diff --git a/buildscripts/resmokelib/hang_analyzer/extractor.py b/buildscripts/resmokelib/hang_analyzer/extractor.py new file mode 100644 index 00000000000..7ea38d5ad3a --- /dev/null +++ b/buildscripts/resmokelib/hang_analyzer/extractor.py @@ -0,0 +1,53 @@ +"""Extracts `mongo-debugsymbols.tgz`.""" + +import os +import sys +import glob + + +def extract_debug_symbols(root_logger): + """ + Extract debug symbols. Idempotent. + + :param root_logger: logger to use + :return: None + """ + path = os.path.join(os.getcwd(), 'mongo-debugsymbols.tgz') + root_logger.debug('Starting: Extract debug-symbols from %s.', path) + if not os.path.exists(path): + root_logger.info('Debug-symbols archive-file does not exist. ' + 'Hang-Analyzer may not complete successfully, ' + 'or debug-symbols may already be extracted.') + return + try: + _extract_tar(path, root_logger) + root_logger.debug('Finished: Extract debug-symbols from %s.', path) + # We never want this to cause the whole task to fail. + # The rest of the hang analyzer will continue to work without the + # symbols it just won't be quite as helpful. + # pylint: disable=broad-except + except Exception as exception: + root_logger.warning('Error when extracting %s: %s', path, exception) + + +def _extract_tar(path, root_logger): + import shutil + # The file name is always .tgz but it's "secretly" a zip file on Windows :( + compressed_format = 'zip' if sys.platform == "win32" else 'gztar' + shutil.unpack_archive(path, format=compressed_format) + for (src, dest) in _extracted_files_to_copy(): + if os.path.exists(dest): + root_logger.debug('Debug symbol %s already exists, not copying from %s.', dest, src) + continue + shutil.copy(src, dest) + root_logger.debug('Copied debug symbol %s.', dest) + + +def _extracted_files_to_copy(): + out = [] + for ext in ['debug', 'dSYM', 'pdb']: + for file in ['mongo', 'mongod', 'mongos']: + haystack = os.path.join('dist-test', 'bin', '{file}.{ext}'.format(file=file, ext=ext)) + for needle in glob.glob(haystack): + out.append((needle, os.path.join(os.getcwd(), os.path.basename(needle)))) + return out diff --git a/buildscripts/resmokelib/hang_analyzer/process.py b/buildscripts/resmokelib/hang_analyzer/process.py new file mode 100644 index 00000000000..0869b5ffcd7 --- /dev/null +++ b/buildscripts/resmokelib/hang_analyzer/process.py @@ -0,0 +1,104 @@ +"""Miscellaneous utility functions used by the hang analyzer.""" + +import os +import sys +import time +import signal +import logging +import subprocess +from distutils import spawn # pylint: disable=no-name-in-module + +from buildscripts.resmokelib import core + +_IS_WINDOWS = (sys.platform == "win32") + +if _IS_WINDOWS: + import win32event + import win32api + + +def call(args, logger): + """Call subprocess on args list.""" + logger.info(str(args)) + + # Use a common pipe for stdout & stderr for logging. + process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + logger_pipe = core.pipe.LoggerPipe(logger, logging.INFO, process.stdout) + logger_pipe.wait_until_started() + + ret = process.wait() + logger_pipe.wait_until_finished() + + if ret != 0: + logger.error("Bad exit code %d", ret) + raise Exception("Bad exit code %d from %s" % (ret, " ".join(args))) + + +def find_program(prog, paths): + """Find the specified program in env PATH, or tries a set of paths.""" + for loc in paths: + full_prog = os.path.join(loc, prog) + if os.path.exists(full_prog): + return full_prog + + return spawn.find_executable(prog) + + +def callo(args, logger): + """Call subprocess on args string.""" + logger.info("%s", str(args)) + return subprocess.check_output(args).decode('utf-8', 'replace') + + +def signal_python(logger, pinfo): + """Send appropriate dumping signal to python processes.""" + + # On Windows, we set up an event object to wait on a signal. For Cygwin, we register + # a signal handler to wait for the signal since it supports POSIX signals. + if _IS_WINDOWS: + logger.info("Calling SetEvent to signal python process %s with PID %d", pinfo.name, + pinfo.pid) + signal_event_object(logger, pinfo.pid) + else: + logger.info("Sending signal SIGUSR1 to python process %s with PID %d", pinfo.name, + pinfo.pid) + signal_process(logger, pinfo.pid, signal.SIGUSR1) + + +def signal_event_object(logger, pid): + """Signal the Windows event object.""" + + # Use unique event_name created. + event_name = "Global\\Mongo_Python_" + str(pid) + + try: + desired_access = win32event.EVENT_MODIFY_STATE + inherit_handle = False + task_timeout_handle = win32event.OpenEvent(desired_access, inherit_handle, event_name) + except win32event.error as err: + logger.info("Exception from win32event.OpenEvent with error: %s", err) + return + + try: + win32event.SetEvent(task_timeout_handle) + except win32event.error as err: + logger.info("Exception from win32event.SetEvent with error: %s", err) + finally: + win32api.CloseHandle(task_timeout_handle) + + logger.info("Waiting for process to report") + time.sleep(5) + + +def signal_process(logger, pid, signalnum): + """Signal process with signal, N/A on Windows.""" + try: + os.kill(pid, signalnum) + + logger.info("Waiting for process to report") + time.sleep(5) + except OSError as err: + logger.error("Hit OS error trying to signal process: %s", err) + + except AttributeError: + logger.error("Cannot send signal to a process on Windows") diff --git a/buildscripts/resmokelib/hang_analyzer/process_list.py b/buildscripts/resmokelib/hang_analyzer/process_list.py new file mode 100644 index 00000000000..3f60a54dd08 --- /dev/null +++ b/buildscripts/resmokelib/hang_analyzer/process_list.py @@ -0,0 +1,158 @@ +"""Functions to list processes in each OS and search for interesting processes.""" + +import os +import io +import sys +import csv +from collections import namedtuple + +from buildscripts.resmokelib.hang_analyzer.process import call, callo, find_program + +Pinfo = namedtuple('Pinfo', ['pid', 'name']) + + +def get_processes(process_ids, interesting_processes, process_match, logger): + """ + Find all running interesting processes. + + If a list of process_ids is supplied, match on that. + Otherwise, do a substring match on interesting_processes. + + :param process_ids: List of PIDs to match on. + :param interesting_processes: List of process names to match on. + :param process_match: String describing the process match to use. + :param logger: Where to log output. + + :return: A list Pinfo objects for matched processes. + """ + ps = _get_lister() + + all_processes = ps.dump_processes(logger) + + # Canonicalize the process names to lowercase to handle cases where the name of the Python + # process is /System/Library/.../Python on OS X and -p python is specified to the hang analyzer. + all_processes = [(pid, process_name.lower()) for (pid, process_name) in all_processes] + + if process_ids: + processes = [ + Pinfo(pid=pid, name=pname) for (pid, pname) in all_processes + if pid in process_ids and pid != os.getpid() + ] + + running_pids = {pid for (pid, pname) in all_processes} + missing_pids = set(process_ids) - running_pids + if missing_pids: + logger.warning("The following requested process ids are not running %s", + list(missing_pids)) + else: + processes = [ + Pinfo(pid=pid, name=pname) for (pid, pname) in all_processes + if _pname_match(process_match, pname, interesting_processes) and pid != os.getpid() + ] + + logger.info("Found %d interesting processes %s", len(processes), processes) + return processes + + +def _get_lister(): + """Return _ProcessList object for OS.""" + if sys.platform.startswith("linux"): + ps = _LinuxProcessList() + elif sys.platform == "win32" or sys.platform == "cygwin": + ps = _WindowsProcessList() + elif sys.platform == "darwin": + ps = _DarwinProcessList() + else: + raise OSError("Hang analyzer: Unsupported platform: {}".format(sys.platform)) + + return ps + + +class _ProcessList(object): + """Abstract base class for all process listers.""" + + def dump_processes(self, logger): + """ + Find all processes. + + :param logger: Where to log output. + :return: A list of process names. + """ + raise NotImplementedError("dump_process must be implemented in OS-specific subclasses") + + +class _WindowsProcessList(_ProcessList): + """_WindowsProcessList class.""" + + @staticmethod + def __find_ps(): + """Find tasklist.""" + return os.path.join(os.environ["WINDIR"], "system32", "tasklist.exe") + + def dump_processes(self, logger): + """Get list of [Pid, Process Name].""" + ps = self.__find_ps() + + logger.info("Getting list of processes using %s", ps) + + ret = callo([ps, "/FO", "CSV"], logger) + + buff = io.StringIO(ret) + csv_reader = csv.reader(buff) + + return [[int(row[1]), row[0]] for row in csv_reader if row[1] != "PID"] + + +class _DarwinProcessList(_ProcessList): + """_DarwinProcessList class.""" + + @staticmethod + def __find_ps(): + """Find ps.""" + return find_program('ps', ['/bin']) + + def dump_processes(self, logger): + """Get list of [Pid, Process Name].""" + ps = self.__find_ps() + + logger.info("Getting list of processes using %s", ps) + + ret = callo([ps, "-axco", "pid,comm"], logger) + + buff = io.StringIO(ret) + csv_reader = csv.reader(buff, delimiter=' ', quoting=csv.QUOTE_NONE, skipinitialspace=True) + + return [[int(row[0]), row[1]] for row in csv_reader if row[0] != "PID"] + + +class _LinuxProcessList(_ProcessList): + """_LinuxProcessList class.""" + + @staticmethod + def __find_ps(): + """Find ps.""" + return find_program('ps', ['/bin', '/usr/bin']) + + def dump_processes(self, logger): + """Get list of [Pid, Process Name].""" + ps = self.__find_ps() + + logger.info("Getting list of processes using %s", ps) + + call([ps, "--version"], logger) + + ret = callo([ps, "-eo", "pid,args"], logger) + + buff = io.StringIO(ret) + csv_reader = csv.reader(buff, delimiter=' ', quoting=csv.QUOTE_NONE, skipinitialspace=True) + + return [[int(row[0]), os.path.split(row[1])[1]] for row in csv_reader if row[0] != "PID"] + + +def _pname_match(match_type, pname, interesting_processes): + """Return True if the pname matches an interesting_processes.""" + pname = os.path.splitext(pname)[0] + for ip in interesting_processes: + if match_type == 'exact' and pname == ip or match_type == 'contains' and ip in pname: + return True + return False diff --git a/buildscripts/resmokelib/parser.py b/buildscripts/resmokelib/parser.py index ca2c56f17e9..58743545d94 100644 --- a/buildscripts/resmokelib/parser.py +++ b/buildscripts/resmokelib/parser.py @@ -22,6 +22,7 @@ def _make_parser(): _add_run(subparsers) _add_list_suites(subparsers) _add_find_suites(subparsers) + _add_hang_analyzer(subparsers) return parser @@ -443,6 +444,41 @@ def _add_find_suites(subparsers): " configurations.")) +def _add_hang_analyzer(subparsers): + """Create and add the parser for the hang analyzer subcommand.""" + + parser = subparsers.add_parser("hang-analyzer", help=commands.hang_analyzer.__doc__) + + parser.add_argument( + '-m', '--process-match', dest='process_match', choices=('contains', 'exact'), + default='contains', help="Type of match for process names (-p & -g), specify 'contains', or" + " 'exact'. Note that the process name match performs the following" + " conversions: change all process names to lowecase, strip off the file" + " extension, like '.exe' on Windows. Default is 'contains'.") + parser.add_argument('-p', '--process-names', dest='process_names', + help='Comma separated list of process names to analyze') + parser.add_argument('-g', '--go-process-names', dest='go_process_names', + help='Comma separated list of go process names to analyze') + parser.add_argument( + '-d', '--process-ids', dest='process_ids', default=None, + help='Comma separated list of process ids (PID) to analyze, overrides -p &' + ' -g') + parser.add_argument('-c', '--dump-core', dest='dump_core', action="store_true", default=False, + help='Dump core file for each analyzed process') + parser.add_argument('-s', '--max-core-dumps-size', dest='max_core_dumps_size', default=10000, + help='Maximum total size of core dumps to keep in megabytes') + parser.add_argument( + '-o', '--debugger-output', dest='debugger_output', action="append", choices=('file', + 'stdout'), + default=None, help="If 'stdout', then the debugger's output is written to the Python" + " process's stdout. If 'file', then the debugger's output is written" + " to a file named debugger_<process>_<pid>.log for each process it" + " attaches to. This option can be specified multiple times on the" + " command line to have the debugger's output written to multiple" + " locations. By default, the debugger's output is written only to the" + " Python process's stdout.") + + # def to_local_args(args=None): # pylint: disable=too-many-branches,too-many-locals # """ # Return a command line invocation for resmoke.py suitable for being run outside of Evergreen. @@ -547,25 +583,22 @@ def parse_command_line(sys_args, **kwargs): """Parse the command line arguments passed to resmoke.py and return the subcommand object to execute.""" parser, parsed_args = _parse(sys_args) - def create_subcommand(parser, parsed_args, **kwargs): - """Create a subcommand object based on args passed into resmoke.py.""" - - subcommand = parsed_args.command - subcommand_obj = None - if subcommand in ('find-suites', 'list-suites', 'run'): - configure_resmoke.validate_and_update_config(parser, parsed_args) - if _config.EVERGREEN_TASK_ID is not None: - subcommand_obj = commands.run.TestRunnerEvg(subcommand, **kwargs) - else: - subcommand_obj = commands.run.TestRunner(subcommand, **kwargs) - - if subcommand_obj is None: - raise RuntimeError( - f"Resmoke configuration has invalid subcommand: {subcommand}. Try '--help'") - - return subcommand_obj - - return create_subcommand(parser, parsed_args, **kwargs) + subcommand = parsed_args.command + subcommand_obj = None + if subcommand in ('find-suites', 'list-suites', 'run'): + configure_resmoke.validate_and_update_config(parser, parsed_args) + if _config.EVERGREEN_TASK_ID is not None: + subcommand_obj = commands.run.TestRunnerEvg(subcommand, **kwargs) + else: + subcommand_obj = commands.run.TestRunner(subcommand, **kwargs) + elif subcommand == 'hang-analyzer': + subcommand_obj = commands.hang_analyzer.HangAnalyzer(parsed_args) + + if subcommand_obj is None: + raise RuntimeError( + f"Resmoke configuration has invalid subcommand: {subcommand}. Try '--help'") + + return subcommand_obj def set_run_options(argstr=''): |