summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCarl Raiden Worley <carl.worley@10gen.com>2020-05-04 12:02:26 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-05-05 17:28:34 +0000
commitaee854c10ec766b0af71f02675ce6ce31bd19d47 (patch)
tree175c39d75b43668bbf0505a33d2471aec190e2bb
parent424b2cd6118a919f3e780c37ed59c570e525e8fe (diff)
downloadmongo-aee854c10ec766b0af71f02675ce6ce31bd19d47.tar.gz
SERVER-46684 Repackage the hang-analyzer as a resmoke subcommand
-rwxr-xr-xbuildscripts/hang_analyzer.py755
-rw-r--r--buildscripts/resmokelib/commands/__init__.py1
-rwxr-xr-xbuildscripts/resmokelib/commands/hang_analyzer.py185
-rw-r--r--buildscripts/resmokelib/hang_analyzer/__init__.py5
-rw-r--r--buildscripts/resmokelib/hang_analyzer/dumper.py346
-rw-r--r--buildscripts/resmokelib/hang_analyzer/process.py109
-rw-r--r--buildscripts/resmokelib/hang_analyzer/process_list.py158
-rw-r--r--buildscripts/resmokelib/parser.py71
-rw-r--r--etc/evergreen.yml8
9 files changed, 864 insertions, 774 deletions
diff --git a/buildscripts/hang_analyzer.py b/buildscripts/hang_analyzer.py
index 2e0f7821f56..30eba2206e4 100755
--- a/buildscripts/hang_analyzer.py
+++ b/buildscripts/hang_analyzer.py
@@ -1,754 +1,7 @@
#!/usr/bin/env python3
-"""Hang Analyzer module.
-
-A prototype hang analyzer for Evergreen integration to help investigate test timeouts
-
-1. Script supports taking dumps, and/or dumping a summary of useful information about a process
-2. Script will iterate through a list of interesting processes,
- and run the tools from step 1. The list of processes can be provided as an option.
-3. Java processes will be dumped using jstack, if available.
-
-Supports Linux, MacOS X, Solaris, and Windows.
-"""
-
-import io
-import csv
-import glob
-import itertools
-import logging
-import os
-import platform
-import re
-import signal
-import subprocess
-import sys
-import tempfile
-import traceback
-import time
-from distutils import spawn # pylint: disable=no-name-in-module
-from optparse import OptionParser
-_IS_WINDOWS = (sys.platform == "win32")
-
-if _IS_WINDOWS:
- import win32event
- import win32api
-
-# Get relative imports to work when the package is not installed on the PYTHONPATH.
-if __name__ == "__main__" and __package__ is None:
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
- from buildscripts.resmokelib import core
-
-
-def call(args, logger):
- """Call subprocess on args list."""
- logger.info(str(args))
-
- # Use a common pipe for stdout & stderr for logging.
- process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
- logger_pipe = core.pipe.LoggerPipe(logger, logging.INFO, process.stdout)
- logger_pipe.wait_until_started()
-
- ret = process.wait()
- logger_pipe.wait_until_finished()
-
- if ret != 0:
- logger.error("Bad exit code %d", ret)
- raise Exception("Bad exit code %d from %s" % (ret, " ".join(args)))
-
-
-def callo(args, logger):
- """Call subprocess on args string."""
- logger.info("%s", str(args))
-
- return subprocess.check_output(args).decode('utf-8', 'replace')
-
-
-def find_program(prog, paths):
- """Find the specified program in env PATH, or tries a set of paths."""
- loc = spawn.find_executable(prog)
-
- if loc is not None:
- return loc
-
- for loc in paths:
- full_prog = os.path.join(loc, prog)
- if os.path.exists(full_prog):
- return full_prog
-
- return None
-
-
-def get_process_logger(debugger_output, pid, process_name):
- """Return the process logger from options specified."""
- process_logger = logging.Logger("process", level=logging.DEBUG)
- process_logger.mongo_process_filename = None
-
- if 'stdout' in debugger_output:
- s_handler = logging.StreamHandler(sys.stdout)
- s_handler.setFormatter(logging.Formatter(fmt="%(message)s"))
- process_logger.addHandler(s_handler)
-
- if 'file' in debugger_output:
- filename = "debugger_%s_%d.log" % (os.path.splitext(process_name)[0], pid)
- process_logger.mongo_process_filename = filename
- f_handler = logging.FileHandler(filename=filename, mode="w")
- f_handler.setFormatter(logging.Formatter(fmt="%(message)s"))
- process_logger.addHandler(f_handler)
-
- return process_logger
-
-
-class WindowsDumper(object):
- """WindowsDumper class."""
-
- @staticmethod
- def __find_debugger(logger, debugger):
- """Find the installed debugger."""
- # We are looking for c:\Program Files (x86)\Windows Kits\8.1\Debuggers\x64
- cdb = spawn.find_executable(debugger)
- if cdb is not None:
- return cdb
- from win32com.shell import shell, shellcon
-
- # Cygwin via sshd does not expose the normal environment variables
- # Use the shell api to get the variable instead
- root_dir = shell.SHGetFolderPath(0, shellcon.CSIDL_PROGRAM_FILESX86, None, 0)
-
- # Construct the debugger search paths in most-recent order
- debugger_paths = [os.path.join(root_dir, "Windows Kits", "10", "Debuggers", "x64")]
- for idx in reversed(range(0, 2)):
- debugger_paths.append(
- os.path.join(root_dir, "Windows Kits", "8." + str(idx), "Debuggers", "x64"))
-
- for dbg_path in debugger_paths:
- logger.info("Checking for debugger in %s", dbg_path)
- if os.path.exists(dbg_path):
- return os.path.join(dbg_path, debugger)
-
- return None
-
- def dump_info( # pylint: disable=too-many-arguments
- self, root_logger, logger, pid, process_name, take_dump):
- """Dump useful information to the console."""
- debugger = "cdb.exe"
- dbg = self.__find_debugger(root_logger, debugger)
-
- if dbg is None:
- root_logger.warning("Debugger %s not found, skipping dumping of %d", debugger, pid)
- return
-
- root_logger.info("Debugger %s, analyzing %s process with PID %d", dbg, process_name, pid)
-
- dump_command = ""
- if take_dump:
- # Dump to file, dump_<process name>.<pid>.mdmp
- dump_file = "dump_%s.%d.%s" % (os.path.splitext(process_name)[0], pid,
- self.get_dump_ext())
- dump_command = ".dump /ma %s" % dump_file
- root_logger.info("Dumping core to %s", dump_file)
-
- cmds = [
- ".symfix", # Fixup symbol path
- ".symopt +0x10", # Enable line loading (off by default in CDB, on by default in WinDBG)
- ".reload", # Reload symbols
- "!peb", # Dump current exe, & environment variables
- "lm", # Dump loaded modules
- dump_command,
- "!uniqstack -pn", # Dump All unique Threads with function arguments
- "!cs -l", # Dump all locked critical sections
- ".detach", # Detach
- "q" # Quit
- ]
-
- call([dbg, '-c', ";".join(cmds), '-p', str(pid)], logger)
-
- root_logger.info("Done analyzing %s process with PID %d", process_name, pid)
-
- @staticmethod
- def get_dump_ext():
- """Return the dump file extension."""
- return "mdmp"
-
-
-class WindowsProcessList(object):
- """WindowsProcessList class."""
-
- @staticmethod
- def __find_ps():
- """Find tasklist."""
- return os.path.join(os.environ["WINDIR"], "system32", "tasklist.exe")
-
- def dump_processes(self, logger):
- """Get list of [Pid, Process Name]."""
- ps = self.__find_ps()
-
- logger.info("Getting list of processes using %s", ps)
-
- ret = callo([ps, "/FO", "CSV"], logger)
-
- buff = io.StringIO(ret)
- csv_reader = csv.reader(buff)
-
- return [[int(row[1]), row[0]] for row in csv_reader if row[1] != "PID"]
-
-
-# LLDB dumper is for MacOS X
-class LLDBDumper(object):
- """LLDBDumper class."""
-
- @staticmethod
- def __find_debugger(debugger):
- """Find the installed debugger."""
- return find_program(debugger, ['/usr/bin'])
-
- def dump_info( # pylint: disable=too-many-arguments,too-many-locals
- self, root_logger, logger, pid, process_name, take_dump):
- """Dump info."""
- debugger = "lldb"
- dbg = self.__find_debugger(debugger)
-
- if dbg is None:
- root_logger.warning("Debugger %s not found, skipping dumping of %d", debugger, pid)
- return
-
- root_logger.info("Debugger %s, analyzing %s process with PID %d", dbg, process_name, pid)
-
- lldb_version = callo([dbg, "--version"], logger)
-
- logger.info(lldb_version)
-
- # Do we have the XCode or LLVM version of lldb?
- # Old versions of lldb do not work well when taking commands via a file
- # XCode (7.2): lldb-340.4.119
- # LLVM - lldb version 3.7.0 ( revision )
-
- if 'version' not in lldb_version:
- # We have XCode's lldb
- lldb_version = lldb_version[lldb_version.index("lldb-"):]
- lldb_version = lldb_version.replace('lldb-', '')
- lldb_major_version = int(lldb_version[:lldb_version.index('.')])
- if lldb_major_version < 340:
- logger.warning("Debugger lldb is too old, please upgrade to XCode 7.2")
- return
-
- dump_command = ""
- if take_dump:
- # Dump to file, dump_<process name>.<pid>.core
- dump_file = "dump_%s.%d.%s" % (process_name, pid, self.get_dump_ext())
- dump_command = "process save-core %s" % dump_file
- root_logger.info("Dumping core to %s", dump_file)
-
- cmds = [
- "attach -p %d" % pid,
- "target modules list",
- "thread backtrace all",
- dump_command,
- "settings set interpreter.prompt-on-quit false",
- "quit",
- ]
-
- tf = tempfile.NamedTemporaryFile(mode='w', encoding='utf-8')
-
- for cmd in cmds:
- tf.write(cmd + "\n")
-
- tf.flush()
-
- # Works on in MacOS 10.9 & later
- #call([dbg] + list( itertools.chain.from_iterable([['-o', b] for b in cmds])), logger)
- call(['cat', tf.name], logger)
- call([dbg, '--source', tf.name], logger)
-
- root_logger.info("Done analyzing %s process with PID %d", process_name, pid)
-
- @staticmethod
- def get_dump_ext():
- """Return the dump file extension."""
- return "core"
-
-
-class DarwinProcessList(object):
- """DarwinProcessList class."""
-
- @staticmethod
- def __find_ps():
- """Find ps."""
- return find_program('ps', ['/bin'])
-
- def dump_processes(self, logger):
- """Get list of [Pid, Process Name]."""
- ps = self.__find_ps()
-
- logger.info("Getting list of processes using %s", ps)
-
- ret = callo([ps, "-axco", "pid,comm"], logger)
-
- buff = io.StringIO(ret)
- csv_reader = csv.reader(buff, delimiter=' ', quoting=csv.QUOTE_NONE, skipinitialspace=True)
-
- return [[int(row[0]), row[1]] for row in csv_reader if row[0] != "PID"]
-
-
-# GDB dumper is for Linux & Solaris
-class GDBDumper(object):
- """GDBDumper class."""
-
- @staticmethod
- def __find_debugger(debugger):
- """Find the installed debugger."""
- return find_program(debugger, ['/opt/mongodbtoolchain/gdb/bin', '/usr/bin'])
-
- def dump_info( # pylint: disable=too-many-arguments,too-many-locals
- self, root_logger, logger, pid, process_name, take_dump):
- """Dump info."""
- debugger = "gdb"
- dbg = self.__find_debugger(debugger)
-
- if dbg is None:
- logger.warning("Debugger %s not found, skipping dumping of %d", debugger, pid)
- return
-
- root_logger.info("Debugger %s, analyzing %s process with PID %d", dbg, process_name, pid)
-
- dump_command = ""
- if take_dump:
- # Dump to file, dump_<process name>.<pid>.core
- dump_file = "dump_%s.%d.%s" % (process_name, pid, self.get_dump_ext())
- dump_command = "gcore %s" % dump_file
- root_logger.info("Dumping core to %s", dump_file)
-
- call([dbg, "--version"], logger)
-
- script_dir = os.path.dirname(os.path.abspath(__file__))
- root_logger.info("dir %s", script_dir)
- gdb_dir = os.path.join(script_dir, "gdb")
- mongo_script = os.path.join(gdb_dir, "mongo.py")
- mongo_printers_script = os.path.join(gdb_dir, "mongo_printers.py")
- mongo_lock_script = os.path.join(gdb_dir, "mongo_lock.py")
-
- source_mongo = "source %s" % mongo_script
- source_mongo_printers = "source %s" % mongo_printers_script
- source_mongo_lock = "source %s" % mongo_lock_script
- mongodb_dump_locks = "mongodb-dump-locks"
- mongodb_show_locks = "mongodb-show-locks"
- mongodb_uniqstack = "mongodb-uniqstack mongodb-bt-if-active"
- mongodb_waitsfor_graph = "mongodb-waitsfor-graph debugger_waitsfor_%s_%d.gv" % \
- (process_name, pid)
- mongodb_javascript_stack = "mongodb-javascript-stack"
- mongod_dump_sessions = "mongod-dump-sessions"
-
- # The following MongoDB python extensions do not run on Solaris.
- if sys.platform.startswith("sunos"):
- source_mongo_lock = ""
- # SERVER-28234 - GDB frame information not available on Solaris for a templatized
- # function
- mongodb_dump_locks = ""
-
- # SERVER-28373 - GDB thread-local variables not available on Solaris
- mongodb_show_locks = ""
- mongodb_waitsfor_graph = ""
- mongodb_javascript_stack = ""
- mongod_dump_sessions = ""
-
- if not logger.mongo_process_filename:
- raw_stacks_commands = []
- else:
- base, ext = os.path.splitext(logger.mongo_process_filename)
- raw_stacks_filename = base + '_raw_stacks' + ext
- raw_stacks_commands = [
- 'echo \\nWriting raw stacks to %s.\\n' % raw_stacks_filename,
- # This sends output to log file rather than stdout until we turn logging off.
- 'set logging redirect on',
- 'set logging file ' + raw_stacks_filename,
- 'set logging on',
- 'thread apply all bt',
- 'set logging off',
- ]
-
- cmds = [
- "set interactive-mode off",
- "set print thread-events off", # Suppress GDB messages of threads starting/finishing.
- "file %s" % process_name, # Solaris must load the process to read the symbols.
- "attach %d" % pid,
- "info sharedlibrary",
- "info threads", # Dump a simple list of commands to get the thread name
- "set python print-stack full",
- ] + raw_stacks_commands + [
- source_mongo,
- source_mongo_printers,
- source_mongo_lock,
- mongodb_uniqstack,
- # Lock the scheduler, before running commands, which execute code in the attached process.
- "set scheduler-locking on",
- dump_command,
- mongodb_dump_locks,
- mongodb_show_locks,
- mongodb_waitsfor_graph,
- mongodb_javascript_stack,
- mongod_dump_sessions,
- "set confirm off",
- "quit",
- ]
-
- call([dbg, "--quiet", "--nx"] + list(
- itertools.chain.from_iterable([['-ex', b] for b in cmds])), logger)
-
- root_logger.info("Done analyzing %s process with PID %d", process_name, pid)
-
- @staticmethod
- def get_dump_ext():
- """Return the dump file extension."""
- return "core"
-
- @staticmethod
- def _find_gcore():
- """Find the installed gcore."""
- dbg = "/usr/bin/gcore"
- if os.path.exists(dbg):
- return dbg
-
- return None
-
-
-class LinuxProcessList(object):
- """LinuxProcessList class."""
-
- @staticmethod
- def __find_ps():
- """Find ps."""
- return find_program('ps', ['/bin', '/usr/bin'])
-
- def dump_processes(self, logger):
- """Get list of [Pid, Process Name]."""
- ps = self.__find_ps()
-
- logger.info("Getting list of processes using %s", ps)
-
- call([ps, "--version"], logger)
-
- ret = callo([ps, "-eo", "pid,args"], logger)
-
- buff = io.StringIO(ret)
- csv_reader = csv.reader(buff, delimiter=' ', quoting=csv.QUOTE_NONE, skipinitialspace=True)
-
- return [[int(row[0]), os.path.split(row[1])[1]] for row in csv_reader if row[0] != "PID"]
-
-
-class SolarisProcessList(object):
- """SolarisProcessList class."""
-
- @staticmethod
- def __find_ps():
- """Find ps."""
- return find_program('ps', ['/bin', '/usr/bin'])
-
- def dump_processes(self, logger):
- """Get list of [Pid, Process Name]."""
- ps = self.__find_ps()
-
- logger.info("Getting list of processes using %s", ps)
-
- ret = callo([ps, "-eo", "pid,args"], logger)
-
- buff = io.StringIO(ret)
- csv_reader = csv.reader(buff, delimiter=' ', quoting=csv.QUOTE_NONE, skipinitialspace=True)
-
- return [[int(row[0]), os.path.split(row[1])[1]] for row in csv_reader if row[0] != "PID"]
-
-
-# jstack is a JDK utility
-class JstackDumper(object):
- """JstackDumper class."""
-
- @staticmethod
- def __find_debugger(debugger):
- """Find the installed jstack debugger."""
- return find_program(debugger, ['/usr/bin'])
-
- def dump_info(self, root_logger, logger, pid, process_name):
- """Dump java thread stack traces to the console."""
- debugger = "jstack"
- jstack = self.__find_debugger(debugger)
-
- if jstack is None:
- logger.warning("Debugger %s not found, skipping dumping of %d", debugger, pid)
- return
-
- root_logger.info("Debugger %s, analyzing %s process with PID %d", jstack, process_name, pid)
-
- call([jstack, "-l", str(pid)], logger)
-
- root_logger.info("Done analyzing %s process with PID %d", process_name, pid)
-
-
-# jstack is a JDK utility
-class JstackWindowsDumper(object):
- """JstackWindowsDumper class."""
-
- @staticmethod
- def dump_info(root_logger, pid):
- """Dump java thread stack traces to the logger."""
-
- root_logger.warning("Debugger jstack not supported, skipping dumping of %d", pid)
-
-
-def get_hang_analyzers():
- """Return hang analyzers."""
-
- dbg = None
- jstack = None
- ps = None
- if sys.platform.startswith("linux"):
- dbg = GDBDumper()
- jstack = JstackDumper()
- ps = LinuxProcessList()
- elif sys.platform.startswith("sunos"):
- dbg = GDBDumper()
- jstack = JstackDumper()
- ps = SolarisProcessList()
- elif _IS_WINDOWS or sys.platform == "cygwin":
- dbg = WindowsDumper()
- jstack = JstackWindowsDumper()
- ps = WindowsProcessList()
- elif sys.platform == "darwin":
- dbg = LLDBDumper()
- jstack = JstackDumper()
- ps = DarwinProcessList()
-
- return [ps, dbg, jstack]
-
-
-def check_dump_quota(quota, ext):
- """Check if sum of the files with ext is within the specified quota in megabytes."""
-
- files = glob.glob("*." + ext)
-
- size_sum = 0
- for file_name in files:
- size_sum += os.path.getsize(file_name)
-
- return size_sum <= quota
-
-
-def signal_event_object(logger, pid):
- """Signal the Windows event object."""
-
- # Use unique event_name created.
- event_name = "Global\\Mongo_Python_" + str(pid)
-
- try:
- desired_access = win32event.EVENT_MODIFY_STATE
- inherit_handle = False
- task_timeout_handle = win32event.OpenEvent(desired_access, inherit_handle, event_name)
- except win32event.error as err:
- logger.info("Exception from win32event.OpenEvent with error: %s", err)
- return
-
- try:
- win32event.SetEvent(task_timeout_handle)
- except win32event.error as err:
- logger.info("Exception from win32event.SetEvent with error: %s", err)
- finally:
- win32api.CloseHandle(task_timeout_handle)
-
- logger.info("Waiting for process to report")
- time.sleep(5)
-
-
-def signal_process(logger, pid, signalnum):
- """Signal process with signal, N/A on Windows."""
- try:
- os.kill(pid, signalnum)
-
- logger.info("Waiting for process to report")
- time.sleep(5)
- except OSError as err:
- logger.error("Hit OS error trying to signal process: %s", err)
-
- except AttributeError:
- logger.error("Cannot send signal to a process on Windows")
-
-
-def pname_match(match_type, pname, interesting_processes):
- """Return True if the pname matches in interesting_processes."""
- pname = os.path.splitext(pname)[0]
- for ip in interesting_processes:
- if match_type == 'exact' and pname == ip or match_type == 'contains' and ip in pname:
- return True
- return False
-
-
-# Basic procedure
-#
-# 1. Get a list of interesting processes
-# 2. Dump useful information or take dumps
-def main(): # pylint: disable=too-many-branches,too-many-locals,too-many-statements
- """Execute Main program."""
- root_logger = logging.Logger("hang_analyzer", level=logging.DEBUG)
-
- handler = logging.StreamHandler(sys.stdout)
- handler.setFormatter(logging.Formatter(fmt="%(message)s"))
- root_logger.addHandler(handler)
-
- root_logger.info("Python Version: %s", sys.version)
- root_logger.info("OS: %s", platform.platform())
-
- try:
- if _IS_WINDOWS or sys.platform == "cygwin":
- distro = platform.win32_ver()
- root_logger.info("Windows Distribution: %s", distro)
- else:
- distro = platform.linux_distribution()
- root_logger.info("Linux Distribution: %s", distro)
-
- except AttributeError:
- root_logger.warning("Cannot determine Linux distro since Python is too old")
-
- try:
- uid = os.getuid()
- root_logger.info("Current User: %s", uid)
- current_login = os.getlogin()
- root_logger.info("Current Login: %s", current_login)
- except OSError:
- root_logger.warning("Cannot determine Unix Current Login")
- except AttributeError:
- root_logger.warning("Cannot determine Unix Current Login, not supported on Windows")
-
- interesting_processes = ["mongo", "mongod", "mongos", "_test", "dbtest", "python", "java"]
- go_processes = []
- process_ids = []
-
- parser = OptionParser(description=__doc__)
- parser.add_option(
- '-m', '--process-match', dest='process_match', choices=['contains', 'exact'],
- default='contains', help="Type of match for process names (-p & -g), specify 'contains', or"
- " 'exact'. Note that the process name match performs the following"
- " conversions: change all process names to lowecase, strip off the file"
- " extension, like '.exe' on Windows. Default is 'contains'.")
- parser.add_option('-p', '--process-names', dest='process_names',
- help='Comma separated list of process names to analyze')
- parser.add_option('-g', '--go-process-names', dest='go_process_names',
- help='Comma separated list of go process names to analyze')
- parser.add_option(
- '-d', '--process-ids', dest='process_ids', default=None,
- help='Comma separated list of process ids (PID) to analyze, overrides -p &'
- ' -g')
- parser.add_option('-c', '--dump-core', dest='dump_core', action="store_true", default=False,
- help='Dump core file for each analyzed process')
- parser.add_option('-s', '--max-core-dumps-size', dest='max_core_dumps_size', default=10000,
- help='Maximum total size of core dumps to keep in megabytes')
- parser.add_option(
- '-o', '--debugger-output', dest='debugger_output', action="append",
- choices=['file', 'stdout'], default=None,
- help="If 'stdout', then the debugger's output is written to the Python"
- " process's stdout. If 'file', then the debugger's output is written"
- " to a file named debugger_<process>_<pid>.log for each process it"
- " attaches to. This option can be specified multiple times on the"
- " command line to have the debugger's output written to multiple"
- " locations. By default, the debugger's output is written only to the"
- " Python process's stdout.")
-
- (options, _) = parser.parse_args()
-
- if options.debugger_output is None:
- options.debugger_output = ['stdout']
-
- if options.process_ids is not None:
- # process_ids is an int list of PIDs
- process_ids = [int(pid) for pid in options.process_ids.split(',')]
-
- if options.process_names is not None:
- interesting_processes = options.process_names.split(',')
-
- if options.go_process_names is not None:
- go_processes = options.go_process_names.split(',')
- interesting_processes += go_processes
-
- [ps, dbg, jstack] = get_hang_analyzers()
-
- if ps is None or (dbg is None and jstack is None):
- root_logger.warning("hang_analyzer.py: Unsupported platform: %s", sys.platform)
- exit(1)
-
- all_processes = ps.dump_processes(root_logger)
-
- # Canonicalize the process names to lowercase to handle cases where the name of the Python
- # process is /System/Library/.../Python on OS X and -p python is specified to hang_analyzer.py.
- all_processes = [(pid, process_name.lower()) for (pid, process_name) in all_processes]
-
- # Find all running interesting processes:
- # If a list of process_ids is supplied, match on that.
- # Otherwise, do a substring match on interesting_processes.
- if process_ids:
- processes = [(pid, pname) for (pid, pname) in all_processes
- if pid in process_ids and pid != os.getpid()]
-
- running_pids = {pid for (pid, pname) in all_processes}
- missing_pids = set(process_ids) - running_pids
- if missing_pids:
- root_logger.warning("The following requested process ids are not running %s",
- list(missing_pids))
- else:
- processes = [(pid, pname) for (pid, pname) in all_processes
- if pname_match(options.process_match, pname, interesting_processes)
- and pid != os.getpid()]
-
- root_logger.info("Found %d interesting processes %s", len(processes), processes)
-
- max_dump_size_bytes = int(options.max_core_dumps_size) * 1024 * 1024
-
- # Dump python processes by signalling them. The resmoke.py process will generate
- # the report.json, when signalled, so we do this before attaching to other processes.
- for (pid, process_name) in [(p, pn) for (p, pn) in processes if pn.startswith("python")]:
- # On Windows, we set up an event object to wait on a signal. For Cygwin, we register
- # a signal handler to wait for the signal since it supports POSIX signals.
- if _IS_WINDOWS:
- root_logger.info("Calling SetEvent to signal python process %s with PID %d",
- process_name, pid)
- signal_event_object(root_logger, pid)
- else:
- root_logger.info("Sending signal SIGUSR1 to python process %s with PID %d",
- process_name, pid)
- signal_process(root_logger, pid, signal.SIGUSR1)
-
- trapped_exceptions = []
-
- # Dump all processes, except python & java.
- for (pid,
- process_name) in [(p, pn) for (p, pn) in processes if not re.match("^(java|python)", pn)]:
- process_logger = get_process_logger(options.debugger_output, pid, process_name)
- try:
- dbg.dump_info(
- root_logger, process_logger, pid, process_name, options.dump_core
- and check_dump_quota(max_dump_size_bytes, dbg.get_dump_ext()))
- except Exception as err: # pylint: disable=broad-except
- root_logger.info("Error encountered when invoking debugger %s", err)
- trapped_exceptions.append(traceback.format_exc())
-
- # Dump java processes using jstack.
- for (pid, process_name) in [(p, pn) for (p, pn) in processes if pn.startswith("java")]:
- process_logger = get_process_logger(options.debugger_output, pid, process_name)
- try:
- jstack.dump_info(root_logger, pid)
- except Exception as err: # pylint: disable=broad-except
- root_logger.info("Error encountered when invoking debugger %s", err)
- trapped_exceptions.append(traceback.format_exc())
-
- # Signal go processes to ensure they print out stack traces, and die on POSIX OSes.
- # On Windows, this will simply kill the process since python emulates SIGABRT as
- # TerminateProcess.
- # Note: The stacktrace output may be captured elsewhere (i.e. resmoke).
- for (pid, process_name) in [(p, pn) for (p, pn) in processes if pn in go_processes]:
- root_logger.info("Sending signal SIGABRT to go process %s with PID %d", process_name, pid)
- signal_process(root_logger, pid, signal.SIGABRT)
-
- root_logger.info("Done analyzing all processes for hangs")
-
- for exception in trapped_exceptions:
- root_logger.info(exception)
- if trapped_exceptions:
- sys.exit(1)
-
+"""Stub file pointing users to the new invocation."""
if __name__ == "__main__":
- main()
+ print("Hello! It seems you've executed 'buildscripts/hang_analyzer.py'. We have recently\n"
+ "repackaged the hang analyzer as a subcommand of resmoke. It can now be invoked as\n"
+ "'./buildscripts/resmoke.py hang-analyzer' with all of the same arguments as before.")
diff --git a/buildscripts/resmokelib/commands/__init__.py b/buildscripts/resmokelib/commands/__init__.py
index 1a3235631a7..641e3625cfd 100644
--- a/buildscripts/resmokelib/commands/__init__.py
+++ b/buildscripts/resmokelib/commands/__init__.py
@@ -2,3 +2,4 @@
from . import interface
from . import run
+from . import hang_analyzer
diff --git a/buildscripts/resmokelib/commands/hang_analyzer.py b/buildscripts/resmokelib/commands/hang_analyzer.py
new file mode 100755
index 00000000000..8df33b6c35e
--- /dev/null
+++ b/buildscripts/resmokelib/commands/hang_analyzer.py
@@ -0,0 +1,185 @@
+"""Hang Analyzer module.
+
+A prototype hang analyzer for Evergreen integration to help investigate test timeouts
+
+1. Script supports taking dumps, and/or dumping a summary of useful information about a process
+2. Script will iterate through a list of interesting processes,
+ and run the tools from step 1. The list of processes can be provided as an option.
+3. Java processes will be dumped using jstack, if available.
+
+Supports Linux, MacOS X, and Windows.
+"""
+import re
+import os
+import sys
+import glob
+import signal
+import logging
+import platform
+import traceback
+
+from buildscripts.resmokelib.commands import interface
+
+from buildscripts.resmokelib.hang_analyzer import dumper
+from buildscripts.resmokelib.hang_analyzer import process_list
+from buildscripts.resmokelib.hang_analyzer.process import signal_process, signal_python
+
+
+class HangAnalyzer(interface.Subcommand):
+ """Main class for the hang analyzer subcommand."""
+
+ def __init__(self, options):
+ """
+ Configure processe lists based on options.
+
+ :param options: Options as parsed by parser.py
+ """
+ self.options = options
+ self.root_logger = None
+ self.interesting_processes = [
+ "mongo", "mongod", "mongos", "_test", "dbtest", "python", "java"
+ ]
+ self.go_processes = []
+ self.process_ids = []
+
+ self._configure_processes()
+
+ def execute(self): # pylint: disable=too-many-branches,too-many-locals,too-many-statements
+ """
+ Execute hang analysis.
+
+ 1. Get a list of interesting processes
+ 2. Dump useful information or take core dumps
+ """
+ self._setup_logging()
+ self._log_system_info()
+
+ dumpers = dumper.get_dumpers()
+
+ processes = process_list.get_processes(self.process_ids, self.interesting_processes,
+ self.options.process_match, self.root_logger)
+
+ max_dump_size_bytes = int(self.options.max_core_dumps_size) * 1024 * 1024
+
+ # Dump python processes by signalling them. The resmoke.py process will generate
+ # the report.json, when signalled, so we do this before attaching to other processes.
+ for pinfo in [pinfo for pinfo in processes if pinfo.name.startswith("python")]:
+ signal_python(self.root_logger, pinfo)
+
+ trapped_exceptions = []
+
+ # Dump all processes, except python & java.
+ for pinfo in [pinfo for pinfo in processes if not re.match("^(java|python)", pinfo.name)]:
+ process_logger = self._get_process_logger(pinfo)
+ try:
+ dumpers.dbg.dump_info(
+ self.root_logger, process_logger, pinfo, self.options.dump_core
+ and _check_dump_quota(max_dump_size_bytes, dumpers.dbg.get_dump_ext()))
+ except Exception as err: # pylint: disable=broad-except
+ self.root_logger.info("Error encountered when invoking debugger %s", err)
+ trapped_exceptions.append(traceback.format_exc())
+
+ # Dump java processes using jstack.
+ for pinfo in [pinfo for pinfo in processes if pinfo.name.startswith("java")]:
+ process_logger = self._get_process_logger(pinfo)
+ try:
+ dumpers.jstack.dump_info(self.root_logger, pinfo.pid)
+ except Exception as err: # pylint: disable=broad-except
+ self.root_logger.info("Error encountered when invoking debugger %s", err)
+ trapped_exceptions.append(traceback.format_exc())
+
+ # Signal go processes to ensure they print out stack traces, and die on POSIX OSes.
+ # On Windows, this will simply kill the process since python emulates SIGABRT as
+ # TerminateProcess.
+ # Note: The stacktrace output may be captured elsewhere (i.e. resmoke).
+ for pinfo in [pinfo for pinfo in processes if pinfo.name in self.go_processes]:
+ self.root_logger.info("Sending signal SIGABRT to go process %s with PID %d", pinfo.name,
+ pinfo.pid)
+ signal_process(self.root_logger, pinfo.pid, signal.SIGABRT)
+
+ self.root_logger.info("Done analyzing all processes for hangs")
+
+ for exception in trapped_exceptions:
+ self.root_logger.info(exception)
+ if trapped_exceptions:
+ raise RuntimeError(
+ "Exceptions were thrown while dumping. There may still be some valid dumps.")
+
+ def _configure_processes(self):
+ if self.options.debugger_output is None:
+ self.options.debugger_output = ['stdout']
+
+ if self.options.process_ids is not None:
+ # self.process_ids is an int list of PIDs
+ self.process_ids = [int(pid) for pid in self.options.process_ids.split(',')]
+
+ if self.options.process_names is not None:
+ self.interesting_processes = self.options.process_names.split(',')
+
+ if self.options.go_process_names is not None:
+ self.go_processes = self.options.go_process_names.split(',')
+ self.interesting_processes += self.go_processes
+
+ def _setup_logging(self):
+ self.root_logger = logging.Logger("hang_analyzer", level=logging.DEBUG)
+
+ handler = logging.StreamHandler(sys.stdout)
+ handler.setFormatter(logging.Formatter(fmt="%(message)s"))
+ self.root_logger.addHandler(handler)
+
+ self.root_logger.info("Python Version: %s", sys.version)
+ self.root_logger.info("OS: %s", platform.platform())
+
+ def _log_system_info(self):
+ try:
+ if sys.platform == "win32" or sys.platform == "cygwin":
+ distro = platform.win32_ver()
+ self.root_logger.info("Windows Distribution: %s", distro)
+ else:
+ distro = platform.linux_distribution()
+ self.root_logger.info("Linux Distribution: %s", distro)
+
+ except AttributeError:
+ self.root_logger.warning("Cannot determine Linux distro since Python is too old")
+
+ try:
+ uid = os.getuid()
+ self.root_logger.info("Current User: %s", uid)
+ current_login = os.getlogin()
+ self.root_logger.info("Current Login: %s", current_login)
+ except OSError:
+ self.root_logger.warning("Cannot determine Unix Current Login")
+ except AttributeError:
+ self.root_logger.warning(
+ "Cannot determine Unix Current Login, not supported on Windows")
+
+ def _get_process_logger(self, pinfo):
+ """Return the process logger from options specified."""
+ process_logger = logging.Logger("process", level=logging.DEBUG)
+ process_logger.mongo_process_filename = None
+
+ if 'stdout' in self.options.debugger_output:
+ s_handler = logging.StreamHandler(sys.stdout)
+ s_handler.setFormatter(logging.Formatter(fmt="%(message)s"))
+ process_logger.addHandler(s_handler)
+
+ if 'file' in self.options.debugger_output:
+ filename = "debugger_%s_%d.log" % (os.path.splitext(pinfo.name)[0], pinfo.pid)
+ process_logger.mongo_process_filename = filename
+ f_handler = logging.FileHandler(filename=filename, mode="w")
+ f_handler.setFormatter(logging.Formatter(fmt="%(message)s"))
+ process_logger.addHandler(f_handler)
+
+ return process_logger
+
+
+def _check_dump_quota(quota, ext):
+ """Check if sum of the files with ext is within the specified quota in megabytes."""
+
+ files = glob.glob("*." + ext)
+
+ size_sum = 0
+ for file_name in files:
+ size_sum += os.path.getsize(file_name)
+
+ return size_sum <= quota
diff --git a/buildscripts/resmokelib/hang_analyzer/__init__.py b/buildscripts/resmokelib/hang_analyzer/__init__.py
new file mode 100644
index 00000000000..246385cdacb
--- /dev/null
+++ b/buildscripts/resmokelib/hang_analyzer/__init__.py
@@ -0,0 +1,5 @@
+"""Utilities for the hang analyzer subcommand."""
+
+from . import process_list
+from . import dumper
+from . import process
diff --git a/buildscripts/resmokelib/hang_analyzer/dumper.py b/buildscripts/resmokelib/hang_analyzer/dumper.py
new file mode 100644
index 00000000000..12036f4e470
--- /dev/null
+++ b/buildscripts/resmokelib/hang_analyzer/dumper.py
@@ -0,0 +1,346 @@
+"""Tools to dump debug info for each OS."""
+
+import os
+import sys
+import tempfile
+import itertools
+from distutils import spawn # pylint: disable=no-name-in-module
+from collections import namedtuple
+
+from buildscripts.resmokelib.hang_analyzer.process import call, callo, find_program
+
+Dumpers = namedtuple('Dumpers', ['dbg', 'jstack'])
+
+
+def get_dumpers():
+ """Return OS-appropriate dumpers."""
+
+ dbg = None
+ jstack = None
+ if sys.platform.startswith("linux"):
+ dbg = GDBDumper()
+ jstack = JstackDumper()
+ elif sys.platform == "win32" or sys.platform == "cygwin":
+ dbg = WindowsDumper()
+ jstack = JstackWindowsDumper()
+ elif sys.platform == "darwin":
+ dbg = LLDBDumper()
+ jstack = JstackDumper()
+
+ return Dumpers(dbg=dbg, jstack=jstack)
+
+
+class Dumper(object):
+ """Abstract base class for OS-specific dumpers."""
+
+ def dump_info( # pylint: disable=too-many-arguments,too-many-locals
+ self, root_logger, logger, pinfo, take_dump):
+ """
+ Perform dump for a process.
+
+ :param root_logger: Top-level logger
+ :param logger: Logger to output dump info to
+ :param pinfo: A Pinfo describing the process
+ :param take_dump: Whether to take a core dump
+ """
+ raise NotImplementedError("dump_info must be implemented in OS-specific subclasses")
+
+ @staticmethod
+ def get_dump_ext():
+ """Return the dump file extension."""
+ raise NotImplementedError("get_dump_ext must be implemented in OS-specific subclasses")
+
+
+class WindowsDumper(Dumper):
+ """WindowsDumper class."""
+
+ @staticmethod
+ def __find_debugger(logger, debugger):
+ """Find the installed debugger."""
+ # We are looking for c:\Program Files (x86)\Windows Kits\8.1\Debuggers\x64
+ cdb = spawn.find_executable(debugger)
+ if cdb is not None:
+ return cdb
+ from win32com.shell import shell, shellcon
+
+ # Cygwin via sshd does not expose the normal environment variables
+ # Use the shell api to get the variable instead
+ root_dir = shell.SHGetFolderPath(0, shellcon.CSIDL_PROGRAM_FILESX86, None, 0)
+
+ # Construct the debugger search paths in most-recent order
+ debugger_paths = [os.path.join(root_dir, "Windows Kits", "10", "Debuggers", "x64")]
+ for idx in reversed(range(0, 2)):
+ debugger_paths.append(
+ os.path.join(root_dir, "Windows Kits", "8." + str(idx), "Debuggers", "x64"))
+
+ for dbg_path in debugger_paths:
+ logger.info("Checking for debugger in %s", dbg_path)
+ if os.path.exists(dbg_path):
+ return os.path.join(dbg_path, debugger)
+
+ return None
+
+ def dump_info( # pylint: disable=too-many-arguments
+ self, root_logger, logger, pinfo, take_dump):
+ """Dump useful information to the console."""
+ debugger = "cdb.exe"
+ dbg = self.__find_debugger(root_logger, debugger)
+
+ if dbg is None:
+ root_logger.warning("Debugger %s not found, skipping dumping of %d", debugger,
+ pinfo.pid)
+ return
+
+ root_logger.info("Debugger %s, analyzing %s process with PID %d", dbg, pinfo.name,
+ pinfo.pid)
+
+ dump_command = ""
+ if take_dump:
+ # Dump to file, dump_<process name>.<pid>.mdmp
+ dump_file = "dump_%s.%d.%s" % (os.path.splitext(pinfo.name)[0], pinfo.pid,
+ self.get_dump_ext())
+ dump_command = ".dump /ma %s" % dump_file
+ root_logger.info("Dumping core to %s", dump_file)
+
+ cmds = [
+ ".symfix", # Fixup symbol path
+ ".symopt +0x10", # Enable line loading (off by default in CDB, on by default in WinDBG)
+ ".reload", # Reload symbols
+ "!peb", # Dump current exe, & environment variables
+ "lm", # Dump loaded modules
+ dump_command,
+ "!uniqstack -pn", # Dump All unique Threads with function arguments
+ "!cs -l", # Dump all locked critical sections
+ ".detach", # Detach
+ "q" # Quit
+ ]
+
+ call([dbg, '-c', ";".join(cmds), '-p', str(pinfo.pid)], logger)
+
+ root_logger.info("Done analyzing %s process with PID %d", pinfo.name, pinfo.pid)
+
+ @staticmethod
+ def get_dump_ext():
+ """Return the dump file extension."""
+ return "mdmp"
+
+
+# LLDB dumper is for MacOS X
+class LLDBDumper(Dumper):
+ """LLDBDumper class."""
+
+ @staticmethod
+ def __find_debugger(debugger):
+ """Find the installed debugger."""
+ return find_program(debugger, ['/usr/bin'])
+
+ def dump_info( # pylint: disable=too-many-arguments,too-many-locals
+ self, root_logger, logger, pinfo, take_dump):
+ """Dump info."""
+ debugger = "lldb"
+ dbg = self.__find_debugger(debugger)
+
+ if dbg is None:
+ root_logger.warning("Debugger %s not found, skipping dumping of %d", debugger,
+ pinfo.pid)
+ return
+
+ root_logger.info("Debugger %s, analyzing %s process with PID %d", dbg, pinfo.name,
+ pinfo.pid)
+
+ lldb_version = callo([dbg, "--version"], logger)
+
+ logger.info(lldb_version)
+
+ # Do we have the XCode or LLVM version of lldb?
+ # Old versions of lldb do not work well when taking commands via a file
+ # XCode (7.2): lldb-340.4.119
+ # LLVM - lldb version 3.7.0 ( revision )
+
+ if 'version' not in lldb_version:
+ # We have XCode's lldb
+ lldb_version = lldb_version[lldb_version.index("lldb-"):]
+ lldb_version = lldb_version.replace('lldb-', '')
+ lldb_major_version = int(lldb_version[:lldb_version.index('.')])
+ if lldb_major_version < 340:
+ logger.warning("Debugger lldb is too old, please upgrade to XCode 7.2")
+ return
+
+ dump_command = ""
+ if take_dump:
+ # Dump to file, dump_<process name>.<pid>.core
+ dump_file = "dump_%s.%d.%s" % (pinfo.name, pinfo.pid, self.get_dump_ext())
+ dump_command = "process save-core %s" % dump_file
+ root_logger.info("Dumping core to %s", dump_file)
+
+ cmds = [
+ "attach -p %d" % pinfo.pid,
+ "target modules list",
+ "thread backtrace all",
+ dump_command,
+ "settings set interpreter.prompt-on-quit false",
+ "quit",
+ ]
+
+ tf = tempfile.NamedTemporaryFile(mode='w', encoding='utf-8')
+
+ for cmd in cmds:
+ tf.write(cmd + "\n")
+
+ tf.flush()
+
+ # Works on in MacOS 10.9 & later
+ #call([dbg] + list( itertools.chain.from_iterable([['-o', b] for b in cmds])), logger)
+ call(['cat', tf.name], logger)
+ call([dbg, '--source', tf.name], logger)
+
+ root_logger.info("Done analyzing %s process with PID %d", pinfo.name, pinfo.pid)
+
+ @staticmethod
+ def get_dump_ext():
+ """Return the dump file extension."""
+ return "core"
+
+
+# GDB dumper is for Linux
+class GDBDumper(Dumper):
+ """GDBDumper class."""
+
+ @staticmethod
+ def __find_debugger(debugger):
+ """Find the installed debugger."""
+ return find_program(debugger, ['/opt/mongodbtoolchain/gdb/bin', '/usr/bin'])
+
+ def dump_info( # pylint: disable=too-many-arguments,too-many-locals
+ self, root_logger, logger, pinfo, take_dump):
+ """Dump info."""
+ debugger = "gdb"
+ dbg = self.__find_debugger(debugger)
+
+ if dbg is None:
+ logger.warning("Debugger %s not found, skipping dumping of %d", debugger, pinfo.pid)
+ return
+
+ root_logger.info("Debugger %s, analyzing %s process with PID %d", dbg, pinfo.name,
+ pinfo.pid)
+
+ dump_command = ""
+ if take_dump:
+ # Dump to file, dump_<process name>.<pid>.core
+ dump_file = "dump_%s.%d.%s" % (pinfo.name, pinfo.pid, self.get_dump_ext())
+ dump_command = "gcore %s" % dump_file
+ root_logger.info("Dumping core to %s", dump_file)
+
+ call([dbg, "--version"], logger)
+
+ script_dir = "buildscripts"
+ root_logger.info("dir %s", script_dir)
+ gdb_dir = os.path.join(script_dir, "gdb")
+ mongo_script = os.path.join(gdb_dir, "mongo.py")
+ mongo_printers_script = os.path.join(gdb_dir, "mongo_printers.py")
+ mongo_lock_script = os.path.join(gdb_dir, "mongo_lock.py")
+
+ source_mongo = "source %s" % mongo_script
+ source_mongo_printers = "source %s" % mongo_printers_script
+ source_mongo_lock = "source %s" % mongo_lock_script
+ mongodb_dump_locks = "mongodb-dump-locks"
+ mongodb_show_locks = "mongodb-show-locks"
+ mongodb_uniqstack = "mongodb-uniqstack mongodb-bt-if-active"
+ mongodb_waitsfor_graph = "mongodb-waitsfor-graph debugger_waitsfor_%s_%d.gv" % \
+ (pinfo.name, pinfo.pid)
+ mongodb_javascript_stack = "mongodb-javascript-stack"
+ mongod_dump_sessions = "mongod-dump-sessions"
+
+ if not logger.mongo_process_filename:
+ raw_stacks_commands = []
+ else:
+ base, ext = os.path.splitext(logger.mongo_process_filename)
+ raw_stacks_filename = base + '_raw_stacks' + ext
+ raw_stacks_commands = [
+ 'echo \\nWriting raw stacks to %s.\\n' % raw_stacks_filename,
+ # This sends output to log file rather than stdout until we turn logging off.
+ 'set logging redirect on',
+ 'set logging file ' + raw_stacks_filename,
+ 'set logging on',
+ 'thread apply all bt',
+ 'set logging off',
+ ]
+
+ cmds = [
+ "set interactive-mode off",
+ "set print thread-events off", # Suppress GDB messages of threads starting/finishing.
+ "attach %d" % pinfo.pid,
+ "info sharedlibrary",
+ "info threads", # Dump a simple list of commands to get the thread name
+ "set python print-stack full",
+ ] + raw_stacks_commands + [
+ source_mongo,
+ source_mongo_printers,
+ source_mongo_lock,
+ mongodb_uniqstack,
+ # Lock the scheduler, before running commands, which execute code in the attached process.
+ "set scheduler-locking on",
+ dump_command,
+ mongodb_dump_locks,
+ mongodb_show_locks,
+ mongodb_waitsfor_graph,
+ mongodb_javascript_stack,
+ mongod_dump_sessions,
+ "set confirm off",
+ "quit",
+ ]
+
+ call([dbg, "--quiet", "--nx"] + list(
+ itertools.chain.from_iterable([['-ex', b] for b in cmds])), logger)
+
+ root_logger.info("Done analyzing %s process with PID %d", pinfo.name, pinfo.pid)
+
+ @staticmethod
+ def get_dump_ext():
+ """Return the dump file extension."""
+ return "core"
+
+ @staticmethod
+ def _find_gcore():
+ """Find the installed gcore."""
+ dbg = "/usr/bin/gcore"
+ if os.path.exists(dbg):
+ return dbg
+
+ return None
+
+
+# jstack is a JDK utility
+class JstackDumper(object):
+ """JstackDumper class."""
+
+ @staticmethod
+ def __find_debugger(debugger):
+ """Find the installed jstack debugger."""
+ return find_program(debugger, ['/usr/bin'])
+
+ def dump_info(self, root_logger, logger, pid, process_name):
+ """Dump java thread stack traces to the console."""
+ debugger = "jstack"
+ jstack = self.__find_debugger(debugger)
+
+ if jstack is None:
+ logger.warning("Debugger %s not found, skipping dumping of %d", debugger, pid)
+ return
+
+ root_logger.info("Debugger %s, analyzing %s process with PID %d", jstack, process_name, pid)
+
+ call([jstack, "-l", str(pid)], logger)
+
+ root_logger.info("Done analyzing %s process with PID %d", process_name, pid)
+
+
+# jstack is a JDK utility
+class JstackWindowsDumper(object):
+ """JstackWindowsDumper class."""
+
+ @staticmethod
+ def dump_info(root_logger, pid):
+ """Dump java thread stack traces to the logger."""
+
+ root_logger.warning("Debugger jstack not supported, skipping dumping of %d", pid)
diff --git a/buildscripts/resmokelib/hang_analyzer/process.py b/buildscripts/resmokelib/hang_analyzer/process.py
new file mode 100644
index 00000000000..33d11829bc1
--- /dev/null
+++ b/buildscripts/resmokelib/hang_analyzer/process.py
@@ -0,0 +1,109 @@
+"""Miscellaneous utility functions used by the hang analyzer."""
+
+import os
+import sys
+import time
+import signal
+import logging
+import subprocess
+from distutils import spawn # pylint: disable=no-name-in-module
+
+from buildscripts.resmokelib import core
+
+_IS_WINDOWS = (sys.platform == "win32")
+
+if _IS_WINDOWS:
+ import win32event
+ import win32api
+
+
+def call(args, logger):
+ """Call subprocess on args list."""
+ logger.info(str(args))
+
+ # Use a common pipe for stdout & stderr for logging.
+ process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+ logger_pipe = core.pipe.LoggerPipe(logger, logging.INFO, process.stdout)
+ logger_pipe.wait_until_started()
+
+ ret = process.wait()
+ logger_pipe.wait_until_finished()
+
+ if ret != 0:
+ logger.error("Bad exit code %d", ret)
+ raise Exception("Bad exit code %d from %s" % (ret, " ".join(args)))
+
+
+def find_program(prog, paths):
+ """Find the specified program in env PATH, or tries a set of paths."""
+ loc = spawn.find_executable(prog)
+
+ if loc is not None:
+ return loc
+
+ for loc in paths:
+ full_prog = os.path.join(loc, prog)
+ if os.path.exists(full_prog):
+ return full_prog
+
+ return None
+
+
+def callo(args, logger):
+ """Call subprocess on args string."""
+ logger.info("%s", str(args))
+ return subprocess.check_output(args).decode('utf-8', 'replace')
+
+
+def signal_python(logger, pinfo):
+ """Send appropriate dumping signal to python processes."""
+
+ # On Windows, we set up an event object to wait on a signal. For Cygwin, we register
+ # a signal handler to wait for the signal since it supports POSIX signals.
+ if _IS_WINDOWS:
+ logger.info("Calling SetEvent to signal python process %s with PID %d", pinfo.name,
+ pinfo.pid)
+ signal_event_object(logger, pinfo.pid)
+ else:
+ logger.info("Sending signal SIGUSR1 to python process %s with PID %d", pinfo.name,
+ pinfo.pid)
+ signal_process(logger, pinfo.pid, signal.SIGUSR1)
+
+
+def signal_event_object(logger, pid):
+ """Signal the Windows event object."""
+
+ # Use unique event_name created.
+ event_name = "Global\\Mongo_Python_" + str(pid)
+
+ try:
+ desired_access = win32event.EVENT_MODIFY_STATE
+ inherit_handle = False
+ task_timeout_handle = win32event.OpenEvent(desired_access, inherit_handle, event_name)
+ except win32event.error as err:
+ logger.info("Exception from win32event.OpenEvent with error: %s", err)
+ return
+
+ try:
+ win32event.SetEvent(task_timeout_handle)
+ except win32event.error as err:
+ logger.info("Exception from win32event.SetEvent with error: %s", err)
+ finally:
+ win32api.CloseHandle(task_timeout_handle)
+
+ logger.info("Waiting for process to report")
+ time.sleep(5)
+
+
+def signal_process(logger, pid, signalnum):
+ """Signal process with signal, N/A on Windows."""
+ try:
+ os.kill(pid, signalnum)
+
+ logger.info("Waiting for process to report")
+ time.sleep(5)
+ except OSError as err:
+ logger.error("Hit OS error trying to signal process: %s", err)
+
+ except AttributeError:
+ logger.error("Cannot send signal to a process on Windows")
diff --git a/buildscripts/resmokelib/hang_analyzer/process_list.py b/buildscripts/resmokelib/hang_analyzer/process_list.py
new file mode 100644
index 00000000000..3f60a54dd08
--- /dev/null
+++ b/buildscripts/resmokelib/hang_analyzer/process_list.py
@@ -0,0 +1,158 @@
+"""Functions to list processes in each OS and search for interesting processes."""
+
+import os
+import io
+import sys
+import csv
+from collections import namedtuple
+
+from buildscripts.resmokelib.hang_analyzer.process import call, callo, find_program
+
+Pinfo = namedtuple('Pinfo', ['pid', 'name'])
+
+
+def get_processes(process_ids, interesting_processes, process_match, logger):
+ """
+ Find all running interesting processes.
+
+ If a list of process_ids is supplied, match on that.
+ Otherwise, do a substring match on interesting_processes.
+
+ :param process_ids: List of PIDs to match on.
+ :param interesting_processes: List of process names to match on.
+ :param process_match: String describing the process match to use.
+ :param logger: Where to log output.
+
+ :return: A list Pinfo objects for matched processes.
+ """
+ ps = _get_lister()
+
+ all_processes = ps.dump_processes(logger)
+
+ # Canonicalize the process names to lowercase to handle cases where the name of the Python
+ # process is /System/Library/.../Python on OS X and -p python is specified to the hang analyzer.
+ all_processes = [(pid, process_name.lower()) for (pid, process_name) in all_processes]
+
+ if process_ids:
+ processes = [
+ Pinfo(pid=pid, name=pname) for (pid, pname) in all_processes
+ if pid in process_ids and pid != os.getpid()
+ ]
+
+ running_pids = {pid for (pid, pname) in all_processes}
+ missing_pids = set(process_ids) - running_pids
+ if missing_pids:
+ logger.warning("The following requested process ids are not running %s",
+ list(missing_pids))
+ else:
+ processes = [
+ Pinfo(pid=pid, name=pname) for (pid, pname) in all_processes
+ if _pname_match(process_match, pname, interesting_processes) and pid != os.getpid()
+ ]
+
+ logger.info("Found %d interesting processes %s", len(processes), processes)
+ return processes
+
+
+def _get_lister():
+ """Return _ProcessList object for OS."""
+ if sys.platform.startswith("linux"):
+ ps = _LinuxProcessList()
+ elif sys.platform == "win32" or sys.platform == "cygwin":
+ ps = _WindowsProcessList()
+ elif sys.platform == "darwin":
+ ps = _DarwinProcessList()
+ else:
+ raise OSError("Hang analyzer: Unsupported platform: {}".format(sys.platform))
+
+ return ps
+
+
+class _ProcessList(object):
+ """Abstract base class for all process listers."""
+
+ def dump_processes(self, logger):
+ """
+ Find all processes.
+
+ :param logger: Where to log output.
+ :return: A list of process names.
+ """
+ raise NotImplementedError("dump_process must be implemented in OS-specific subclasses")
+
+
+class _WindowsProcessList(_ProcessList):
+ """_WindowsProcessList class."""
+
+ @staticmethod
+ def __find_ps():
+ """Find tasklist."""
+ return os.path.join(os.environ["WINDIR"], "system32", "tasklist.exe")
+
+ def dump_processes(self, logger):
+ """Get list of [Pid, Process Name]."""
+ ps = self.__find_ps()
+
+ logger.info("Getting list of processes using %s", ps)
+
+ ret = callo([ps, "/FO", "CSV"], logger)
+
+ buff = io.StringIO(ret)
+ csv_reader = csv.reader(buff)
+
+ return [[int(row[1]), row[0]] for row in csv_reader if row[1] != "PID"]
+
+
+class _DarwinProcessList(_ProcessList):
+ """_DarwinProcessList class."""
+
+ @staticmethod
+ def __find_ps():
+ """Find ps."""
+ return find_program('ps', ['/bin'])
+
+ def dump_processes(self, logger):
+ """Get list of [Pid, Process Name]."""
+ ps = self.__find_ps()
+
+ logger.info("Getting list of processes using %s", ps)
+
+ ret = callo([ps, "-axco", "pid,comm"], logger)
+
+ buff = io.StringIO(ret)
+ csv_reader = csv.reader(buff, delimiter=' ', quoting=csv.QUOTE_NONE, skipinitialspace=True)
+
+ return [[int(row[0]), row[1]] for row in csv_reader if row[0] != "PID"]
+
+
+class _LinuxProcessList(_ProcessList):
+ """_LinuxProcessList class."""
+
+ @staticmethod
+ def __find_ps():
+ """Find ps."""
+ return find_program('ps', ['/bin', '/usr/bin'])
+
+ def dump_processes(self, logger):
+ """Get list of [Pid, Process Name]."""
+ ps = self.__find_ps()
+
+ logger.info("Getting list of processes using %s", ps)
+
+ call([ps, "--version"], logger)
+
+ ret = callo([ps, "-eo", "pid,args"], logger)
+
+ buff = io.StringIO(ret)
+ csv_reader = csv.reader(buff, delimiter=' ', quoting=csv.QUOTE_NONE, skipinitialspace=True)
+
+ return [[int(row[0]), os.path.split(row[1])[1]] for row in csv_reader if row[0] != "PID"]
+
+
+def _pname_match(match_type, pname, interesting_processes):
+ """Return True if the pname matches an interesting_processes."""
+ pname = os.path.splitext(pname)[0]
+ for ip in interesting_processes:
+ if match_type == 'exact' and pname == ip or match_type == 'contains' and ip in pname:
+ return True
+ return False
diff --git a/buildscripts/resmokelib/parser.py b/buildscripts/resmokelib/parser.py
index 44345050704..63770525a55 100644
--- a/buildscripts/resmokelib/parser.py
+++ b/buildscripts/resmokelib/parser.py
@@ -21,6 +21,7 @@ def _make_parser():
_add_run(subparsers)
_add_list_suites(subparsers)
_add_find_suites(subparsers)
+ _add_hang_analyzer(subparsers)
return parser
@@ -409,6 +410,41 @@ def _add_find_suites(subparsers):
" configurations."))
+def _add_hang_analyzer(subparsers):
+ """Create and add the parser for the hang analyzer subcommand."""
+
+ parser = subparsers.add_parser("hang-analyzer", help=commands.hang_analyzer.__doc__)
+
+ parser.add_argument(
+ '-m', '--process-match', dest='process_match', choices=('contains', 'exact'),
+ default='contains', help="Type of match for process names (-p & -g), specify 'contains', or"
+ " 'exact'. Note that the process name match performs the following"
+ " conversions: change all process names to lowecase, strip off the file"
+ " extension, like '.exe' on Windows. Default is 'contains'.")
+ parser.add_argument('-p', '--process-names', dest='process_names',
+ help='Comma separated list of process names to analyze')
+ parser.add_argument('-g', '--go-process-names', dest='go_process_names',
+ help='Comma separated list of go process names to analyze')
+ parser.add_argument(
+ '-d', '--process-ids', dest='process_ids', default=None,
+ help='Comma separated list of process ids (PID) to analyze, overrides -p &'
+ ' -g')
+ parser.add_argument('-c', '--dump-core', dest='dump_core', action="store_true", default=False,
+ help='Dump core file for each analyzed process')
+ parser.add_argument('-s', '--max-core-dumps-size', dest='max_core_dumps_size', default=10000,
+ help='Maximum total size of core dumps to keep in megabytes')
+ parser.add_argument(
+ '-o', '--debugger-output', dest='debugger_output', action="append", choices=('file',
+ 'stdout'),
+ default=None, help="If 'stdout', then the debugger's output is written to the Python"
+ " process's stdout. If 'file', then the debugger's output is written"
+ " to a file named debugger_<process>_<pid>.log for each process it"
+ " attaches to. This option can be specified multiple times on the"
+ " command line to have the debugger's output written to multiple"
+ " locations. By default, the debugger's output is written only to the"
+ " Python process's stdout.")
+
+
# def to_local_args(args=None): # pylint: disable=too-many-branches,too-many-locals
# """
# Return a command line invocation for resmoke.py suitable for being run outside of Evergreen.
@@ -513,22 +549,19 @@ def parse_command_line(sys_args, **kwargs):
"""Parse the command line arguments passed to resmoke.py and return the subcommand object to execute."""
parser, parsed_args = _parse(sys_args)
- def create_subcommand(parser, parsed_args, **kwargs):
- """Create a subcommand object based on args passed into resmoke.py."""
-
- subcommand = parsed_args.command
- subcommand_obj = None
- if subcommand in ('find-suites', 'list-suites', 'run'):
- configure_resmoke.validate_and_update_config(parser, parsed_args)
- if _config.EVERGREEN_TASK_ID is not None:
- subcommand_obj = commands.run.TestRunnerEvg(subcommand, **kwargs)
- else:
- subcommand_obj = commands.run.TestRunner(subcommand, **kwargs)
-
- if subcommand_obj is None:
- raise RuntimeError(
- f"Resmoke configuration has invalid subcommand: {subcommand}. Try '--help'")
-
- return subcommand_obj
-
- return create_subcommand(parser, parsed_args, **kwargs)
+ subcommand = parsed_args.command
+ subcommand_obj = None
+ if subcommand in ('find-suites', 'list-suites', 'run'):
+ configure_resmoke.validate_and_update_config(parser, parsed_args)
+ if _config.EVERGREEN_TASK_ID is not None:
+ subcommand_obj = commands.run.TestRunnerEvg(subcommand, **kwargs)
+ else:
+ subcommand_obj = commands.run.TestRunner(subcommand, **kwargs)
+ elif subcommand == 'hang-analyzer':
+ subcommand_obj = commands.hang_analyzer.HangAnalyzer(parsed_args)
+
+ if subcommand_obj is None:
+ raise RuntimeError(
+ f"Resmoke configuration has invalid subcommand: {subcommand}. Try '--help'")
+
+ return subcommand_obj
diff --git a/etc/evergreen.yml b/etc/evergreen.yml
index 1fd50b76467..89c22fefa5d 100644
--- a/etc/evergreen.yml
+++ b/etc/evergreen.yml
@@ -3080,10 +3080,10 @@ functions:
fi
${activate_virtualenv}
- echo "Calling the hang analyzer: PATH=\"/opt/mongodbtoolchain/gdb/bin:$PATH\" $python buildscripts/hang_analyzer.py $hang_analyzer_option"
- PATH="/opt/mongodbtoolchain/gdb/bin:$PATH" $python buildscripts/hang_analyzer.py $hang_analyzer_option
+ echo "Calling the hang analyzer: PATH=\"/opt/mongodbtoolchain/gdb/bin:$PATH\" $python buildscripts/resmoke.py hang-analyzer $hang_analyzer_option"
+ PATH="/opt/mongodbtoolchain/gdb/bin:$PATH" $python buildscripts/resmoke.py hang-analyzer $hang_analyzer_option
- # Call hang_analyzer.py script for tasks that are running remote mongo processes
+ # Call hang analyzer for tasks that are running remote mongo processes
if [ -n "${private_ip_address}" ]; then
core_ext=core
if [ "Windows_NT" = "$OS" ]; then
@@ -3118,7 +3118,7 @@ functions:
# we don't want to evaluate the local python variable, but instead pass the python string
# so the remote host will use the right python when the virtualenv is sourced.
cmds="$cmds; cd ${remote_dir}"
- cmds="$cmds; PATH=\"/opt/mongodbtoolchain/gdb/bin:\$PATH\" python buildscripts/hang_analyzer.py $hang_analyzer_option"
+ cmds="$cmds; PATH=\"/opt/mongodbtoolchain/gdb/bin:\$PATH\" python buildscripts/resmoke.py hang-analyzer $hang_analyzer_option"
$python buildscripts/remote_operations.py \
--verbose \
--userHost $USER@${private_ip_address} \