summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMikhail Shchatko <mikhail.shchatko@mongodb.com>2021-06-04 17:00:56 +0300
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-12-13 19:34:01 +0000
commit4b2608fc881f8443c0945a060129b92282674f3d (patch)
tree4574be5bf3b6a463a54f614cf7161f07d3e0cb69
parent13ed17c8e001c07ab84233ebb48f5c3f18499833 (diff)
downloadmongo-4b2608fc881f8443c0945a060129b92282674f3d.tar.gz
SERVER-56167 Guarantee hang analyzer collects core dumps for sharded clusters
(cherry picked from commit 940116b555f9b5e624b13d4465ebfa83a00ee049)
-rw-r--r--buildscripts/resmokelib/hang_analyzer/dumper.py159
-rw-r--r--buildscripts/resmokelib/hang_analyzer/extractor.py5
-rwxr-xr-xbuildscripts/resmokelib/hang_analyzer/hang_analyzer.py51
3 files changed, 120 insertions, 95 deletions
diff --git a/buildscripts/resmokelib/hang_analyzer/dumper.py b/buildscripts/resmokelib/hang_analyzer/dumper.py
index fb9dbef9969..f1e619e2917 100644
--- a/buildscripts/resmokelib/hang_analyzer/dumper.py
+++ b/buildscripts/resmokelib/hang_analyzer/dumper.py
@@ -144,7 +144,6 @@ class WindowsDumper(Dumper):
"""Return the commands that attach to each process, dump info and detach."""
assert isinstance(pinfo.pidv, int)
- dump_command = ""
if take_dump:
# Dump to file, dump_<process name>.<pid>.mdmp
dump_file = "dump_%s.%d.%s" % (os.path.splitext(pinfo.name)[0], pinfo.pidv,
@@ -152,14 +151,18 @@ class WindowsDumper(Dumper):
dump_command = ".dump /ma %s" % dump_file
self._root_logger.info("Dumping core to %s", dump_file)
- cmds = [
- "!peb", # Dump current exe, & environment variables
- "lm", # Dump loaded modules
- dump_command,
- "!uniqstack -pn", # Dump All unique Threads with function arguments
- "!cs -l", # Dump all locked critical sections
- ".detach", # Detach
- ]
+ cmds = [
+ dump_command,
+ ".detach", # Detach
+ ]
+ else:
+ cmds = [
+ "!peb", # Dump current exe, & environment variables
+ "lm", # Dump loaded modules
+ "!uniqstack -pn", # Dump All unique Threads with function arguments
+ "!cs -l", # Dump all locked critical sections
+ ".detach", # Detach
+ ]
return cmds
@@ -216,24 +219,32 @@ class LLDBDumper(Dumper):
def _process_specific(self, pinfo, take_dump, logger=None):
"""Return the commands that attach to each process, dump info and detach."""
cmds = []
- dump_files = self._dump_files(pinfo)
- for pid in pinfo.pidv:
- dump_command = ""
- if take_dump:
+
+ if take_dump:
+ dump_files = self._dump_files(pinfo)
+ for pid in pinfo.pidv:
# Dump to file, dump_<process name>.<pid>.core
dump_file = dump_files[pid]
dump_command = "process save-core %s" % dump_file
self._root_logger.info("Dumping core to %s", dump_file)
- cmds += [
- "platform shell kill -CONT %d" % pid,
- "attach -p %d" % pid,
- "target modules list",
- "thread backtrace all",
- dump_command,
- "process detach",
- "platform shell kill -STOP %d" % pid,
- ]
+ cmds += [
+ "platform shell kill -CONT %d" % pid,
+ "attach -p %d" % pid,
+ dump_command,
+ "process detach",
+ "platform shell kill -STOP %d" % pid,
+ ]
+ else:
+ for pid in pinfo.pidv:
+ cmds += [
+ "platform shell kill -CONT %d" % pid,
+ "attach -p %d" % pid,
+ "target modules list",
+ "thread backtrace all",
+ "process detach",
+ "platform shell kill -STOP %d" % pid,
+ ]
return cmds
@@ -349,60 +360,68 @@ class GDBDumper(Dumper):
def _process_specific( # pylint: disable=too-many-locals
self, pinfo, take_dump, logger=None):
"""Return the commands that attach to each process, dump info and detach."""
- mongodb_dump_locks = "mongodb-dump-locks"
- mongodb_show_locks = "mongodb-show-locks"
- mongodb_uniqstack = "mongodb-uniqstack mongodb-bt-if-active"
- mongodb_javascript_stack = "mongodb-javascript-stack"
- mongod_dump_sessions = "mongod-dump-sessions"
- mongodb_dump_mutexes = "mongodb-dump-mutexes"
- mongodb_dump_recovery_units = "mongodb-dump-recovery-units"
-
cmds = []
- for pid in pinfo.pidv:
- if not logger.mongo_process_filename:
- raw_stacks_commands = []
- else:
- base, ext = os.path.splitext(logger.mongo_process_filename)
- raw_stacks_filename = "%s_%d_raw_stacks%s" % (base, pid, ext)
- raw_stacks_commands = [
- 'echo \\nWriting raw stacks to %s.\\n' % raw_stacks_filename,
- # This sends output to log file rather than stdout until we turn logging off.
- 'set logging redirect on',
- 'set logging file ' + raw_stacks_filename,
- 'set logging on',
- 'thread apply all bt',
- 'set logging off',
- ]
- dump_command = ""
- if take_dump:
+ if take_dump:
+ for pid in pinfo.pidv:
# Dump to file, dump_<process name>.<pid>.core
dump_file = "dump_%s.%d.%s" % (pinfo.name, pid, self.get_dump_ext())
dump_command = "gcore %s" % dump_file
self._root_logger.info("Dumping core to %s", dump_file)
-
- mongodb_waitsfor_graph = "mongodb-waitsfor-graph debugger_waitsfor_%s_%d.gv" % \
- (pinfo.name, pid)
-
- cmds += [
- "attach %d" % pid,
- "handle SIGSTOP ignore noprint",
- "info sharedlibrary",
- "info threads", # Dump a simple list of commands to get the thread name
- ] + raw_stacks_commands + [
- mongodb_uniqstack,
- # Lock the scheduler, before running commands, which execute code in the attached process.
- "set scheduler-locking on",
- dump_command,
- mongodb_dump_locks,
- mongodb_show_locks,
- mongodb_waitsfor_graph,
- mongodb_javascript_stack,
- mongod_dump_sessions,
- mongodb_dump_mutexes,
- mongodb_dump_recovery_units,
- "detach",
- ]
+ cmds += [
+ "attach %d" % pid,
+ "handle SIGSTOP ignore noprint",
+ # Lock the scheduler, before running commands, which execute code in the attached process.
+ "set scheduler-locking on",
+ dump_command,
+ "detach",
+ ]
+ else:
+ mongodb_dump_locks = "mongodb-dump-locks"
+ mongodb_show_locks = "mongodb-show-locks"
+ mongodb_uniqstack = "mongodb-uniqstack mongodb-bt-if-active"
+ mongodb_javascript_stack = "mongodb-javascript-stack"
+ mongod_dump_sessions = "mongod-dump-sessions"
+ mongodb_dump_mutexes = "mongodb-dump-mutexes"
+ mongodb_dump_recovery_units = "mongodb-dump-recovery-units"
+
+ for pid in pinfo.pidv:
+ if not logger.mongo_process_filename:
+ raw_stacks_commands = []
+ else:
+ base, ext = os.path.splitext(logger.mongo_process_filename)
+ raw_stacks_filename = "%s_%d_raw_stacks%s" % (base, pid, ext)
+ raw_stacks_commands = [
+ 'echo \\nWriting raw stacks to %s.\\n' % raw_stacks_filename,
+ # This sends output to log file rather than stdout until we turn logging off.
+ 'set logging redirect on',
+ 'set logging file ' + raw_stacks_filename,
+ 'set logging on',
+ 'thread apply all bt',
+ 'set logging off',
+ ]
+
+ mongodb_waitsfor_graph = "mongodb-waitsfor-graph debugger_waitsfor_%s_%d.gv" % \
+ (pinfo.name, pid)
+
+ cmds += [
+ "attach %d" % pid,
+ "handle SIGSTOP ignore noprint",
+ "info sharedlibrary",
+ "info threads", # Dump a simple list of commands to get the thread name
+ ] + raw_stacks_commands + [
+ mongodb_uniqstack,
+ # Lock the scheduler, before running commands, which execute code in the attached process.
+ "set scheduler-locking on",
+ mongodb_dump_locks,
+ mongodb_show_locks,
+ mongodb_waitsfor_graph,
+ mongodb_javascript_stack,
+ mongod_dump_sessions,
+ mongodb_dump_mutexes,
+ mongodb_dump_recovery_units,
+ "detach",
+ ]
return cmds
diff --git a/buildscripts/resmokelib/hang_analyzer/extractor.py b/buildscripts/resmokelib/hang_analyzer/extractor.py
index cdb7206e8dc..01e1fd54fb5 100644
--- a/buildscripts/resmokelib/hang_analyzer/extractor.py
+++ b/buildscripts/resmokelib/hang_analyzer/extractor.py
@@ -53,7 +53,10 @@ def _extract_tar(root_logger):
if os.path.exists(dest):
root_logger.debug('Debug symbol %s already exists, not copying from %s.', dest, src)
continue
- shutil.copy(src, dest)
+ if os.path.isdir(src):
+ shutil.copytree(src, dest)
+ else:
+ shutil.copy(src, dest)
root_logger.debug('Copied debug symbol %s.', dest)
return sym_files
diff --git a/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py b/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py
index dbf2ca23bc9..39a505be44b 100755
--- a/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py
+++ b/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py
@@ -9,7 +9,6 @@ A prototype hang analyzer for Evergreen integration to help investigate test tim
Supports Linux, MacOS X, and Windows.
"""
-import glob
import logging
import os
import platform
@@ -75,8 +74,6 @@ class HangAnalyzer(Subcommand):
processes = process_list.get_processes(self.process_ids, self.interesting_processes,
self.options.process_match, self.root_logger)
- max_dump_size_bytes = int(self.options.max_core_dumps_size) * 1024 * 1024
-
# Suspending all processes, except python, to prevent them from getting unstuck when
# the hang analyzer attaches to them.
for pinfo in [pinfo for pinfo in processes if not pinfo.name.startswith("python")]:
@@ -92,15 +89,29 @@ class HangAnalyzer(Subcommand):
trapped_exceptions = []
dump_pids = {}
- # Dump all processes, except python & java.
+ # Dump core files of all processes, except python & java.
+ if self.options.dump_core:
+ for pinfo in [
+ pinfo for pinfo in processes if not re.match("^(java|python)", pinfo.name)
+ ]:
+ if self._check_enough_free_space():
+ try:
+ dumpers.dbg.dump_info(pinfo, take_dump=True)
+ except dumper.DumpError as err:
+ self.root_logger.error(err.message)
+ dump_pids = {**err.dump_pids, **dump_pids}
+ except Exception as err: # pylint: disable=broad-except
+ self.root_logger.info("Error encountered when invoking debugger %s", err)
+ trapped_exceptions.append(traceback.format_exc())
+ else:
+ self.root_logger.info(
+ "Not enough space for a core dump, skipping %s processes with PIDs %s",
+ pinfo.name, str(pinfo.pidv))
+
+ # Dump info of all processes, except python & java.
for pinfo in [pinfo for pinfo in processes if not re.match("^(java|python)", pinfo.name)]:
try:
- dumpers.dbg.dump_info(
- pinfo, self.options.dump_core
- and _check_dump_quota(max_dump_size_bytes, dumpers.dbg.get_dump_ext()))
- except dumper.DumpError as err:
- self.root_logger.error(err.message)
- dump_pids = {**err.dump_pids, **dump_pids}
+ dumpers.dbg.dump_info(pinfo, take_dump=False)
except Exception as err: # pylint: disable=broad-except
self.root_logger.info("Error encountered when invoking debugger %s", err)
trapped_exceptions.append(traceback.format_exc())
@@ -200,17 +211,10 @@ class HangAnalyzer(Subcommand):
self.root_logger.warning(
"Cannot determine Unix Current Login, not supported on Windows")
-
-def _check_dump_quota(quota, ext):
- """Check if sum of the files with ext is within the specified quota in megabytes."""
-
- files = glob.glob("*." + ext)
-
- size_sum = 0
- for file_name in files:
- size_sum += os.path.getsize(file_name)
-
- return size_sum <= quota
+ def _check_enough_free_space(self):
+ usage_percent = psutil.disk_usage(".").percent
+ self.root_logger.info("Current disk usage percent: %s", usage_percent)
+ return usage_percent < self.options.max_disk_usage_percent
class HangAnalyzerPlugin(PluginInterface):
@@ -243,9 +247,8 @@ class HangAnalyzerPlugin(PluginInterface):
' -g')
parser.add_argument('-c', '--dump-core', dest='dump_core', action="store_true",
default=False, help='Dump core file for each analyzed process')
- parser.add_argument('-s', '--max-core-dumps-size', dest='max_core_dumps_size',
- default=10000,
- help='Maximum total size of core dumps to keep in megabytes')
+ parser.add_argument('-s', '--max-disk-usage-percent', dest='max_disk_usage_percent',
+ default=90, help='Maximum disk usage percent for a core dump')
parser.add_argument(
'-o', '--debugger-output', dest='debugger_output', action="append", choices=('file',
'stdout'),