diff options
author | Mikhail Shchatko <mikhail.shchatko@mongodb.com> | 2021-06-04 17:00:56 +0300 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-12-13 19:34:01 +0000 |
commit | 4b2608fc881f8443c0945a060129b92282674f3d (patch) | |
tree | 4574be5bf3b6a463a54f614cf7161f07d3e0cb69 | |
parent | 13ed17c8e001c07ab84233ebb48f5c3f18499833 (diff) | |
download | mongo-4b2608fc881f8443c0945a060129b92282674f3d.tar.gz |
SERVER-56167 Guarantee hang analyzer collects core dumps for sharded clusters
(cherry picked from commit 940116b555f9b5e624b13d4465ebfa83a00ee049)
-rw-r--r-- | buildscripts/resmokelib/hang_analyzer/dumper.py | 159 | ||||
-rw-r--r-- | buildscripts/resmokelib/hang_analyzer/extractor.py | 5 | ||||
-rwxr-xr-x | buildscripts/resmokelib/hang_analyzer/hang_analyzer.py | 51 |
3 files changed, 120 insertions, 95 deletions
diff --git a/buildscripts/resmokelib/hang_analyzer/dumper.py b/buildscripts/resmokelib/hang_analyzer/dumper.py index fb9dbef9969..f1e619e2917 100644 --- a/buildscripts/resmokelib/hang_analyzer/dumper.py +++ b/buildscripts/resmokelib/hang_analyzer/dumper.py @@ -144,7 +144,6 @@ class WindowsDumper(Dumper): """Return the commands that attach to each process, dump info and detach.""" assert isinstance(pinfo.pidv, int) - dump_command = "" if take_dump: # Dump to file, dump_<process name>.<pid>.mdmp dump_file = "dump_%s.%d.%s" % (os.path.splitext(pinfo.name)[0], pinfo.pidv, @@ -152,14 +151,18 @@ class WindowsDumper(Dumper): dump_command = ".dump /ma %s" % dump_file self._root_logger.info("Dumping core to %s", dump_file) - cmds = [ - "!peb", # Dump current exe, & environment variables - "lm", # Dump loaded modules - dump_command, - "!uniqstack -pn", # Dump All unique Threads with function arguments - "!cs -l", # Dump all locked critical sections - ".detach", # Detach - ] + cmds = [ + dump_command, + ".detach", # Detach + ] + else: + cmds = [ + "!peb", # Dump current exe, & environment variables + "lm", # Dump loaded modules + "!uniqstack -pn", # Dump All unique Threads with function arguments + "!cs -l", # Dump all locked critical sections + ".detach", # Detach + ] return cmds @@ -216,24 +219,32 @@ class LLDBDumper(Dumper): def _process_specific(self, pinfo, take_dump, logger=None): """Return the commands that attach to each process, dump info and detach.""" cmds = [] - dump_files = self._dump_files(pinfo) - for pid in pinfo.pidv: - dump_command = "" - if take_dump: + + if take_dump: + dump_files = self._dump_files(pinfo) + for pid in pinfo.pidv: # Dump to file, dump_<process name>.<pid>.core dump_file = dump_files[pid] dump_command = "process save-core %s" % dump_file self._root_logger.info("Dumping core to %s", dump_file) - cmds += [ - "platform shell kill -CONT %d" % pid, - "attach -p %d" % pid, - "target modules list", - "thread backtrace all", - dump_command, - "process detach", - "platform shell kill -STOP %d" % pid, - ] + cmds += [ + "platform shell kill -CONT %d" % pid, + "attach -p %d" % pid, + dump_command, + "process detach", + "platform shell kill -STOP %d" % pid, + ] + else: + for pid in pinfo.pidv: + cmds += [ + "platform shell kill -CONT %d" % pid, + "attach -p %d" % pid, + "target modules list", + "thread backtrace all", + "process detach", + "platform shell kill -STOP %d" % pid, + ] return cmds @@ -349,60 +360,68 @@ class GDBDumper(Dumper): def _process_specific( # pylint: disable=too-many-locals self, pinfo, take_dump, logger=None): """Return the commands that attach to each process, dump info and detach.""" - mongodb_dump_locks = "mongodb-dump-locks" - mongodb_show_locks = "mongodb-show-locks" - mongodb_uniqstack = "mongodb-uniqstack mongodb-bt-if-active" - mongodb_javascript_stack = "mongodb-javascript-stack" - mongod_dump_sessions = "mongod-dump-sessions" - mongodb_dump_mutexes = "mongodb-dump-mutexes" - mongodb_dump_recovery_units = "mongodb-dump-recovery-units" - cmds = [] - for pid in pinfo.pidv: - if not logger.mongo_process_filename: - raw_stacks_commands = [] - else: - base, ext = os.path.splitext(logger.mongo_process_filename) - raw_stacks_filename = "%s_%d_raw_stacks%s" % (base, pid, ext) - raw_stacks_commands = [ - 'echo \\nWriting raw stacks to %s.\\n' % raw_stacks_filename, - # This sends output to log file rather than stdout until we turn logging off. - 'set logging redirect on', - 'set logging file ' + raw_stacks_filename, - 'set logging on', - 'thread apply all bt', - 'set logging off', - ] - dump_command = "" - if take_dump: + if take_dump: + for pid in pinfo.pidv: # Dump to file, dump_<process name>.<pid>.core dump_file = "dump_%s.%d.%s" % (pinfo.name, pid, self.get_dump_ext()) dump_command = "gcore %s" % dump_file self._root_logger.info("Dumping core to %s", dump_file) - - mongodb_waitsfor_graph = "mongodb-waitsfor-graph debugger_waitsfor_%s_%d.gv" % \ - (pinfo.name, pid) - - cmds += [ - "attach %d" % pid, - "handle SIGSTOP ignore noprint", - "info sharedlibrary", - "info threads", # Dump a simple list of commands to get the thread name - ] + raw_stacks_commands + [ - mongodb_uniqstack, - # Lock the scheduler, before running commands, which execute code in the attached process. - "set scheduler-locking on", - dump_command, - mongodb_dump_locks, - mongodb_show_locks, - mongodb_waitsfor_graph, - mongodb_javascript_stack, - mongod_dump_sessions, - mongodb_dump_mutexes, - mongodb_dump_recovery_units, - "detach", - ] + cmds += [ + "attach %d" % pid, + "handle SIGSTOP ignore noprint", + # Lock the scheduler, before running commands, which execute code in the attached process. + "set scheduler-locking on", + dump_command, + "detach", + ] + else: + mongodb_dump_locks = "mongodb-dump-locks" + mongodb_show_locks = "mongodb-show-locks" + mongodb_uniqstack = "mongodb-uniqstack mongodb-bt-if-active" + mongodb_javascript_stack = "mongodb-javascript-stack" + mongod_dump_sessions = "mongod-dump-sessions" + mongodb_dump_mutexes = "mongodb-dump-mutexes" + mongodb_dump_recovery_units = "mongodb-dump-recovery-units" + + for pid in pinfo.pidv: + if not logger.mongo_process_filename: + raw_stacks_commands = [] + else: + base, ext = os.path.splitext(logger.mongo_process_filename) + raw_stacks_filename = "%s_%d_raw_stacks%s" % (base, pid, ext) + raw_stacks_commands = [ + 'echo \\nWriting raw stacks to %s.\\n' % raw_stacks_filename, + # This sends output to log file rather than stdout until we turn logging off. + 'set logging redirect on', + 'set logging file ' + raw_stacks_filename, + 'set logging on', + 'thread apply all bt', + 'set logging off', + ] + + mongodb_waitsfor_graph = "mongodb-waitsfor-graph debugger_waitsfor_%s_%d.gv" % \ + (pinfo.name, pid) + + cmds += [ + "attach %d" % pid, + "handle SIGSTOP ignore noprint", + "info sharedlibrary", + "info threads", # Dump a simple list of commands to get the thread name + ] + raw_stacks_commands + [ + mongodb_uniqstack, + # Lock the scheduler, before running commands, which execute code in the attached process. + "set scheduler-locking on", + mongodb_dump_locks, + mongodb_show_locks, + mongodb_waitsfor_graph, + mongodb_javascript_stack, + mongod_dump_sessions, + mongodb_dump_mutexes, + mongodb_dump_recovery_units, + "detach", + ] return cmds diff --git a/buildscripts/resmokelib/hang_analyzer/extractor.py b/buildscripts/resmokelib/hang_analyzer/extractor.py index cdb7206e8dc..01e1fd54fb5 100644 --- a/buildscripts/resmokelib/hang_analyzer/extractor.py +++ b/buildscripts/resmokelib/hang_analyzer/extractor.py @@ -53,7 +53,10 @@ def _extract_tar(root_logger): if os.path.exists(dest): root_logger.debug('Debug symbol %s already exists, not copying from %s.', dest, src) continue - shutil.copy(src, dest) + if os.path.isdir(src): + shutil.copytree(src, dest) + else: + shutil.copy(src, dest) root_logger.debug('Copied debug symbol %s.', dest) return sym_files diff --git a/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py b/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py index dbf2ca23bc9..39a505be44b 100755 --- a/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py +++ b/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py @@ -9,7 +9,6 @@ A prototype hang analyzer for Evergreen integration to help investigate test tim Supports Linux, MacOS X, and Windows. """ -import glob import logging import os import platform @@ -75,8 +74,6 @@ class HangAnalyzer(Subcommand): processes = process_list.get_processes(self.process_ids, self.interesting_processes, self.options.process_match, self.root_logger) - max_dump_size_bytes = int(self.options.max_core_dumps_size) * 1024 * 1024 - # Suspending all processes, except python, to prevent them from getting unstuck when # the hang analyzer attaches to them. for pinfo in [pinfo for pinfo in processes if not pinfo.name.startswith("python")]: @@ -92,15 +89,29 @@ class HangAnalyzer(Subcommand): trapped_exceptions = [] dump_pids = {} - # Dump all processes, except python & java. + # Dump core files of all processes, except python & java. + if self.options.dump_core: + for pinfo in [ + pinfo for pinfo in processes if not re.match("^(java|python)", pinfo.name) + ]: + if self._check_enough_free_space(): + try: + dumpers.dbg.dump_info(pinfo, take_dump=True) + except dumper.DumpError as err: + self.root_logger.error(err.message) + dump_pids = {**err.dump_pids, **dump_pids} + except Exception as err: # pylint: disable=broad-except + self.root_logger.info("Error encountered when invoking debugger %s", err) + trapped_exceptions.append(traceback.format_exc()) + else: + self.root_logger.info( + "Not enough space for a core dump, skipping %s processes with PIDs %s", + pinfo.name, str(pinfo.pidv)) + + # Dump info of all processes, except python & java. for pinfo in [pinfo for pinfo in processes if not re.match("^(java|python)", pinfo.name)]: try: - dumpers.dbg.dump_info( - pinfo, self.options.dump_core - and _check_dump_quota(max_dump_size_bytes, dumpers.dbg.get_dump_ext())) - except dumper.DumpError as err: - self.root_logger.error(err.message) - dump_pids = {**err.dump_pids, **dump_pids} + dumpers.dbg.dump_info(pinfo, take_dump=False) except Exception as err: # pylint: disable=broad-except self.root_logger.info("Error encountered when invoking debugger %s", err) trapped_exceptions.append(traceback.format_exc()) @@ -200,17 +211,10 @@ class HangAnalyzer(Subcommand): self.root_logger.warning( "Cannot determine Unix Current Login, not supported on Windows") - -def _check_dump_quota(quota, ext): - """Check if sum of the files with ext is within the specified quota in megabytes.""" - - files = glob.glob("*." + ext) - - size_sum = 0 - for file_name in files: - size_sum += os.path.getsize(file_name) - - return size_sum <= quota + def _check_enough_free_space(self): + usage_percent = psutil.disk_usage(".").percent + self.root_logger.info("Current disk usage percent: %s", usage_percent) + return usage_percent < self.options.max_disk_usage_percent class HangAnalyzerPlugin(PluginInterface): @@ -243,9 +247,8 @@ class HangAnalyzerPlugin(PluginInterface): ' -g') parser.add_argument('-c', '--dump-core', dest='dump_core', action="store_true", default=False, help='Dump core file for each analyzed process') - parser.add_argument('-s', '--max-core-dumps-size', dest='max_core_dumps_size', - default=10000, - help='Maximum total size of core dumps to keep in megabytes') + parser.add_argument('-s', '--max-disk-usage-percent', dest='max_disk_usage_percent', + default=90, help='Maximum disk usage percent for a core dump') parser.add_argument( '-o', '--debugger-output', dest='debugger_output', action="append", choices=('file', 'stdout'), |