diff options
author | Jonathan Abrahams <jonathan@mongodb.com> | 2017-03-16 14:39:09 -0400 |
---|---|---|
committer | Jonathan Abrahams <jonathan@mongodb.com> | 2017-03-16 14:39:09 -0400 |
commit | 5fe822f53e4bb28e15af2541c0ca931fa05a0e20 (patch) | |
tree | 2a6645be6030f990bfd946afc14c65ae773ff68b | |
parent | 38a2af996ae67f3c65194ea38a2d6e3a64682839 (diff) | |
download | mongo-5fe822f53e4bb28e15af2541c0ca931fa05a0e20.tar.gz |
SERVER-27874 Display locks and generate digraph for threads using LockManager locks and/or pthread_mutexes
-rw-r--r-- | .gdbinit | 4 | ||||
-rw-r--r-- | buildscripts/gdb/mongo.py | 45 | ||||
-rw-r--r-- | buildscripts/gdb/mongo_lock.py | 311 | ||||
-rwxr-xr-x | buildscripts/hang_analyzer.py | 21 | ||||
-rw-r--r-- | etc/evergreen.yml | 2 |
5 files changed, 365 insertions, 18 deletions
@@ -2,5 +2,7 @@ set python print-stack full # Load the mongodb pretty printers -# source buildscripts/gdb/mongo.py + +# Load the mongodb lock analysis +source buildscripts/gdb/mongo_lock.py diff --git a/buildscripts/gdb/mongo.py b/buildscripts/gdb/mongo.py index ecf2f1469fe..ce0835c4fbd 100644 --- a/buildscripts/gdb/mongo.py +++ b/buildscripts/gdb/mongo.py @@ -4,6 +4,7 @@ from __future__ import print_function import gdb.printing import os +import re import sys try: @@ -29,6 +30,27 @@ def get_process_name(): return os.path.splitext(os.path.basename(main_binary_name))[0] +def get_thread_id(): + """Returns the thread_id of the current GDB thread""" + # GDB thread example: + # RHEL + # [Current thread is 1 (Thread 0x7f072426cca0 (LWP 12867))] + thread_info = gdb.execute("thread", from_tty=False, to_string=True) + + if sys.platform.startswith("linux"): + match = re.search(r"Thread (?P<pthread_id>0x[0-9a-f]+)", thread_info) + if match: + return int(match.group("pthread_id"), 16) + elif sys.platform.startswith("sunos"): + match = re.search(r"Thread (?P<pthread_id>[0-9]+)", thread_info) + if match: + return int(match.group("pthread_id"), 10) + lwpid = gdb.selected_thread().ptid[1] + if lwpid != 0: + return lwpid + raise ValueError("Failed to find thread id in {}".format(thread_info)) + + ################################################################################################### # # Pretty-Printers @@ -336,35 +358,34 @@ class DumpGlobalServiceContext(gdb.Command): DumpGlobalServiceContext() -class MongoDBAnalyze(gdb.Command): - """Analyze MongoDB process""" +class MongoDBDumpLocks(gdb.Command): + """Dump locks in mongod process""" def __init__(self): - register_mongo_command(self, "mongodb-analyze", gdb.COMMAND_STATUS) + register_mongo_command(self, "mongodb-dump-locks", gdb.COMMAND_DATA) def invoke(self, arg, _from_tty): print("Running Hang Analyzer Supplement") main_binary_name = get_process_name() - if main_binary_name.endswith('mongod'): - self.analyze_mongod() + if main_binary_name == 'mongod': + self.dump_mongod_locks() else: - print("No process specific analysis done for: %s" % (main_binary_name)) + print("Not invoking mongod lock dump for: %s" % (main_binary_name)) - def analyze_mongod(self): + def dump_mongod_locks(self): """GDB in-process python supplement""" try: - # Call into mongod, and dump the state of lock manager # Note that output will go to mongod's standard output, not the debugger output window gdb.execute("call ('mongo::(anonymous namespace)::globalLockManager').dump()", from_tty=False, to_string=False) except gdb.error as gdberr: - print("Ignoring error '%s'" % str(gdberr)) + print("Ignoring error '%s' in dump_mongod_locks" % str(gdberr)) # Register command -MongoDBAnalyze() +MongoDBDumpLocks() class MongoDBUniqueStack(gdb.Command): @@ -394,7 +415,7 @@ class MongoDBUniqueStack(gdb.Command): thread_info = {} # thread dict to hold per thread data thread_info['frames'] = [] # the complete backtrace per thread from gdb thread_info['functions'] = [] # list of function names from frames - thread_info['pthread'] = hex(int(gdb.parse_and_eval("(uint64_t)pthread_self()"))) + thread_info['pthread'] = hex(get_thread_id()) frame = gdb.newest_frame() while frame: @@ -480,7 +501,7 @@ class MongoDBHelp(gdb.Command): """Dump list of mongodb commands""" def __init__(self): - gdb.Command.__init__(self, "mongodb-help", gdb.COMMAND_DATA) + gdb.Command.__init__(self, "mongodb-help", gdb.COMMAND_SUPPORT) def invoke(self, arg, _from_tty): print("Command - Description") diff --git a/buildscripts/gdb/mongo_lock.py b/buildscripts/gdb/mongo_lock.py new file mode 100644 index 00000000000..5fbde9c3e8f --- /dev/null +++ b/buildscripts/gdb/mongo_lock.py @@ -0,0 +1,311 @@ +from __future__ import print_function + +import gdb +import gdb.printing +import re +import sys + +if sys.version_info[0] >= 3: + # GDB only permits converting a gdb.Value instance to its numerical address when using the + # long() constructor in Python 2 and not when using the int() constructor. We define the + # 'long' class as an alias for the 'int' class in Python 3 for compatibility. + long = int + + +class Thread(object): + def __init__(self, thread_id, lwpid): + self.thread_id = thread_id + self.lwpid = lwpid + + def __eq__(self, other): + if isinstance(other, Thread): + return self.thread_id == other.thread_id + return NotImplemented + + def __ne__(self, other): + return not self == other + + def __str__(self): + return "Thread 0x{:012x} (LWP {})".format(self.thread_id, self.lwpid) + + def key(self): + return "Thread 0x{:012x}".format(self.thread_id) + + +class Lock(object): + def __init__(self, addr, resource): + self.addr = addr + self.resource = resource + + def __eq__(self, other): + if isinstance(other, Lock): + return self.addr == other.addr + return NotImplemented + + def __ne__(self, other): + return not self == other + + def __str__(self): + return "Lock 0x{:012x} ({})".format(self.addr, self.resource) + + def key(self): + return "Lock 0x{:012x}".format(self.addr) + + +class Graph(object): + # The Graph is a dict with the following structure: + # {'node_key': {'node': {id: val}, 'next_nodes': [node_key_1, ...]}} + # Example graph: + # { + # 'Lock 1': {'node': {1: 'MongoDB lock'}, 'next_nodes': ['Thread 1']}, + # 'Lock 2': {'node': {1: 'MongoDB lock'}, 'next_nodes': ['Thread 2']}, + # 'Thread 1': {'node': {1: 123}, 'next_nodes': ['Lock 2']}, + # 'Thread 2': {'node': {2: 456}, 'next_nodes': ['Lock 1']} + # } + def __init__(self): + self.nodes = {} + + def add_node(self, node): + if not self.find_node(node): + self.nodes[node.key()] = {'node': node, 'next_nodes': []} + + def find_node(self, node): + if node.key() in self.nodes: + return self.nodes[node.key()] + return None + + def find_from_node(self, from_node): + for node_key in self.nodes: + node = self.nodes[node_key] + for next_node in node['next_nodes']: + if next_node == from_node['node'].key(): + return node + return None + + def remove_nodes_without_edge(self): + # Rebuild graph by removing any nodes which do not have any incoming or outgoing any edges. + temp_nodes = {} + for node_key in self.nodes: + node = self.nodes[node_key] + if node['next_nodes'] or self.find_from_node(node) is not None: + temp_nodes[node_key] = self.nodes[node_key] + self.nodes = temp_nodes + + def add_edge(self, from_node, to_node): + f = self.find_node(from_node) + if f is None: + self.add_node(from_node) + f = self.nodes[from_node.key()] + + t = self.find_node(to_node) + if t is None: + self.add_node(to_node) + t = self.nodes[to_node.key()] + + for n in f['next_nodes']: + if n == to_node.key(): + return + self.nodes[from_node.key()]['next_nodes'].append(to_node.key()) + + def print(self): + for node_key in self.nodes: + print("Node", self.nodes[node_key]['node']) + for to in self.nodes[node_key]['next_nodes']: + print(" ->", to) + + def to_graph(self): + sb = [] + sb.append('digraph "mongod+lock-status" {') + for node_key in self.nodes: + for next_node_key in self.nodes[node_key]['next_nodes']: + sb.append(' "%s" -> "%s";' % (node_key, next_node_key)) + for node_key in self.nodes: + sb.append(' "%s" [label="%s"]' % (node_key, self.nodes[node_key]['node'])) + sb.append("}") + return "\n".join(sb) + + +def find_lwpid(thread_dict, search_thread_id): + for (lwpid, thread_id) in thread_dict.items(): + if thread_id == search_thread_id: + return lwpid + return None + + +def find_func_block(block): + while block: + if block.function: + return block + block = block.superblock + return None + + +def find_frame(function_name_pattern): + frame = gdb.newest_frame() + while frame: + block = None + try: + block = frame.block() + except RuntimeError as err: + if err.args[0] != "Cannot locate block for frame.": + raise + + block = find_func_block(block) + if block and re.match(function_name_pattern, block.function.name): + return frame + frame = frame.older() + return None + + +def find_mutex_holder(graph, thread_dict, show): + frame = find_frame(r'std::mutex::lock\(\)') + if frame is None: + return + + frame.select() + + # Waiting for mutex locking! + mutex_this, _ = gdb.lookup_symbol("this", frame.block()) + mutex_value = mutex_this.value(frame) + # The mutex holder is a LWPID + mutex_holder = int(mutex_value["_M_mutex"]["__data"]["__owner"]) + mutex_holder_id = thread_dict[mutex_holder] + + (_, mutex_waiter_lwpid, _) = gdb.selected_thread().ptid + mutex_waiter_id = thread_dict[mutex_waiter_lwpid] + if show: + print("Mutex at {} held by thread 0x{:x} (LWP {}) " + " waited on by thread 0x{:x} (LWP {})".format(mutex_value, + mutex_holder_id, + mutex_holder, + mutex_waiter_id, + mutex_waiter_lwpid)) + if graph: + graph.add_edge(Thread(mutex_waiter_id, mutex_waiter_lwpid), + Lock(long(mutex_value), "Mutex")) + graph.add_edge(Lock(long(mutex_value), "Mutex"), Thread(mutex_holder_id, mutex_holder)) + + +def find_lock_manager_holders(graph, thread_dict, show): + frame = find_frame(r'mongo::LockerImpl\<.*\>::') + if not frame: + return + + frame.select() + + (_, lwpid, _) = gdb.selected_thread().ptid + + locker_ptr_type = gdb.lookup_type("mongo::LockerImpl<false>").pointer() + lock_head = gdb.parse_and_eval( + "mongo::getGlobalLockManager()->_getBucket(resId)->findOrInsert(resId)") + + grantedList = lock_head.dereference()["grantedList"] + lock_request_ptr = grantedList["_front"] + while lock_request_ptr: + lock_request = lock_request_ptr.dereference() + locker_ptr = lock_request["locker"] + locker_ptr = locker_ptr.cast(locker_ptr_type) + locker = locker_ptr.dereference() + lock_thread_id = int(locker["_threadId"]["_M_thread"]) + lock_thread_lwpid = find_lwpid(thread_dict, lock_thread_id) + if show: + print("MongoDB Lock at {} ({}) held by thread id 0x{:x} (LWP {})".format( + lock_head, lock_request["mode"], lock_thread_id, lock_thread_lwpid) + + " waited on by thread 0x{:x} (LWP {})".format(thread_dict[lwpid], lwpid)) + if graph: + graph.add_edge(Thread(thread_dict[lwpid], lwpid), Lock(long(lock_head), "MongoDB lock")) + graph.add_edge(Lock(long(lock_head), "MongoDB lock"), + Thread(lock_thread_id, lock_thread_lwpid)) + lock_request_ptr = lock_request["next"] + + +def get_locks(graph, thread_dict, show=False): + for thread in gdb.selected_inferior().threads(): + try: + if not thread.is_valid(): + continue + thread.switch() + find_mutex_holder(graph, thread_dict, show) + find_lock_manager_holders(graph, thread_dict, show) + except gdb.error as err: + print("Ignoring GDB error '%s' in get_locks" % str(err)) + + +def get_threads_info(graph=None): + thread_dict = {} + for thread in gdb.selected_inferior().threads(): + try: + if not thread.is_valid(): + continue + thread.switch() + # PTID is a tuple: Process ID (PID), Lightweight Process ID (LWPID), Thread ID (TID) + (_, lwpid, _) = thread.ptid + thread_num = thread.num + thread_id = get_thread_id() + if not thread_id: + print("Unable to retrieve thread_info for thread %d" % thread_num) + continue + thread_dict[lwpid] = thread_id + except gdb.error as err: + print("Ignoring GDB error '%s' in get_threads_info" % str(err)) + + return thread_dict + + +class MongoDBShowLocks(gdb.Command): + """Show MongoDB locks""" + def __init__(self): + register_mongo_command(self, "mongodb-show-locks", gdb.COMMAND_DATA) + + def invoke(self, arg, _from_tty): + main_binary_name = get_process_name() + if main_binary_name.endswith('mongod') or main_binary_name.endswith('mongo'): + self.mongodb_show_locks() + else: + print("No show MongoDB locks done for: %s" % (main_binary_name)) + + def mongodb_show_locks(self): + """GDB in-process python supplement""" + try: + thread_dict = get_threads_info() + get_locks(graph=None, thread_dict=thread_dict, show=True) + except gdb.error as err: + print("Ignoring GDB error '%s' in mongodb_show_locks" % str(err)) + +MongoDBShowLocks() + + +class MongoDBWaitsForGraph(gdb.Command): + """Create MongoDB WaitsFor graph [graph_file]""" + def __init__(self): + register_mongo_command(self, "mongodb-waitsfor-graph", gdb.COMMAND_DATA) + + def invoke(self, arg, _from_tty): + main_binary_name = get_process_name() + if main_binary_name.endswith('mongod') or main_binary_name.endswith('mongo'): + self.mongodb_waitsfor_graph(arg) + else: + print("No create MongoDB WaitsFor graph done for: %s" % (main_binary_name)) + + def mongodb_waitsfor_graph(self, file=None): + """GDB in-process python supplement""" + + graph = Graph() + try: + thread_dict = get_threads_info(graph=graph) + get_locks(graph=graph, thread_dict=thread_dict, show=False) + graph.remove_nodes_without_edge() + if file: + print("Saving digraph to %s" % file) + with open(file, 'w') as f: + f.write(graph.to_graph()) + else: + print(graph.to_graph()) + + except gdb.error as err: + print("Ignoring GDB error '%s' in mongod_deadlock_graph" % str(err)) + + +MongoDBWaitsForGraph() + +print("MongoDB Lock analysis commands loaded") diff --git a/buildscripts/hang_analyzer.py b/buildscripts/hang_analyzer.py index b1c049dcf74..ec2316082e4 100755 --- a/buildscripts/hang_analyzer.py +++ b/buildscripts/hang_analyzer.py @@ -301,8 +301,12 @@ class GDBDumper(object): root_logger.info("dir %s" % script_dir) gdb_dir = os.path.join(script_dir, "gdb") printers_script = os.path.join(gdb_dir, "mongo.py") + mongo_lock_script = os.path.join(gdb_dir, "mongo_lock.py") bt_command = "mongodb-uniqstack bt" + mongodb_show_locks = "mongodb-show-locks" + mongodb_deadlock = "mongodb-waitsfor-graph debugger_deadlock_%s_%d.gv" % \ + (process_name, pid) if sys.platform.startswith("sunos"): ''' On Solaris, currently calling mongo-uniqstack leads to an error: @@ -318,20 +322,29 @@ class GDBDumper(object): When the function is done executing, GDB will silently stop. ''' bt_command = "thread apply all bt" + mongodb_show_locks = "" + mongodb_deadlock = "" cmds = [ - "set pagination off", + "set interactive-mode off", + "set print thread-events off", # Python calls to gdb.parse_and_eval may cause threads + # to start and finish. This suppresses those messages + # from appearing in the return output. + "file %s" % process_name, # Solaris must load the process to read the symbols. "attach %d" % pid, "info sharedlibrary", "info threads", # Dump a simple list of commands to get the thread name "set python print-stack full", - "source " + printers_script, + "source %s" % printers_script, + "source %s" % mongo_lock_script, bt_command, dump_command, - "mongodb-analyze", + "mongodb-dump-locks", + mongodb_show_locks, + mongodb_deadlock, "mongodb-javascript-stack", # The mongodb-javascript-stack command executes code in # order to dump JavaScript backtraces and should therefore - # should be one of the last analysis commands. + # be one of the last analysis commands. "set confirm off", "quit", ] diff --git a/etc/evergreen.yml b/etc/evergreen.yml index 8b62492de91..8bd122096bb 100644 --- a/etc/evergreen.yml +++ b/etc/evergreen.yml @@ -1116,7 +1116,7 @@ post: target: "src/mongo-hanganalyzer.tgz" source_dir: "src" include: - - "./debugger*.log" + - "./debugger*.*" - command: s3.put params: aws_key: ${aws_key} |