summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJonathan Abrahams <jonathan@mongodb.com>2017-03-16 14:39:09 -0400
committerJonathan Abrahams <jonathan@mongodb.com>2017-03-16 14:39:09 -0400
commit5fe822f53e4bb28e15af2541c0ca931fa05a0e20 (patch)
tree2a6645be6030f990bfd946afc14c65ae773ff68b
parent38a2af996ae67f3c65194ea38a2d6e3a64682839 (diff)
downloadmongo-5fe822f53e4bb28e15af2541c0ca931fa05a0e20.tar.gz
SERVER-27874 Display locks and generate digraph for threads using LockManager locks and/or pthread_mutexes
-rw-r--r--.gdbinit4
-rw-r--r--buildscripts/gdb/mongo.py45
-rw-r--r--buildscripts/gdb/mongo_lock.py311
-rwxr-xr-xbuildscripts/hang_analyzer.py21
-rw-r--r--etc/evergreen.yml2
5 files changed, 365 insertions, 18 deletions
diff --git a/.gdbinit b/.gdbinit
index d82d81f02ba..66d56149410 100644
--- a/.gdbinit
+++ b/.gdbinit
@@ -2,5 +2,7 @@
set python print-stack full
# Load the mongodb pretty printers
-#
source buildscripts/gdb/mongo.py
+
+# Load the mongodb lock analysis
+source buildscripts/gdb/mongo_lock.py
diff --git a/buildscripts/gdb/mongo.py b/buildscripts/gdb/mongo.py
index ecf2f1469fe..ce0835c4fbd 100644
--- a/buildscripts/gdb/mongo.py
+++ b/buildscripts/gdb/mongo.py
@@ -4,6 +4,7 @@ from __future__ import print_function
import gdb.printing
import os
+import re
import sys
try:
@@ -29,6 +30,27 @@ def get_process_name():
return os.path.splitext(os.path.basename(main_binary_name))[0]
+def get_thread_id():
+ """Returns the thread_id of the current GDB thread"""
+ # GDB thread example:
+ # RHEL
+ # [Current thread is 1 (Thread 0x7f072426cca0 (LWP 12867))]
+ thread_info = gdb.execute("thread", from_tty=False, to_string=True)
+
+ if sys.platform.startswith("linux"):
+ match = re.search(r"Thread (?P<pthread_id>0x[0-9a-f]+)", thread_info)
+ if match:
+ return int(match.group("pthread_id"), 16)
+ elif sys.platform.startswith("sunos"):
+ match = re.search(r"Thread (?P<pthread_id>[0-9]+)", thread_info)
+ if match:
+ return int(match.group("pthread_id"), 10)
+ lwpid = gdb.selected_thread().ptid[1]
+ if lwpid != 0:
+ return lwpid
+ raise ValueError("Failed to find thread id in {}".format(thread_info))
+
+
###################################################################################################
#
# Pretty-Printers
@@ -336,35 +358,34 @@ class DumpGlobalServiceContext(gdb.Command):
DumpGlobalServiceContext()
-class MongoDBAnalyze(gdb.Command):
- """Analyze MongoDB process"""
+class MongoDBDumpLocks(gdb.Command):
+ """Dump locks in mongod process"""
def __init__(self):
- register_mongo_command(self, "mongodb-analyze", gdb.COMMAND_STATUS)
+ register_mongo_command(self, "mongodb-dump-locks", gdb.COMMAND_DATA)
def invoke(self, arg, _from_tty):
print("Running Hang Analyzer Supplement")
main_binary_name = get_process_name()
- if main_binary_name.endswith('mongod'):
- self.analyze_mongod()
+ if main_binary_name == 'mongod':
+ self.dump_mongod_locks()
else:
- print("No process specific analysis done for: %s" % (main_binary_name))
+ print("Not invoking mongod lock dump for: %s" % (main_binary_name))
- def analyze_mongod(self):
+ def dump_mongod_locks(self):
"""GDB in-process python supplement"""
try:
-
# Call into mongod, and dump the state of lock manager
# Note that output will go to mongod's standard output, not the debugger output window
gdb.execute("call ('mongo::(anonymous namespace)::globalLockManager').dump()",
from_tty=False, to_string=False)
except gdb.error as gdberr:
- print("Ignoring error '%s'" % str(gdberr))
+ print("Ignoring error '%s' in dump_mongod_locks" % str(gdberr))
# Register command
-MongoDBAnalyze()
+MongoDBDumpLocks()
class MongoDBUniqueStack(gdb.Command):
@@ -394,7 +415,7 @@ class MongoDBUniqueStack(gdb.Command):
thread_info = {} # thread dict to hold per thread data
thread_info['frames'] = [] # the complete backtrace per thread from gdb
thread_info['functions'] = [] # list of function names from frames
- thread_info['pthread'] = hex(int(gdb.parse_and_eval("(uint64_t)pthread_self()")))
+ thread_info['pthread'] = hex(get_thread_id())
frame = gdb.newest_frame()
while frame:
@@ -480,7 +501,7 @@ class MongoDBHelp(gdb.Command):
"""Dump list of mongodb commands"""
def __init__(self):
- gdb.Command.__init__(self, "mongodb-help", gdb.COMMAND_DATA)
+ gdb.Command.__init__(self, "mongodb-help", gdb.COMMAND_SUPPORT)
def invoke(self, arg, _from_tty):
print("Command - Description")
diff --git a/buildscripts/gdb/mongo_lock.py b/buildscripts/gdb/mongo_lock.py
new file mode 100644
index 00000000000..5fbde9c3e8f
--- /dev/null
+++ b/buildscripts/gdb/mongo_lock.py
@@ -0,0 +1,311 @@
+from __future__ import print_function
+
+import gdb
+import gdb.printing
+import re
+import sys
+
+if sys.version_info[0] >= 3:
+ # GDB only permits converting a gdb.Value instance to its numerical address when using the
+ # long() constructor in Python 2 and not when using the int() constructor. We define the
+ # 'long' class as an alias for the 'int' class in Python 3 for compatibility.
+ long = int
+
+
+class Thread(object):
+ def __init__(self, thread_id, lwpid):
+ self.thread_id = thread_id
+ self.lwpid = lwpid
+
+ def __eq__(self, other):
+ if isinstance(other, Thread):
+ return self.thread_id == other.thread_id
+ return NotImplemented
+
+ def __ne__(self, other):
+ return not self == other
+
+ def __str__(self):
+ return "Thread 0x{:012x} (LWP {})".format(self.thread_id, self.lwpid)
+
+ def key(self):
+ return "Thread 0x{:012x}".format(self.thread_id)
+
+
+class Lock(object):
+ def __init__(self, addr, resource):
+ self.addr = addr
+ self.resource = resource
+
+ def __eq__(self, other):
+ if isinstance(other, Lock):
+ return self.addr == other.addr
+ return NotImplemented
+
+ def __ne__(self, other):
+ return not self == other
+
+ def __str__(self):
+ return "Lock 0x{:012x} ({})".format(self.addr, self.resource)
+
+ def key(self):
+ return "Lock 0x{:012x}".format(self.addr)
+
+
+class Graph(object):
+ # The Graph is a dict with the following structure:
+ # {'node_key': {'node': {id: val}, 'next_nodes': [node_key_1, ...]}}
+ # Example graph:
+ # {
+ # 'Lock 1': {'node': {1: 'MongoDB lock'}, 'next_nodes': ['Thread 1']},
+ # 'Lock 2': {'node': {1: 'MongoDB lock'}, 'next_nodes': ['Thread 2']},
+ # 'Thread 1': {'node': {1: 123}, 'next_nodes': ['Lock 2']},
+ # 'Thread 2': {'node': {2: 456}, 'next_nodes': ['Lock 1']}
+ # }
+ def __init__(self):
+ self.nodes = {}
+
+ def add_node(self, node):
+ if not self.find_node(node):
+ self.nodes[node.key()] = {'node': node, 'next_nodes': []}
+
+ def find_node(self, node):
+ if node.key() in self.nodes:
+ return self.nodes[node.key()]
+ return None
+
+ def find_from_node(self, from_node):
+ for node_key in self.nodes:
+ node = self.nodes[node_key]
+ for next_node in node['next_nodes']:
+ if next_node == from_node['node'].key():
+ return node
+ return None
+
+ def remove_nodes_without_edge(self):
+ # Rebuild graph by removing any nodes which do not have any incoming or outgoing any edges.
+ temp_nodes = {}
+ for node_key in self.nodes:
+ node = self.nodes[node_key]
+ if node['next_nodes'] or self.find_from_node(node) is not None:
+ temp_nodes[node_key] = self.nodes[node_key]
+ self.nodes = temp_nodes
+
+ def add_edge(self, from_node, to_node):
+ f = self.find_node(from_node)
+ if f is None:
+ self.add_node(from_node)
+ f = self.nodes[from_node.key()]
+
+ t = self.find_node(to_node)
+ if t is None:
+ self.add_node(to_node)
+ t = self.nodes[to_node.key()]
+
+ for n in f['next_nodes']:
+ if n == to_node.key():
+ return
+ self.nodes[from_node.key()]['next_nodes'].append(to_node.key())
+
+ def print(self):
+ for node_key in self.nodes:
+ print("Node", self.nodes[node_key]['node'])
+ for to in self.nodes[node_key]['next_nodes']:
+ print(" ->", to)
+
+ def to_graph(self):
+ sb = []
+ sb.append('digraph "mongod+lock-status" {')
+ for node_key in self.nodes:
+ for next_node_key in self.nodes[node_key]['next_nodes']:
+ sb.append(' "%s" -> "%s";' % (node_key, next_node_key))
+ for node_key in self.nodes:
+ sb.append(' "%s" [label="%s"]' % (node_key, self.nodes[node_key]['node']))
+ sb.append("}")
+ return "\n".join(sb)
+
+
+def find_lwpid(thread_dict, search_thread_id):
+ for (lwpid, thread_id) in thread_dict.items():
+ if thread_id == search_thread_id:
+ return lwpid
+ return None
+
+
+def find_func_block(block):
+ while block:
+ if block.function:
+ return block
+ block = block.superblock
+ return None
+
+
+def find_frame(function_name_pattern):
+ frame = gdb.newest_frame()
+ while frame:
+ block = None
+ try:
+ block = frame.block()
+ except RuntimeError as err:
+ if err.args[0] != "Cannot locate block for frame.":
+ raise
+
+ block = find_func_block(block)
+ if block and re.match(function_name_pattern, block.function.name):
+ return frame
+ frame = frame.older()
+ return None
+
+
+def find_mutex_holder(graph, thread_dict, show):
+ frame = find_frame(r'std::mutex::lock\(\)')
+ if frame is None:
+ return
+
+ frame.select()
+
+ # Waiting for mutex locking!
+ mutex_this, _ = gdb.lookup_symbol("this", frame.block())
+ mutex_value = mutex_this.value(frame)
+ # The mutex holder is a LWPID
+ mutex_holder = int(mutex_value["_M_mutex"]["__data"]["__owner"])
+ mutex_holder_id = thread_dict[mutex_holder]
+
+ (_, mutex_waiter_lwpid, _) = gdb.selected_thread().ptid
+ mutex_waiter_id = thread_dict[mutex_waiter_lwpid]
+ if show:
+ print("Mutex at {} held by thread 0x{:x} (LWP {}) "
+ " waited on by thread 0x{:x} (LWP {})".format(mutex_value,
+ mutex_holder_id,
+ mutex_holder,
+ mutex_waiter_id,
+ mutex_waiter_lwpid))
+ if graph:
+ graph.add_edge(Thread(mutex_waiter_id, mutex_waiter_lwpid),
+ Lock(long(mutex_value), "Mutex"))
+ graph.add_edge(Lock(long(mutex_value), "Mutex"), Thread(mutex_holder_id, mutex_holder))
+
+
+def find_lock_manager_holders(graph, thread_dict, show):
+ frame = find_frame(r'mongo::LockerImpl\<.*\>::')
+ if not frame:
+ return
+
+ frame.select()
+
+ (_, lwpid, _) = gdb.selected_thread().ptid
+
+ locker_ptr_type = gdb.lookup_type("mongo::LockerImpl<false>").pointer()
+ lock_head = gdb.parse_and_eval(
+ "mongo::getGlobalLockManager()->_getBucket(resId)->findOrInsert(resId)")
+
+ grantedList = lock_head.dereference()["grantedList"]
+ lock_request_ptr = grantedList["_front"]
+ while lock_request_ptr:
+ lock_request = lock_request_ptr.dereference()
+ locker_ptr = lock_request["locker"]
+ locker_ptr = locker_ptr.cast(locker_ptr_type)
+ locker = locker_ptr.dereference()
+ lock_thread_id = int(locker["_threadId"]["_M_thread"])
+ lock_thread_lwpid = find_lwpid(thread_dict, lock_thread_id)
+ if show:
+ print("MongoDB Lock at {} ({}) held by thread id 0x{:x} (LWP {})".format(
+ lock_head, lock_request["mode"], lock_thread_id, lock_thread_lwpid) +
+ " waited on by thread 0x{:x} (LWP {})".format(thread_dict[lwpid], lwpid))
+ if graph:
+ graph.add_edge(Thread(thread_dict[lwpid], lwpid), Lock(long(lock_head), "MongoDB lock"))
+ graph.add_edge(Lock(long(lock_head), "MongoDB lock"),
+ Thread(lock_thread_id, lock_thread_lwpid))
+ lock_request_ptr = lock_request["next"]
+
+
+def get_locks(graph, thread_dict, show=False):
+ for thread in gdb.selected_inferior().threads():
+ try:
+ if not thread.is_valid():
+ continue
+ thread.switch()
+ find_mutex_holder(graph, thread_dict, show)
+ find_lock_manager_holders(graph, thread_dict, show)
+ except gdb.error as err:
+ print("Ignoring GDB error '%s' in get_locks" % str(err))
+
+
+def get_threads_info(graph=None):
+ thread_dict = {}
+ for thread in gdb.selected_inferior().threads():
+ try:
+ if not thread.is_valid():
+ continue
+ thread.switch()
+ # PTID is a tuple: Process ID (PID), Lightweight Process ID (LWPID), Thread ID (TID)
+ (_, lwpid, _) = thread.ptid
+ thread_num = thread.num
+ thread_id = get_thread_id()
+ if not thread_id:
+ print("Unable to retrieve thread_info for thread %d" % thread_num)
+ continue
+ thread_dict[lwpid] = thread_id
+ except gdb.error as err:
+ print("Ignoring GDB error '%s' in get_threads_info" % str(err))
+
+ return thread_dict
+
+
+class MongoDBShowLocks(gdb.Command):
+ """Show MongoDB locks"""
+ def __init__(self):
+ register_mongo_command(self, "mongodb-show-locks", gdb.COMMAND_DATA)
+
+ def invoke(self, arg, _from_tty):
+ main_binary_name = get_process_name()
+ if main_binary_name.endswith('mongod') or main_binary_name.endswith('mongo'):
+ self.mongodb_show_locks()
+ else:
+ print("No show MongoDB locks done for: %s" % (main_binary_name))
+
+ def mongodb_show_locks(self):
+ """GDB in-process python supplement"""
+ try:
+ thread_dict = get_threads_info()
+ get_locks(graph=None, thread_dict=thread_dict, show=True)
+ except gdb.error as err:
+ print("Ignoring GDB error '%s' in mongodb_show_locks" % str(err))
+
+MongoDBShowLocks()
+
+
+class MongoDBWaitsForGraph(gdb.Command):
+ """Create MongoDB WaitsFor graph [graph_file]"""
+ def __init__(self):
+ register_mongo_command(self, "mongodb-waitsfor-graph", gdb.COMMAND_DATA)
+
+ def invoke(self, arg, _from_tty):
+ main_binary_name = get_process_name()
+ if main_binary_name.endswith('mongod') or main_binary_name.endswith('mongo'):
+ self.mongodb_waitsfor_graph(arg)
+ else:
+ print("No create MongoDB WaitsFor graph done for: %s" % (main_binary_name))
+
+ def mongodb_waitsfor_graph(self, file=None):
+ """GDB in-process python supplement"""
+
+ graph = Graph()
+ try:
+ thread_dict = get_threads_info(graph=graph)
+ get_locks(graph=graph, thread_dict=thread_dict, show=False)
+ graph.remove_nodes_without_edge()
+ if file:
+ print("Saving digraph to %s" % file)
+ with open(file, 'w') as f:
+ f.write(graph.to_graph())
+ else:
+ print(graph.to_graph())
+
+ except gdb.error as err:
+ print("Ignoring GDB error '%s' in mongod_deadlock_graph" % str(err))
+
+
+MongoDBWaitsForGraph()
+
+print("MongoDB Lock analysis commands loaded")
diff --git a/buildscripts/hang_analyzer.py b/buildscripts/hang_analyzer.py
index b1c049dcf74..ec2316082e4 100755
--- a/buildscripts/hang_analyzer.py
+++ b/buildscripts/hang_analyzer.py
@@ -301,8 +301,12 @@ class GDBDumper(object):
root_logger.info("dir %s" % script_dir)
gdb_dir = os.path.join(script_dir, "gdb")
printers_script = os.path.join(gdb_dir, "mongo.py")
+ mongo_lock_script = os.path.join(gdb_dir, "mongo_lock.py")
bt_command = "mongodb-uniqstack bt"
+ mongodb_show_locks = "mongodb-show-locks"
+ mongodb_deadlock = "mongodb-waitsfor-graph debugger_deadlock_%s_%d.gv" % \
+ (process_name, pid)
if sys.platform.startswith("sunos"):
'''
On Solaris, currently calling mongo-uniqstack leads to an error:
@@ -318,20 +322,29 @@ class GDBDumper(object):
When the function is done executing, GDB will silently stop.
'''
bt_command = "thread apply all bt"
+ mongodb_show_locks = ""
+ mongodb_deadlock = ""
cmds = [
- "set pagination off",
+ "set interactive-mode off",
+ "set print thread-events off", # Python calls to gdb.parse_and_eval may cause threads
+ # to start and finish. This suppresses those messages
+ # from appearing in the return output.
+ "file %s" % process_name, # Solaris must load the process to read the symbols.
"attach %d" % pid,
"info sharedlibrary",
"info threads", # Dump a simple list of commands to get the thread name
"set python print-stack full",
- "source " + printers_script,
+ "source %s" % printers_script,
+ "source %s" % mongo_lock_script,
bt_command,
dump_command,
- "mongodb-analyze",
+ "mongodb-dump-locks",
+ mongodb_show_locks,
+ mongodb_deadlock,
"mongodb-javascript-stack", # The mongodb-javascript-stack command executes code in
# order to dump JavaScript backtraces and should therefore
- # should be one of the last analysis commands.
+ # be one of the last analysis commands.
"set confirm off",
"quit",
]
diff --git a/etc/evergreen.yml b/etc/evergreen.yml
index 8b62492de91..8bd122096bb 100644
--- a/etc/evergreen.yml
+++ b/etc/evergreen.yml
@@ -1116,7 +1116,7 @@ post:
target: "src/mongo-hanganalyzer.tgz"
source_dir: "src"
include:
- - "./debugger*.log"
+ - "./debugger*.*"
- command: s3.put
params:
aws_key: ${aws_key}