#!/usr/bin/env python3 # Copyright (c) Meta Platforms, Inc. and affiliates. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import re from collections import defaultdict from enum import Enum import gdb class DiGraph: """ Adapted from networkx: http://networkx.github.io/ Represents a directed graph. Edges can store (key, value) attributes. """ def __init__(self): # Map of node -> set of nodes self.adjacency_map = {} # Map of (node1, node2) -> map string -> arbitrary attribute # This will not be copied in subgraph() self.attributes_map = {} def neighbors(self, node): return self.adjacency_map.get(node, set()) def edges(self): edges = [] for node, neighbors in self.adjacency_map.items(): for neighbor in neighbors: edges.append((node, neighbor)) return edges def nodes(self): return self.adjacency_map.keys() def attributes(self, node1, node2): return self.attributes_map[(node1, node2)] def add_edge(self, node1, node2, **kwargs): if node1 not in self.adjacency_map: self.adjacency_map[node1] = set() if node2 not in self.adjacency_map: self.adjacency_map[node2] = set() self.adjacency_map[node1].add(node2) self.attributes_map[(node1, node2)] = kwargs def remove_node(self, node): self.adjacency_map.pop(node, None) for _, neighbors in self.adjacency_map.items(): neighbors.discard(node) def subgraph(self, nodes): graph = DiGraph() for node in nodes: for neighbor in self.neighbors(node): if neighbor in nodes: graph.add_edge(node, neighbor) return graph def node_link_data(self): """ Returns the graph as a dictionary in a format that can be serialized. """ data = { "directed": True, "multigraph": False, "graph": {}, "links": [], "nodes": [], } # Do one pass to build a map of node -> position in nodes node_to_number = {} for node in self.adjacency_map.keys(): node_to_number[node] = len(data["nodes"]) data["nodes"].append({"id": node}) # Do another pass to build the link information for node, neighbors in self.adjacency_map.items(): for neighbor in neighbors: link = self.attributes_map[(node, neighbor)].copy() link["source"] = node_to_number[node] link["target"] = node_to_number[neighbor] data["links"].append(link) return data def strongly_connected_components(G): # noqa: C901 """ Adapted from networkx: http://networkx.github.io/ Parameters ---------- G : DiGraph Returns ------- comp : generator of sets A generator of sets of nodes, one for each strongly connected component of G. """ preorder = {} lowlink = {} scc_found = {} scc_queue = [] i = 0 # Preorder counter for source in G.nodes(): if source not in scc_found: queue = [source] while queue: v = queue[-1] if v not in preorder: i = i + 1 preorder[v] = i done = 1 v_nbrs = G.neighbors(v) for w in v_nbrs: if w not in preorder: queue.append(w) done = 0 break if done == 1: lowlink[v] = preorder[v] for w in v_nbrs: if w not in scc_found: if preorder[w] > preorder[v]: lowlink[v] = min([lowlink[v], lowlink[w]]) else: lowlink[v] = min([lowlink[v], preorder[w]]) queue.pop() if lowlink[v] == preorder[v]: scc_found[v] = True scc = {v} while scc_queue and preorder[scc_queue[-1]] > preorder[v]: k = scc_queue.pop() scc_found[k] = True scc.add(k) yield scc else: scc_queue.append(v) def simple_cycles(G): # noqa: C901 """ Adapted from networkx: http://networkx.github.io/ Parameters ---------- G : DiGraph Returns ------- cycle_generator: generator A generator that produces elementary cycles of the graph. Each cycle is represented by a list of nodes along the cycle. """ def _unblock(thisnode, blocked, B): stack = {thisnode} while stack: node = stack.pop() if node in blocked: blocked.remove(node) stack.update(B[node]) B[node].clear() # Johnson's algorithm requires some ordering of the nodes. # We assign the arbitrary ordering given by the strongly connected comps # There is no need to track the ordering as each node removed as processed. # save the actual graph so we can mutate it here # We only take the edges because we do not want to # copy edge and node attributes here. subG = G.subgraph(G.nodes()) sccs = list(strongly_connected_components(subG)) while sccs: scc = sccs.pop() # order of scc determines ordering of nodes startnode = scc.pop() # Processing node runs 'circuit' routine from recursive version path = [startnode] blocked = set() # vertex: blocked from search? closed = set() # nodes involved in a cycle blocked.add(startnode) B = defaultdict(set) # graph portions that yield no elementary circuit stack = [(startnode, list(subG.neighbors(startnode)))] while stack: thisnode, nbrs = stack[-1] if nbrs: nextnode = nbrs.pop() if nextnode == startnode: yield path[:] closed.update(path) elif nextnode not in blocked: path.append(nextnode) stack.append((nextnode, list(subG.neighbors(nextnode)))) closed.discard(nextnode) blocked.add(nextnode) continue # done with nextnode... look for more neighbors if not nbrs: # no more nbrs if thisnode in closed: _unblock(thisnode, blocked, B) else: for nbr in subG.neighbors(thisnode): if thisnode not in B[nbr]: B[nbr].add(thisnode) stack.pop() path.pop() # done processing this node subG.remove_node(startnode) H = subG.subgraph(scc) # make smaller to avoid work in SCC routine sccs.extend(list(strongly_connected_components(H))) def find_cycle(graph): """ Looks for a cycle in the graph. If found, returns the first cycle. If nodes a1, a2, ..., an are in a cycle, then this returns: [(a1,a2), (a2,a3), ... (an-1,an), (an, a1)] Otherwise returns an empty list. """ cycles = list(simple_cycles(graph)) if cycles: nodes = cycles[0] nodes.append(nodes[0]) edges = [] prev = nodes[0] for node in nodes[1:]: edges.append((prev, node)) prev = node return edges else: return [] def get_stacktrace(thread_id): """ Returns the stack trace for the thread id as a list of strings. """ gdb.execute("thread %d" % thread_id, from_tty=False, to_string=True) output = gdb.execute("bt", from_tty=False, to_string=True) stacktrace_lines = output.strip().split("\n") return stacktrace_lines def is_thread_blocked_with_frame( thread_id, top_line, expected_top_lines, expected_frame ): """ Returns True if we found expected_top_line in top_line, and we found the expected_frame in the thread's stack trace. """ if all(expected not in top_line for expected in expected_top_lines): return False stacktrace_lines = get_stacktrace(thread_id) return any(expected_frame in line for line in stacktrace_lines) class MutexType(Enum): """Types of mutexes that we can detect deadlocks.""" PTHREAD_MUTEX_T = "pthread_mutex_t" PTHREAD_RWLOCK_T = "pthread_rwlock_t" @staticmethod def get_mutex_type(thread_id, top_line): """ Returns the probable mutex type, based on the first line of the thread's stack. Returns None if not found. """ WAITLIST = [ "__lll_lock_wait", "futex_abstimed_wait", "futex_abstimed_wait_cancelable", "futex_reltimed_wait", "futex_reltimed_wait_cancelable", "futex_wait", "futex_wait_cancelable", ] if is_thread_blocked_with_frame(thread_id, top_line, WAITLIST, "pthread_mutex"): return MutexType.PTHREAD_MUTEX_T if is_thread_blocked_with_frame( thread_id, top_line, WAITLIST, "pthread_rwlock" ): return MutexType.PTHREAD_RWLOCK_T return None @staticmethod def get_mutex_owner_and_address_func_for_type(mutex_type): """ Returns a function to resolve the mutex owner and address for the given type. The returned function f has the following signature: f: args: (map of thread lwp -> thread id), blocked thread lwp returns: (lwp of thread owning mutex, mutex address) or (None, None) if not found. Returns None if there is no function for this mutex_type. """ if mutex_type == MutexType.PTHREAD_MUTEX_T: return get_pthread_mutex_t_owner_and_address if mutex_type == MutexType.PTHREAD_RWLOCK_T: return get_pthread_rwlock_t_owner_and_address return None def print_cycle(graph, lwp_to_thread_id, cycle): """Prints the threads and mutexes involved in the deadlock.""" for m, n in cycle: print( "Thread %d (LWP %d) is waiting on %s (0x%016x) held by " "Thread %d (LWP %d)" % ( lwp_to_thread_id[m], m, graph.attributes(m, n)["mutex_type"].value, graph.attributes(m, n)["mutex"], lwp_to_thread_id[n], n, ) ) def get_thread_info(): """ Returns a pair of: - map of LWP -> thread ID - map of blocked threads LWP -> potential mutex type """ # LWP -> thread ID lwp_to_thread_id = {} # LWP -> potential mutex type it is blocked on blocked_threads = {} output = gdb.execute("info threads", from_tty=False, to_string=True) lines = output.strip().split("\n")[1:] regex = re.compile(r"[\s\*]*(\d+).*Thread.*\(LWP (\d+)\).*") for line in lines: try: thread_id = int(regex.match(line).group(1)) thread_lwp = int(regex.match(line).group(2)) lwp_to_thread_id[thread_lwp] = thread_id mutex_type = MutexType.get_mutex_type(thread_id, line) if mutex_type: blocked_threads[thread_lwp] = mutex_type except Exception: continue return (lwp_to_thread_id, blocked_threads) def get_pthread_mutex_t_owner_and_address(lwp_to_thread_id, thread_lwp): """ Finds the thread holding the mutex that this thread is blocked on. Returns a pair of (lwp of thread owning mutex, mutex address), or (None, None) if not found. """ # Go up the stack to the pthread_mutex_lock frame gdb.execute( "thread %d" % lwp_to_thread_id[thread_lwp], from_tty=False, to_string=True ) gdb.execute("frame 1", from_tty=False, to_string=True) # Get the owner of the mutex by inspecting the internal # fields of the mutex. try: mutex_info = gdb.parse_and_eval("mutex").dereference() mutex_owner_lwp = int(mutex_info["__data"]["__owner"]) return (mutex_owner_lwp, int(mutex_info.address)) except gdb.error: return (None, None) def get_pthread_rwlock_t_owner_and_address(lwp_to_thread_id, thread_lwp): """ If the thread is waiting on a write-locked pthread_rwlock_t, this will return the pair of: (lwp of thread that is write-owning the mutex, mutex address) or (None, None) if not found, or if the mutex is read-locked. """ # Go up the stack to the pthread_rwlock_{rd|wr}lock frame gdb.execute( "thread %d" % lwp_to_thread_id[thread_lwp], from_tty=False, to_string=True ) gdb.execute("frame 2", from_tty=False, to_string=True) # Get the owner of the mutex by inspecting the internal # fields of the mutex. try: rwlock_info = gdb.parse_and_eval("rwlock").dereference() rwlock_data = rwlock_info["__data"] field_names = ["__cur_writer", "__writer"] fields = rwlock_data.type.fields() field = [f for f in fields if f.name in field_names][0] rwlock_owner_lwp = int(rwlock_data[field]) # We can only track the owner if it is currently write-locked. # If it is not write-locked or if it is currently read-locked, # possibly by multiple threads, we cannot find the owner. if rwlock_owner_lwp != 0: return (rwlock_owner_lwp, int(rwlock_info.address)) else: return (None, None) except gdb.error: return (None, None) class Deadlock(gdb.Command): """Detects deadlocks""" def __init__(self): super(Deadlock, self).__init__("deadlock", gdb.COMMAND_NONE) def invoke(self, arg, from_tty): """Prints the threads and mutexes in a deadlock, if it exists.""" lwp_to_thread_id, blocked_threads = get_thread_info() # Nodes represent threads. Edge (A,B) exists if thread A # is waiting on a mutex held by thread B. graph = DiGraph() # Go through all the blocked threads and see which threads # they are blocked on, and build the thread wait graph. for thread_lwp, mutex_type in blocked_threads.items(): get_owner_and_address_func = ( MutexType.get_mutex_owner_and_address_func_for_type(mutex_type) ) if not get_owner_and_address_func: continue mutex_owner_lwp, mutex_address = get_owner_and_address_func( lwp_to_thread_id, thread_lwp ) if mutex_owner_lwp and mutex_address: graph.add_edge( thread_lwp, mutex_owner_lwp, mutex=mutex_address, mutex_type=mutex_type, ) # A deadlock exists if there is a cycle in the graph. cycle = find_cycle(graph) if cycle: print("Found deadlock!") print_cycle(graph, lwp_to_thread_id, cycle) else: print("No deadlock detected. " "Do you have debug symbols installed?") def load(): # instantiate the Deadlock command Deadlock() print('Type "deadlock" to detect deadlocks.') def info(): return "Detect deadlocks" if __name__ == "__main__": load()