123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474 |
- #!/usr/bin/env python3
- # Copyright (c) Meta Platforms, Inc. and affiliates.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import re
- from collections import defaultdict
- from enum import Enum
- import gdb
- class DiGraph:
- """
- Adapted from networkx: http://networkx.github.io/
- Represents a directed graph. Edges can store (key, value) attributes.
- """
- def __init__(self):
- # Map of node -> set of nodes
- self.adjacency_map = {}
- # Map of (node1, node2) -> map string -> arbitrary attribute
- # This will not be copied in subgraph()
- self.attributes_map = {}
- def neighbors(self, node):
- return self.adjacency_map.get(node, set())
- def edges(self):
- edges = []
- for node, neighbors in self.adjacency_map.items():
- for neighbor in neighbors:
- edges.append((node, neighbor))
- return edges
- def nodes(self):
- return self.adjacency_map.keys()
- def attributes(self, node1, node2):
- return self.attributes_map[(node1, node2)]
- def add_edge(self, node1, node2, **kwargs):
- if node1 not in self.adjacency_map:
- self.adjacency_map[node1] = set()
- if node2 not in self.adjacency_map:
- self.adjacency_map[node2] = set()
- self.adjacency_map[node1].add(node2)
- self.attributes_map[(node1, node2)] = kwargs
- def remove_node(self, node):
- self.adjacency_map.pop(node, None)
- for _, neighbors in self.adjacency_map.items():
- neighbors.discard(node)
- def subgraph(self, nodes):
- graph = DiGraph()
- for node in nodes:
- for neighbor in self.neighbors(node):
- if neighbor in nodes:
- graph.add_edge(node, neighbor)
- return graph
- def node_link_data(self):
- """
- Returns the graph as a dictionary in a format that can be
- serialized.
- """
- data = {
- "directed": True,
- "multigraph": False,
- "graph": {},
- "links": [],
- "nodes": [],
- }
- # Do one pass to build a map of node -> position in nodes
- node_to_number = {}
- for node in self.adjacency_map.keys():
- node_to_number[node] = len(data["nodes"])
- data["nodes"].append({"id": node})
- # Do another pass to build the link information
- for node, neighbors in self.adjacency_map.items():
- for neighbor in neighbors:
- link = self.attributes_map[(node, neighbor)].copy()
- link["source"] = node_to_number[node]
- link["target"] = node_to_number[neighbor]
- data["links"].append(link)
- return data
- def strongly_connected_components(G): # noqa: C901
- """
- Adapted from networkx: http://networkx.github.io/
- Parameters
- ----------
- G : DiGraph
- Returns
- -------
- comp : generator of sets
- A generator of sets of nodes, one for each strongly connected
- component of G.
- """
- preorder = {}
- lowlink = {}
- scc_found = {}
- scc_queue = []
- i = 0 # Preorder counter
- for source in G.nodes():
- if source not in scc_found:
- queue = [source]
- while queue:
- v = queue[-1]
- if v not in preorder:
- i = i + 1
- preorder[v] = i
- done = 1
- v_nbrs = G.neighbors(v)
- for w in v_nbrs:
- if w not in preorder:
- queue.append(w)
- done = 0
- break
- if done == 1:
- lowlink[v] = preorder[v]
- for w in v_nbrs:
- if w not in scc_found:
- if preorder[w] > preorder[v]:
- lowlink[v] = min([lowlink[v], lowlink[w]])
- else:
- lowlink[v] = min([lowlink[v], preorder[w]])
- queue.pop()
- if lowlink[v] == preorder[v]:
- scc_found[v] = True
- scc = {v}
- while scc_queue and preorder[scc_queue[-1]] > preorder[v]:
- k = scc_queue.pop()
- scc_found[k] = True
- scc.add(k)
- yield scc
- else:
- scc_queue.append(v)
- def simple_cycles(G): # noqa: C901
- """
- Adapted from networkx: http://networkx.github.io/
- Parameters
- ----------
- G : DiGraph
- Returns
- -------
- cycle_generator: generator
- A generator that produces elementary cycles of the graph.
- Each cycle is represented by a list of nodes along the cycle.
- """
- def _unblock(thisnode, blocked, B):
- stack = {thisnode}
- while stack:
- node = stack.pop()
- if node in blocked:
- blocked.remove(node)
- stack.update(B[node])
- B[node].clear()
- # Johnson's algorithm requires some ordering of the nodes.
- # We assign the arbitrary ordering given by the strongly connected comps
- # There is no need to track the ordering as each node removed as processed.
- # save the actual graph so we can mutate it here
- # We only take the edges because we do not want to
- # copy edge and node attributes here.
- subG = G.subgraph(G.nodes())
- sccs = list(strongly_connected_components(subG))
- while sccs:
- scc = sccs.pop()
- # order of scc determines ordering of nodes
- startnode = scc.pop()
- # Processing node runs 'circuit' routine from recursive version
- path = [startnode]
- blocked = set() # vertex: blocked from search?
- closed = set() # nodes involved in a cycle
- blocked.add(startnode)
- B = defaultdict(set) # graph portions that yield no elementary circuit
- stack = [(startnode, list(subG.neighbors(startnode)))]
- while stack:
- thisnode, nbrs = stack[-1]
- if nbrs:
- nextnode = nbrs.pop()
- if nextnode == startnode:
- yield path[:]
- closed.update(path)
- elif nextnode not in blocked:
- path.append(nextnode)
- stack.append((nextnode, list(subG.neighbors(nextnode))))
- closed.discard(nextnode)
- blocked.add(nextnode)
- continue
- # done with nextnode... look for more neighbors
- if not nbrs: # no more nbrs
- if thisnode in closed:
- _unblock(thisnode, blocked, B)
- else:
- for nbr in subG.neighbors(thisnode):
- if thisnode not in B[nbr]:
- B[nbr].add(thisnode)
- stack.pop()
- path.pop()
- # done processing this node
- subG.remove_node(startnode)
- H = subG.subgraph(scc) # make smaller to avoid work in SCC routine
- sccs.extend(list(strongly_connected_components(H)))
- def find_cycle(graph):
- """
- Looks for a cycle in the graph. If found, returns the first cycle.
- If nodes a1, a2, ..., an are in a cycle, then this returns:
- [(a1,a2), (a2,a3), ... (an-1,an), (an, a1)]
- Otherwise returns an empty list.
- """
- cycles = list(simple_cycles(graph))
- if cycles:
- nodes = cycles[0]
- nodes.append(nodes[0])
- edges = []
- prev = nodes[0]
- for node in nodes[1:]:
- edges.append((prev, node))
- prev = node
- return edges
- else:
- return []
- def get_stacktrace(thread_id):
- """
- Returns the stack trace for the thread id as a list of strings.
- """
- gdb.execute("thread %d" % thread_id, from_tty=False, to_string=True)
- output = gdb.execute("bt", from_tty=False, to_string=True)
- stacktrace_lines = output.strip().split("\n")
- return stacktrace_lines
- def is_thread_blocked_with_frame(
- thread_id, top_line, expected_top_lines, expected_frame
- ):
- """
- Returns True if we found expected_top_line in top_line, and
- we found the expected_frame in the thread's stack trace.
- """
- if all(expected not in top_line for expected in expected_top_lines):
- return False
- stacktrace_lines = get_stacktrace(thread_id)
- return any(expected_frame in line for line in stacktrace_lines)
- class MutexType(Enum):
- """Types of mutexes that we can detect deadlocks."""
- PTHREAD_MUTEX_T = "pthread_mutex_t"
- PTHREAD_RWLOCK_T = "pthread_rwlock_t"
- @staticmethod
- def get_mutex_type(thread_id, top_line):
- """
- Returns the probable mutex type, based on the first line
- of the thread's stack. Returns None if not found.
- """
- WAITLIST = [
- "__lll_lock_wait",
- "futex_abstimed_wait",
- "futex_abstimed_wait_cancelable",
- "futex_reltimed_wait",
- "futex_reltimed_wait_cancelable",
- "futex_wait",
- "futex_wait_cancelable",
- ]
- if is_thread_blocked_with_frame(thread_id, top_line, WAITLIST, "pthread_mutex"):
- return MutexType.PTHREAD_MUTEX_T
- if is_thread_blocked_with_frame(
- thread_id, top_line, WAITLIST, "pthread_rwlock"
- ):
- return MutexType.PTHREAD_RWLOCK_T
- return None
- @staticmethod
- def get_mutex_owner_and_address_func_for_type(mutex_type):
- """
- Returns a function to resolve the mutex owner and address for
- the given type. The returned function f has the following
- signature:
- f: args: (map of thread lwp -> thread id), blocked thread lwp
- returns: (lwp of thread owning mutex, mutex address)
- or (None, None) if not found.
- Returns None if there is no function for this mutex_type.
- """
- if mutex_type == MutexType.PTHREAD_MUTEX_T:
- return get_pthread_mutex_t_owner_and_address
- if mutex_type == MutexType.PTHREAD_RWLOCK_T:
- return get_pthread_rwlock_t_owner_and_address
- return None
- def print_cycle(graph, lwp_to_thread_id, cycle):
- """Prints the threads and mutexes involved in the deadlock."""
- for m, n in cycle:
- print(
- "Thread %d (LWP %d) is waiting on %s (0x%016x) held by "
- "Thread %d (LWP %d)"
- % (
- lwp_to_thread_id[m],
- m,
- graph.attributes(m, n)["mutex_type"].value,
- graph.attributes(m, n)["mutex"],
- lwp_to_thread_id[n],
- n,
- )
- )
- def get_thread_info():
- """
- Returns a pair of:
- - map of LWP -> thread ID
- - map of blocked threads LWP -> potential mutex type
- """
- # LWP -> thread ID
- lwp_to_thread_id = {}
- # LWP -> potential mutex type it is blocked on
- blocked_threads = {}
- output = gdb.execute("info threads", from_tty=False, to_string=True)
- lines = output.strip().split("\n")[1:]
- regex = re.compile(r"[\s\*]*(\d+).*Thread.*\(LWP (\d+)\).*")
- for line in lines:
- try:
- thread_id = int(regex.match(line).group(1))
- thread_lwp = int(regex.match(line).group(2))
- lwp_to_thread_id[thread_lwp] = thread_id
- mutex_type = MutexType.get_mutex_type(thread_id, line)
- if mutex_type:
- blocked_threads[thread_lwp] = mutex_type
- except Exception:
- continue
- return (lwp_to_thread_id, blocked_threads)
- def get_pthread_mutex_t_owner_and_address(lwp_to_thread_id, thread_lwp):
- """
- Finds the thread holding the mutex that this thread is blocked on.
- Returns a pair of (lwp of thread owning mutex, mutex address),
- or (None, None) if not found.
- """
- # Go up the stack to the pthread_mutex_lock frame
- gdb.execute(
- "thread %d" % lwp_to_thread_id[thread_lwp], from_tty=False, to_string=True
- )
- gdb.execute("frame 1", from_tty=False, to_string=True)
- # Get the owner of the mutex by inspecting the internal
- # fields of the mutex.
- try:
- mutex_info = gdb.parse_and_eval("mutex").dereference()
- mutex_owner_lwp = int(mutex_info["__data"]["__owner"])
- return (mutex_owner_lwp, int(mutex_info.address))
- except gdb.error:
- return (None, None)
- def get_pthread_rwlock_t_owner_and_address(lwp_to_thread_id, thread_lwp):
- """
- If the thread is waiting on a write-locked pthread_rwlock_t, this will
- return the pair of:
- (lwp of thread that is write-owning the mutex, mutex address)
- or (None, None) if not found, or if the mutex is read-locked.
- """
- # Go up the stack to the pthread_rwlock_{rd|wr}lock frame
- gdb.execute(
- "thread %d" % lwp_to_thread_id[thread_lwp], from_tty=False, to_string=True
- )
- gdb.execute("frame 2", from_tty=False, to_string=True)
- # Get the owner of the mutex by inspecting the internal
- # fields of the mutex.
- try:
- rwlock_info = gdb.parse_and_eval("rwlock").dereference()
- rwlock_data = rwlock_info["__data"]
- field_names = ["__cur_writer", "__writer"]
- fields = rwlock_data.type.fields()
- field = [f for f in fields if f.name in field_names][0]
- rwlock_owner_lwp = int(rwlock_data[field])
- # We can only track the owner if it is currently write-locked.
- # If it is not write-locked or if it is currently read-locked,
- # possibly by multiple threads, we cannot find the owner.
- if rwlock_owner_lwp != 0:
- return (rwlock_owner_lwp, int(rwlock_info.address))
- else:
- return (None, None)
- except gdb.error:
- return (None, None)
- class Deadlock(gdb.Command):
- """Detects deadlocks"""
- def __init__(self):
- super(Deadlock, self).__init__("deadlock", gdb.COMMAND_NONE)
- def invoke(self, arg, from_tty):
- """Prints the threads and mutexes in a deadlock, if it exists."""
- lwp_to_thread_id, blocked_threads = get_thread_info()
- # Nodes represent threads. Edge (A,B) exists if thread A
- # is waiting on a mutex held by thread B.
- graph = DiGraph()
- # Go through all the blocked threads and see which threads
- # they are blocked on, and build the thread wait graph.
- for thread_lwp, mutex_type in blocked_threads.items():
- get_owner_and_address_func = (
- MutexType.get_mutex_owner_and_address_func_for_type(mutex_type)
- )
- if not get_owner_and_address_func:
- continue
- mutex_owner_lwp, mutex_address = get_owner_and_address_func(
- lwp_to_thread_id, thread_lwp
- )
- if mutex_owner_lwp and mutex_address:
- graph.add_edge(
- thread_lwp,
- mutex_owner_lwp,
- mutex=mutex_address,
- mutex_type=mutex_type,
- )
- # A deadlock exists if there is a cycle in the graph.
- cycle = find_cycle(graph)
- if cycle:
- print("Found deadlock!")
- print_cycle(graph, lwp_to_thread_id, cycle)
- else:
- print("No deadlock detected. " "Do you have debug symbols installed?")
- def load():
- # instantiate the Deadlock command
- Deadlock()
- print('Type "deadlock" to detect deadlocks.')
- def info():
- return "Detect deadlocks"
- if __name__ == "__main__":
- load()
|