deadlock.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474
  1. #!/usr/bin/env python3
  2. # Copyright (c) Meta Platforms, Inc. and affiliates.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import re
  16. from collections import defaultdict
  17. from enum import Enum
  18. import gdb
  19. class DiGraph:
  20. """
  21. Adapted from networkx: http://networkx.github.io/
  22. Represents a directed graph. Edges can store (key, value) attributes.
  23. """
  24. def __init__(self):
  25. # Map of node -> set of nodes
  26. self.adjacency_map = {}
  27. # Map of (node1, node2) -> map string -> arbitrary attribute
  28. # This will not be copied in subgraph()
  29. self.attributes_map = {}
  30. def neighbors(self, node):
  31. return self.adjacency_map.get(node, set())
  32. def edges(self):
  33. edges = []
  34. for node, neighbors in self.adjacency_map.items():
  35. for neighbor in neighbors:
  36. edges.append((node, neighbor))
  37. return edges
  38. def nodes(self):
  39. return self.adjacency_map.keys()
  40. def attributes(self, node1, node2):
  41. return self.attributes_map[(node1, node2)]
  42. def add_edge(self, node1, node2, **kwargs):
  43. if node1 not in self.adjacency_map:
  44. self.adjacency_map[node1] = set()
  45. if node2 not in self.adjacency_map:
  46. self.adjacency_map[node2] = set()
  47. self.adjacency_map[node1].add(node2)
  48. self.attributes_map[(node1, node2)] = kwargs
  49. def remove_node(self, node):
  50. self.adjacency_map.pop(node, None)
  51. for _, neighbors in self.adjacency_map.items():
  52. neighbors.discard(node)
  53. def subgraph(self, nodes):
  54. graph = DiGraph()
  55. for node in nodes:
  56. for neighbor in self.neighbors(node):
  57. if neighbor in nodes:
  58. graph.add_edge(node, neighbor)
  59. return graph
  60. def node_link_data(self):
  61. """
  62. Returns the graph as a dictionary in a format that can be
  63. serialized.
  64. """
  65. data = {
  66. "directed": True,
  67. "multigraph": False,
  68. "graph": {},
  69. "links": [],
  70. "nodes": [],
  71. }
  72. # Do one pass to build a map of node -> position in nodes
  73. node_to_number = {}
  74. for node in self.adjacency_map.keys():
  75. node_to_number[node] = len(data["nodes"])
  76. data["nodes"].append({"id": node})
  77. # Do another pass to build the link information
  78. for node, neighbors in self.adjacency_map.items():
  79. for neighbor in neighbors:
  80. link = self.attributes_map[(node, neighbor)].copy()
  81. link["source"] = node_to_number[node]
  82. link["target"] = node_to_number[neighbor]
  83. data["links"].append(link)
  84. return data
  85. def strongly_connected_components(G): # noqa: C901
  86. """
  87. Adapted from networkx: http://networkx.github.io/
  88. Parameters
  89. ----------
  90. G : DiGraph
  91. Returns
  92. -------
  93. comp : generator of sets
  94. A generator of sets of nodes, one for each strongly connected
  95. component of G.
  96. """
  97. preorder = {}
  98. lowlink = {}
  99. scc_found = {}
  100. scc_queue = []
  101. i = 0 # Preorder counter
  102. for source in G.nodes():
  103. if source not in scc_found:
  104. queue = [source]
  105. while queue:
  106. v = queue[-1]
  107. if v not in preorder:
  108. i = i + 1
  109. preorder[v] = i
  110. done = 1
  111. v_nbrs = G.neighbors(v)
  112. for w in v_nbrs:
  113. if w not in preorder:
  114. queue.append(w)
  115. done = 0
  116. break
  117. if done == 1:
  118. lowlink[v] = preorder[v]
  119. for w in v_nbrs:
  120. if w not in scc_found:
  121. if preorder[w] > preorder[v]:
  122. lowlink[v] = min([lowlink[v], lowlink[w]])
  123. else:
  124. lowlink[v] = min([lowlink[v], preorder[w]])
  125. queue.pop()
  126. if lowlink[v] == preorder[v]:
  127. scc_found[v] = True
  128. scc = {v}
  129. while scc_queue and preorder[scc_queue[-1]] > preorder[v]:
  130. k = scc_queue.pop()
  131. scc_found[k] = True
  132. scc.add(k)
  133. yield scc
  134. else:
  135. scc_queue.append(v)
  136. def simple_cycles(G): # noqa: C901
  137. """
  138. Adapted from networkx: http://networkx.github.io/
  139. Parameters
  140. ----------
  141. G : DiGraph
  142. Returns
  143. -------
  144. cycle_generator: generator
  145. A generator that produces elementary cycles of the graph.
  146. Each cycle is represented by a list of nodes along the cycle.
  147. """
  148. def _unblock(thisnode, blocked, B):
  149. stack = {thisnode}
  150. while stack:
  151. node = stack.pop()
  152. if node in blocked:
  153. blocked.remove(node)
  154. stack.update(B[node])
  155. B[node].clear()
  156. # Johnson's algorithm requires some ordering of the nodes.
  157. # We assign the arbitrary ordering given by the strongly connected comps
  158. # There is no need to track the ordering as each node removed as processed.
  159. # save the actual graph so we can mutate it here
  160. # We only take the edges because we do not want to
  161. # copy edge and node attributes here.
  162. subG = G.subgraph(G.nodes())
  163. sccs = list(strongly_connected_components(subG))
  164. while sccs:
  165. scc = sccs.pop()
  166. # order of scc determines ordering of nodes
  167. startnode = scc.pop()
  168. # Processing node runs 'circuit' routine from recursive version
  169. path = [startnode]
  170. blocked = set() # vertex: blocked from search?
  171. closed = set() # nodes involved in a cycle
  172. blocked.add(startnode)
  173. B = defaultdict(set) # graph portions that yield no elementary circuit
  174. stack = [(startnode, list(subG.neighbors(startnode)))]
  175. while stack:
  176. thisnode, nbrs = stack[-1]
  177. if nbrs:
  178. nextnode = nbrs.pop()
  179. if nextnode == startnode:
  180. yield path[:]
  181. closed.update(path)
  182. elif nextnode not in blocked:
  183. path.append(nextnode)
  184. stack.append((nextnode, list(subG.neighbors(nextnode))))
  185. closed.discard(nextnode)
  186. blocked.add(nextnode)
  187. continue
  188. # done with nextnode... look for more neighbors
  189. if not nbrs: # no more nbrs
  190. if thisnode in closed:
  191. _unblock(thisnode, blocked, B)
  192. else:
  193. for nbr in subG.neighbors(thisnode):
  194. if thisnode not in B[nbr]:
  195. B[nbr].add(thisnode)
  196. stack.pop()
  197. path.pop()
  198. # done processing this node
  199. subG.remove_node(startnode)
  200. H = subG.subgraph(scc) # make smaller to avoid work in SCC routine
  201. sccs.extend(list(strongly_connected_components(H)))
  202. def find_cycle(graph):
  203. """
  204. Looks for a cycle in the graph. If found, returns the first cycle.
  205. If nodes a1, a2, ..., an are in a cycle, then this returns:
  206. [(a1,a2), (a2,a3), ... (an-1,an), (an, a1)]
  207. Otherwise returns an empty list.
  208. """
  209. cycles = list(simple_cycles(graph))
  210. if cycles:
  211. nodes = cycles[0]
  212. nodes.append(nodes[0])
  213. edges = []
  214. prev = nodes[0]
  215. for node in nodes[1:]:
  216. edges.append((prev, node))
  217. prev = node
  218. return edges
  219. else:
  220. return []
  221. def get_stacktrace(thread_id):
  222. """
  223. Returns the stack trace for the thread id as a list of strings.
  224. """
  225. gdb.execute("thread %d" % thread_id, from_tty=False, to_string=True)
  226. output = gdb.execute("bt", from_tty=False, to_string=True)
  227. stacktrace_lines = output.strip().split("\n")
  228. return stacktrace_lines
  229. def is_thread_blocked_with_frame(
  230. thread_id, top_line, expected_top_lines, expected_frame
  231. ):
  232. """
  233. Returns True if we found expected_top_line in top_line, and
  234. we found the expected_frame in the thread's stack trace.
  235. """
  236. if all(expected not in top_line for expected in expected_top_lines):
  237. return False
  238. stacktrace_lines = get_stacktrace(thread_id)
  239. return any(expected_frame in line for line in stacktrace_lines)
  240. class MutexType(Enum):
  241. """Types of mutexes that we can detect deadlocks."""
  242. PTHREAD_MUTEX_T = "pthread_mutex_t"
  243. PTHREAD_RWLOCK_T = "pthread_rwlock_t"
  244. @staticmethod
  245. def get_mutex_type(thread_id, top_line):
  246. """
  247. Returns the probable mutex type, based on the first line
  248. of the thread's stack. Returns None if not found.
  249. """
  250. WAITLIST = [
  251. "__lll_lock_wait",
  252. "futex_abstimed_wait",
  253. "futex_abstimed_wait_cancelable",
  254. "futex_reltimed_wait",
  255. "futex_reltimed_wait_cancelable",
  256. "futex_wait",
  257. "futex_wait_cancelable",
  258. ]
  259. if is_thread_blocked_with_frame(thread_id, top_line, WAITLIST, "pthread_mutex"):
  260. return MutexType.PTHREAD_MUTEX_T
  261. if is_thread_blocked_with_frame(
  262. thread_id, top_line, WAITLIST, "pthread_rwlock"
  263. ):
  264. return MutexType.PTHREAD_RWLOCK_T
  265. return None
  266. @staticmethod
  267. def get_mutex_owner_and_address_func_for_type(mutex_type):
  268. """
  269. Returns a function to resolve the mutex owner and address for
  270. the given type. The returned function f has the following
  271. signature:
  272. f: args: (map of thread lwp -> thread id), blocked thread lwp
  273. returns: (lwp of thread owning mutex, mutex address)
  274. or (None, None) if not found.
  275. Returns None if there is no function for this mutex_type.
  276. """
  277. if mutex_type == MutexType.PTHREAD_MUTEX_T:
  278. return get_pthread_mutex_t_owner_and_address
  279. if mutex_type == MutexType.PTHREAD_RWLOCK_T:
  280. return get_pthread_rwlock_t_owner_and_address
  281. return None
  282. def print_cycle(graph, lwp_to_thread_id, cycle):
  283. """Prints the threads and mutexes involved in the deadlock."""
  284. for m, n in cycle:
  285. print(
  286. "Thread %d (LWP %d) is waiting on %s (0x%016x) held by "
  287. "Thread %d (LWP %d)"
  288. % (
  289. lwp_to_thread_id[m],
  290. m,
  291. graph.attributes(m, n)["mutex_type"].value,
  292. graph.attributes(m, n)["mutex"],
  293. lwp_to_thread_id[n],
  294. n,
  295. )
  296. )
  297. def get_thread_info():
  298. """
  299. Returns a pair of:
  300. - map of LWP -> thread ID
  301. - map of blocked threads LWP -> potential mutex type
  302. """
  303. # LWP -> thread ID
  304. lwp_to_thread_id = {}
  305. # LWP -> potential mutex type it is blocked on
  306. blocked_threads = {}
  307. output = gdb.execute("info threads", from_tty=False, to_string=True)
  308. lines = output.strip().split("\n")[1:]
  309. regex = re.compile(r"[\s\*]*(\d+).*Thread.*\(LWP (\d+)\).*")
  310. for line in lines:
  311. try:
  312. thread_id = int(regex.match(line).group(1))
  313. thread_lwp = int(regex.match(line).group(2))
  314. lwp_to_thread_id[thread_lwp] = thread_id
  315. mutex_type = MutexType.get_mutex_type(thread_id, line)
  316. if mutex_type:
  317. blocked_threads[thread_lwp] = mutex_type
  318. except Exception:
  319. continue
  320. return (lwp_to_thread_id, blocked_threads)
  321. def get_pthread_mutex_t_owner_and_address(lwp_to_thread_id, thread_lwp):
  322. """
  323. Finds the thread holding the mutex that this thread is blocked on.
  324. Returns a pair of (lwp of thread owning mutex, mutex address),
  325. or (None, None) if not found.
  326. """
  327. # Go up the stack to the pthread_mutex_lock frame
  328. gdb.execute(
  329. "thread %d" % lwp_to_thread_id[thread_lwp], from_tty=False, to_string=True
  330. )
  331. gdb.execute("frame 1", from_tty=False, to_string=True)
  332. # Get the owner of the mutex by inspecting the internal
  333. # fields of the mutex.
  334. try:
  335. mutex_info = gdb.parse_and_eval("mutex").dereference()
  336. mutex_owner_lwp = int(mutex_info["__data"]["__owner"])
  337. return (mutex_owner_lwp, int(mutex_info.address))
  338. except gdb.error:
  339. return (None, None)
  340. def get_pthread_rwlock_t_owner_and_address(lwp_to_thread_id, thread_lwp):
  341. """
  342. If the thread is waiting on a write-locked pthread_rwlock_t, this will
  343. return the pair of:
  344. (lwp of thread that is write-owning the mutex, mutex address)
  345. or (None, None) if not found, or if the mutex is read-locked.
  346. """
  347. # Go up the stack to the pthread_rwlock_{rd|wr}lock frame
  348. gdb.execute(
  349. "thread %d" % lwp_to_thread_id[thread_lwp], from_tty=False, to_string=True
  350. )
  351. gdb.execute("frame 2", from_tty=False, to_string=True)
  352. # Get the owner of the mutex by inspecting the internal
  353. # fields of the mutex.
  354. try:
  355. rwlock_info = gdb.parse_and_eval("rwlock").dereference()
  356. rwlock_data = rwlock_info["__data"]
  357. field_names = ["__cur_writer", "__writer"]
  358. fields = rwlock_data.type.fields()
  359. field = [f for f in fields if f.name in field_names][0]
  360. rwlock_owner_lwp = int(rwlock_data[field])
  361. # We can only track the owner if it is currently write-locked.
  362. # If it is not write-locked or if it is currently read-locked,
  363. # possibly by multiple threads, we cannot find the owner.
  364. if rwlock_owner_lwp != 0:
  365. return (rwlock_owner_lwp, int(rwlock_info.address))
  366. else:
  367. return (None, None)
  368. except gdb.error:
  369. return (None, None)
  370. class Deadlock(gdb.Command):
  371. """Detects deadlocks"""
  372. def __init__(self):
  373. super(Deadlock, self).__init__("deadlock", gdb.COMMAND_NONE)
  374. def invoke(self, arg, from_tty):
  375. """Prints the threads and mutexes in a deadlock, if it exists."""
  376. lwp_to_thread_id, blocked_threads = get_thread_info()
  377. # Nodes represent threads. Edge (A,B) exists if thread A
  378. # is waiting on a mutex held by thread B.
  379. graph = DiGraph()
  380. # Go through all the blocked threads and see which threads
  381. # they are blocked on, and build the thread wait graph.
  382. for thread_lwp, mutex_type in blocked_threads.items():
  383. get_owner_and_address_func = (
  384. MutexType.get_mutex_owner_and_address_func_for_type(mutex_type)
  385. )
  386. if not get_owner_and_address_func:
  387. continue
  388. mutex_owner_lwp, mutex_address = get_owner_and_address_func(
  389. lwp_to_thread_id, thread_lwp
  390. )
  391. if mutex_owner_lwp and mutex_address:
  392. graph.add_edge(
  393. thread_lwp,
  394. mutex_owner_lwp,
  395. mutex=mutex_address,
  396. mutex_type=mutex_type,
  397. )
  398. # A deadlock exists if there is a cycle in the graph.
  399. cycle = find_cycle(graph)
  400. if cycle:
  401. print("Found deadlock!")
  402. print_cycle(graph, lwp_to_thread_id, cycle)
  403. else:
  404. print("No deadlock detected. " "Do you have debug symbols installed?")
  405. def load():
  406. # instantiate the Deadlock command
  407. Deadlock()
  408. print('Type "deadlock" to detect deadlocks.')
  409. def info():
  410. return "Detect deadlocks"
  411. if __name__ == "__main__":
  412. load()