tools/clang/scripts/analyze_includes.py - experimental/chromium/src - Git at Google

 #!/usr/bin/env python3
 # Copyright 2021 The Chromium Authors
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 """This script is used to analyze #include graphs.

 It produces the .js file that accompanies include-analysis.html.

 Usage:

 $ gn gen --args="show_includes=true symbol_level=0 enable_precompiled_headers=false" out/Debug
 $ autoninja -C out/Debug -v chrome | tee /tmp/build_log
 $ analyze_includes.py --target=chrome --revision=$(git rev-parse --short HEAD) \
     --json-out=/tmp/include-analysis.js /tmp/build_log

 (If you have reclient access, add use_reclient=true to the gn args, but not on
 Windows due to crbug.com/1223741#c9)

 The script takes roughly half an hour on a fast machine for the chrome build
 target, which is considered fast enough for batch job purposes for now. It can
 be sped up significantly by using multiple processes with the --proccesses option,
 but it will also use significantly more memory as a result (OOM is a risk).

 If --json-out is not provided, the script exits after printing some statistics
 to stdout. This is significantly faster than generating the full JSON data. For
 example:

 $ autoninja -C out/Debug -v chrome | analyze_includes.py - 2>/dev/null
 build_size 270237664463
 """

 import argparse
 import concurrent.futures
 import functools
 import json
 import math
 import os
 import pathlib
 import re
 import sys
 import unittest
 from collections import defaultdict
 from datetime import datetime, timezone
 from itertools import islice


 def parse_build(build_log, root_filter=None):
   """Parse the build_log (generated as in the Usage note above) to capture the
   include graph. Returns a (roots, includes) pair, where roots is a list of root
   nodes in the graph (the source files) and includes is a dict from filename to
   list of filenames included by that filename."""
   build_dir = '.'
   file_stack = []
   includes = {}
   roots = set()

   # Note: A file might include different files for different compiler
   # invocations depending on -D flags. For such cases, includes[file] will be
   # the union of those includes.

   @functools.cache
   def norm(fn):
     x = fn.replace('\\\\', '\\')
     # Use Path.resolve() rather than path.realpath() to get the canonical
     # upper/lower-case version of the path on Windows.
     p = pathlib.Path(os.path.join(build_dir, x)).resolve()
     x = os.path.relpath(p)
     return x.replace(os.path.sep, '/')

   @functools.cache
   def parse_include_line(line):
     m = INCLUDE_RE.match(line)

     if m:
       depth = len(m.group(1))
       filename = norm(m.group(2))

       return filename, depth

   # ninja: Entering directory `out/foo'
   ENTER_DIR_RE = re.compile(r'ninja: Entering directory `(.*?)\'$')
   # [M/N] clang... -c foo.cc -o foo.o ...
   # [M/N] .../clang... -c foo.cc -o foo.o ...
   # [M/N] clang-cl.exe /c foo.cc /Fofoo.o ...
   # [M/N] ...\clang-cl.exe /c foo.cc /Fofoo.o ...
   COMPILE_RE = re.compile(r'\[\d+/\d+\] (.*[/\\])?clang.* [/-]c (\S*)')
   # . a.h
   # .. b.h
   # . c.h
   INCLUDE_RE = re.compile(r'(\.+) (.*)$')

   skipping_root = False

   for line in build_log:
     # TODO(https://crbug.com/435303792): Ignore precompiled modules (.pcm) until
     # an appropriate way to calculate their size for include analysis is found.
     if (parsed := parse_include_line(line)) and not parsed[0].endswith('.pcm'):
       if skipping_root:
         continue
       prev_depth = len(file_stack) - 1
       filename, depth = parsed
       if filename not in includes:
         includes[filename] = set()

       if depth > prev_depth:
         if sys.platform != 'win32':
           # TODO(crbug.com/40187759): Always assert.
           assert depth == prev_depth + 1
         elif depth > prev_depth + 1:
           # Until the bug is fixed, skip these includes.
           print('missing include under', file_stack[0])
           continue
       else:
         del file_stack[-(prev_depth - depth + 1):]

       includes[file_stack[-1]].add(filename)
       file_stack.append(filename)
       continue

     # Clang module compile .modulemap files, so skip that from include analysis.
     if (m := COMPILE_RE.match(line)) and not m.group(2).endswith('.modulemap'):
       skipping_root = False
       filename = norm(m.group(2))
       if root_filter and not root_filter.match(filename):
         skipping_root = True
         continue
       roots.add(filename)
       file_stack = [filename]
       includes.setdefault(filename, set())
       continue

     if (m := ENTER_DIR_RE.match(line)):
       build_dir = m.group(1)
       continue

     if line.startswith('['):
       # Some tool other than clang is running. Ignore its output.
       skipping_root = True
       continue

   return roots, includes


 class TestParseBuild(unittest.TestCase):
   def test_basic(self):
     x = [
         'ninja: Entering directory `out/foo\'',
         '[1/3] clang -c ../../a.cc -o a.o',
         '. ../../a.h',
         '[2/3] clang -c gen/c.c -o a.o',
     ]
     (roots, includes) = parse_build(x)
     self.assertEqual(roots, set(['a.cc', 'out/foo/gen/c.c']))
     self.assertEqual(set(includes.keys()),
                      set(['a.cc', 'a.h', 'out/foo/gen/c.c']))
     self.assertEqual(includes['a.cc'], set(['a.h']))
     self.assertEqual(includes['a.h'], set())
     self.assertEqual(includes['out/foo/gen/c.c'], set())

   def test_more(self):
     x = [
         'ninja: Entering directory `out/foo\'',
         '[20/99] clang -c ../../a.cc -o a.o',
         '. ../../a.h',
         '. ../../b.h',
         '.. ../../c.h',
         '... ../../d.h',
         '. ../../e.h',
     ]
     (roots, includes) = parse_build(x)
     self.assertEqual(roots, set(['a.cc']))
     self.assertEqual(includes['a.cc'], set(['a.h', 'b.h', 'e.h']))
     self.assertEqual(includes['b.h'], set(['c.h']))
     self.assertEqual(includes['c.h'], set(['d.h']))
     self.assertEqual(includes['d.h'], set())
     self.assertEqual(includes['e.h'], set())

   def test_multiple(self):
     x = [
         'ninja: Entering directory `out/foo\'',
         '[123/234] clang -c ../../a.cc -o a.o',
         '. ../../a.h',
         '[124/234] clang -c ../../b.cc -o b.o',
         '. ../../b.h',
     ]
     (roots, includes) = parse_build(x)
     self.assertEqual(roots, set(['a.cc', 'b.cc']))
     self.assertEqual(includes['a.cc'], set(['a.h']))
     self.assertEqual(includes['b.cc'], set(['b.h']))

   def test_root_filter(self):
     x = [
         'ninja: Entering directory `out/foo\'',
         '[9/100] clang -c ../../a.cc -o a.o',
         '. ../../a.h',
         '[10/100] clang -c ../../b.cc -o b.o',
         '. ../../b.h',
     ]
     (roots, includes) = parse_build(x, re.compile(r'^a.cc$'))
     self.assertEqual(roots, set(['a.cc']))
     self.assertEqual(set(includes.keys()), set(['a.cc', 'a.h']))
     self.assertEqual(includes['a.cc'], set(['a.h']))

   def test_windows(self):
     x = [
         'ninja: Entering directory `out/foo\'',
         '[1/3] path\\clang-cl.exe /c ../../a.cc /Foa.o',
         '. ../../a.h',
         '[2/3] clang-cl.exe /c gen/c.c /Foa.o',
     ]
     (roots, includes) = parse_build(x)
     self.assertEqual(roots, set(['a.cc', 'out/foo/gen/c.c']))
     self.assertEqual(set(includes.keys()),
                      set(['a.cc', 'a.h', 'out/foo/gen/c.c']))
     self.assertEqual(includes['a.cc'], set(['a.h']))
     self.assertEqual(includes['a.h'], set())
     self.assertEqual(includes['out/foo/gen/c.c'], set())

   def test_bindgen(self):
     x = [
         'ninja: Entering directory `out/foo\'',
         '[123/234] clang -c ../../a.cc -o a.o',
         '. ../../a.h',
         '[124/234] bindgen -c ../../b.cc -o b.o',
         '. ../../b.h',
         '[125/234] clang -c ../../c.cc -o c.o',
         '. ../../c.h',
     ]
     (roots, includes) = parse_build(x)
     self.assertEqual(roots, set(['a.cc', 'c.cc']))
     self.assertEqual(includes['a.cc'], set(['a.h']))
     self.assertEqual(includes['c.cc'], set(['c.h']))

   def test_modules(self):
     x = [
         'ninja: Entering directory `out/foo\'',
         '[123/234] clang -x c++ -Xclang -emit-module -c ../../a.modulemap -o a.pcm',
         '[124/234] clang -fmodule-file=a.pcm -c ../../a.cc -o a.o',
         '. a.pcm',
         '. ../../a.h',
     ]
     (roots, includes) = parse_build(x)
     self.assertEqual(roots, {'a.cc'})
     self.assertEqual(includes, {'a.cc': {'a.h'}, 'a.h': set()})


 def post_order_nodes(root, child_nodes):
   """Generate the nodes reachable from root (including root itself) in
   post-order traversal order. child_nodes maps each node to its children."""
   visited = set()

   def walk(n):
     if n in visited:
       return
     visited.add(n)

     for c in child_nodes[n]:
       for x in walk(c):
         yield x
     yield n

   return walk(root)


 def compute_doms(root, includes):
   """Compute the dominators for all nodes reachable from root. Node A dominates
   node B if all paths from the root to B go through A. Returns a dict from
   filename to the set of dominators of that filename (including itself).

   The implementation follows the "simple" version of Lengauer & Tarjan "A Fast
   Algorithm for Finding Dominators in a Flowgraph" (TOPLAS 1979).
   """

   parent = {}
   ancestor = {}
   vertex = []
   label = {}
   semi = {}
   pred = defaultdict(list)
   bucket = defaultdict(list)
   dom = {}

   def dfs(v):
     semi[v] = len(vertex)
     vertex.append(v)
     label[v] = v

     for w in includes[v]:
       if w not in semi:
         parent[w] = v
         dfs(w)
       pred[w].append(v)

   def compress(v):
     if ancestor[v] in ancestor:
       compress(ancestor[v])
       if semi[label[ancestor[v]]] < semi[label[v]]:
         label[v] = label[ancestor[v]]
       ancestor[v] = ancestor[ancestor[v]]

   def evaluate(v):
     if v not in ancestor:
       return v
     compress(v)
     return label[v]

   def link(v, w):
     ancestor[w] = v

   # Step 1: Initialization.
   dfs(root)

   for w in reversed(vertex[1:]):
     # Step 2: Compute semidominators.
     for v in pred[w]:
       u = evaluate(v)
       if semi[u] < semi[w]:
         semi[w] = semi[u]

     bucket[vertex[semi[w]]].append(w)
     link(parent[w], w)

     # Step 3: Implicitly define the immediate dominator for each node.
     for v in bucket[parent[w]]:
       u = evaluate(v)
       dom[v] = u if semi[u] < semi[v] else parent[w]
     bucket[parent[w]] = []

   # Step 4: Explicitly define the immediate dominator for each node.
   for w in vertex[1:]:
     if dom[w] != vertex[semi[w]]:
       dom[w] = dom[dom[w]]

   # Get the full dominator set for each node.
   all_doms = {}
   all_doms[root] = {root}

   def dom_set(node):
     if node not in all_doms:
       # node's dominators is itself and the dominators of its immediate
       # dominator.
       all_doms[node] = {node}
       all_doms[node].update(dom_set(dom[node]))

     return all_doms[node]

   return {n: dom_set(n) for n in vertex}


 def compute_added_sizes(args):
   """Helper to compute added sizes from the given root."""

   roots, includes, sizes = args
   added_sizes = {node: 0 for node in includes}

   for root in roots:
     doms = compute_doms(root, includes)

     for node in doms:
       if node not in sizes:
         # Skip the (src,dst) pseudo nodes.
         continue
       for dom in doms[node]:
         added_sizes[dom] += sizes[node]

   return added_sizes


 class TestComputeDoms(unittest.TestCase):
   def test_basic(self):
     includes = {}
     includes[1] = [2]
     includes[2] = [1]
     includes[3] = [2]
     includes[4] = [1]
     includes[5] = [4, 3]
     root = 5

     doms = compute_doms(root, includes)

     self.assertEqual(doms[1], set([5, 1]))
     self.assertEqual(doms[2], set([5, 2]))
     self.assertEqual(doms[3], set([5, 3]))
     self.assertEqual(doms[4], set([5, 4]))
     self.assertEqual(doms[5], set([5]))

   def test_larger(self):
     # Fig. 1 in the Lengauer-Tarjan paper.
     includes = {}
     includes['a'] = ['d']
     includes['b'] = ['a', 'd', 'e']
     includes['c'] = ['f', 'g']
     includes['d'] = ['l']
     includes['e'] = ['h']
     includes['f'] = ['i']
     includes['g'] = ['i', 'j']
     includes['h'] = ['k', 'e']
     includes['i'] = ['k']
     includes['j'] = ['i']
     includes['k'] = ['i', 'r']
     includes['l'] = ['h']
     includes['r'] = ['a', 'b', 'c']
     root = 'r'

     doms = compute_doms(root, includes)

     # Fig. 2 in the Lengauer-Tarjan paper.
     self.assertEqual(doms['a'], set(['a', 'r']))
     self.assertEqual(doms['b'], set(['b', 'r']))
     self.assertEqual(doms['c'], set(['c', 'r']))
     self.assertEqual(doms['d'], set(['d', 'r']))
     self.assertEqual(doms['e'], set(['e', 'r']))
     self.assertEqual(doms['f'], set(['f', 'c', 'r']))
     self.assertEqual(doms['g'], set(['g', 'c', 'r']))
     self.assertEqual(doms['h'], set(['h', 'r']))
     self.assertEqual(doms['i'], set(['i', 'r']))
     self.assertEqual(doms['j'], set(['j', 'g', 'c', 'r']))
     self.assertEqual(doms['k'], set(['k', 'r']))
     self.assertEqual(doms['l'], set(['l', 'd', 'r']))
     self.assertEqual(doms['r'], set(['r']))


 def log(*args, **kwargs):
   """Log output to stderr."""
   print(*args, file=sys.stderr, **kwargs)


 # TODO: Use itertools.batched after updating the Python version on bots to 3.12.
 def batched(iterable, n):
   # batched('ABCDEFG', 2) → AB CD EF G
   if n < 1:
     raise ValueError('n must be at least one')
   iterator = iter(iterable)
   while batch := tuple(islice(iterator, n)):
     yield batch


 def analyze(target, revision, build_log_file, json_file, root_filter, processes=1):
   log('Parsing build log...')
   (roots, includes) = parse_build(build_log_file, root_filter)

   log('Getting file sizes...')
   sizes = {name: os.path.getsize(name) for name in includes}

   log('Computing transitive sizes and prevalence...')
   build_size = 0
   trans_sizes = {name: 0 for name in includes}
   prevalence = {name: 0 for name in includes}

   for n in includes:
     for node in post_order_nodes(n, includes):
       # Compute the transitive size of a file, i.e. the size of the
       # file itself and all its transitive includes.
       trans_sizes[n] += sizes[node]

       if n in roots:
         prevalence[node] += 1

     # Total build size is the sum of the transitive size of all roots.
     if n in roots:
       build_size += trans_sizes[n]

   print('build_size', build_size)

   if json_file is None:
     log('--json-out not set; exiting.')
     return 0

   # Map from file to files that include it.
   log('Building reverse include map...')
   included_by = {k: set() for k in includes}
   for k in includes:
     for i in includes[k]:
       included_by[i].add(k)

   log('Computing added sizes...')

   # Split each src -> dst edge in includes into src -> (src,dst) -> dst, so that
   # we can compute how much each include graph edge adds to the size by doing
   # dominance analysis on the (src,dst) nodes.
   augmented_includes = {}
   for src in includes:
     augmented_includes[src] = set()
     for dst in includes[src]:
       augmented_includes[src].add((src, dst))
       augmented_includes[(src, dst)] = {dst}

   if processes > 1:
     added_sizes = {node: 0 for node in augmented_includes}

     # Break up the list of roots into chunks based on the number of processes and pass them
     # to each worker. Giving each worker a complete chunk of roots to work on rather than
     # having workers pull as they go minimizes contention and gives better parallelization.
     chunk_size = math.ceil(float(len(roots)) / processes)
     chunked = list(batched(roots, chunk_size))

     with concurrent.futures.ProcessPoolExecutor(max_workers=processes) as pool:
       for computed_added_sizes in pool.map(
         compute_added_sizes,
         ((chunk, augmented_includes, sizes) for chunk in chunked),
       ):
         for dom, size in computed_added_sizes.items():
           added_sizes[dom] += size
   else:
     added_sizes = compute_added_sizes((roots, augmented_includes, sizes))

   # Assign a number to each filename for tighter JSON representation.
   names = []
   name2nr = {}
   for n in sorted(includes.keys()):
     name2nr[n] = len(names)
     names.append(n)

   def nr(name):
     return name2nr[name]

   log('Writing output...')

   # Provide a JS object for convenient inclusion in the HTML file.
   # If someone really wants a proper JSON file, maybe we can reconsider this.
   json_file.write('data = ')

   json.dump(
       {
           'target': target,
           'revision': revision,
           'date': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC'),
           'files': names,
           'roots': [nr(x) for x in sorted(roots)],
           'includes': [[nr(x) for x in sorted(includes[n])] for n in names],
           'included_by': [[nr(x) for x in sorted(included_by[n])] for n in names],
           'sizes': [sizes[n] for n in names],
           'tsizes': [trans_sizes[n] for n in names],
           'asizes': [added_sizes[n] for n in names],
           'esizes': [[added_sizes[(s, d)] for d in sorted(includes[s])]
                      for s in names],
           'prevalence': [prevalence[n] for n in names],
       }, json_file)

   log('All done!')


 def main():
   result = unittest.main(argv=sys.argv[:1], exit=False, verbosity=2).result
   if len(result.failures) > 0 or len(result.errors) > 0:
     return 1

   parser = argparse.ArgumentParser(description='Analyze an #include graph.')
   parser.add_argument('build_log',
                       type=argparse.FileType('r', errors='replace'),
                       help='The build log to analyze (- for stdin).')
   parser.add_argument('--target',
                       help='The target that was built (e.g. chrome).')
   parser.add_argument('--revision',
                       help='The revision that was built (e.g. 016588d4ee20).')
   parser.add_argument(
       '--json-out',
       type=argparse.FileType('w'),
       help='Write full analysis data to a JSON file (- for stdout).')
   parser.add_argument('--root-filter',
                       help='Regex to filter which root files are analyzed.')
   parser.add_argument('--processes',
                       action="store",
                       type=int,
                       default=1,
                       help="Use multiple processes to speed up the analysis - "
                            "note that this scales memory usage significantly")
   args = parser.parse_args()

   if args.json_out and not (args.target and args.revision):
     print('error: --json-out requires both --target and --revision to be set')
     return 1

   try:
     root_filter = re.compile(args.root_filter) if args.root_filter else None
   except Exception:
     print('error: --root-filter is not a valid regex')
     return 1

   analyze(args.target, args.revision, args.build_log, args.json_out,
           root_filter, processes=args.processes)


 if __name__ == '__main__':
   sys.exit(main())
	#!/usr/bin/env python3
	# Copyright 2021 The Chromium Authors
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.
	"""This script is used to analyze #include graphs.

	It produces the .js file that accompanies include-analysis.html.

	Usage:

	$ gn gen --args="show_includes=true symbol_level=0 enable_precompiled_headers=false" out/Debug
	$ autoninja -C out/Debug -v chrome \| tee /tmp/build_log
	$ analyze_includes.py --target=chrome --revision=$(git rev-parse --short HEAD) \
	--json-out=/tmp/include-analysis.js /tmp/build_log

	(If you have reclient access, add use_reclient=true to the gn args, but not on
	Windows due to crbug.com/1223741#c9)

	The script takes roughly half an hour on a fast machine for the chrome build
	target, which is considered fast enough for batch job purposes for now. It can
	be sped up significantly by using multiple processes with the --proccesses option,
	but it will also use significantly more memory as a result (OOM is a risk).

	If --json-out is not provided, the script exits after printing some statistics
	to stdout. This is significantly faster than generating the full JSON data. For
	example:

	$ autoninja -C out/Debug -v chrome \| analyze_includes.py - 2>/dev/null
	build_size 270237664463
	"""

	import argparse
	import concurrent.futures
	import functools
	import json
	import math
	import os
	import pathlib
	import re
	import sys
	import unittest
	from collections import defaultdict
	from datetime import datetime, timezone
	from itertools import islice


	def parse_build(build_log, root_filter=None):
	"""Parse the build_log (generated as in the Usage note above) to capture the
	include graph. Returns a (roots, includes) pair, where roots is a list of root
	nodes in the graph (the source files) and includes is a dict from filename to
	list of filenames included by that filename."""
	build_dir = '.'
	file_stack = []
	includes = {}
	roots = set()

	# Note: A file might include different files for different compiler
	# invocations depending on -D flags. For such cases, includes[file] will be
	# the union of those includes.

	@functools.cache
	def norm(fn):
	x = fn.replace('\\\\', '\\')
	# Use Path.resolve() rather than path.realpath() to get the canonical
	# upper/lower-case version of the path on Windows.
	p = pathlib.Path(os.path.join(build_dir, x)).resolve()
	x = os.path.relpath(p)
	return x.replace(os.path.sep, '/')

	@functools.cache
	def parse_include_line(line):
	m = INCLUDE_RE.match(line)

	if m:
	depth = len(m.group(1))
	filename = norm(m.group(2))

	return filename, depth

	# ninja: Entering directory `out/foo'
	ENTER_DIR_RE = re.compile(r'ninja: Entering directory `(.*?)\'$')
	# [M/N] clang... -c foo.cc -o foo.o ...
	# [M/N] .../clang... -c foo.cc -o foo.o ...
	# [M/N] clang-cl.exe /c foo.cc /Fofoo.o ...
	# [M/N] ...\clang-cl.exe /c foo.cc /Fofoo.o ...
	COMPILE_RE = re.compile(r'\[\d+/\d+\] (.[/\\])?clang. [/-]c (\S*)')
	# . a.h
	# .. b.h
	# . c.h
	INCLUDE_RE = re.compile(r'(\.+) (.*)$')

	skipping_root = False

	for line in build_log:
	# TODO(https://crbug.com/435303792): Ignore precompiled modules (.pcm) until
	# an appropriate way to calculate their size for include analysis is found.
	if (parsed := parse_include_line(line)) and not parsed[0].endswith('.pcm'):
	if skipping_root:
	continue
	prev_depth = len(file_stack) - 1
	filename, depth = parsed
	if filename not in includes:
	includes[filename] = set()

	if depth > prev_depth:
	if sys.platform != 'win32':
	# TODO(crbug.com/40187759): Always assert.
	assert depth == prev_depth + 1
	elif depth > prev_depth + 1:
	# Until the bug is fixed, skip these includes.
	print('missing include under', file_stack[0])
	continue
	else:
	del file_stack[-(prev_depth - depth + 1):]

	includes[file_stack[-1]].add(filename)
	file_stack.append(filename)
	continue

	# Clang module compile .modulemap files, so skip that from include analysis.
	if (m := COMPILE_RE.match(line)) and not m.group(2).endswith('.modulemap'):
	skipping_root = False
	filename = norm(m.group(2))
	if root_filter and not root_filter.match(filename):
	skipping_root = True
	continue
	roots.add(filename)
	file_stack = [filename]
	includes.setdefault(filename, set())
	continue

	if (m := ENTER_DIR_RE.match(line)):
	build_dir = m.group(1)
	continue

	if line.startswith('['):
	# Some tool other than clang is running. Ignore its output.
	skipping_root = True
	continue

	return roots, includes


	class TestParseBuild(unittest.TestCase):
	def test_basic(self):
	x = [
	'ninja: Entering directory `out/foo\'',
	'[1/3] clang -c ../../a.cc -o a.o',
	'. ../../a.h',
	'[2/3] clang -c gen/c.c -o a.o',
	]
	(roots, includes) = parse_build(x)
	self.assertEqual(roots, set(['a.cc', 'out/foo/gen/c.c']))
	self.assertEqual(set(includes.keys()),
	set(['a.cc', 'a.h', 'out/foo/gen/c.c']))
	self.assertEqual(includes['a.cc'], set(['a.h']))
	self.assertEqual(includes['a.h'], set())
	self.assertEqual(includes['out/foo/gen/c.c'], set())

	def test_more(self):
	x = [
	'ninja: Entering directory `out/foo\'',
	'[20/99] clang -c ../../a.cc -o a.o',
	'. ../../a.h',
	'. ../../b.h',
	'.. ../../c.h',
	'... ../../d.h',
	'. ../../e.h',
	]
	(roots, includes) = parse_build(x)
	self.assertEqual(roots, set(['a.cc']))
	self.assertEqual(includes['a.cc'], set(['a.h', 'b.h', 'e.h']))
	self.assertEqual(includes['b.h'], set(['c.h']))
	self.assertEqual(includes['c.h'], set(['d.h']))
	self.assertEqual(includes['d.h'], set())
	self.assertEqual(includes['e.h'], set())

	def test_multiple(self):
	x = [
	'ninja: Entering directory `out/foo\'',
	'[123/234] clang -c ../../a.cc -o a.o',
	'. ../../a.h',
	'[124/234] clang -c ../../b.cc -o b.o',
	'. ../../b.h',
	]
	(roots, includes) = parse_build(x)
	self.assertEqual(roots, set(['a.cc', 'b.cc']))
	self.assertEqual(includes['a.cc'], set(['a.h']))
	self.assertEqual(includes['b.cc'], set(['b.h']))

	def test_root_filter(self):
	x = [
	'ninja: Entering directory `out/foo\'',
	'[9/100] clang -c ../../a.cc -o a.o',
	'. ../../a.h',
	'[10/100] clang -c ../../b.cc -o b.o',
	'. ../../b.h',
	]
	(roots, includes) = parse_build(x, re.compile(r'^a.cc$'))
	self.assertEqual(roots, set(['a.cc']))
	self.assertEqual(set(includes.keys()), set(['a.cc', 'a.h']))
	self.assertEqual(includes['a.cc'], set(['a.h']))

	def test_windows(self):
	x = [
	'ninja: Entering directory `out/foo\'',
	'[1/3] path\\clang-cl.exe /c ../../a.cc /Foa.o',
	'. ../../a.h',
	'[2/3] clang-cl.exe /c gen/c.c /Foa.o',
	]
	(roots, includes) = parse_build(x)
	self.assertEqual(roots, set(['a.cc', 'out/foo/gen/c.c']))
	self.assertEqual(set(includes.keys()),
	set(['a.cc', 'a.h', 'out/foo/gen/c.c']))
	self.assertEqual(includes['a.cc'], set(['a.h']))
	self.assertEqual(includes['a.h'], set())
	self.assertEqual(includes['out/foo/gen/c.c'], set())

	def test_bindgen(self):
	x = [
	'ninja: Entering directory `out/foo\'',
	'[123/234] clang -c ../../a.cc -o a.o',
	'. ../../a.h',
	'[124/234] bindgen -c ../../b.cc -o b.o',
	'. ../../b.h',
	'[125/234] clang -c ../../c.cc -o c.o',
	'. ../../c.h',
	]
	(roots, includes) = parse_build(x)
	self.assertEqual(roots, set(['a.cc', 'c.cc']))
	self.assertEqual(includes['a.cc'], set(['a.h']))
	self.assertEqual(includes['c.cc'], set(['c.h']))

	def test_modules(self):
	x = [
	'ninja: Entering directory `out/foo\'',
	'[123/234] clang -x c++ -Xclang -emit-module -c ../../a.modulemap -o a.pcm',
	'[124/234] clang -fmodule-file=a.pcm -c ../../a.cc -o a.o',
	'. a.pcm',
	'. ../../a.h',
	]
	(roots, includes) = parse_build(x)
	self.assertEqual(roots, {'a.cc'})
	self.assertEqual(includes, {'a.cc': {'a.h'}, 'a.h': set()})


	def post_order_nodes(root, child_nodes):
	"""Generate the nodes reachable from root (including root itself) in
	post-order traversal order. child_nodes maps each node to its children."""
	visited = set()

	def walk(n):
	if n in visited:
	return
	visited.add(n)

	for c in child_nodes[n]:
	for x in walk(c):
	yield x
	yield n

	return walk(root)


	def compute_doms(root, includes):
	"""Compute the dominators for all nodes reachable from root. Node A dominates
	node B if all paths from the root to B go through A. Returns a dict from
	filename to the set of dominators of that filename (including itself).

	The implementation follows the "simple" version of Lengauer & Tarjan "A Fast
	Algorithm for Finding Dominators in a Flowgraph" (TOPLAS 1979).
	"""

	parent = {}
	ancestor = {}
	vertex = []
	label = {}
	semi = {}
	pred = defaultdict(list)
	bucket = defaultdict(list)
	dom = {}

	def dfs(v):
	semi[v] = len(vertex)
	vertex.append(v)
	label[v] = v

	for w in includes[v]:
	if w not in semi:
	parent[w] = v
	dfs(w)
	pred[w].append(v)

	def compress(v):
	if ancestor[v] in ancestor:
	compress(ancestor[v])
	if semi[label[ancestor[v]]] < semi[label[v]]:
	label[v] = label[ancestor[v]]
	ancestor[v] = ancestor[ancestor[v]]

	def evaluate(v):
	if v not in ancestor:
	return v
	compress(v)
	return label[v]

	def link(v, w):
	ancestor[w] = v

	# Step 1: Initialization.
	dfs(root)

	for w in reversed(vertex[1:]):
	# Step 2: Compute semidominators.
	for v in pred[w]:
	u = evaluate(v)
	if semi[u] < semi[w]:
	semi[w] = semi[u]

	bucket[vertex[semi[w]]].append(w)
	link(parent[w], w)

	# Step 3: Implicitly define the immediate dominator for each node.
	for v in bucket[parent[w]]:
	u = evaluate(v)
	dom[v] = u if semi[u] < semi[v] else parent[w]
	bucket[parent[w]] = []

	# Step 4: Explicitly define the immediate dominator for each node.
	for w in vertex[1:]:
	if dom[w] != vertex[semi[w]]:
	dom[w] = dom[dom[w]]

	# Get the full dominator set for each node.
	all_doms = {}
	all_doms[root] = {root}

	def dom_set(node):
	if node not in all_doms:
	# node's dominators is itself and the dominators of its immediate
	# dominator.
	all_doms[node] = {node}
	all_doms[node].update(dom_set(dom[node]))

	return all_doms[node]

	return {n: dom_set(n) for n in vertex}


	def compute_added_sizes(args):
	"""Helper to compute added sizes from the given root."""

	roots, includes, sizes = args
	added_sizes = {node: 0 for node in includes}

	for root in roots:
	doms = compute_doms(root, includes)

	for node in doms:
	if node not in sizes:
	# Skip the (src,dst) pseudo nodes.
	continue
	for dom in doms[node]:
	added_sizes[dom] += sizes[node]

	return added_sizes


	class TestComputeDoms(unittest.TestCase):
	def test_basic(self):
	includes = {}
	includes[1] = [2]
	includes[2] = [1]
	includes[3] = [2]
	includes[4] = [1]
	includes[5] = [4, 3]
	root = 5

	doms = compute_doms(root, includes)

	self.assertEqual(doms[1], set([5, 1]))
	self.assertEqual(doms[2], set([5, 2]))
	self.assertEqual(doms[3], set([5, 3]))
	self.assertEqual(doms[4], set([5, 4]))
	self.assertEqual(doms[5], set([5]))

	def test_larger(self):
	# Fig. 1 in the Lengauer-Tarjan paper.
	includes = {}
	includes['a'] = ['d']
	includes['b'] = ['a', 'd', 'e']
	includes['c'] = ['f', 'g']
	includes['d'] = ['l']
	includes['e'] = ['h']
	includes['f'] = ['i']
	includes['g'] = ['i', 'j']
	includes['h'] = ['k', 'e']
	includes['i'] = ['k']
	includes['j'] = ['i']
	includes['k'] = ['i', 'r']
	includes['l'] = ['h']
	includes['r'] = ['a', 'b', 'c']
	root = 'r'

	doms = compute_doms(root, includes)

	# Fig. 2 in the Lengauer-Tarjan paper.
	self.assertEqual(doms['a'], set(['a', 'r']))
	self.assertEqual(doms['b'], set(['b', 'r']))
	self.assertEqual(doms['c'], set(['c', 'r']))
	self.assertEqual(doms['d'], set(['d', 'r']))
	self.assertEqual(doms['e'], set(['e', 'r']))
	self.assertEqual(doms['f'], set(['f', 'c', 'r']))
	self.assertEqual(doms['g'], set(['g', 'c', 'r']))
	self.assertEqual(doms['h'], set(['h', 'r']))
	self.assertEqual(doms['i'], set(['i', 'r']))
	self.assertEqual(doms['j'], set(['j', 'g', 'c', 'r']))
	self.assertEqual(doms['k'], set(['k', 'r']))
	self.assertEqual(doms['l'], set(['l', 'd', 'r']))
	self.assertEqual(doms['r'], set(['r']))


	def log(args, *kwargs):
	"""Log output to stderr."""
	print(args, file=sys.stderr, *kwargs)


	# TODO: Use itertools.batched after updating the Python version on bots to 3.12.
	def batched(iterable, n):
	# batched('ABCDEFG', 2) → AB CD EF G
	if n < 1:
	raise ValueError('n must be at least one')
	iterator = iter(iterable)
	while batch := tuple(islice(iterator, n)):
	yield batch


	def analyze(target, revision, build_log_file, json_file, root_filter, processes=1):
	log('Parsing build log...')
	(roots, includes) = parse_build(build_log_file, root_filter)

	log('Getting file sizes...')
	sizes = {name: os.path.getsize(name) for name in includes}

	log('Computing transitive sizes and prevalence...')
	build_size = 0
	trans_sizes = {name: 0 for name in includes}
	prevalence = {name: 0 for name in includes}

	for n in includes:
	for node in post_order_nodes(n, includes):
	# Compute the transitive size of a file, i.e. the size of the
	# file itself and all its transitive includes.
	trans_sizes[n] += sizes[node]

	if n in roots:
	prevalence[node] += 1

	# Total build size is the sum of the transitive size of all roots.
	if n in roots:
	build_size += trans_sizes[n]

	print('build_size', build_size)

	if json_file is None:
	log('--json-out not set; exiting.')
	return 0

	# Map from file to files that include it.
	log('Building reverse include map...')
	included_by = {k: set() for k in includes}
	for k in includes:
	for i in includes[k]:
	included_by[i].add(k)

	log('Computing added sizes...')

	# Split each src -> dst edge in includes into src -> (src,dst) -> dst, so that
	# we can compute how much each include graph edge adds to the size by doing
	# dominance analysis on the (src,dst) nodes.
	augmented_includes = {}
	for src in includes:
	augmented_includes[src] = set()
	for dst in includes[src]:
	augmented_includes[src].add((src, dst))
	augmented_includes[(src, dst)] = {dst}

	if processes > 1:
	added_sizes = {node: 0 for node in augmented_includes}

	# Break up the list of roots into chunks based on the number of processes and pass them
	# to each worker. Giving each worker a complete chunk of roots to work on rather than
	# having workers pull as they go minimizes contention and gives better parallelization.
	chunk_size = math.ceil(float(len(roots)) / processes)
	chunked = list(batched(roots, chunk_size))

	with concurrent.futures.ProcessPoolExecutor(max_workers=processes) as pool:
	for computed_added_sizes in pool.map(
	compute_added_sizes,
	((chunk, augmented_includes, sizes) for chunk in chunked),
	):
	for dom, size in computed_added_sizes.items():
	added_sizes[dom] += size
	else:
	added_sizes = compute_added_sizes((roots, augmented_includes, sizes))

	# Assign a number to each filename for tighter JSON representation.
	names = []
	name2nr = {}
	for n in sorted(includes.keys()):
	name2nr[n] = len(names)
	names.append(n)

	def nr(name):
	return name2nr[name]

	log('Writing output...')

	# Provide a JS object for convenient inclusion in the HTML file.
	# If someone really wants a proper JSON file, maybe we can reconsider this.
	json_file.write('data = ')

	json.dump(
	{
	'target': target,
	'revision': revision,
	'date': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC'),
	'files': names,
	'roots': [nr(x) for x in sorted(roots)],
	'includes': [[nr(x) for x in sorted(includes[n])] for n in names],
	'included_by': [[nr(x) for x in sorted(included_by[n])] for n in names],
	'sizes': [sizes[n] for n in names],
	'tsizes': [trans_sizes[n] for n in names],
	'asizes': [added_sizes[n] for n in names],
	'esizes': [[added_sizes[(s, d)] for d in sorted(includes[s])]
	for s in names],
	'prevalence': [prevalence[n] for n in names],
	}, json_file)

	log('All done!')


	def main():
	result = unittest.main(argv=sys.argv[:1], exit=False, verbosity=2).result
	if len(result.failures) > 0 or len(result.errors) > 0:
	return 1

	parser = argparse.ArgumentParser(description='Analyze an #include graph.')
	parser.add_argument('build_log',
	type=argparse.FileType('r', errors='replace'),
	help='The build log to analyze (- for stdin).')
	parser.add_argument('--target',
	help='The target that was built (e.g. chrome).')
	parser.add_argument('--revision',
	help='The revision that was built (e.g. 016588d4ee20).')
	parser.add_argument(
	'--json-out',
	type=argparse.FileType('w'),
	help='Write full analysis data to a JSON file (- for stdout).')
	parser.add_argument('--root-filter',
	help='Regex to filter which root files are analyzed.')
	parser.add_argument('--processes',
	action="store",
	type=int,
	default=1,
	help="Use multiple processes to speed up the analysis - "
	"note that this scales memory usage significantly")
	args = parser.parse_args()

	if args.json_out and not (args.target and args.revision):
	print('error: --json-out requires both --target and --revision to be set')
	return 1

	try:
	root_filter = re.compile(args.root_filter) if args.root_filter else None
	except Exception:
	print('error: --root-filter is not a valid regex')
	return 1

	analyze(args.target, args.revision, args.build_log, args.json_out,
	root_filter, processes=args.processes)


	if __name__ == '__main__':
	sys.exit(main())