scripts/analyze_includes.py - chromium/src/tools/clang - Git at Google

 #!/usr/bin/env python3
 # Copyright 2021 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 """This script is used to analyze #include graphs.

 It produces the .js file that accompanies include-analysis.html.

 Usage:

 $ gn gen --args="show_includes=true symbol_level=0 enable_precompiled_headers=false" out/Debug
 $ autoninja -C out/Debug -v chrome | tee /tmp/build_log
 $ analyze_includes.py --target=chrome --revision=$(git rev-parse --short HEAD) \
     --json-out=/tmp/include-analysis.js /tmp/build_log

 (If you have goma access, add use_goma=true to the gn args, but not on Windows
 due to crbug.com/1223741#c9)

 The script takes roughly half an hour on a fast machine for the chrome build
 target, which is considered fast enough for batch job purposes for now.

 If --json-out is not provided, the script exits after printing some statistics
 to stdout. This is significantly faster than generating the full JSON data. For
 example:

 $ autoninja -C out/Debug -v chrome | analyze_includes.py - 2>/dev/null
 build_size 270237664463
 """

 import argparse
 import json
 import os
 import pathlib
 import re
 import sys
 import unittest
 from collections import defaultdict
 from datetime import datetime


 def parse_build(build_log, root_filter=None):
   """Parse the build_log (generated as in the Usage note above) to capture the
   include graph. Returns a (roots, includes) pair, where roots is a list of root
   nodes in the graph (the source files) and includes is a dict from filename to
   list of filenames included by that filename."""
   build_dir = '.'
   file_stack = []
   includes = {}
   roots = set()

   # Note: A file might include different files for different compiler
   # invocations depending on -D flags. For such cases, includes[file] will be
   # the union of those includes.

   # Normalize paths.
   normalized = {}

   def norm(fn):
     if fn not in normalized:
       x = fn.replace('\\\\', '\\')
       # Use Path.resolve() rather than path.realpath() to get the canonical
       # upper/lower-case version of the path on Windows.
       p = pathlib.Path(os.path.join(build_dir, x)).resolve()
       x = os.path.relpath(p)
       x = x.replace(os.path.sep, '/')
       normalized[fn] = x
     return normalized[fn]

   # ninja: Entering directory `out/foo'
   ENTER_DIR_RE = re.compile(r'ninja: Entering directory `(.*?)\'$')
   # [M/N] clang... -c foo.cc -o foo.o ...
   # [M/N] .../clang... -c foo.cc -o foo.o ...
   # [M/N] clang-cl.exe /c foo.cc /Fofoo.o ...
   # [M/N] ...\clang-cl.exe /c foo.cc /Fofoo.o ...
   COMPILE_RE = re.compile(r'\[\d+/\d+\] (.*[/\\])?clang.* [/-]c (\S*)')
   # . a.h
   # .. b.h
   # . c.h
   INCLUDE_RE = re.compile(r'(\.+) (.*)$')

   skipping_root = False

   for line in build_log:
     m = INCLUDE_RE.match(line)
     if m:
       if skipping_root:
         continue
       prev_depth = len(file_stack) - 1
       depth = len(m.group(1))
       filename = norm(m.group(2))
       includes.setdefault(filename, set())

       if depth > prev_depth:
         if sys.platform != 'win32':
           # TODO(crbug.com/1223741): Always assert.
           assert depth == prev_depth + 1
         elif depth > prev_depth + 1:
           # Until the bug is fixed, skip these includes.
           print('missing include under', file_stack[0])
           continue
       else:
         for _ in range(prev_depth - depth + 1):
           file_stack.pop()

       includes[file_stack[-1]].add(filename)
       file_stack.append(filename)
       continue

     m = COMPILE_RE.match(line)
     if m:
       skipping_root = False
       filename = norm(m.group(2))
       if root_filter and not root_filter.match(filename):
         skipping_root = True
         continue
       roots.add(filename)
       file_stack = [filename]
       includes.setdefault(filename, set())
       continue

     m = ENTER_DIR_RE.match(line)
     if m:
       build_dir = m.group(1)
       continue

   return roots, includes


 class TestParseBuild(unittest.TestCase):
   def test_basic(self):
     x = [
         'ninja: Entering directory `out/foo\'',
         '[1/3] clang -c ../../a.cc -o a.o',
         '. ../../a.h',
         '[2/3] clang -c gen/c.c -o a.o',
     ]
     (roots, includes) = parse_build(x)
     self.assertEqual(roots, set(['a.cc', 'out/foo/gen/c.c']))
     self.assertEqual(set(includes.keys()),
                      set(['a.cc', 'a.h', 'out/foo/gen/c.c']))
     self.assertEqual(includes['a.cc'], set(['a.h']))
     self.assertEqual(includes['a.h'], set())
     self.assertEqual(includes['out/foo/gen/c.c'], set())

   def test_more(self):
     x = [
         'ninja: Entering directory `out/foo\'',
         '[20/99] clang -c ../../a.cc -o a.o',
         '. ../../a.h',
         '. ../../b.h',
         '.. ../../c.h',
         '... ../../d.h',
         '. ../../e.h',
     ]
     (roots, includes) = parse_build(x)
     self.assertEqual(roots, set(['a.cc']))
     self.assertEqual(includes['a.cc'], set(['a.h', 'b.h', 'e.h']))
     self.assertEqual(includes['b.h'], set(['c.h']))
     self.assertEqual(includes['c.h'], set(['d.h']))
     self.assertEqual(includes['d.h'], set())
     self.assertEqual(includes['e.h'], set())

   def test_multiple(self):
     x = [
         'ninja: Entering directory `out/foo\'',
         '[123/234] clang -c ../../a.cc -o a.o',
         '. ../../a.h',
         '[124/234] clang -c ../../b.cc -o b.o',
         '. ../../b.h',
     ]
     (roots, includes) = parse_build(x)
     self.assertEqual(roots, set(['a.cc', 'b.cc']))
     self.assertEqual(includes['a.cc'], set(['a.h']))
     self.assertEqual(includes['b.cc'], set(['b.h']))

   def test_root_filter(self):
     x = [
         'ninja: Entering directory `out/foo\'',
         '[9/100] clang -c ../../a.cc -o a.o',
         '. ../../a.h',
         '[10/100] clang -c ../../b.cc -o b.o',
         '. ../../b.h',
     ]
     (roots, includes) = parse_build(x, re.compile(r'^a.cc$'))
     self.assertEqual(roots, set(['a.cc']))
     self.assertEqual(set(includes.keys()), set(['a.cc', 'a.h']))
     self.assertEqual(includes['a.cc'], set(['a.h']))

   def test_windows(self):
     x = [
         'ninja: Entering directory `out/foo\'',
         '[1/3] path\\clang-cl.exe /c ../../a.cc /Foa.o',
         '. ../../a.h',
         '[2/3] clang-cl.exe /c gen/c.c /Foa.o',
     ]
     (roots, includes) = parse_build(x)
     self.assertEqual(roots, set(['a.cc', 'out/foo/gen/c.c']))
     self.assertEqual(set(includes.keys()),
                      set(['a.cc', 'a.h', 'out/foo/gen/c.c']))
     self.assertEqual(includes['a.cc'], set(['a.h']))
     self.assertEqual(includes['a.h'], set())
     self.assertEqual(includes['out/foo/gen/c.c'], set())


 def post_order_nodes(root, child_nodes):
   """Generate the nodes reachable from root (including root itself) in
   post-order traversal order. child_nodes maps each node to its children."""
   visited = set()

   def walk(n):
     if n in visited:
       return
     visited.add(n)

     for c in child_nodes[n]:
       for x in walk(c):
         yield x
     yield n

   return walk(root)


 def compute_doms(root, includes):
   """Compute the dominators for all nodes reachable from root. Node A dominates
   node B if all paths from the root to B go through A. Returns a dict from
   filename to the set of dominators of that filename (including itself).

   The implementation follows the "simple" version of Lengauer & Tarjan "A Fast
   Algorithm for Finding Dominators in a Flowgraph" (TOPLAS 1979).
   """

   parent = {}
   ancestor = {}
   vertex = []
   label = {}
   semi = {}
   pred = defaultdict(list)
   bucket = defaultdict(list)
   dom = {}

   def dfs(v):
     semi[v] = len(vertex)
     vertex.append(v)
     label[v] = v

     for w in includes[v]:
       if w not in semi:
         parent[w] = v
         dfs(w)
       pred[w].append(v)

   def compress(v):
     if ancestor[v] in ancestor:
       compress(ancestor[v])
       if semi[label[ancestor[v]]] < semi[label[v]]:
         label[v] = label[ancestor[v]]
       ancestor[v] = ancestor[ancestor[v]]

   def evaluate(v):
     if v not in ancestor:
       return v
     compress(v)
     return label[v]

   def link(v, w):
     ancestor[w] = v

   # Step 1: Initialization.
   dfs(root)

   for w in reversed(vertex[1:]):
     # Step 2: Compute semidominators.
     for v in pred[w]:
       u = evaluate(v)
       if semi[u] < semi[w]:
         semi[w] = semi[u]

     bucket[vertex[semi[w]]].append(w)
     link(parent[w], w)

     # Step 3: Implicitly define the immediate dominator for each node.
     for v in bucket[parent[w]]:
       u = evaluate(v)
       dom[v] = u if semi[u] < semi[v] else parent[w]
     bucket[parent[w]] = []

   # Step 4: Explicitly define the immediate dominator for each node.
   for w in vertex[1:]:
     if dom[w] != vertex[semi[w]]:
       dom[w] = dom[dom[w]]

   # Get the full dominator set for each node.
   all_doms = {}
   all_doms[root] = {root}

   def dom_set(node):
     if node not in all_doms:
       # node's dominators is itself and the dominators of its immediate
       # dominator.
       all_doms[node] = {node}
       all_doms[node].update(dom_set(dom[node]))

     return all_doms[node]

   return {n: dom_set(n) for n in vertex}


 class TestComputeDoms(unittest.TestCase):
   def test_basic(self):
     includes = {}
     includes[1] = [2]
     includes[2] = [1]
     includes[3] = [2]
     includes[4] = [1]
     includes[5] = [4, 3]
     root = 5

     doms = compute_doms(root, includes)

     self.assertEqual(doms[1], set([5, 1]))
     self.assertEqual(doms[2], set([5, 2]))
     self.assertEqual(doms[3], set([5, 3]))
     self.assertEqual(doms[4], set([5, 4]))
     self.assertEqual(doms[5], set([5]))

   def test_larger(self):
     # Fig. 1 in the Lengauer-Tarjan paper.
     includes = {}
     includes['a'] = ['d']
     includes['b'] = ['a', 'd', 'e']
     includes['c'] = ['f', 'g']
     includes['d'] = ['l']
     includes['e'] = ['h']
     includes['f'] = ['i']
     includes['g'] = ['i', 'j']
     includes['h'] = ['k', 'e']
     includes['i'] = ['k']
     includes['j'] = ['i']
     includes['k'] = ['i', 'r']
     includes['l'] = ['h']
     includes['r'] = ['a', 'b', 'c']
     root = 'r'

     doms = compute_doms(root, includes)

     # Fig. 2 in the Lengauer-Tarjan paper.
     self.assertEqual(doms['a'], set(['a', 'r']))
     self.assertEqual(doms['b'], set(['b', 'r']))
     self.assertEqual(doms['c'], set(['c', 'r']))
     self.assertEqual(doms['d'], set(['d', 'r']))
     self.assertEqual(doms['e'], set(['e', 'r']))
     self.assertEqual(doms['f'], set(['f', 'c', 'r']))
     self.assertEqual(doms['g'], set(['g', 'c', 'r']))
     self.assertEqual(doms['h'], set(['h', 'r']))
     self.assertEqual(doms['i'], set(['i', 'r']))
     self.assertEqual(doms['j'], set(['j', 'g', 'c', 'r']))
     self.assertEqual(doms['k'], set(['k', 'r']))
     self.assertEqual(doms['l'], set(['l', 'd', 'r']))
     self.assertEqual(doms['r'], set(['r']))


 def trans_size(root, includes, sizes):
   """Compute the transitive size of a file, i.e. the size of the file itself and
   all its transitive includes."""
   return sum([sizes[n] for n in post_order_nodes(root, includes)])


 def log(*args, **kwargs):
   """Log output to stderr."""
   print(*args, file=sys.stderr, **kwargs)


 def analyze(target, revision, build_log_file, json_file, root_filter):
   log('Parsing build log...')
   (roots, includes) = parse_build(build_log_file, root_filter)

   log('Getting file sizes...')
   sizes = {name: os.path.getsize(name) for name in includes}

   log('Computing transitive sizes...')
   trans_sizes = {n: trans_size(n, includes, sizes) for n in includes}

   build_size = sum([trans_sizes[n] for n in roots])

   print('build_size', build_size)

   if json_file is None:
     log('--json-out not set; exiting.')
     return 0

   log('Counting prevalence...')
   prevalence = {name: 0 for name in includes}
   for r in roots:
     for n in post_order_nodes(r, includes):
       prevalence[n] += 1

   # Map from file to files that include it.
   log('Building reverse include map...')
   included_by = {k: set() for k in includes}
   for k in includes:
     for i in includes[k]:
       included_by[i].add(k)

   log('Computing added sizes...')

   # Split each src -> dst edge in includes into src -> (src,dst) -> dst, so that
   # we can compute how much each include graph edge adds to the size by doing
   # dominance analysis on the (src,dst) nodes.
   augmented_includes = {}
   for src in includes:
     augmented_includes[src] = set()
     for dst in includes[src]:
       augmented_includes[src].add((src, dst))
       augmented_includes[(src, dst)] = {dst}

   added_sizes = {node: 0 for node in augmented_includes}
   for r in roots:
     doms = compute_doms(r, augmented_includes)
     for node in doms:
       if node not in sizes:
         # Skip the (src,dst) pseudo nodes.
         continue
       for dom in doms[node]:
         added_sizes[dom] += sizes[node]

   # Assign a number to each filename for tighter JSON representation.
   names = []
   name2nr = {}
   for n in sorted(includes.keys()):
     name2nr[n] = len(names)
     names.append(n)

   def nr(name):
     return name2nr[name]

   log('Writing output...')

   # Provide a JS object for convenient inclusion in the HTML file.
   # If someone really wants a proper JSON file, maybe we can reconsider this.
   json_file.write('data = ')

   json.dump(
       {
           'target': target,
           'revision': revision,
           'date': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC'),
           'files': names,
           'roots': [nr(x) for x in sorted(roots)],
           'includes': [[nr(x) for x in sorted(includes[n])] for n in names],
           'included_by': [[nr(x) for x in included_by[n]] for n in names],
           'sizes': [sizes[n] for n in names],
           'tsizes': [trans_sizes[n] for n in names],
           'asizes': [added_sizes[n] for n in names],
           'esizes': [[added_sizes[(s, d)] for d in sorted(includes[s])]
                      for s in names],
           'prevalence': [prevalence[n] for n in names],
       }, json_file)

   log('All done!')


 def main():
   result = unittest.main(argv=sys.argv[:1], exit=False, verbosity=2).result
   if len(result.failures) > 0 or len(result.errors) > 0:
     return 1

   parser = argparse.ArgumentParser(description='Analyze an #include graph.')
   parser.add_argument('build_log',
                       type=argparse.FileType('r'),
                       help='The build log to analyze (- for stdin).')
   parser.add_argument('--target',
                       help='The target that was built (e.g. chrome).')
   parser.add_argument('--revision',
                       help='The revision that was built (e.g. 016588d4ee20).')
   parser.add_argument(
       '--json-out',
       type=argparse.FileType('w'),
       help='Write full analysis data to a JSON file (- for stdout).')
   parser.add_argument('--root-filter',
                       help='Regex to filter which root files are analyzed.')
   args = parser.parse_args()

   if args.json_out and not (args.target and args.revision):
     print('error: --json-out requires both --target and --revision to be set')
     return 1

   try:
     root_filter = re.compile(args.root_filter) if args.root_filter else None
   except Exception:
     print('error: --root-filter is not a valid regex')
     return 1

   analyze(args.target, args.revision, args.build_log, args.json_out,
           root_filter)


 if __name__ == '__main__':
   sys.exit(main())
	#!/usr/bin/env python3
	# Copyright 2021 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.
	"""This script is used to analyze #include graphs.

	It produces the .js file that accompanies include-analysis.html.

	Usage:

	$ gn gen --args="show_includes=true symbol_level=0 enable_precompiled_headers=false" out/Debug
	$ autoninja -C out/Debug -v chrome \| tee /tmp/build_log
	$ analyze_includes.py --target=chrome --revision=$(git rev-parse --short HEAD) \
	--json-out=/tmp/include-analysis.js /tmp/build_log

	(If you have goma access, add use_goma=true to the gn args, but not on Windows
	due to crbug.com/1223741#c9)

	The script takes roughly half an hour on a fast machine for the chrome build
	target, which is considered fast enough for batch job purposes for now.

	If --json-out is not provided, the script exits after printing some statistics
	to stdout. This is significantly faster than generating the full JSON data. For
	example:

	$ autoninja -C out/Debug -v chrome \| analyze_includes.py - 2>/dev/null
	build_size 270237664463
	"""

	import argparse
	import json
	import os
	import pathlib
	import re
	import sys
	import unittest
	from collections import defaultdict
	from datetime import datetime


	def parse_build(build_log, root_filter=None):
	"""Parse the build_log (generated as in the Usage note above) to capture the
	include graph. Returns a (roots, includes) pair, where roots is a list of root
	nodes in the graph (the source files) and includes is a dict from filename to
	list of filenames included by that filename."""
	build_dir = '.'
	file_stack = []
	includes = {}
	roots = set()

	# Note: A file might include different files for different compiler
	# invocations depending on -D flags. For such cases, includes[file] will be
	# the union of those includes.

	# Normalize paths.
	normalized = {}

	def norm(fn):
	if fn not in normalized:
	x = fn.replace('\\\\', '\\')
	# Use Path.resolve() rather than path.realpath() to get the canonical
	# upper/lower-case version of the path on Windows.
	p = pathlib.Path(os.path.join(build_dir, x)).resolve()
	x = os.path.relpath(p)
	x = x.replace(os.path.sep, '/')
	normalized[fn] = x
	return normalized[fn]

	# ninja: Entering directory `out/foo'
	ENTER_DIR_RE = re.compile(r'ninja: Entering directory `(.*?)\'$')
	# [M/N] clang... -c foo.cc -o foo.o ...
	# [M/N] .../clang... -c foo.cc -o foo.o ...
	# [M/N] clang-cl.exe /c foo.cc /Fofoo.o ...
	# [M/N] ...\clang-cl.exe /c foo.cc /Fofoo.o ...
	COMPILE_RE = re.compile(r'\[\d+/\d+\] (.[/\\])?clang. [/-]c (\S*)')
	# . a.h
	# .. b.h
	# . c.h
	INCLUDE_RE = re.compile(r'(\.+) (.*)$')

	skipping_root = False

	for line in build_log:
	m = INCLUDE_RE.match(line)
	if m:
	if skipping_root:
	continue
	prev_depth = len(file_stack) - 1
	depth = len(m.group(1))
	filename = norm(m.group(2))
	includes.setdefault(filename, set())

	if depth > prev_depth:
	if sys.platform != 'win32':
	# TODO(crbug.com/1223741): Always assert.
	assert depth == prev_depth + 1
	elif depth > prev_depth + 1:
	# Until the bug is fixed, skip these includes.
	print('missing include under', file_stack[0])
	continue
	else:
	for _ in range(prev_depth - depth + 1):
	file_stack.pop()

	includes[file_stack[-1]].add(filename)
	file_stack.append(filename)
	continue

	m = COMPILE_RE.match(line)
	if m:
	skipping_root = False
	filename = norm(m.group(2))
	if root_filter and not root_filter.match(filename):
	skipping_root = True
	continue
	roots.add(filename)
	file_stack = [filename]
	includes.setdefault(filename, set())
	continue

	m = ENTER_DIR_RE.match(line)
	if m:
	build_dir = m.group(1)
	continue

	return roots, includes


	class TestParseBuild(unittest.TestCase):
	def test_basic(self):
	x = [
	'ninja: Entering directory `out/foo\'',
	'[1/3] clang -c ../../a.cc -o a.o',
	'. ../../a.h',
	'[2/3] clang -c gen/c.c -o a.o',
	]
	(roots, includes) = parse_build(x)
	self.assertEqual(roots, set(['a.cc', 'out/foo/gen/c.c']))
	self.assertEqual(set(includes.keys()),
	set(['a.cc', 'a.h', 'out/foo/gen/c.c']))
	self.assertEqual(includes['a.cc'], set(['a.h']))
	self.assertEqual(includes['a.h'], set())
	self.assertEqual(includes['out/foo/gen/c.c'], set())

	def test_more(self):
	x = [
	'ninja: Entering directory `out/foo\'',
	'[20/99] clang -c ../../a.cc -o a.o',
	'. ../../a.h',
	'. ../../b.h',
	'.. ../../c.h',
	'... ../../d.h',
	'. ../../e.h',
	]
	(roots, includes) = parse_build(x)
	self.assertEqual(roots, set(['a.cc']))
	self.assertEqual(includes['a.cc'], set(['a.h', 'b.h', 'e.h']))
	self.assertEqual(includes['b.h'], set(['c.h']))
	self.assertEqual(includes['c.h'], set(['d.h']))
	self.assertEqual(includes['d.h'], set())
	self.assertEqual(includes['e.h'], set())

	def test_multiple(self):
	x = [
	'ninja: Entering directory `out/foo\'',
	'[123/234] clang -c ../../a.cc -o a.o',
	'. ../../a.h',
	'[124/234] clang -c ../../b.cc -o b.o',
	'. ../../b.h',
	]
	(roots, includes) = parse_build(x)
	self.assertEqual(roots, set(['a.cc', 'b.cc']))
	self.assertEqual(includes['a.cc'], set(['a.h']))
	self.assertEqual(includes['b.cc'], set(['b.h']))

	def test_root_filter(self):
	x = [
	'ninja: Entering directory `out/foo\'',
	'[9/100] clang -c ../../a.cc -o a.o',
	'. ../../a.h',
	'[10/100] clang -c ../../b.cc -o b.o',
	'. ../../b.h',
	]
	(roots, includes) = parse_build(x, re.compile(r'^a.cc$'))
	self.assertEqual(roots, set(['a.cc']))
	self.assertEqual(set(includes.keys()), set(['a.cc', 'a.h']))
	self.assertEqual(includes['a.cc'], set(['a.h']))

	def test_windows(self):
	x = [
	'ninja: Entering directory `out/foo\'',
	'[1/3] path\\clang-cl.exe /c ../../a.cc /Foa.o',
	'. ../../a.h',
	'[2/3] clang-cl.exe /c gen/c.c /Foa.o',
	]
	(roots, includes) = parse_build(x)
	self.assertEqual(roots, set(['a.cc', 'out/foo/gen/c.c']))
	self.assertEqual(set(includes.keys()),
	set(['a.cc', 'a.h', 'out/foo/gen/c.c']))
	self.assertEqual(includes['a.cc'], set(['a.h']))
	self.assertEqual(includes['a.h'], set())
	self.assertEqual(includes['out/foo/gen/c.c'], set())


	def post_order_nodes(root, child_nodes):
	"""Generate the nodes reachable from root (including root itself) in
	post-order traversal order. child_nodes maps each node to its children."""
	visited = set()

	def walk(n):
	if n in visited:
	return
	visited.add(n)

	for c in child_nodes[n]:
	for x in walk(c):
	yield x
	yield n

	return walk(root)


	def compute_doms(root, includes):
	"""Compute the dominators for all nodes reachable from root. Node A dominates
	node B if all paths from the root to B go through A. Returns a dict from
	filename to the set of dominators of that filename (including itself).

	The implementation follows the "simple" version of Lengauer & Tarjan "A Fast
	Algorithm for Finding Dominators in a Flowgraph" (TOPLAS 1979).
	"""

	parent = {}
	ancestor = {}
	vertex = []
	label = {}
	semi = {}
	pred = defaultdict(list)
	bucket = defaultdict(list)
	dom = {}

	def dfs(v):
	semi[v] = len(vertex)
	vertex.append(v)
	label[v] = v

	for w in includes[v]:
	if w not in semi:
	parent[w] = v
	dfs(w)
	pred[w].append(v)

	def compress(v):
	if ancestor[v] in ancestor:
	compress(ancestor[v])
	if semi[label[ancestor[v]]] < semi[label[v]]:
	label[v] = label[ancestor[v]]
	ancestor[v] = ancestor[ancestor[v]]

	def evaluate(v):
	if v not in ancestor:
	return v
	compress(v)
	return label[v]

	def link(v, w):
	ancestor[w] = v

	# Step 1: Initialization.
	dfs(root)

	for w in reversed(vertex[1:]):
	# Step 2: Compute semidominators.
	for v in pred[w]:
	u = evaluate(v)
	if semi[u] < semi[w]:
	semi[w] = semi[u]

	bucket[vertex[semi[w]]].append(w)
	link(parent[w], w)

	# Step 3: Implicitly define the immediate dominator for each node.
	for v in bucket[parent[w]]:
	u = evaluate(v)
	dom[v] = u if semi[u] < semi[v] else parent[w]
	bucket[parent[w]] = []

	# Step 4: Explicitly define the immediate dominator for each node.
	for w in vertex[1:]:
	if dom[w] != vertex[semi[w]]:
	dom[w] = dom[dom[w]]

	# Get the full dominator set for each node.
	all_doms = {}
	all_doms[root] = {root}

	def dom_set(node):
	if node not in all_doms:
	# node's dominators is itself and the dominators of its immediate
	# dominator.
	all_doms[node] = {node}
	all_doms[node].update(dom_set(dom[node]))

	return all_doms[node]

	return {n: dom_set(n) for n in vertex}


	class TestComputeDoms(unittest.TestCase):
	def test_basic(self):
	includes = {}
	includes[1] = [2]
	includes[2] = [1]
	includes[3] = [2]
	includes[4] = [1]
	includes[5] = [4, 3]
	root = 5

	doms = compute_doms(root, includes)

	self.assertEqual(doms[1], set([5, 1]))
	self.assertEqual(doms[2], set([5, 2]))
	self.assertEqual(doms[3], set([5, 3]))
	self.assertEqual(doms[4], set([5, 4]))
	self.assertEqual(doms[5], set([5]))

	def test_larger(self):
	# Fig. 1 in the Lengauer-Tarjan paper.
	includes = {}
	includes['a'] = ['d']
	includes['b'] = ['a', 'd', 'e']
	includes['c'] = ['f', 'g']
	includes['d'] = ['l']
	includes['e'] = ['h']
	includes['f'] = ['i']
	includes['g'] = ['i', 'j']
	includes['h'] = ['k', 'e']
	includes['i'] = ['k']
	includes['j'] = ['i']
	includes['k'] = ['i', 'r']
	includes['l'] = ['h']
	includes['r'] = ['a', 'b', 'c']
	root = 'r'

	doms = compute_doms(root, includes)

	# Fig. 2 in the Lengauer-Tarjan paper.
	self.assertEqual(doms['a'], set(['a', 'r']))
	self.assertEqual(doms['b'], set(['b', 'r']))
	self.assertEqual(doms['c'], set(['c', 'r']))
	self.assertEqual(doms['d'], set(['d', 'r']))
	self.assertEqual(doms['e'], set(['e', 'r']))
	self.assertEqual(doms['f'], set(['f', 'c', 'r']))
	self.assertEqual(doms['g'], set(['g', 'c', 'r']))
	self.assertEqual(doms['h'], set(['h', 'r']))
	self.assertEqual(doms['i'], set(['i', 'r']))
	self.assertEqual(doms['j'], set(['j', 'g', 'c', 'r']))
	self.assertEqual(doms['k'], set(['k', 'r']))
	self.assertEqual(doms['l'], set(['l', 'd', 'r']))
	self.assertEqual(doms['r'], set(['r']))


	def trans_size(root, includes, sizes):
	"""Compute the transitive size of a file, i.e. the size of the file itself and
	all its transitive includes."""
	return sum([sizes[n] for n in post_order_nodes(root, includes)])


	def log(args, *kwargs):
	"""Log output to stderr."""
	print(args, file=sys.stderr, *kwargs)


	def analyze(target, revision, build_log_file, json_file, root_filter):
	log('Parsing build log...')
	(roots, includes) = parse_build(build_log_file, root_filter)

	log('Getting file sizes...')
	sizes = {name: os.path.getsize(name) for name in includes}

	log('Computing transitive sizes...')
	trans_sizes = {n: trans_size(n, includes, sizes) for n in includes}

	build_size = sum([trans_sizes[n] for n in roots])

	print('build_size', build_size)

	if json_file is None:
	log('--json-out not set; exiting.')
	return 0

	log('Counting prevalence...')
	prevalence = {name: 0 for name in includes}
	for r in roots:
	for n in post_order_nodes(r, includes):
	prevalence[n] += 1

	# Map from file to files that include it.
	log('Building reverse include map...')
	included_by = {k: set() for k in includes}
	for k in includes:
	for i in includes[k]:
	included_by[i].add(k)

	log('Computing added sizes...')

	# Split each src -> dst edge in includes into src -> (src,dst) -> dst, so that
	# we can compute how much each include graph edge adds to the size by doing
	# dominance analysis on the (src,dst) nodes.
	augmented_includes = {}
	for src in includes:
	augmented_includes[src] = set()
	for dst in includes[src]:
	augmented_includes[src].add((src, dst))
	augmented_includes[(src, dst)] = {dst}

	added_sizes = {node: 0 for node in augmented_includes}
	for r in roots:
	doms = compute_doms(r, augmented_includes)
	for node in doms:
	if node not in sizes:
	# Skip the (src,dst) pseudo nodes.
	continue
	for dom in doms[node]:
	added_sizes[dom] += sizes[node]

	# Assign a number to each filename for tighter JSON representation.
	names = []
	name2nr = {}
	for n in sorted(includes.keys()):
	name2nr[n] = len(names)
	names.append(n)

	def nr(name):
	return name2nr[name]

	log('Writing output...')

	# Provide a JS object for convenient inclusion in the HTML file.
	# If someone really wants a proper JSON file, maybe we can reconsider this.
	json_file.write('data = ')

	json.dump(
	{
	'target': target,
	'revision': revision,
	'date': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC'),
	'files': names,
	'roots': [nr(x) for x in sorted(roots)],
	'includes': [[nr(x) for x in sorted(includes[n])] for n in names],
	'included_by': [[nr(x) for x in included_by[n]] for n in names],
	'sizes': [sizes[n] for n in names],
	'tsizes': [trans_sizes[n] for n in names],
	'asizes': [added_sizes[n] for n in names],
	'esizes': [[added_sizes[(s, d)] for d in sorted(includes[s])]
	for s in names],
	'prevalence': [prevalence[n] for n in names],
	}, json_file)

	log('All done!')


	def main():
	result = unittest.main(argv=sys.argv[:1], exit=False, verbosity=2).result
	if len(result.failures) > 0 or len(result.errors) > 0:
	return 1

	parser = argparse.ArgumentParser(description='Analyze an #include graph.')
	parser.add_argument('build_log',
	type=argparse.FileType('r'),
	help='The build log to analyze (- for stdin).')
	parser.add_argument('--target',
	help='The target that was built (e.g. chrome).')
	parser.add_argument('--revision',
	help='The revision that was built (e.g. 016588d4ee20).')
	parser.add_argument(
	'--json-out',
	type=argparse.FileType('w'),
	help='Write full analysis data to a JSON file (- for stdout).')
	parser.add_argument('--root-filter',
	help='Regex to filter which root files are analyzed.')
	args = parser.parse_args()

	if args.json_out and not (args.target and args.revision):
	print('error: --json-out requires both --target and --revision to be set')
	return 1

	try:
	root_filter = re.compile(args.root_filter) if args.root_filter else None
	except Exception:
	print('error: --root-filter is not a valid regex')
	return 1

	analyze(args.target, args.revision, args.build_log, args.json_out,
	root_filter)


	if __name__ == '__main__':
	sys.exit(main())