Merge pull request #2 from jfbastien/namespaces

Add namespaces, some unmangling, symbol colors, dominant symbols.
diff --git a/bloat.py b/bloat.py
index d7fc126..c5cbb31 100755
--- a/bloat.py
+++ b/bloat.py
@@ -1,10 +1,12 @@
 #!/usr/bin/python
 
 import fileinput
+import operator
 import optparse
 import os
 import pprint
 import re
+import subprocess
 import sys
 import json
 
@@ -77,66 +79,130 @@
         print >>sys.stderr, 'unparsed:', repr(line)
 
 
-def filter_syms(types, symbols):
-    for sym, type, size, path in symbols:
-        if type in types:
-            yield sym, type, size, path
+def parse_cpp_name(name):
+    # Turn prefixes into suffixes so namespacing works.
+    prefixes = [
+        ['bool ',                         ''],
+        ['construction vtable for ',      ' [construction vtable]'],
+        ['global constructors keyed to ', ' [global constructors]'],
+        ['int ',                          ''],
+        ['non-virtual thunk to ',         ' [non-virtual thunk]'],
+        ['typeinfo for ',                 ' [typeinfo]'],
+        ['typeinfo name for ',            ' [typeinfo name]'],
+        ['virtual thunk to ',             ' [virtual thunk]'],
+        ['void ',                         ''],
+        ['vtable for ',                   ' [vtable]'],
+    ]
+    for prefix, replacement in prefixes:
+        if name.startswith(prefix):
+            name = name[len(prefix):] + replacement
+    # Simplify parenthesis parsing.
+    replacements = [
+        ['(anonymous namespace)', '[anonymous namespace]'],
+    ]
+    for value, replacement in replacements:
+        name = name.replace(value, replacement)
+
+    def parse_one(val):
+        """Returns (leftmost-part, remaining)."""
+        if (val.startswith('operator') and
+            not (val[8].isalnum() or val[8] == '_')):
+            # Operator overload function, terminate.
+            return (val, '')
+        co = val.find('::')
+        lt = val.find('<')
+        pa = val.find('(')
+        co = len(val) if co == -1 else co
+        lt = len(val) if lt == -1 else lt
+        pa = len(val) if pa == -1 else pa
+        if co < lt and co < pa:
+            # Namespace or type name.
+            return (val[:co], val[co+2:])
+        if lt < pa:
+            # Template. Make sure we capture nested templates too.
+            open_tmpl = 1
+            gt = lt
+            while val[gt] != '>' or open_tmpl != 0:
+                gt = gt + 1
+                if val[gt] == '<':
+                    open_tmpl = open_tmpl + 1
+                if val[gt] == '>':
+                    open_tmpl = open_tmpl - 1
+            ret = val[gt+1:]
+            if ret.startswith('::'):
+                ret = ret[2:]
+            if ret.startswith('('):
+                # Template function, terminate.
+                return (val, '')
+            return (val[:gt+1], ret)
+        # Terminate with any function name, identifier, or unmangled name.
+        return (val, '')
+
+    parts = []
+    while len(name) > 0:
+        (part, name) = parse_one(name)
+        assert len(part) > 0
+        parts.append(part)
+    return parts
 
 
-def treeify_syms(symbols, strip_prefix=None):
+def treeify_syms(symbols, strip_prefix=None, cppfilt=None):
     dirs = {}
     for sym, type, size, path in symbols:
         if path:
             path = os.path.normpath(path)
             if strip_prefix and path.startswith(strip_prefix):
                 path = path[len(strip_prefix):]
-            elif path.startswith('/usr/include'):
-                path = path.replace('/usr/include', 'usrinclude')
             elif path.startswith('/'):
                 path = path[1:]
+            path = ['[path]'] + path.split('/')
 
-        parts = None
-        # TODO: make segmenting by namespace work.
-        if False and '::' in sym:
-            if sym.startswith('vtable for '):
-                sym = sym[len('vtable for '):]
-                parts = sym.split('::')
-                parts.append('[vtable]')
-            else:
-                parts = sym.split('::')
-            parts[0] = '::' + parts[0]
-        elif path and '/' in path:
-            parts = path.split('/')
+        parts = parse_cpp_name(sym)
+        if len(parts) == 1:
+          if path:
+            # No namespaces, group with path.
+            parts = path + parts
+          else:
+            new_prefix = ['[ungrouped]']
+            regroups = [
+                ['.L.str',                 '[str]'],
+                ['.L__PRETTY_FUNCTION__.', '[__PRETTY_FUNCTION__]'],
+                ['.L__func__.',            '[__func__]'],
+                ['.Lswitch.table',         '[switch table]'],
+            ]
+            for prefix, group in regroups:
+                if parts[0].startswith(prefix):
+                    parts[0] = parts[0][len(prefix):]
+                    if cppfilt and parts[0].startswith('_Z'):
+                        # Demangle names when possible.
+                        # Mangled names all start with _Z.
+                        parts[0] = subprocess.check_output(
+                            [cppfilt, parts[0]]).strip()
+                    new_prefix += [group]
+                    break
+            parts = new_prefix + parts
 
-        if parts:
-            key = parts.pop()
-            tree = dirs
-            try:
-                for part in parts:
-                    assert part != '', path
-                    if part not in tree:
-                        tree[part] = {}
-                    tree = tree[part]
-                tree[key] = tree.get(key, 0) + size
-            except:
-                print >>sys.stderr, sym, parts, key
-                raise
-        else:
-            key = 'symbols without paths'
-            if key not in dirs:
-                dirs[key] = {}
-            tree = dirs[key]
-            subkey = 'misc'
-            if (sym.endswith('::__FUNCTION__') or
-                sym.endswith('::__PRETTY_FUNCTION__')):
-                subkey = '__FUNCTION__'
-            elif sym.startswith('CSWTCH.'):
-                subkey = 'CSWTCH'
-            elif '::' in sym:
-                subkey = sym[0:sym.find('::') + 2]
-            else:
-                print >>sys.stderr, 'unbucketed (no path?):', sym, type, size, path
-            tree[subkey] = tree.get(subkey, 0) + size
+        key = parts.pop()
+        tree = dirs
+        try:
+            depth = 0
+            for part in parts:
+                depth = depth + 1
+                assert part != '', path
+                if part not in tree:
+                    tree[part] = {'$bloat_symbols':{}}
+                if type not in tree[part]['$bloat_symbols']:
+                    tree[part]['$bloat_symbols'][type] = 0
+                tree[part]['$bloat_symbols'][type] += 1
+                tree = tree[part]
+            old_size, old_symbols = tree.get(key, (0, {}))
+            if type not in old_symbols:
+                old_symbols[type] = 0
+            old_symbols[type] += 1
+            tree[key] = (old_size + size, old_symbols)
+        except:
+            print >>sys.stderr, 'sym `%s`\tparts `%s`\tkey `%s`' % (sym, parts, key)
+            raise
     return dirs
 
 
@@ -146,31 +212,45 @@
     files = 0
 
     for key, val in tree.iteritems():
+        if key == '$bloat_symbols':
+            continue
         if isinstance(val, dict):
             subtree = jsonify_tree(val, key)
             total += subtree['data']['$area']
             children.append(subtree)
         else:
-            total += val
+            (size, symbols) = val
+            total += size
+            assert len(symbols) == 1, symbols.values()[0] == 1
+            symbol = symbol_type_to_human(symbols.keys()[0])
             children.append({
-                    'name': key + ' ' + format_bytes(val),
-                    'data': { '$area': val }
-                    })
+                    'name': key + ' ' + format_bytes(size),
+                    'data': {
+                        '$area': size,
+                        '$symbol': symbol,
+                    }
+            })
 
     children.sort(key=lambda child: -child['data']['$area'])
-
+    dominant_symbol = ''
+    if '$bloat_symbols' in tree:
+        dominant_symbol = symbol_type_to_human(
+            max(tree['$bloat_symbols'].iteritems(),
+                key=operator.itemgetter(1))[0])
     return {
         'name': name + ' ' + format_bytes(total),
         'data': {
             '$area': total,
+            '$dominant_symbol': dominant_symbol,
             },
         'children': children,
         }
 
 
-def dump_nm(nmfile, strip_prefix):
-    dirs = treeify_syms(parse_nm(nmfile), strip_prefix)
-    print 'var kTree = ' + json.dumps(jsonify_tree(dirs, '/'), indent=2)
+def dump_nm(nmfile, strip_prefix, cppfilt):
+    dirs = treeify_syms(parse_nm(nmfile), strip_prefix, cppfilt)
+    print ('var kTree = ' +
+           json.dumps(jsonify_tree(dirs, '[everything]'), indent=2))
 
 
 def parse_objdump(input):
@@ -214,14 +294,14 @@
         }
 
 
-def dump_sections():
-    sections, debug_sections = parse_objdump(open('objdump.out'))
+def dump_sections(objdump):
+    sections, debug_sections = parse_objdump(objdump)
     sections = jsonify_sections('sections', sections)
     debug_sections = jsonify_sections('debug', debug_sections)
+    size = sections['data']['$area'] + debug_sections['data']['$area']
     print 'var kTree = ' + json.dumps({
-            'name': 'top',
-            'data': { '$area': sections['data']['$area'] +
-                               debug_sections['data']['$area'] },
+            'name': 'top ' + format_bytes(size),
+            'data': { '$area': size },
             'children': [ debug_sections, sections ]})
 
 
@@ -243,13 +323,17 @@
 parser.add_option('--nm-output', action='store', dest='nmpath',
                   metavar='PATH', default='nm.out',
                   help='path to nm output [default=nm.out]')
-parser.add_option('--objdump-output', action='store', dest='objdump',
+parser.add_option('--objdump-output', action='store', dest='objdumppath',
                   metavar='PATH', default='objdump.out',
                   help='path to objdump output [default=objdump.out]')
 parser.add_option('--strip-prefix', metavar='PATH', action='store',
                   help='strip PATH prefix from paths; e.g. /path/to/src/root')
 parser.add_option('--filter', action='store',
                   help='include only symbols/files matching FILTER')
+parser.add_option('--c++filt', action='store', metavar='PATH', dest='cppfilt',
+                  default='c++filt', help="Path to c++filt, used to demangle "
+                  "symbols that weren't handled by nm. Set to an invalid path "
+                  "to disable.")
 opts, args = parser.parse_args()
 
 if len(args) != 1:
@@ -259,9 +343,20 @@
 mode = args[0]
 if mode == 'syms':
     nmfile = open(opts.nmpath, 'r')
-    dump_nm(nmfile, strip_prefix=opts.strip_prefix)
+    try:
+        res = subprocess.check_output([opts.cppfilt, 'main'])
+        if res.strip() != 'main':
+            print >>sys.stderr, ("%s failed demangling, "
+                                 "output won't be demangled." % opt.cppfilt)
+            opts.cppfilt = None
+    except:
+        print >>sys.stderr, ("Could not find c++filt at %s, "
+                             "output won't be demangled." % opt.cppfilt)
+        opts.cppfilt = None
+    dump_nm(nmfile, strip_prefix=opts.strip_prefix, cppfilt=opts.cppfilt)
 elif mode == 'sections':
-    dump_sections()
+    objdumpfile = open(opts.objdumppath, 'r')
+    dump_sections(objdumpfile)
 elif mode == 'dump':
     nmfile = open(opts.nmpath, 'r')
     syms = list(parse_nm(nmfile))
diff --git a/index.html b/index.html
index e31958f..4529e8b 100644
--- a/index.html
+++ b/index.html
@@ -28,6 +28,11 @@
   cursor: pointer;
   -webkit-user-select: none;
 }
+
+#legend {
+  width: 800px;
+  padding-top: 10px;
+}
 </style>
 
 <h1>binary size</h1>
@@ -35,6 +40,18 @@
 <p>Click on a box to zoom in.  Click on the outermost box to zoom out.</p>
 
 <div id='map'></div>
+<div id='legend'>
+<div class='webtreemap-symbol-bss'>bss</div>
+<div class='webtreemap-symbol-bss webtreemap-aggregate'>bss aggregate</div>
+<div class='webtreemap-symbol-data'>data</div>
+<div class='webtreemap-symbol-data webtreemap-aggregate'>data aggregate</div>
+<div class='webtreemap-symbol-read-only_data'>read-only data</div>
+<div class='webtreemap-symbol-read-only_data webtreemap-aggregate'>read-only data aggregate</div>
+<div class='webtreemap-symbol-code'>code</div>
+<div class='webtreemap-symbol-code webtreemap-aggregate'>code aggregate</div>
+<div class='webtreemap-symbol-weak_symbol'>weak symbol</div>
+<div class='webtreemap-symbol-weak_symbol webtreemap-aggregate'>weak symbol aggregate</div>
+</div>
 
 <script src='webtreemap/webtreemap.js'></script>