import
diff --git a/README.markdown b/README.markdown
new file mode 100644
index 0000000..aab7cbe
--- /dev/null
+++ b/README.markdown
@@ -0,0 +1,25 @@
+# bloat
+
+Generate [webtreemap][]-compatible JSON summaries of binary size.
+
+[webtreemap]: http://github.com/martine/webtreemap
+
+## Setup
+
+1) Check out a copy of webtreemap in a `webtreemap` subdirectory:
+
+        git clone git://github.com/martine/webtreemap.git
+
+2) Build your binary with the `-g` flag to get symbols.
+
+3) Run `./bloat.py --help` and generate `nm.out` as instructed there.
+
+4) Example command line:
+
+        ./bloat.py --strip-prefix=/path/to/src syms > bloat.json
+
+## Misc other feature
+
+Dump large symbols:
+
+    $ ./bloat.py dump | head -20
diff --git a/bloat.py b/bloat.py
new file mode 100755
index 0000000..e9d58f6
--- /dev/null
+++ b/bloat.py
@@ -0,0 +1,284 @@
+#!/usr/bin/python
+
+import fileinput
+import optparse
+import os
+import pprint
+import re
+import sys
+import json
+
+def format_bytes(bytes):
+    """Pretty-print a number of bytes."""
+    if bytes > 1e6:
+        bytes = bytes / 1.0e6
+        return '%.1fm' % bytes
+    if bytes > 1e3:
+        bytes = bytes / 1.0e3
+        return '%.1fk' % bytes
+    return str(bytes)
+
+
+def symbol_type_to_human(type):
+    """Convert a symbol type as printed by nm into a human-readable name."""
+    return {
+        'b': 'bss',
+        'd': 'data',
+        'r': 'read-only data',
+        't': 'code',
+        'w': 'weak symbol',
+        'v': 'weak symbol'
+        }[type]
+
+
+def parse_nm(input):
+    """Parse nm output.
+
+    Argument: an iterable over lines of nm output.
+
+    Yields: (symbol name, symbol type, symbol size, source file path).
+    Path may be None if nm couldn't figure out the source file.
+    """
+
+    # Match lines with size + symbol + optional filename.
+    sym_re = re.compile(r'^[0-9a-f]{8} ([0-9a-f]{8}) (.) ([^\t]+)(?:\t(.*):\d+)?$')
+
+    # Match lines with addr but no size.
+    addr_re = re.compile(r'^[0-9a-f]{8} (.) ([^\t]+)(?:\t.*)?$')
+    # Match lines that don't have an address at all -- typically external symbols.
+    noaddr_re = re.compile(r'^ {8} (.) (.*)$')
+
+    for line in input:
+        line = line.rstrip()
+        match = sym_re.match(line)
+        if match:
+            size, type, sym = match.groups()[0:3]
+            size = int(size, 16)
+            type = type.lower()
+            if type == 'v':
+                type = 'w'  # just call them all weak
+            if type == 'b':
+                continue  # skip all BSS for now
+            path = match.group(4)
+            yield sym, type, size, path
+            continue
+        match = addr_re.match(line)
+        if match:
+            type, sym = match.groups()[0:2]
+            # No size == we don't care.
+            continue
+        match = noaddr_re.match(line)
+        if match:
+            type, sym = match.groups()
+            if type in ('U', 'w'):
+                # external or weak symbol
+                continue
+
+        print >>sys.stderr, 'unparsed:', repr(line)
+
+
+def filter_syms(types, symbols):
+    for sym, type, size, path in symbols:
+        if type in types:
+            yield sym, type, size, path
+
+
+def treeify_syms(symbols, strip_prefix=None):
+    dirs = {}
+    for sym, type, size, path in symbols:
+        if path:
+            path = os.path.normpath(path)
+            if strip_prefix and path.startswith(strip_prefix):
+                path = path[len(strip_prefix):]
+            elif path.startswith('/usr/include'):
+                path = path.replace('/usr/include', 'usrinclude')
+            elif path.startswith('/'):
+                path = path[1:]
+
+        parts = None
+        # TODO: make segmenting by namespace work.
+        if False and '::' in sym:
+            if sym.startswith('vtable for '):
+                sym = sym[len('vtable for '):]
+                parts = sym.split('::')
+                parts.append('[vtable]')
+            else:
+                parts = sym.split('::')
+            parts[0] = '::' + parts[0]
+        elif path and '/' in path:
+            parts = path.split('/')
+
+        if parts:
+            key = parts.pop()
+            tree = dirs
+            try:
+                for part in parts:
+                    assert part != '', path
+                    if part not in tree:
+                        tree[part] = {}
+                    tree = tree[part]
+                tree[key] = tree.get(key, 0) + size
+            except:
+                print >>sys.stderr, sym, parts, key
+                raise
+        else:
+            key = 'symbols without paths'
+            if key not in dirs:
+                dirs[key] = {}
+            tree = dirs[key]
+            subkey = 'misc'
+            if (sym.endswith('::__FUNCTION__') or
+                sym.endswith('::__PRETTY_FUNCTION__')):
+                subkey = '__FUNCTION__'
+            elif sym.startswith('CSWTCH.'):
+                subkey = 'CSWTCH'
+            elif '::' in sym:
+                subkey = sym[0:sym.find('::') + 2]
+            else:
+                print >>sys.stderr, 'unbucketed (no path?):', sym, type, size, path
+            tree[subkey] = tree.get(subkey, 0) + size
+    return dirs
+
+
+def jsonify_tree(tree, name):
+    children = []
+    total = 0
+    files = 0
+
+    for key, val in tree.iteritems():
+        if isinstance(val, dict):
+            subtree = jsonify_tree(val, key)
+            total += subtree['data']['$area']
+            children.append(subtree)
+        else:
+            total += val
+            children.append({
+                    'name': key + ' ' + format_bytes(val),
+                    'data': { '$area': val }
+                    })
+
+    children.sort(key=lambda child: -child['data']['$area'])
+
+    return {
+        'name': name + ' ' + format_bytes(total),
+        'data': {
+            '$area': total,
+            },
+        'children': children,
+        }
+
+
+def dump_nm(nmfile, strip_prefix):
+    dirs = treeify_syms(parse_nm(nmfile), strip_prefix)
+    print 'var kTree = ' + json.dumps(jsonify_tree(dirs, '/'), indent=2)
+
+
+def parse_objdump(input):
+    """Parse objdump -h output."""
+    sec_re = re.compile('^\d+ (\S+) +([0-9a-z]+)')
+    sections = []
+    debug_sections = []
+
+    for line in input:
+        line = line.strip()
+        match = sec_re.match(line)
+        if match:
+            name, size = match.groups()
+            if name.startswith('.'):
+                name = name[1:]
+            if name.startswith('debug_'):
+                name = name[len('debug_'):]
+                debug_sections.append((name, int(size, 16)))
+            else:
+                sections.append((name, int(size, 16)))
+            continue
+    return sections, debug_sections
+
+
+def jsonify_sections(name, sections):
+    children = []
+    total = 0
+    for section, size in sections:
+        children.append({
+                'name': section + ' ' + format_bytes(size),
+                'data': { '$area': size }
+                })
+        total += size
+
+    children.sort(key=lambda child: -child['data']['$area'])
+
+    return {
+        'name': name + ' ' + format_bytes(total),
+        'data': { '$area': total },
+        'children': children
+        }
+
+
+def dump_sections():
+    sections, debug_sections = parse_objdump(open('objdump.out'))
+    sections = jsonify_sections('sections', sections)
+    debug_sections = jsonify_sections('debug', debug_sections)
+    print 'var kTree = ' + json.dumps({
+            'name': 'top',
+            'data': { '$area': sections['data']['$area'] +
+                               debug_sections['data']['$area'] },
+            'children': [ debug_sections, sections ]})
+
+
+usage="""%prog [options] MODE
+
+Modes are:
+  syms: output symbols json suitable for a treemap
+  dump: print symbols sorted by size (pipe to head for best output)
+  sections: output binary sections json suitable for a treemap
+
+nm output passed to --nm-output should from running a command
+like the following (note, can take a long time -- 30 minutes):
+  nm -C -S -l /path/to/binary > nm.out
+
+objdump output passed to --objdump-output should be from a command
+like:
+  objdump -h /path/to/binary > objdump.out"""
+parser = optparse.OptionParser(usage=usage)
+parser.add_option('--nm-output', action='store', dest='nmpath',
+                  metavar='PATH', default='nm.out',
+                  help='path to nm output [default=nm.out]')
+parser.add_option('--objdump-output', action='store', dest='objdump',
+                  metavar='PATH', default='objdump.out',
+                  help='path to objdump output [default=objdump.out]')
+parser.add_option('--strip-prefix', metavar='PATH', action='store',
+                  help='strip PATH prefix from paths; e.g. /path/to/src/root')
+parser.add_option('--filter', action='store',
+                  help='include only symbols/files matching FILTER')
+opts, args = parser.parse_args()
+
+if len(args) != 1:
+    parser.print_usage()
+    sys.exit(1)
+
+mode = args[0]
+if mode == 'syms':
+    nmfile = open(opts.nmpath, 'r')
+    dump_nm(nmfile, strip_prefix=opts.strip_prefix)
+elif mode == 'sections':
+    dump_sections()
+elif mode == 'dump':
+    nmfile = open(opts.nmpath, 'r')
+    syms = list(parse_nm(nmfile))
+    # a list of (sym, type, size, path); sort by size.
+    syms.sort(key=lambda x: -x[2])
+    total = 0
+    for sym, type, size, path in syms:
+        if type in ('b', 'w'):
+            continue  # skip bss and weak symbols
+        if path is None:
+            path = ''
+        if opts.filter and not (opts.filter in sym or opts.filter in path):
+            continue
+        print '%6s %s (%s) %s' % (format_bytes(size), sym,
+                                  symbol_type_to_human(type), path)
+        total += size
+    print '%6s %s' % (format_bytes(total), 'total'),
+else:
+    print 'unknown mode'
+    parser.print_usage()
diff --git a/index.html b/index.html
new file mode 100644
index 0000000..e31958f
--- /dev/null
+++ b/index.html
@@ -0,0 +1,44 @@
+<!DOCTYPE html>
+<title>binary size</title>
+<script src=bloat.json></script>
+<link rel=stylesheet href=webtreemap/webtreemap.css>
+<style>
+body {
+  font-family: sans-serif;
+  font-size: 0.8em;
+  margin: 2ex 4ex;
+}
+
+tt, pre {
+  font-family: WebKitWorkaround, monospace;
+}
+
+h1, h2 {
+  font-weight: normal;
+}
+h2 {
+  margin-top: 4ex;
+}
+
+#map {
+  width: 800px;
+  height: 600px;
+
+  position: relative;
+  cursor: pointer;
+  -webkit-user-select: none;
+}
+</style>
+
+<h1>binary size</h1>
+
+<p>Click on a box to zoom in.  Click on the outermost box to zoom out.</p>
+
+<div id='map'></div>
+
+<script src='webtreemap/webtreemap.js'></script>
+
+<script>
+var map = document.getElementById('map');
+appendTreemap(map, kTree);
+</script>