| #!/usr/bin/env python |
| # Copyright 2011 The Chromium Authors |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Process a History database and dump a .dot file suitable for GraphViz. |
| |
| This is useful for debugging history redirect flows. |
| |
| An example run of this program: |
| python /src/history-viz.py History > foo.dot |
| /c/Program\ Files/Graphviz2.18/bin/dot -Tpng foo.dot -o foo.png |
| """ |
| |
| import struct |
| import subprocess |
| import sys |
| import urlparse |
| |
| |
| # Some transition types, copied from page_transition_types.h. |
| TRANS_TYPES = { |
| 0: 'link', |
| 1: 'typed', |
| 2: 'most-visited', |
| 3: 'auto subframe', |
| 7: 'form', |
| } |
| |
| |
| class URL(object): |
| """Represents a broken-down URL from our most visited database.""" |
| |
| def __init__(self, id, url): |
| """Initialize a new URL object. |id| is the database id of the URL.""" |
| self.id = id |
| self.url = url |
| scheme, loc, path, query, fragment = urlparse.urlsplit(url) |
| if scheme == 'http': |
| scheme = '' # Shorten for display purposes. |
| if len(scheme) > 0: |
| scheme += '://' |
| self.host = scheme + loc |
| self.path = path |
| |
| extra = '' |
| if len(query) > 0: |
| extra += '?' + query |
| if len(fragment) > 0 or url.find('#') > 0: |
| extra += '#' + fragment |
| self.extra = extra |
| |
| def PrettyPrint(self, include_host=True, include_path=True): |
| """Pretty-print this URL in a form more suitable for the graph. |
| |
| This will elide very long paths and potentially puts newlines between parts |
| of long components. include_host and include_path determine whether to |
| include the host and path in the output. |
| |
| Returns: the pretty-printed string.""" |
| MAX_LEN = 30 # Maximum length of a line in the output. |
| parts = [] |
| if include_host: |
| parts.append(self.host) |
| if include_path: |
| parts.append(self.path) |
| parts.append(self.extra) |
| lines = [] |
| line = '' |
| for part in parts: |
| if len(part) > MAX_LEN: |
| part = part[0:MAX_LEN-3] + '...' |
| if len(line)+len(part) > MAX_LEN: |
| lines.append(line) |
| line = '' |
| line += part |
| if len(line) > 0: |
| lines.append(line) |
| return '\n'.join(lines) |
| |
| |
| class Edge(object): |
| """Represents an edge in the history graph, connecting two pages. |
| |
| If a link is traversed twice, it is one Edge with two entries in |
| the .transitions array.""" |
| def __init__(self, src, dst): |
| self.src = src |
| self.dst = dst |
| self.transitions = [] |
| |
| def Transitions(self): |
| """Return a dictionary mapping transition type -> occurences.""" |
| all = {} |
| for trans in self.transitions: |
| all[trans] = all.get(trans, 0) + 1 |
| # We currently don't use the chain type. |
| # TODO(evanm): make this a command-line option. |
| # if trans & 0x30000000 != 0: |
| # chain = '' |
| # if trans & 0x10000000: |
| # chain = 'start' |
| # if trans & 0x20000000: |
| # if len(chain) == 0: |
| # chain = 'end' |
| # else: |
| # chain = '' |
| # if len(chain) > 0: |
| # edge['chain'] = chain |
| return all |
| |
| |
| def ClusterBy(objs, pred): |
| """Group a list of objects by a predicate. |
| |
| Given a list of objects and a predicate over the objects, return a |
| dictionary mapping pred(obj) -> all objs with the same pred(obj).""" |
| clusters = {} |
| for obj in objs: |
| cluster = pred(obj) |
| clusters[cluster] = clusters.get(cluster, []) |
| clusters[cluster].append(obj) |
| return clusters |
| |
| |
| def EscapeDot(string): |
| """Escape a string suitable for embedding in a graphviz graph.""" |
| # TODO(evanm): this is likely not sufficient. |
| return string.replace('\n', '\\n') |
| |
| |
| class SQLite(object): |
| """Trivial interface to executing SQLite queries. |
| Spawns a new process with each call.""" |
| def __init__(self, file=None): |
| self.file = file |
| |
| def Run(self, sql): |
| """Execute |sql|, yielding each row of results as an array.""" |
| subproc = subprocess.Popen(['sqlite', self.file], |
| stdin=subprocess.PIPE, |
| stdout=subprocess.PIPE) |
| subproc.stdin.write('.mode tabs\n') |
| subproc.stdin.write(sql + ';') |
| subproc.stdin.close() |
| for line in subproc.stdout: |
| row = line.strip().split('\t') |
| yield row |
| |
| |
| def LoadHistory(filename): |
| db = SQLite(filename) |
| |
| urls = {} # Map of urlid => url. |
| urls['0'] = URL('0', 'start') # Node name '0' is our special 'start' node. |
| for id, url in db.Run('SELECT id, url FROM urls'): |
| urls[id] = URL(id, url) |
| |
| visiturlids = {} # Map of visitid => urlid. |
| visiturlids['0'] = '0' # '0' is our special 'start' node. |
| edges = {} # Map of urlid->urlid->Edge. |
| for src, dst, url, trans in db.Run('SELECT from_visit, id, url, transition ' |
| 'FROM visits ORDER BY id'): |
| visiturlids[dst] = url |
| src = visiturlids[src] |
| dst = visiturlids[dst] |
| edges[src] = edges.get(src, {}) |
| edge = edges[src][dst] = edges[src].get(dst, Edge(src, dst)) |
| # SQLite outputs transition values as signed integers, but they're really |
| # a bitfield. Below does "unsigned trans = static_cast<unsigned>(trans)". |
| trans = struct.unpack('I', struct.pack('i', int(trans)))[0] |
| edge.transitions.append(trans) |
| |
| return urls, edges |
| |
| |
| def main(): |
| urls, edges = LoadHistory(sys.argv[1]) |
| print 'digraph G {' |
| print ' graph [rankdir=LR]' # Display left to right. |
| print ' node [shape=box]' # Display nodes as boxes. |
| print ' subgraph { rank=source; 0 [label="start"] }' |
| |
| # Output all the nodes within graph clusters. |
| hosts = ClusterBy(urls.values(), lambda url: url.host) |
| for i, (host, urls) in enumerate(hosts.items()): |
| # Cluster all URLs under this host if it has more than one entry. |
| host_clustered = len(urls) > 1 |
| if host_clustered: |
| print 'subgraph clusterhost%d {' % i |
| print ' label="%s"' % host |
| paths = ClusterBy(urls, lambda url: url.path) |
| for j, (path, urls) in enumerate(paths.items()): |
| # Cluster all URLs under this host if it has more than one entry. |
| path_clustered = host_clustered and len(urls) > 1 |
| if path_clustered: |
| print ' subgraph cluster%d%d {' % (i, j) |
| print ' label="%s"' % path |
| for url in urls: |
| if url.id == '0': continue # We already output the special start node. |
| pretty = url.PrettyPrint(include_host=not host_clustered, |
| include_path=not path_clustered) |
| print ' %s [label="%s"]' % (url.id, EscapeDot(pretty)) |
| if path_clustered: |
| print ' }' |
| if host_clustered: |
| print '}' |
| |
| # Output all the edges between nodes. |
| for src, dsts in edges.items(): |
| for dst, edge in dsts.items(): |
| # Gather up all the transitions into the label. |
| label = [] # Label for the edge. |
| transitions = edge.Transitions() |
| for trans, count in transitions.items(): |
| text = '' |
| if count > 1: |
| text = '%dx ' % count |
| base_type = trans & 0xFF |
| redir = (trans & 0xC0000000) != 0 |
| start = (trans & 0x10000000) != 0 |
| end = (trans & 0x20000000) != 0 |
| if start or end: |
| if start: |
| text += '<' |
| if end: |
| text += '>' |
| text += ' ' |
| if redir: |
| text += 'R ' |
| text += TRANS_TYPES.get(base_type, 'trans%d' % base_type) |
| label.append(text) |
| if len(label) == 0: |
| continue |
| |
| edgeattrs = [] # Graphviz attributes for the edge. |
| # If the edge is from the start and the transitions are fishy, make it |
| # display as a dotted line. |
| if src == '0' and len(transitions.keys()) == 1 and 0 in transitions: |
| edgeattrs.append('style=dashed') |
| if len(label) > 0: |
| edgeattrs.append('label="%s"' % EscapeDot('\n'.join(label))) |
| |
| out = '%s -> %s' % (src, dst) |
| if len(edgeattrs) > 0: |
| out += ' [%s]' % ','.join(edgeattrs) |
| print out |
| print '}' |
| return 0 |
| |
| |
| if __name__ == '__main__': |
| sys.exit(main()) |