chrome/tools/history-viz.py - chromium/src.git - Git at Google

 #!/usr/bin/env python
 # Copyright 2011 The Chromium Authors
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Process a History database and dump a .dot file suitable for GraphViz.

 This is useful for debugging history redirect flows.

 An example run of this program:
   python /src/history-viz.py History > foo.dot
   /c/Program\ Files/Graphviz2.18/bin/dot -Tpng foo.dot -o foo.png
 """

 import struct
 import subprocess
 import sys
 import urlparse


 # Some transition types, copied from page_transition_types.h.
 TRANS_TYPES = {
   0: 'link',
   1: 'typed',
   2: 'most-visited',
   3: 'auto subframe',
   7: 'form',
 }


 class URL(object):
   """Represents a broken-down URL from our most visited database."""

   def __init__(self, id, url):
     """Initialize a new URL object.  |id| is the database id of the URL."""
     self.id = id
     self.url = url
     scheme, loc, path, query, fragment = urlparse.urlsplit(url)
     if scheme == 'http':
       scheme = ''  # Shorten for display purposes.
     if len(scheme) > 0:
       scheme += '://'
     self.host = scheme + loc
     self.path = path

     extra = ''
     if len(query) > 0:
       extra += '?' + query
     if len(fragment) > 0 or url.find('#') > 0:
       extra += '#' + fragment
     self.extra = extra

   def PrettyPrint(self, include_host=True, include_path=True):
     """Pretty-print this URL in a form more suitable for the graph.

     This will elide very long paths and potentially puts newlines between parts
     of long components.  include_host and include_path determine whether to
     include the host and path in the output.

     Returns: the pretty-printed string."""
     MAX_LEN = 30  # Maximum length of a line in the output.
     parts = []
     if include_host:
       parts.append(self.host)
     if include_path:
       parts.append(self.path)
     parts.append(self.extra)
     lines = []
     line = ''
     for part in parts:
       if len(part) > MAX_LEN:
         part = part[0:MAX_LEN-3] + '...'
       if len(line)+len(part) > MAX_LEN:
         lines.append(line)
         line = ''
       line += part
     if len(line) > 0:
       lines.append(line)
     return '\n'.join(lines)


 class Edge(object):
   """Represents an edge in the history graph, connecting two pages.

   If a link is traversed twice, it is one Edge with two entries in
   the .transitions array."""
   def __init__(self, src, dst):
     self.src = src
     self.dst = dst
     self.transitions = []

   def Transitions(self):
     """Return a dictionary mapping transition type -> occurences."""
     all = {}
     for trans in self.transitions:
       all[trans] = all.get(trans, 0) + 1
       # We currently don't use the chain type.
       # TODO(evanm): make this a command-line option.
       # if trans & 0x30000000 != 0:
       #   chain = ''
       #   if trans & 0x10000000:
       #     chain = 'start'
       #   if trans & 0x20000000:
       #     if len(chain) == 0:
       #       chain = 'end'
       #     else:
       #       chain = ''
       #   if len(chain) > 0:
       #     edge['chain'] = chain
     return all


 def ClusterBy(objs, pred):
   """Group a list of objects by a predicate.

   Given a list of objects and a predicate over the objects, return a
   dictionary mapping pred(obj) -> all objs with the same pred(obj)."""
   clusters = {}
   for obj in objs:
     cluster = pred(obj)
     clusters[cluster] = clusters.get(cluster, [])
     clusters[cluster].append(obj)
   return clusters


 def EscapeDot(string):
   """Escape a string suitable for embedding in a graphviz graph."""
   # TODO(evanm): this is likely not sufficient.
   return string.replace('\n', '\\n')


 class SQLite(object):
   """Trivial interface to executing SQLite queries.
   Spawns a new process with each call."""
   def __init__(self, file=None):
     self.file = file

   def Run(self, sql):
     """Execute |sql|, yielding each row of results as an array."""
     subproc = subprocess.Popen(['sqlite', self.file],
                                stdin=subprocess.PIPE,
                                stdout=subprocess.PIPE)
     subproc.stdin.write('.mode tabs\n')
     subproc.stdin.write(sql + ';')
     subproc.stdin.close()
     for line in subproc.stdout:
       row = line.strip().split('\t')
       yield row


 def LoadHistory(filename):
   db = SQLite(filename)

   urls = {}  # Map of urlid => url.
   urls['0'] = URL('0', 'start')  # Node name '0' is our special 'start' node.
   for id, url in db.Run('SELECT id, url FROM urls'):
     urls[id] = URL(id, url)

   visiturlids = {}  # Map of visitid => urlid.
   visiturlids['0'] = '0'  # '0' is our special 'start' node.
   edges = {}  # Map of urlid->urlid->Edge.
   for src, dst, url, trans in db.Run('SELECT from_visit, id, url, transition '
                                      'FROM visits ORDER BY id'):
     visiturlids[dst] = url
     src = visiturlids[src]
     dst = visiturlids[dst]
     edges[src] = edges.get(src, {})
     edge = edges[src][dst] = edges[src].get(dst, Edge(src, dst))
     # SQLite outputs transition values as signed integers, but they're really
     # a bitfield.  Below does "unsigned trans = static_cast<unsigned>(trans)".
     trans = struct.unpack('I', struct.pack('i', int(trans)))[0]
     edge.transitions.append(trans)

   return urls, edges


 def main():
   urls, edges = LoadHistory(sys.argv[1])
   print 'digraph G {'
   print '  graph [rankdir=LR]'  # Display left to right.
   print '  node [shape=box]'    # Display nodes as boxes.
   print '  subgraph { rank=source; 0 [label="start"] }'

   # Output all the nodes within graph clusters.
   hosts = ClusterBy(urls.values(), lambda url: url.host)
   for i, (host, urls) in enumerate(hosts.items()):
     # Cluster all URLs under this host if it has more than one entry.
     host_clustered = len(urls) > 1
     if host_clustered:
       print 'subgraph clusterhost%d {' % i
       print '  label="%s"' % host
     paths = ClusterBy(urls, lambda url: url.path)
     for j, (path, urls) in enumerate(paths.items()):
       # Cluster all URLs under this host if it has more than one entry.
       path_clustered = host_clustered and len(urls) > 1
       if path_clustered:
         print '  subgraph cluster%d%d {' % (i, j)
         print '    label="%s"' % path
       for url in urls:
         if url.id == '0': continue  # We already output the special start node.
         pretty = url.PrettyPrint(include_host=not host_clustered,
                                 include_path=not path_clustered)
         print '    %s [label="%s"]' % (url.id, EscapeDot(pretty))
       if path_clustered:
         print '  }'
     if host_clustered:
       print '}'

   # Output all the edges between nodes.
   for src, dsts in edges.items():
     for dst, edge in dsts.items():
       # Gather up all the transitions into the label.
       label = []      # Label for the edge.
       transitions = edge.Transitions()
       for trans, count in transitions.items():
         text = ''
         if count > 1:
           text = '%dx ' % count
         base_type = trans & 0xFF
         redir = (trans & 0xC0000000) != 0
         start = (trans & 0x10000000) != 0
         end = (trans & 0x20000000) != 0
         if start or end:
           if start:
             text += '<'
           if end:
             text += '>'
           text += ' '
         if redir:
           text += 'R '
         text += TRANS_TYPES.get(base_type, 'trans%d' % base_type)
         label.append(text)
       if len(label) == 0:
         continue

       edgeattrs = []  # Graphviz attributes for the edge.
       # If the edge is from the start and the transitions are fishy, make it
       # display as a dotted line.
       if src == '0' and len(transitions.keys()) == 1 and 0 in transitions:
         edgeattrs.append('style=dashed')
       if len(label) > 0:
         edgeattrs.append('label="%s"' % EscapeDot('\n'.join(label)))

       out = '%s -> %s' % (src, dst)
       if len(edgeattrs) > 0:
         out += ' [%s]' % ','.join(edgeattrs)
       print out
   print '}'
   return 0


 if __name__ == '__main__':
   sys.exit(main())
	#!/usr/bin/env python
	# Copyright 2011 The Chromium Authors
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""Process a History database and dump a .dot file suitable for GraphViz.

	This is useful for debugging history redirect flows.

	An example run of this program:
	python /src/history-viz.py History > foo.dot
	/c/Program\ Files/Graphviz2.18/bin/dot -Tpng foo.dot -o foo.png
	"""

	import struct
	import subprocess
	import sys
	import urlparse


	# Some transition types, copied from page_transition_types.h.
	TRANS_TYPES = {
	0: 'link',
	1: 'typed',
	2: 'most-visited',
	3: 'auto subframe',
	7: 'form',
	}


	class URL(object):
	"""Represents a broken-down URL from our most visited database."""

	def __init__(self, id, url):
	"""Initialize a new URL object. \|id\| is the database id of the URL."""
	self.id = id
	self.url = url
	scheme, loc, path, query, fragment = urlparse.urlsplit(url)
	if scheme == 'http':
	scheme = '' # Shorten for display purposes.
	if len(scheme) > 0:
	scheme += '://'
	self.host = scheme + loc
	self.path = path

	extra = ''
	if len(query) > 0:
	extra += '?' + query
	if len(fragment) > 0 or url.find('#') > 0:
	extra += '#' + fragment
	self.extra = extra

	def PrettyPrint(self, include_host=True, include_path=True):
	"""Pretty-print this URL in a form more suitable for the graph.

	This will elide very long paths and potentially puts newlines between parts
	of long components. include_host and include_path determine whether to
	include the host and path in the output.

	Returns: the pretty-printed string."""
	MAX_LEN = 30 # Maximum length of a line in the output.
	parts = []
	if include_host:
	parts.append(self.host)
	if include_path:
	parts.append(self.path)
	parts.append(self.extra)
	lines = []
	line = ''
	for part in parts:
	if len(part) > MAX_LEN:
	part = part[0:MAX_LEN-3] + '...'
	if len(line)+len(part) > MAX_LEN:
	lines.append(line)
	line = ''
	line += part
	if len(line) > 0:
	lines.append(line)
	return '\n'.join(lines)


	class Edge(object):
	"""Represents an edge in the history graph, connecting two pages.

	If a link is traversed twice, it is one Edge with two entries in
	the .transitions array."""
	def __init__(self, src, dst):
	self.src = src
	self.dst = dst
	self.transitions = []

	def Transitions(self):
	"""Return a dictionary mapping transition type -> occurences."""
	all = {}
	for trans in self.transitions:
	all[trans] = all.get(trans, 0) + 1
	# We currently don't use the chain type.
	# TODO(evanm): make this a command-line option.
	# if trans & 0x30000000 != 0:
	# chain = ''
	# if trans & 0x10000000:
	# chain = 'start'
	# if trans & 0x20000000:
	# if len(chain) == 0:
	# chain = 'end'
	# else:
	# chain = ''
	# if len(chain) > 0:
	# edge['chain'] = chain
	return all


	def ClusterBy(objs, pred):
	"""Group a list of objects by a predicate.

	Given a list of objects and a predicate over the objects, return a
	dictionary mapping pred(obj) -> all objs with the same pred(obj)."""
	clusters = {}
	for obj in objs:
	cluster = pred(obj)
	clusters[cluster] = clusters.get(cluster, [])
	clusters[cluster].append(obj)
	return clusters


	def EscapeDot(string):
	"""Escape a string suitable for embedding in a graphviz graph."""
	# TODO(evanm): this is likely not sufficient.
	return string.replace('\n', '\\n')


	class SQLite(object):
	"""Trivial interface to executing SQLite queries.
	Spawns a new process with each call."""
	def __init__(self, file=None):
	self.file = file

	def Run(self, sql):
	"""Execute \|sql\|, yielding each row of results as an array."""
	subproc = subprocess.Popen(['sqlite', self.file],
	stdin=subprocess.PIPE,
	stdout=subprocess.PIPE)
	subproc.stdin.write('.mode tabs\n')
	subproc.stdin.write(sql + ';')
	subproc.stdin.close()
	for line in subproc.stdout:
	row = line.strip().split('\t')
	yield row


	def LoadHistory(filename):
	db = SQLite(filename)

	urls = {} # Map of urlid => url.
	urls['0'] = URL('0', 'start') # Node name '0' is our special 'start' node.
	for id, url in db.Run('SELECT id, url FROM urls'):
	urls[id] = URL(id, url)

	visiturlids = {} # Map of visitid => urlid.
	visiturlids['0'] = '0' # '0' is our special 'start' node.
	edges = {} # Map of urlid->urlid->Edge.
	for src, dst, url, trans in db.Run('SELECT from_visit, id, url, transition '
	'FROM visits ORDER BY id'):
	visiturlids[dst] = url
	src = visiturlids[src]
	dst = visiturlids[dst]
	edges[src] = edges.get(src, {})
	edge = edges[src][dst] = edges[src].get(dst, Edge(src, dst))
	# SQLite outputs transition values as signed integers, but they're really
	# a bitfield. Below does "unsigned trans = static_cast<unsigned>(trans)".
	trans = struct.unpack('I', struct.pack('i', int(trans)))[0]
	edge.transitions.append(trans)

	return urls, edges


	def main():
	urls, edges = LoadHistory(sys.argv[1])
	print 'digraph G {'
	print ' graph [rankdir=LR]' # Display left to right.
	print ' node [shape=box]' # Display nodes as boxes.
	print ' subgraph { rank=source; 0 [label="start"] }'

	# Output all the nodes within graph clusters.
	hosts = ClusterBy(urls.values(), lambda url: url.host)
	for i, (host, urls) in enumerate(hosts.items()):
	# Cluster all URLs under this host if it has more than one entry.
	host_clustered = len(urls) > 1
	if host_clustered:
	print 'subgraph clusterhost%d {' % i
	print ' label="%s"' % host
	paths = ClusterBy(urls, lambda url: url.path)
	for j, (path, urls) in enumerate(paths.items()):
	# Cluster all URLs under this host if it has more than one entry.
	path_clustered = host_clustered and len(urls) > 1
	if path_clustered:
	print ' subgraph cluster%d%d {' % (i, j)
	print ' label="%s"' % path
	for url in urls:
	if url.id == '0': continue # We already output the special start node.
	pretty = url.PrettyPrint(include_host=not host_clustered,
	include_path=not path_clustered)
	print ' %s [label="%s"]' % (url.id, EscapeDot(pretty))
	if path_clustered:
	print ' }'
	if host_clustered:
	print '}'

	# Output all the edges between nodes.
	for src, dsts in edges.items():
	for dst, edge in dsts.items():
	# Gather up all the transitions into the label.
	label = [] # Label for the edge.
	transitions = edge.Transitions()
	for trans, count in transitions.items():
	text = ''
	if count > 1:
	text = '%dx ' % count
	base_type = trans & 0xFF
	redir = (trans & 0xC0000000) != 0
	start = (trans & 0x10000000) != 0
	end = (trans & 0x20000000) != 0
	if start or end:
	if start:
	text += '<'
	if end:
	text += '>'
	text += ' '
	if redir:
	text += 'R '
	text += TRANS_TYPES.get(base_type, 'trans%d' % base_type)
	label.append(text)
	if len(label) == 0:
	continue

	edgeattrs = [] # Graphviz attributes for the edge.
	# If the edge is from the start and the transitions are fishy, make it
	# display as a dotted line.
	if src == '0' and len(transitions.keys()) == 1 and 0 in transitions:
	edgeattrs.append('style=dashed')
	if len(label) > 0:
	edgeattrs.append('label="%s"' % EscapeDot('\n'.join(label)))

	out = '%s -> %s' % (src, dst)
	if len(edgeattrs) > 0:
	out += ' [%s]' % ','.join(edgeattrs)
	print out
	print '}'
	return 0


	if __name__ == '__main__':
	sys.exit(main())