tools/android/loading/loading_model.py - chromium/src - Git at Google

 # Copyright 2015 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Models for loading in chrome.

 (Redirect the following to the general model module once we have one)
 A model is an object with the following methods.
   CostMs(): return the cost of the model in milliseconds.
   Set(): set model-specific parameters.

 ResourceGraph
   This creates a DAG of resource dependencies from loading.log_requests to model
   loading time. The model may be parameterized by changing the loading time of
   a particular or all resources.
 """

 import logging
 import os
 import urlparse
 import sys

 import dag
 import loading_trace
 import request_dependencies_lens

 class ResourceGraph(object):
   """A model of loading by a DAG (tree?) of resource dependancies.

   Set parameters:
     cache_all: if true, assume zero loading time for all resources.
   """
   def __init__(self, trace, content_lens=None):
     """Create from a LoadingTrace (or json of a trace).

     Args:
       trace: (LoadingTrace/JSON) Loading trace or JSON of a trace.
       content_lens: (ContentClassificationLens) Lens used to annotate the
                     nodes, or None.
     """
     if type(trace) == dict:
       trace = loading_trace.LoadingTrace.FromJsonDict(trace)
     self._content_lens = content_lens
     self._BuildDag(trace)
     self._global_start = min([n.StartTime() for n in self._node_info])
     # Sort before splitting children so that we can correctly dectect if a
     # reparented child is actually a dependency for a child of its new parent.
     try:
       for n in dag.TopologicalSort(self._nodes):
         self._SplitChildrenByTime(self._node_info[n.Index()])
     except AssertionError as exc:
       sys.stderr.write('Bad topological sort: %s\n'
                        'Skipping child split\n' % str(exc))
     self._cache_all = False
     self._node_filter = lambda _: True

   @classmethod
   def CheckImageLoadConsistency(cls, g1, g2):
     """Check that images have the same dependancies between ResourceGraphs.

     Image resources are identified by their short names.

     Args:
       g1: a ResourceGraph instance
       g2: a ResourceGraph instance

     Returns:
       A list of discrepancy tuples. If this list is empty, g1 and g2 are
       consistent with respect to image load dependencies. Otherwise, each tuple
       is of the form:
         ( g1 resource short name or str(list of short names),
           g2 resource short name or str(list of short names),
           human-readable discrepancy reason )
       Either or both of the g1 and g2 image resource short names may be None if
       it's not applicable for the discrepancy reason.
     """
     discrepancies = []
     g1_image_to_info = g1._ExtractImages()
     g2_image_to_info = g2._ExtractImages()
     for image in set(g1_image_to_info.keys()) - set(g2_image_to_info.keys()):
       discrepancies.append((image, None, 'Missing in g2'))
     for image in set(g2_image_to_info.keys()) - set(g1_image_to_info.keys()):
       discrepancies.append((None, image, 'Missing in g1'))

     for image in set(g1_image_to_info.keys()) & set(g2_image_to_info.keys()):
       def PredecessorInfo(g, n):
         info = [g._ShortName(p) for p in n.Node().Predecessors()]
         info.sort()
         return str(info)
       g1_pred = PredecessorInfo(g1, g1_image_to_info[image])
       g2_pred = PredecessorInfo(g2, g2_image_to_info[image])
       if g1_pred != g2_pred:
         discrepancies.append((g1_pred, g2_pred,
                               'Predecessor mismatch for ' + image))

     return discrepancies

   def Set(self, cache_all=None, node_filter=None):
     """Set model parameters.

     TODO(mattcary): add parameters for caching certain types of resources (just
     scripts, just cachable, etc).

     Args:
       cache_all: boolean that if true ignores emperical resource load times for
         all resources.
       node_filter: a Node->boolean used to restrict the graph for most
         operations.
     """
     if self._cache_all is not None:
       self._cache_all = cache_all
     if node_filter is not None:
       self._node_filter = node_filter

   def Nodes(self):
     """Return iterable of all nodes via their _NodeInfos.

     Returns:
       Iterable of node infos in arbitrary order.
     """
     for n in self._node_info:
       if self._node_filter(n.Node()):
         yield n

   def EdgeCosts(self, node_filter=None):
     """Edge costs.

     Args:
       node_filter: if not none, a Node->boolean filter to use instead of the
       current one from Set.

     Returns:
       The total edge costs of our graph.

     """
     node_filter = self._node_filter if node_filter is None else node_filter
     total = 0
     for n in self._node_info:
       if not node_filter(n.Node()):
         continue
       for s in n.Node().Successors():
         if node_filter(s):
           total += self._EdgeCost(n.Node(), s)
     return total

   def Intersect(self, other_nodes):
     """Return iterable of nodes that intersect with another graph.

     Args:
       other_nodes: iterable of the nodes of another graph, eg from Nodes().

     Returns:
       an iterable of (mine, other) pairs for all nodes for which the URL is
       identical.
     """
     other_map = {n.Url(): n for n in other_nodes}
     for n in self._node_info:
       if self._node_filter(n.Node()) and n.Url() in other_map:
         yield(n, other_map[n.Url()])


   def Cost(self, path_list=None):
     """Compute cost of current model.

     Args:
       path_list: if not None, gets a list of NodeInfo in the longest path.

     Returns:
       Cost of the longest path.

     """
     costs = [0] * len(self._nodes)
     for n in dag.TopologicalSort(self._nodes, self._node_filter):
       cost = 0
       if n.Predecessors():
         cost = max([costs[p.Index()] + self._EdgeCost(p, n)
                     for p in n.Predecessors()])
       if not self._cache_all:
         cost += self._NodeCost(n)
       costs[n.Index()] = cost
     max_cost = max(costs)
     assert max_cost > 0  # Otherwise probably the filter went awry.
     if path_list is not None:
       del path_list[:-1]
       n = (i for i in self._nodes if costs[i.Index()] == max_cost).next()
       path_list.append(self._node_info[n.Index()])
       while n.Predecessors():
         n = reduce(lambda costliest, next:
                    next if (self._node_filter(next) and
                             costs[next.Index()] > costs[costliest.Index()])
                         else costliest,
                    n.Predecessors())
         path_list.insert(0, self._node_info[n.Index()])
     return max_cost

   def FilterAds(self, node):
     """A filter for use in eg, Cost, to remove advertising nodes.

     Args:
       node: A dag.Node.

     Returns:
       True if the node is not ad-related.
     """
     node_info = self._node_info[node.Index()]
     return not (node_info.IsAd() or node_info.IsTracking())

   def MakeGraphviz(self, output, highlight=None):
     """Output a graphviz representation of our DAG.

     Args:
       output: a file-like output stream which recieves a graphviz dot.
       highlight: a list of node items to emphasize. Any resource url which
         contains any highlight text will be distinguished in the output.
     """
     output.write("""digraph dependencies {
     rankdir = LR;
     """)
     orphans = set()
     try:
       sorted_nodes = dag.TopologicalSort(self._nodes,
                                          node_filter=self._node_filter)
     except AssertionError as exc:
       sys.stderr.write('Bad topological sort: %s\n'
                        'Writing children in order\n' % str(exc))
       sorted_nodes = self._nodes
     for n in sorted_nodes:
       if not n.Successors() and not n.Predecessors():
         orphans.add(n)
     if orphans:
       output.write("""subgraph cluster_orphans {
   color=black;
   label="Orphans";
 """)
       for n in orphans:
         output.write(self._GraphvizNode(n.Index(), highlight))
       output.write('}\n')

     output.write("""subgraph cluster_nodes {
   color=invis;
 """)
     for n in sorted_nodes:
       if not n.Successors() and not n.Predecessors():
         continue
       output.write(self._GraphvizNode(n.Index(), highlight))

     for n in sorted_nodes:
       for s in n.Successors():
         style = 'color = orange'
         annotations = self._EdgeAnnotation(n, s)
         if 'redirect' in annotations:
           style = 'color = black'
         elif 'parser' in annotations:
           style = 'color = red'
         elif 'stack' in annotations:
           style = 'color = blue'
         elif 'script_inferred' in annotations:
           style = 'color = purple'
         if 'timing' in annotations:
           style += '; style=dashed'
         arrow = '[%s; label="%s"]' % (style, self._EdgeCost(n, s))
         output.write('%d -> %d %s;\n' % (n.Index(), s.Index(), arrow))
     output.write('}\n}\n')

   def ResourceInfo(self):
     """Get resource info.

     Returns:
       A list of _NodeInfo objects that describe the resources fetched.
     """
     return self._node_info

   def DebugString(self):
     """Graph structure for debugging.

     TODO(mattcary): this fails for graphs with more than one component or where
     self._nodes[0] is not a root.

     Returns:
       A human-readable string of the graph.
     """
     output = []
     queue = [self._nodes[0]]
     visited = set()
     while queue:
       n = queue.pop(0)
       assert n not in visited
       visited.add(n)
       children = n.SortedSuccessors()
       output.append('%d -> [%s]' %
                     (n.Index(), ' '.join([str(c.Index()) for c in children])))
       for c in children:
         assert n in c.Predecessors()  # Integrity checking
         queue.append(c)
     assert len(visited) == len(self._nodes)
     return '\n'.join(output)

   ##
   ## Internal items
   ##

   _CONTENT_KIND_TO_COLOR = {
       'application':     'blue',      # Scripts.
       'font':            'grey70',
       'image':           'orange',    # This probably catches gifs?
       'video':           'hotpink1',
       }

   _CONTENT_TYPE_TO_COLOR = {
       'html':            'red',
       'css':             'green',
       'script':          'blue',
       'javascript':      'blue',
       'json':            'purple',
       'gif':             'grey',
       'image':           'orange',
       'jpeg':            'orange',
       'png':             'orange',
       'plain':           'brown3',
       'octet-stream':    'brown3',
       'other':           'white',
       }

   # This resource type may induce a timing dependency. See _SplitChildrenByTime
   # for details.
   # TODO(mattcary): are these right?
   _CAN_BE_TIMING_PARENT = set(['script', 'magic-debug-content'])
   _CAN_MAKE_TIMING_DEPENDENCE = set(['json', 'other', 'magic-debug-content'])

   class _NodeInfo(object):
     """Our internal class that adds cost and other information to nodes.

     Costs are stored on the node as well as edges. Edge information is only
     stored on successor edges and not predecessor, that is, you get them from
     the parent and not the child.

     We also store the request on the node, and expose request-derived
     information like content type.
     """
     def __init__(self, node, request):
       """Create a new node info.

       Args:
         node: The node to augment.
         request: The request associated with this node.
       """
       self._request = request
       self._is_ad = False
       self._is_tracking = False
       self._node = node
       self._edge_costs = {}
       self._edge_annotations = {}
       # All fields in timing are millis relative to request_time, which is epoch
       # seconds.
       self._node_cost = max(
           [0] + [t for f, t in request.timing._asdict().iteritems()
                  if f != 'request_time'])

     def __str__(self):
       return self.ShortName()

     def Node(self):
       return self._node

     def Index(self):
       return self._node.Index()

     def SetRequestContent(self, is_ad, is_tracking):
       """Sets the kind of content the request relates to.

       Args:
         is_ad: (bool) Whether the request is an Ad.
         is_tracking: (bool) Whether the request is related to tracking.
       """
       (self._is_ad, self._is_tracking) = (is_ad, is_tracking)

     def IsAd(self):
       return self._is_ad

     def IsTracking(self):
       return self._is_tracking

     def Request(self):
       return self._request

     def NodeCost(self):
       return self._node_cost

     def EdgeCost(self, s):
       return self._edge_costs[s]

     def StartTime(self):
       return self._request.timing.request_time * 1000

     def EndTime(self):
       return self._request.timing.request_time * 1000 + self._node_cost

     def EdgeAnnotation(self, s):
       assert s.Node() in self.Node().Successors()
       return self._edge_annotations.get(s, [])

     def ContentType(self):
       return self._request.GetContentType()

     def ShortName(self):
       """Returns either the hostname of the resource, or the filename,
       or the end of the path. Tries to include the domain as much as possible.
       """
       parsed = urlparse.urlparse(self._request.url)
       path = parsed.path
       hostname = parsed.hostname if parsed.hostname else '?.?.?'
       if path != '' and path != '/':
         last_path = parsed.path.split('/')[-1]
         if len(last_path) < 10:
           if len(path) < 10:
             return hostname + '/' + path
           else:
             return hostname + '/..' + parsed.path[-10:]
         elif len(last_path) > 10:
           return hostname + '/..' + last_path[:5]
         else:
           return hostname + '/..' + last_path
       else:
         return hostname

     def Url(self):
       return self._request.url

     def SetEdgeCost(self, child, cost):
       assert child.Node() in self._node.Successors()
       self._edge_costs[child] = cost

     def AddEdgeAnnotation(self, s, annotation):
       assert s.Node() in self._node.Successors()
       self._edge_annotations.setdefault(s, []).append(annotation)

     def ReparentTo(self, old_parent, new_parent):
       """Move costs and annotatations from old_parent to new_parent.

       Also updates the underlying node connections, ie, do not call
       old_parent.RemoveSuccessor(), etc.

       Args:
         old_parent: the _NodeInfo of a current parent of self. We assert this
           is actually a parent.
         new_parent: the _NodeInfo of the new parent. We assert it is not already
           a parent.
       """
       assert old_parent.Node() in self.Node().Predecessors()
       assert new_parent.Node() not in self.Node().Predecessors()
       edge_annotations = old_parent._edge_annotations.pop(self, [])
       edge_cost =  old_parent._edge_costs.pop(self)
       old_parent.Node().RemoveSuccessor(self.Node())
       new_parent.Node().AddSuccessor(self.Node())
       new_parent.SetEdgeCost(self, edge_cost)
       for a in edge_annotations:
         new_parent.AddEdgeAnnotation(self, a)

     def __eq__(self, o):
       return self.Node().Index() == o.Node().Index()

     def __hash__(self):
       return hash(self.Node().Index())

   def _ShortName(self, node):
     """Convenience function for redirecting Nodes to _NodeInfo."""
     return self._node_info[node.Index()].ShortName()

   def _Url(self, node):
     """Convenience function for redirecting Nodes to _NodeInfo."""
     return self._node_info[node.Index()].Url()

   def _NodeCost(self, node):
     """Convenience function for redirecting Nodes to _NodeInfo."""
     return self._node_info[node.Index()].NodeCost()

   def _EdgeCost(self, parent, child):
     """Convenience function for redirecting Nodes to _NodeInfo."""
     return self._node_info[parent.Index()].EdgeCost(
         self._node_info[child.Index()])

   def _EdgeAnnotation(self, parent, child):
     """Convenience function for redirecting Nodes to _NodeInfo."""
     return self._node_info[parent.Index()].EdgeAnnotation(
         self._node_info[child.Index()])

   def _BuildDag(self, trace):
     """Build DAG of resources.

     Build a DAG from our requests and augment with _NodeInfo (see above) in a
     parallel array indexed by Node.Index().

     Creates self._nodes and self._node_info.

     Args:
       trace: A LoadingTrace.
     """
     self._nodes = []
     self._node_info = []
     index_by_request = {}
     for request in trace.request_track.GetEvents():
       next_index = len(self._nodes)
       assert request not in index_by_request
       index_by_request[request] = next_index
       node = dag.Node(next_index)
       node_info = self._NodeInfo(node, request)
       if self._content_lens:
         node_info.SetRequestContent(
             self._content_lens.IsAdRequest(request),
             self._content_lens.IsTrackingRequest(request))
       self._nodes.append(node)
       self._node_info.append(node_info)

     dependencies = request_dependencies_lens.RequestDependencyLens(
         trace).GetRequestDependencies()
     for parent_rq, child_rq, reason in dependencies:
       parent = self._node_info[index_by_request[parent_rq]]
       child = self._node_info[index_by_request[child_rq]]
       edge_cost = child.StartTime() - parent.EndTime()
       if edge_cost < 0:
         edge_cost = 0
         if child.StartTime() < parent.StartTime():
           logging.error('Inverted dependency: %s->%s',
                         parent.ShortName(), child.ShortName())
           # Note that child.StartTime() < parent.EndTime() appears to happen a
           # fair amount in practice.
       parent.Node().AddSuccessor(child.Node())
       parent.SetEdgeCost(child, edge_cost)
       parent.AddEdgeAnnotation(child, reason)

   def _SplitChildrenByTime(self, parent):
     """Split children of a node by request times.

     The initiator of a request may not be the true dependency of a request. For
     example, a script may appear to load several resources independently, but in
     fact one of them may be a JSON data file, and the remaining resources assets
     described in the JSON. The assets should be dependent upon the JSON data
     file, and not the original script.

     This function approximates that by rearranging the children of a node
     according to their request times. The predecessor of each child is made to
     be the node with the greatest finishing time, that is before the start time
     of the child.

     We do this by sorting the nodes twice, once by start time and once by end
     time. We mark the earliest end time, and then we walk the start time list,
     advancing the end time mark when it is less than our current start time.

     This is refined by only considering assets which we believe actually create
     a dependency. We only split if the original parent is a script, and the new
     parent a data file. We confirm these relationships heuristically by loading
     pages multiple times and ensuring that dependacies do not change; see
     CheckImageLoadConsistency() for details.

     We incorporate this heuristic by skipping over any non-script/json resources
     when moving the end mark.

     TODO(mattcary): More heuristics, like incorporating cachability somehow, and
     not just picking arbitrarily if there are two nodes with the same end time
     (does that ever really happen?)

     Args:
       parent: the NodeInfo whose children we are going to rearrange.

     """
     if parent.ContentType() not in self._CAN_BE_TIMING_PARENT:
       return  # No dependency changes.
     children_by_start_time = [self._node_info[s.Index()]
                               for s in parent.Node().Successors()]
     children_by_start_time.sort(key=lambda c: c.StartTime())
     children_by_end_time = [self._node_info[s.Index()]
                             for s in parent.Node().Successors()]
     children_by_end_time.sort(key=lambda c: c.EndTime())
     end_mark = 0
     for current in children_by_start_time:
       if current.StartTime() < parent.EndTime() - 1e-5:
         logging.warning('Child loaded before parent finished: %s -> %s',
                         parent.ShortName(), current.ShortName())
       go_to_next_child = False
       while end_mark < len(children_by_end_time):
         if children_by_end_time[end_mark] == current:
           go_to_next_child = True
           break
         elif (children_by_end_time[end_mark].ContentType() not in
             self._CAN_MAKE_TIMING_DEPENDENCE):
           end_mark += 1
         elif (end_mark < len(children_by_end_time) - 1 and
               children_by_end_time[end_mark + 1].EndTime() <
                   current.StartTime()):
           end_mark += 1
         else:
           break
       if end_mark >= len(children_by_end_time):
         break  # It's not possible to rearrange any more children.
       if go_to_next_child:
         continue  # We can't rearrange this child, but the next child may be
                   # eligible.
       if children_by_end_time[end_mark].EndTime() <= current.StartTime():
         current.ReparentTo(parent, children_by_end_time[end_mark])
         children_by_end_time[end_mark].AddEdgeAnnotation(current, 'timing')

   def _ContentTypeToColor(self, content_type):
     if not content_type:
       type_str = 'other'
     elif '/' in content_type:
       kind, type_str = content_type.split('/', 1)
       if kind in self._CONTENT_KIND_TO_COLOR:
         return self._CONTENT_KIND_TO_COLOR[kind]
     else:
       type_str = content_type
     return self._CONTENT_TYPE_TO_COLOR[type_str]

   def _GraphvizNode(self, index, highlight):
     """Returns a graphviz node description for a given node.

     Args:
       index: index of the node.
       highlight: a list of node items to emphasize. Any resource url which
         contains any highlight text will be distinguished in the output.

     Returns:
       A string describing the resource in graphviz format.
       The resource is color-coded according to its content type, and its shape
       is oval if its max-age is less than 300s (or if it's not cacheable).
     """
     node_info = self._node_info[index]
     color = self._ContentTypeToColor(node_info.ContentType())
     max_age = node_info.Request().MaxAge()
     shape = 'polygon' if max_age > 300 else 'oval'
     styles = ['filled']
     if highlight:
       for fragment in highlight:
         if fragment in node_info.Url():
           styles.append('dotted')
           break
     if node_info.IsAd() or node_info.IsTracking():
       styles += ['bold', 'diagonals']
     return ('%d [label = "%s\\n%.2f->%.2f (%.2f)"; style = "%s"; '
             'fillcolor = %s; shape = %s];\n'
             % (index, node_info.ShortName(),
                node_info.StartTime() - self._global_start,
                node_info.EndTime() - self._global_start,
                node_info.EndTime() - node_info.StartTime(),
                ','.join(styles), color, shape))

   def _ExtractImages(self):
     """Return interesting image resources.

     Uninteresting image resources are things like ads that we don't expect to be
     constant across fetches.

     Returns:
       Dict of image url + short name to NodeInfo.
     """
     image_to_info = {}
     for n in self._node_info:
       if (n.ContentType().startswith('image') and
           not self._IsAdUrl(n.Url())):
         key = str((n.Url(), n.ShortName(), n.StartTime()))
         assert key not in image_to_info, n.Url()
         image_to_info[key] = n
     return image_to_info
	# Copyright 2015 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""Models for loading in chrome.

	(Redirect the following to the general model module once we have one)
	A model is an object with the following methods.
	CostMs(): return the cost of the model in milliseconds.
	Set(): set model-specific parameters.

	ResourceGraph
	This creates a DAG of resource dependencies from loading.log_requests to model
	loading time. The model may be parameterized by changing the loading time of
	a particular or all resources.
	"""

	import logging
	import os
	import urlparse
	import sys

	import dag
	import loading_trace
	import request_dependencies_lens

	class ResourceGraph(object):
	"""A model of loading by a DAG (tree?) of resource dependancies.

	Set parameters:
	cache_all: if true, assume zero loading time for all resources.
	"""
	def __init__(self, trace, content_lens=None):
	"""Create from a LoadingTrace (or json of a trace).

	Args:
	trace: (LoadingTrace/JSON) Loading trace or JSON of a trace.
	content_lens: (ContentClassificationLens) Lens used to annotate the
	nodes, or None.
	"""
	if type(trace) == dict:
	trace = loading_trace.LoadingTrace.FromJsonDict(trace)
	self._content_lens = content_lens
	self._BuildDag(trace)
	self._global_start = min([n.StartTime() for n in self._node_info])
	# Sort before splitting children so that we can correctly dectect if a
	# reparented child is actually a dependency for a child of its new parent.
	try:
	for n in dag.TopologicalSort(self._nodes):
	self._SplitChildrenByTime(self._node_info[n.Index()])
	except AssertionError as exc:
	sys.stderr.write('Bad topological sort: %s\n'
	'Skipping child split\n' % str(exc))
	self._cache_all = False
	self._node_filter = lambda _: True

	@classmethod
	def CheckImageLoadConsistency(cls, g1, g2):
	"""Check that images have the same dependancies between ResourceGraphs.

	Image resources are identified by their short names.

	Args:
	g1: a ResourceGraph instance
	g2: a ResourceGraph instance

	Returns:
	A list of discrepancy tuples. If this list is empty, g1 and g2 are
	consistent with respect to image load dependencies. Otherwise, each tuple
	is of the form:
	( g1 resource short name or str(list of short names),
	g2 resource short name or str(list of short names),
	human-readable discrepancy reason )
	Either or both of the g1 and g2 image resource short names may be None if
	it's not applicable for the discrepancy reason.
	"""
	discrepancies = []
	g1_image_to_info = g1._ExtractImages()
	g2_image_to_info = g2._ExtractImages()
	for image in set(g1_image_to_info.keys()) - set(g2_image_to_info.keys()):
	discrepancies.append((image, None, 'Missing in g2'))
	for image in set(g2_image_to_info.keys()) - set(g1_image_to_info.keys()):
	discrepancies.append((None, image, 'Missing in g1'))

	for image in set(g1_image_to_info.keys()) & set(g2_image_to_info.keys()):
	def PredecessorInfo(g, n):
	info = [g._ShortName(p) for p in n.Node().Predecessors()]
	info.sort()
	return str(info)
	g1_pred = PredecessorInfo(g1, g1_image_to_info[image])
	g2_pred = PredecessorInfo(g2, g2_image_to_info[image])
	if g1_pred != g2_pred:
	discrepancies.append((g1_pred, g2_pred,
	'Predecessor mismatch for ' + image))

	return discrepancies

	def Set(self, cache_all=None, node_filter=None):
	"""Set model parameters.

	TODO(mattcary): add parameters for caching certain types of resources (just
	scripts, just cachable, etc).

	Args:
	cache_all: boolean that if true ignores emperical resource load times for
	all resources.
	node_filter: a Node->boolean used to restrict the graph for most
	operations.
	"""
	if self._cache_all is not None:
	self._cache_all = cache_all
	if node_filter is not None:
	self._node_filter = node_filter

	def Nodes(self):
	"""Return iterable of all nodes via their _NodeInfos.

	Returns:
	Iterable of node infos in arbitrary order.
	"""
	for n in self._node_info:
	if self._node_filter(n.Node()):
	yield n

	def EdgeCosts(self, node_filter=None):
	"""Edge costs.

	Args:
	node_filter: if not none, a Node->boolean filter to use instead of the
	current one from Set.

	Returns:
	The total edge costs of our graph.

	"""
	node_filter = self._node_filter if node_filter is None else node_filter
	total = 0
	for n in self._node_info:
	if not node_filter(n.Node()):
	continue
	for s in n.Node().Successors():
	if node_filter(s):
	total += self._EdgeCost(n.Node(), s)
	return total

	def Intersect(self, other_nodes):
	"""Return iterable of nodes that intersect with another graph.

	Args:
	other_nodes: iterable of the nodes of another graph, eg from Nodes().

	Returns:
	an iterable of (mine, other) pairs for all nodes for which the URL is
	identical.
	"""
	other_map = {n.Url(): n for n in other_nodes}
	for n in self._node_info:
	if self._node_filter(n.Node()) and n.Url() in other_map:
	yield(n, other_map[n.Url()])


	def Cost(self, path_list=None):
	"""Compute cost of current model.

	Args:
	path_list: if not None, gets a list of NodeInfo in the longest path.

	Returns:
	Cost of the longest path.

	"""
	costs = [0] * len(self._nodes)
	for n in dag.TopologicalSort(self._nodes, self._node_filter):
	cost = 0
	if n.Predecessors():
	cost = max([costs[p.Index()] + self._EdgeCost(p, n)
	for p in n.Predecessors()])
	if not self._cache_all:
	cost += self._NodeCost(n)
	costs[n.Index()] = cost
	max_cost = max(costs)
	assert max_cost > 0 # Otherwise probably the filter went awry.
	if path_list is not None:
	del path_list[:-1]
	n = (i for i in self._nodes if costs[i.Index()] == max_cost).next()
	path_list.append(self._node_info[n.Index()])
	while n.Predecessors():
	n = reduce(lambda costliest, next:
	next if (self._node_filter(next) and
	costs[next.Index()] > costs[costliest.Index()])
	else costliest,
	n.Predecessors())
	path_list.insert(0, self._node_info[n.Index()])
	return max_cost

	def FilterAds(self, node):
	"""A filter for use in eg, Cost, to remove advertising nodes.

	Args:
	node: A dag.Node.

	Returns:
	True if the node is not ad-related.
	"""
	node_info = self._node_info[node.Index()]
	return not (node_info.IsAd() or node_info.IsTracking())

	def MakeGraphviz(self, output, highlight=None):
	"""Output a graphviz representation of our DAG.

	Args:
	output: a file-like output stream which recieves a graphviz dot.
	highlight: a list of node items to emphasize. Any resource url which
	contains any highlight text will be distinguished in the output.
	"""
	output.write("""digraph dependencies {
	rankdir = LR;
	""")
	orphans = set()
	try:
	sorted_nodes = dag.TopologicalSort(self._nodes,
	node_filter=self._node_filter)
	except AssertionError as exc:
	sys.stderr.write('Bad topological sort: %s\n'
	'Writing children in order\n' % str(exc))
	sorted_nodes = self._nodes
	for n in sorted_nodes:
	if not n.Successors() and not n.Predecessors():
	orphans.add(n)
	if orphans:
	output.write("""subgraph cluster_orphans {
	color=black;
	label="Orphans";
	""")
	for n in orphans:
	output.write(self._GraphvizNode(n.Index(), highlight))
	output.write('}\n')

	output.write("""subgraph cluster_nodes {
	color=invis;
	""")
	for n in sorted_nodes:
	if not n.Successors() and not n.Predecessors():
	continue
	output.write(self._GraphvizNode(n.Index(), highlight))

	for n in sorted_nodes:
	for s in n.Successors():
	style = 'color = orange'
	annotations = self._EdgeAnnotation(n, s)
	if 'redirect' in annotations:
	style = 'color = black'
	elif 'parser' in annotations:
	style = 'color = red'
	elif 'stack' in annotations:
	style = 'color = blue'
	elif 'script_inferred' in annotations:
	style = 'color = purple'
	if 'timing' in annotations:
	style += '; style=dashed'
	arrow = '[%s; label="%s"]' % (style, self._EdgeCost(n, s))
	output.write('%d -> %d %s;\n' % (n.Index(), s.Index(), arrow))
	output.write('}\n}\n')

	def ResourceInfo(self):
	"""Get resource info.

	Returns:
	A list of _NodeInfo objects that describe the resources fetched.
	"""
	return self._node_info

	def DebugString(self):
	"""Graph structure for debugging.

	TODO(mattcary): this fails for graphs with more than one component or where
	self._nodes[0] is not a root.

	Returns:
	A human-readable string of the graph.
	"""
	output = []
	queue = [self._nodes[0]]
	visited = set()
	while queue:
	n = queue.pop(0)
	assert n not in visited
	visited.add(n)
	children = n.SortedSuccessors()
	output.append('%d -> [%s]' %
	(n.Index(), ' '.join([str(c.Index()) for c in children])))
	for c in children:
	assert n in c.Predecessors() # Integrity checking
	queue.append(c)
	assert len(visited) == len(self._nodes)
	return '\n'.join(output)

	##
	## Internal items
	##

	_CONTENT_KIND_TO_COLOR = {
	'application': 'blue', # Scripts.
	'font': 'grey70',
	'image': 'orange', # This probably catches gifs?
	'video': 'hotpink1',
	}

	_CONTENT_TYPE_TO_COLOR = {
	'html': 'red',
	'css': 'green',
	'script': 'blue',
	'javascript': 'blue',
	'json': 'purple',
	'gif': 'grey',
	'image': 'orange',
	'jpeg': 'orange',
	'png': 'orange',
	'plain': 'brown3',
	'octet-stream': 'brown3',
	'other': 'white',
	}

	# This resource type may induce a timing dependency. See _SplitChildrenByTime
	# for details.
	# TODO(mattcary): are these right?
	_CAN_BE_TIMING_PARENT = set(['script', 'magic-debug-content'])
	_CAN_MAKE_TIMING_DEPENDENCE = set(['json', 'other', 'magic-debug-content'])

	class _NodeInfo(object):
	"""Our internal class that adds cost and other information to nodes.

	Costs are stored on the node as well as edges. Edge information is only
	stored on successor edges and not predecessor, that is, you get them from
	the parent and not the child.

	We also store the request on the node, and expose request-derived
	information like content type.
	"""
	def __init__(self, node, request):
	"""Create a new node info.

	Args:
	node: The node to augment.
	request: The request associated with this node.
	"""
	self._request = request
	self._is_ad = False
	self._is_tracking = False
	self._node = node
	self._edge_costs = {}
	self._edge_annotations = {}
	# All fields in timing are millis relative to request_time, which is epoch
	# seconds.
	self._node_cost = max(
	[0] + [t for f, t in request.timing._asdict().iteritems()
	if f != 'request_time'])

	def __str__(self):
	return self.ShortName()

	def Node(self):
	return self._node

	def Index(self):
	return self._node.Index()

	def SetRequestContent(self, is_ad, is_tracking):
	"""Sets the kind of content the request relates to.

	Args:
	is_ad: (bool) Whether the request is an Ad.
	is_tracking: (bool) Whether the request is related to tracking.
	"""
	(self._is_ad, self._is_tracking) = (is_ad, is_tracking)

	def IsAd(self):
	return self._is_ad

	def IsTracking(self):
	return self._is_tracking

	def Request(self):
	return self._request

	def NodeCost(self):
	return self._node_cost

	def EdgeCost(self, s):
	return self._edge_costs[s]

	def StartTime(self):
	return self._request.timing.request_time * 1000

	def EndTime(self):
	return self._request.timing.request_time * 1000 + self._node_cost

	def EdgeAnnotation(self, s):
	assert s.Node() in self.Node().Successors()
	return self._edge_annotations.get(s, [])

	def ContentType(self):
	return self._request.GetContentType()

	def ShortName(self):
	"""Returns either the hostname of the resource, or the filename,
	or the end of the path. Tries to include the domain as much as possible.
	"""
	parsed = urlparse.urlparse(self._request.url)
	path = parsed.path
	hostname = parsed.hostname if parsed.hostname else '?.?.?'
	if path != '' and path != '/':
	last_path = parsed.path.split('/')[-1]
	if len(last_path) < 10:
	if len(path) < 10:
	return hostname + '/' + path
	else:
	return hostname + '/..' + parsed.path[-10:]
	elif len(last_path) > 10:
	return hostname + '/..' + last_path[:5]
	else:
	return hostname + '/..' + last_path
	else:
	return hostname

	def Url(self):
	return self._request.url

	def SetEdgeCost(self, child, cost):
	assert child.Node() in self._node.Successors()
	self._edge_costs[child] = cost

	def AddEdgeAnnotation(self, s, annotation):
	assert s.Node() in self._node.Successors()
	self._edge_annotations.setdefault(s, []).append(annotation)

	def ReparentTo(self, old_parent, new_parent):
	"""Move costs and annotatations from old_parent to new_parent.

	Also updates the underlying node connections, ie, do not call
	old_parent.RemoveSuccessor(), etc.

	Args:
	old_parent: the _NodeInfo of a current parent of self. We assert this
	is actually a parent.
	new_parent: the _NodeInfo of the new parent. We assert it is not already
	a parent.
	"""
	assert old_parent.Node() in self.Node().Predecessors()
	assert new_parent.Node() not in self.Node().Predecessors()
	edge_annotations = old_parent._edge_annotations.pop(self, [])
	edge_cost = old_parent._edge_costs.pop(self)
	old_parent.Node().RemoveSuccessor(self.Node())
	new_parent.Node().AddSuccessor(self.Node())
	new_parent.SetEdgeCost(self, edge_cost)
	for a in edge_annotations:
	new_parent.AddEdgeAnnotation(self, a)

	def __eq__(self, o):
	return self.Node().Index() == o.Node().Index()

	def __hash__(self):
	return hash(self.Node().Index())

	def _ShortName(self, node):
	"""Convenience function for redirecting Nodes to _NodeInfo."""
	return self._node_info[node.Index()].ShortName()

	def _Url(self, node):
	"""Convenience function for redirecting Nodes to _NodeInfo."""
	return self._node_info[node.Index()].Url()

	def _NodeCost(self, node):
	"""Convenience function for redirecting Nodes to _NodeInfo."""
	return self._node_info[node.Index()].NodeCost()

	def _EdgeCost(self, parent, child):
	"""Convenience function for redirecting Nodes to _NodeInfo."""
	return self._node_info[parent.Index()].EdgeCost(
	self._node_info[child.Index()])

	def _EdgeAnnotation(self, parent, child):
	"""Convenience function for redirecting Nodes to _NodeInfo."""
	return self._node_info[parent.Index()].EdgeAnnotation(
	self._node_info[child.Index()])

	def _BuildDag(self, trace):
	"""Build DAG of resources.

	Build a DAG from our requests and augment with _NodeInfo (see above) in a
	parallel array indexed by Node.Index().

	Creates self._nodes and self._node_info.

	Args:
	trace: A LoadingTrace.
	"""
	self._nodes = []
	self._node_info = []
	index_by_request = {}
	for request in trace.request_track.GetEvents():
	next_index = len(self._nodes)
	assert request not in index_by_request
	index_by_request[request] = next_index
	node = dag.Node(next_index)
	node_info = self._NodeInfo(node, request)
	if self._content_lens:
	node_info.SetRequestContent(
	self._content_lens.IsAdRequest(request),
	self._content_lens.IsTrackingRequest(request))
	self._nodes.append(node)
	self._node_info.append(node_info)

	dependencies = request_dependencies_lens.RequestDependencyLens(
	trace).GetRequestDependencies()
	for parent_rq, child_rq, reason in dependencies:
	parent = self._node_info[index_by_request[parent_rq]]
	child = self._node_info[index_by_request[child_rq]]
	edge_cost = child.StartTime() - parent.EndTime()
	if edge_cost < 0:
	edge_cost = 0
	if child.StartTime() < parent.StartTime():
	logging.error('Inverted dependency: %s->%s',
	parent.ShortName(), child.ShortName())
	# Note that child.StartTime() < parent.EndTime() appears to happen a
	# fair amount in practice.
	parent.Node().AddSuccessor(child.Node())
	parent.SetEdgeCost(child, edge_cost)
	parent.AddEdgeAnnotation(child, reason)

	def _SplitChildrenByTime(self, parent):
	"""Split children of a node by request times.

	The initiator of a request may not be the true dependency of a request. For
	example, a script may appear to load several resources independently, but in
	fact one of them may be a JSON data file, and the remaining resources assets
	described in the JSON. The assets should be dependent upon the JSON data
	file, and not the original script.

	This function approximates that by rearranging the children of a node
	according to their request times. The predecessor of each child is made to
	be the node with the greatest finishing time, that is before the start time
	of the child.

	We do this by sorting the nodes twice, once by start time and once by end
	time. We mark the earliest end time, and then we walk the start time list,
	advancing the end time mark when it is less than our current start time.

	This is refined by only considering assets which we believe actually create
	a dependency. We only split if the original parent is a script, and the new
	parent a data file. We confirm these relationships heuristically by loading
	pages multiple times and ensuring that dependacies do not change; see
	CheckImageLoadConsistency() for details.

	We incorporate this heuristic by skipping over any non-script/json resources
	when moving the end mark.

	TODO(mattcary): More heuristics, like incorporating cachability somehow, and
	not just picking arbitrarily if there are two nodes with the same end time
	(does that ever really happen?)

	Args:
	parent: the NodeInfo whose children we are going to rearrange.

	"""
	if parent.ContentType() not in self._CAN_BE_TIMING_PARENT:
	return # No dependency changes.
	children_by_start_time = [self._node_info[s.Index()]
	for s in parent.Node().Successors()]
	children_by_start_time.sort(key=lambda c: c.StartTime())
	children_by_end_time = [self._node_info[s.Index()]
	for s in parent.Node().Successors()]
	children_by_end_time.sort(key=lambda c: c.EndTime())
	end_mark = 0
	for current in children_by_start_time:
	if current.StartTime() < parent.EndTime() - 1e-5:
	logging.warning('Child loaded before parent finished: %s -> %s',
	parent.ShortName(), current.ShortName())
	go_to_next_child = False
	while end_mark < len(children_by_end_time):
	if children_by_end_time[end_mark] == current:
	go_to_next_child = True
	break
	elif (children_by_end_time[end_mark].ContentType() not in
	self._CAN_MAKE_TIMING_DEPENDENCE):
	end_mark += 1
	elif (end_mark < len(children_by_end_time) - 1 and
	children_by_end_time[end_mark + 1].EndTime() <
	current.StartTime()):
	end_mark += 1
	else:
	break
	if end_mark >= len(children_by_end_time):
	break # It's not possible to rearrange any more children.
	if go_to_next_child:
	continue # We can't rearrange this child, but the next child may be
	# eligible.
	if children_by_end_time[end_mark].EndTime() <= current.StartTime():
	current.ReparentTo(parent, children_by_end_time[end_mark])
	children_by_end_time[end_mark].AddEdgeAnnotation(current, 'timing')

	def _ContentTypeToColor(self, content_type):
	if not content_type:
	type_str = 'other'
	elif '/' in content_type:
	kind, type_str = content_type.split('/', 1)
	if kind in self._CONTENT_KIND_TO_COLOR:
	return self._CONTENT_KIND_TO_COLOR[kind]
	else:
	type_str = content_type
	return self._CONTENT_TYPE_TO_COLOR[type_str]

	def _GraphvizNode(self, index, highlight):
	"""Returns a graphviz node description for a given node.

	Args:
	index: index of the node.
	highlight: a list of node items to emphasize. Any resource url which
	contains any highlight text will be distinguished in the output.

	Returns:
	A string describing the resource in graphviz format.
	The resource is color-coded according to its content type, and its shape
	is oval if its max-age is less than 300s (or if it's not cacheable).
	"""
	node_info = self._node_info[index]
	color = self._ContentTypeToColor(node_info.ContentType())
	max_age = node_info.Request().MaxAge()
	shape = 'polygon' if max_age > 300 else 'oval'
	styles = ['filled']
	if highlight:
	for fragment in highlight:
	if fragment in node_info.Url():
	styles.append('dotted')
	break
	if node_info.IsAd() or node_info.IsTracking():
	styles += ['bold', 'diagonals']
	return ('%d [label = "%s\\n%.2f->%.2f (%.2f)"; style = "%s"; '
	'fillcolor = %s; shape = %s];\n'
	% (index, node_info.ShortName(),
	node_info.StartTime() - self._global_start,
	node_info.EndTime() - self._global_start,
	node_info.EndTime() - node_info.StartTime(),
	','.join(styles), color, shape))

	def _ExtractImages(self):
	"""Return interesting image resources.

	Uninteresting image resources are things like ads that we don't expect to be
	constant across fetches.

	Returns:
	Dict of image url + short name to NodeInfo.
	"""
	image_to_info = {}
	for n in self._node_info:
	if (n.ContentType().startswith('image') and
	not self._IsAdUrl(n.Url())):
	key = str((n.Url(), n.ShortName(), n.StartTime()))
	assert key not in image_to_info, n.Url()
	image_to_info[key] = n
	return image_to_info